#!@PYTHON@ # -*- coding: utf-8 -*- # extract_texi_filenames.py # USAGE: extract_texi_filenames.py [-o OUTDIR] FILES # # -o OUTDIR specifies that output files should rather be written in OUTDIR # # Description: # This script parses the .texi file given and creates a file with the # nodename <=> filename/anchor map. # The idea behind: Unnumbered subsections go into the same file as the # previous numbered section, @translationof gives the original node name, # which is then used for the filename/anchor. # # If this script is run on a file texifile.texi, it produces a file # texifile[.LANG].xref-map with tab-separated entries of the form # NODE\tFILENAME\tANCHOR # LANG is the document language in case it's not 'en' # Note: The filename does not have any extension appended! # This file can then be used by our texi2html init script to determine # the correct file name and anchor for external refs import sys import re import os import getopt options_list, files = getopt.getopt (sys.argv[1:],'o:s:hI:m:', ['output=', 'split=', 'help', 'include=', 'master-map-file=']) help_text = r"""Usage: %(program_name)s [OPTIONS]... TEXIFILE... Extract files names for texinfo (sub)sections from the texinfo files. Options: -h, --help print this help -I, --include=DIRECTORY append DIRECTORY to include search path -m, --master-map-file=FILE use FILE as master map file -o, --output=DIRECTORY write .xref-map files to DIRECTORY -s, --split=MODE split manual according to MODE. Possible values are section and custom (default) """ def help (text): sys.stdout.write ( text) sys.exit (0) outdir = '.' split = "custom" include_path = [] master_map_file = '' initial_map = {} for opt in options_list: o = opt[0] a = opt[1] if o == '-h' or o == '--help': help (help_text % vars ()) if o == '-I' or o == '--include': if os.path.isdir (a): include_path.append (a) elif o == '-o' or o == '--output': outdir = a elif o == '-s' or o == '--split': split = a elif o == '-m' or o == '--master-map-file': if os.path.isfile (a): master_map_file = a else: raise Exception ('unknown option: ' + o) if not os.path.isdir (outdir): if os.path.exists (outdir): os.unlink (outdir) os.makedirs (outdir) include_re = re.compile (r'@include ((?!../lily-).*?\.i?texi)$', re.M) whitespaces = re.compile (r'\s+') section_translation_re = re.compile ('^@(node|(?:unnumbered|appendix)\ (?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|\ (?:major|chap|(?:sub){0,2})heading|lydoctitle|translationof) \ (.+)$', re.MULTILINE) external_node_re = re.compile (r'\s+@c\s+external.*') def expand_includes (m, filename): filepath = os.path.join (os.path.dirname (filename), m.group(1)) if os.path.exists (filepath): return extract_sections (filepath)[1] else: for directory in include_path: filepath = os.path.join (directory, m.group(1)) if os.path.exists (filepath): return extract_sections (filepath)[1] print "Unable to locate include file " + filepath return '' lang_re = re.compile (r'^@documentlanguage (.+)', re.M) def extract_sections (filename): result = '' f = open (filename, 'r') page = f.read () f.close() # Search document language m = lang_re.search (page) if m and m.group (1) != 'en': lang_suffix = '.' + m.group (1) else: lang_suffix = '' # Replace all includes by their list of sections and extract all sections page = include_re.sub (lambda m: expand_includes (m, filename), page) sections = section_translation_re.findall (page) for sec in sections: result += "@" + sec[0] + " " + sec[1] + "\n" return (lang_suffix, result) # Convert a given node name to its proper file name (normalization as # explained in the texinfo manual: # http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html def texinfo_file_name(title): # exception: The top node is always mapped to index.html if title == "Top": return "index" # File name normalization by texinfo (described in the texinfo manual): # 1/2: letters and numbers are left unchanged # 3/4: multiple, leading and trailing whitespace is removed title = title.strip (); title = whitespaces.sub (' ', title) # 5: all remaining spaces are converted to '-' # 6: all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code) result = '' for index in range(len(title)): char = title[index] if char == ' ': # space -> '-' result += '-' elif ( ('0' <= char and char <= '9' ) or ('A' <= char and char <= 'Z' ) or ('a' <= char and char <= 'z' ) ): # number or letter result += char else: ccode = ord(char) if ccode <= 0xFFFF: result += "_%04x" % ccode else: result += "__%06x" % ccode # 7: if name begins with number, prepend 't_g' (so it starts with a letter) if (result != '') and (ord(result[0]) in range (ord('0'), ord('9'))): result = 't_g' + result return result texinfo_re = re.compile (r'@.*{(.*)}') def remove_texinfo (title): return texinfo_re.sub (r'\1', title) def create_texinfo_anchor (title): return texinfo_file_name (remove_texinfo (title)) unnumbered_re = re.compile (r'unnumbered.+|lydoctitle') file_name_section_level = { 'top': 4, 'chapter':3, 'unnumbered':3, 'appendix':3, 'section':2, 'unnumberedsec':2, 'appendixsec':2, 'subsection':1, 'unnumberedsubsec':1, 'appendixsubsec':1, 'subsubsection':0, 'unnumberedsubsubsec':0, 'appendixsubsubsec':0 } if split in file_name_section_level: splitting_level = file_name_section_level[split] else: splitting_level = -1 def process_sections (filename, lang_suffix, page): sections = section_translation_re.findall (page) basename = os.path.splitext (os.path.basename (filename))[0] p = os.path.join (outdir, basename) + lang_suffix + '.xref-map' f = open (p, 'w') this_title = '' this_filename = 'index' this_anchor = '' this_unnumbered = False had_section = False for sec in sections: if sec[0] == "node": # Write out the cached values to the file and start a new # section: if this_title and this_title != 'Top': f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n") had_section = False this_title = remove_texinfo (sec[1]) this_anchor = create_texinfo_anchor (sec[1]) # delete entry from master map file if this_title in initial_map: del initial_map[this_title] elif sec[0] == "translationof": print sec (original_node, external_node) = external_node_re.subn ('', sec[1]) original_node = remove_texinfo (original_node) # The following binds the translator to use the # translated node name in cross-references in case # it exists if external_node and original_node in initial_map: del initial_map[original_node] anchor = create_texinfo_anchor (sec[1]) # If @translationof is used, it gives the original # node name, which we use for the anchor and the file # name (if it is a numbered node) this_anchor = anchor if not this_unnumbered: this_filename = anchor elif original_node in initial_map: this_filename = initial_map[original_node][2] else: # Some pages might not use a node for every section, so # treat this case here, too: If we already had a section # and encounter another one before the next @node, we # write out the old one and start with the new values if had_section and this_title: f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n") this_title = remove_texinfo (sec[1]) this_anchor = create_texinfo_anchor (sec[1]) had_section = True if split == 'custom': # unnumbered nodes use the previously used file name, # only numbered nodes get their own filename! However, # top-level @unnumbered still get their own file. this_unnumbered = unnumbered_re.match (sec[0]) if not this_unnumbered: this_filename = this_anchor elif split == 'node': this_filename = this_anchor else: if sec[0] in file_name_section_level and \ file_name_section_level[sec[0]] >= splitting_level: this_filename = this_anchor if this_title and this_title != 'Top': f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n") for node in initial_map: f.write ("\t".join (initial_map[node]) + "\n") f.close () xref_map_line_re = re.compile (r'(.*?)\t(.*?)\t(.*?)$') if master_map_file: for line in open (master_map_file): m = xref_map_line_re.match (line) if m: initial_map[m.group (1)] = (m.group (1), m.group (2), m.group (3)) for filename in files: print "extract_texi_filenames.py: Processing %s" % filename (lang_suffix, sections) = extract_sections (filename) process_sections (filename, lang_suffix, sections)