scripts/build/extract_texi_filenames.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

#!@PYTHON@
# -*- coding: utf-8 -*-
# extract_texi_filenames.py

# USAGE:  extract_texi_filenames.py [-o OUTDIR] FILES
#
# -o OUTDIR specifies that output files should rather be written in OUTDIR
#
# Description:
# This script parses the .texi file given and creates a file with the
# nodename <=> filename/anchor map.
# The idea behind: Unnumbered subsections go into the same file as the
# previous numbered section, @translationof gives the original node name,
# which is then used for the filename/anchor.
#
# If this script is run on a file texifile.texi, it produces a file
# texifile[.LANG].xref-map with tab-separated entries of the form
#        NODE\tFILENAME\tANCHOR
# LANG is the document language in case it's not 'en'
# Note: The filename does not have any extension appended!
# This file can then be used by our texi2html init script to determine
# the correct file name and anchor for external refs

import sys
import re
import os
import getopt

options_list, files = getopt.getopt (sys.argv[1:],'o:s:hI:m:',
                                     ['output=', 'split=',
                                      'help', 'include=',
                                      'master-map-file='])

help_text = r"""Usage: %(program_name)s [OPTIONS]... TEXIFILE...
Extract files names for texinfo (sub)sections from the texinfo files.

Options:
 -h, --help                     print this help
 -I, --include=DIRECTORY        append DIRECTORY to include search path
 -m, --master-map-file=FILE     use FILE as master map file
 -o, --output=DIRECTORY         write .xref-map files to DIRECTORY
 -s, --split=MODE               split manual according to MODE. Possible values
                                are section and custom (default)
"""

def help (text):
    sys.stdout.write ( text)
    sys.exit (0)

outdir = '.'
split = "custom"
include_path = []
master_map_file = ''
initial_map = {}
for opt in options_list:
    o = opt[0]
    a = opt[1]
    if o == '-h' or o == '--help':
        help (help_text % vars ())
    if o == '-I' or o == '--include':
        if os.path.isdir (a):
            include_path.append (a)
    elif o == '-o' or o == '--output':
        outdir = a
    elif o == '-s' or o == '--split':
        split = a
    elif o == '-m' or o == '--master-map-file':
        if os.path.isfile (a):
            master_map_file = a
    else:
        raise Exception ('unknown option: ' + o)


if not os.path.isdir (outdir):
    if os.path.exists (outdir):
        os.unlink (outdir)
    os.makedirs (outdir)

include_re = re.compile (r'@include ((?!../lily-).*?\.i?texi)$', re.M)
whitespaces = re.compile (r'\s+')
section_translation_re = re.compile ('^@(node|(?:unnumbered|appendix)\
(?:(?:sub){0,2}sec)?|top|chapter|(?:sub){0,2}section|\
(?:major|chap|(?:sub){0,2})heading|lydoctitle|translationof) \
(.+)$', re.MULTILINE)
external_node_re = re.compile (r'\s+@c\s+external.*')

def expand_includes (m, filename):
    filepath = os.path.join (os.path.dirname (filename), m.group(1))
    if os.path.exists (filepath):
        return extract_sections (filepath)[1]
    else:
        for directory in include_path:
            filepath = os.path.join (directory, m.group(1))
            if os.path.exists (filepath):
                return extract_sections (filepath)[1]
        print "Unable to locate include file " + filepath
        return ''

lang_re = re.compile (r'^@documentlanguage (.+)', re.M)

def extract_sections (filename):
    result = ''
    f = open (filename, 'r')
    page = f.read ()
    f.close()
    # Search document language
    m = lang_re.search (page)
    if m and m.group (1) != 'en':
        lang_suffix = '.' + m.group (1)
    else:
        lang_suffix = ''
    # Replace all includes by their list of sections and extract all sections
    page = include_re.sub (lambda m: expand_includes (m, filename), page)
    sections = section_translation_re.findall (page)
    for sec in sections:
        result += "@" + sec[0] + " " + sec[1] + "\n"
    return (lang_suffix, result)

# Convert a given node name to its proper file name (normalization as
# explained in the texinfo manual:
# http://www.gnu.org/software/texinfo/manual/texinfo/html_node/HTML-Xref-Node-Name-Expansion.html
def texinfo_file_name(title):
    # exception: The top node is always mapped to index.html
    if title == "Top":
        return "index"
    # File name normalization by texinfo (described in the texinfo manual):
    # 1/2: letters and numbers are left unchanged
    # 3/4: multiple, leading and trailing whitespace is removed
    title = title.strip ();
    title = whitespaces.sub (' ', title)
    # 5:   all remaining spaces are converted to '-'
    # 6:   all other 7- or 8-bit chars are replaced by _xxxx (xxxx=ascii character code)
    result = ''
    for index in range(len(title)):
        char = title[index]
        if char == ' ': # space -> '-'
            result += '-'
        elif ( ('0' <= char and char <= '9' ) or
               ('A' <= char and char <= 'Z' ) or
               ('a' <= char and char <= 'z' ) ):  # number or letter
            result += char
        else:
            ccode = ord(char)
            if ccode <= 0xFFFF:
                result += "_%04x" % ccode
            else:
                result += "__%06x" % ccode
    # 7: if name begins with number, prepend 't_g' (so it starts with a letter)
    if (result != '') and (ord(result[0]) in range (ord('0'), ord('9'))):
        result = 't_g' + result
    return result

texinfo_re = re.compile (r'@.*{(.*)}')
def remove_texinfo (title):
    return texinfo_re.sub (r'\1', title)

def create_texinfo_anchor (title):
    return texinfo_file_name (remove_texinfo (title))

unnumbered_re = re.compile (r'unnumbered.+|lydoctitle')
file_name_section_level = {
    'top': 4,
    'chapter':3,
    'unnumbered':3,
    'appendix':3,
    'section':2,
    'unnumberedsec':2,
    'appendixsec':2,
    'subsection':1,
    'unnumberedsubsec':1,
    'appendixsubsec':1,
    'subsubsection':0,
    'unnumberedsubsubsec':0,
    'appendixsubsubsec':0
}
if split in file_name_section_level:
    splitting_level = file_name_section_level[split]
else:
    splitting_level = -1
def process_sections (filename, lang_suffix, page):
    sections = section_translation_re.findall (page)
    basename = os.path.splitext (os.path.basename (filename))[0]
    p = os.path.join (outdir, basename) + lang_suffix + '.xref-map'
    f = open (p, 'w')

    this_title = ''
    this_filename = 'index'
    this_anchor = ''
    this_unnumbered = False
    had_section = False
    for sec in sections:
        if sec[0] == "node":
            # Write out the cached values to the file and start a new
            # section:
            if this_title and this_title != 'Top':
                    f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
            had_section = False
            this_title = remove_texinfo (sec[1])
            this_anchor = create_texinfo_anchor (sec[1])
            # delete entry from master map file
            if this_title in initial_map:
                del initial_map[this_title]
        elif sec[0] == "translationof":
            print sec
            (original_node, external_node) = external_node_re.subn ('', sec[1])
            original_node = remove_texinfo (original_node)
            # The following binds the translator to use the
            # translated node name in cross-references in case
            # it exists
            if external_node and original_node in initial_map:
                del initial_map[original_node]
            anchor = create_texinfo_anchor (sec[1])
            # If @translationof is used, it gives the original
            # node name, which we use for the anchor and the file
            # name (if it is a numbered node)
            this_anchor = anchor
            if not this_unnumbered:
                this_filename = anchor
            elif original_node in initial_map:
                this_filename = initial_map[original_node][2]
        else:
            # Some pages might not use a node for every section, so
            # treat this case here, too: If we already had a section
            # and encounter another one before the next @node, we
            # write out the old one and start with the new values
            if had_section and this_title:
                f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")
                this_title = remove_texinfo (sec[1])
                this_anchor = create_texinfo_anchor (sec[1])
            had_section = True

            if split == 'custom':
                # unnumbered nodes use the previously used file name,
                # only numbered nodes get their own filename! However,
                # top-level @unnumbered still get their own file.
                this_unnumbered = unnumbered_re.match (sec[0])
                if not this_unnumbered:
                    this_filename = this_anchor
            elif split == 'node':
                this_filename = this_anchor
            else:
                if sec[0] in file_name_section_level and \
                        file_name_section_level[sec[0]] >= splitting_level:
                    this_filename = this_anchor

    if this_title and this_title != 'Top':
        f.write (this_title + "\t" + this_filename + "\t" + this_anchor + "\n")

    for node in initial_map:
        f.write ("\t".join (initial_map[node]) + "\n")
    f.close ()

xref_map_line_re = re.compile (r'(.*?)\t(.*?)\t(.*?)$')
if master_map_file:
    for line in open (master_map_file):
        m = xref_map_line_re.match (line)
        if m:
            initial_map[m.group (1)] = (m.group (1), m.group (2), m.group (3))

for filename in files:
    print "extract_texi_filenames.py: Processing %s" % filename
    (lang_suffix, sections) = extract_sections (filename)
    process_sections (filename, lang_suffix, sections)