diff options
Diffstat (limited to 'debian/pyhtml2devhelp.py')
-rw-r--r-- | debian/pyhtml2devhelp.py | 279 |
1 files changed, 279 insertions, 0 deletions
diff --git a/debian/pyhtml2devhelp.py b/debian/pyhtml2devhelp.py new file mode 100644 index 0000000..7d1c8d9 --- /dev/null +++ b/debian/pyhtml2devhelp.py @@ -0,0 +1,279 @@ +#! /usr/bin/python3 + +from html.parser import HTMLParser +from xml.sax.saxutils import escape +import os, sys, re + +class PyHTMLParser(HTMLParser): + pages_to_include = set(('whatsnew/index.html', 'tutorial/index.html', 'using/index.html', + 'reference/index.html', 'library/index.html', 'howto/index.html', + 'extending/index.html', 'c-api/index.html', 'install/index.html', + 'distutils/index.html')) + + def __init__(self, basedir, fn, indent, parents=set()): + HTMLParser.__init__(self, convert_charrefs=True) + self.basedir = basedir + self.dir, self.fn = os.path.split(fn) + self.data = '' + self.parents = parents + self.link = {} + self.indent = indent + self.last_indent = indent - 1 + self.sub_indent = 0 + self.sub_count = 0 + self.next_link = False + + def escape(self, text): + return escape(text, {'"': '"'}) + + def process_link(self): + new_href = self.escape(os.path.join(self.dir, self.link['href'])) + text = self.escape(self.link['text']) + indent = self.indent + self.sub_indent + if self.last_indent == indent: + print('%s</sub>' % (' ' * self.last_indent)) + self.sub_count -= 1 + print('%s<sub link="%s" name="%s">' % (' ' * indent, new_href, text)) + self.sub_count += 1 + self.last_indent = self.indent + self.sub_indent + + def handle_starttag(self, tag, attrs): + if tag == 'a': + self.start_a(attrs) + elif tag == 'li': + self.start_li(attrs) + + def handle_endtag(self, tag): + if tag == 'a': + self.end_a() + elif tag == 'li': + self.end_li() + + def start_li(self, attrs): + self.sub_indent += 1 + self.next_link = True + + def end_li(self): + indent = self.indent + self.sub_indent + if self.sub_count > 0: + print('%s</sub>' % (' ' * self.last_indent)) + self.sub_count -= 1 + self.last_indent -= 1 + self.sub_indent -= 1 + + def start_a(self, attrs): + self.link = {} + for attr in attrs: + self.link[attr[0]] = attr[1] + self.data = '' + + def end_a(self): + process = False + text = self.escape(self.data.replace('\t', '').replace('\n', ' ')) + self.link['text'] = text + # handle a tag without href attribute + try: + href = self.link['href'] + except KeyError: + return + + abs_href = os.path.join(self.basedir, href) + if abs_href in self.parents: + return + if href.startswith('..') or href.startswith('http:') \ + or href.startswith('mailto:') or href.startswith('news:'): + return + if href in ('', 'about.html', 'modindex.html', 'genindex.html', 'glossary.html', + 'search.html', 'contents.html', 'download.html', 'bugs.html', + 'license.html', 'copyright.html'): + return + + if 'class' in self.link: + if self.link['class'] in ('biglink'): + process = True + if self.link['class'] in ('reference external'): + if self.next_link: + process = True + next_link = False + + if process == True: + self.process_link() + if href in self.pages_to_include: + self.parse_file(os.path.join(self.dir, href)) + + def finish(self): + if self.sub_count > 0: + print('%s</sub>' % (' ' * self.last_indent)) + + def handle_data(self, data): + self.data += data + + def parse_file(self, href): + # TODO basedir bestimmen + parent = os.path.join(self.basedir, self.fn) + self.parents.add(parent) + parser = PyHTMLParser(self.basedir, href, self.indent + 1, + self.parents) + text = open(self.basedir + '/' + href, encoding='latin_1').read() + parser.feed(text) + parser.finish() + parser.close() + if parent in self.parents: + self.parents.remove(parent) + +class PyIdxHTMLParser(HTMLParser): + def __init__(self, basedir, fn, indent): + HTMLParser.__init__(self, convert_charrefs=True) + self.basedir = basedir + self.dir, self.fn = os.path.split(fn) + self.data = '' + self.link = {} + self.indent = indent + self.active = False + self.indented = False + self.nolink = False + self.header = '' + self.last_letter = 'Z' + self.last_text = '' + + def escape(self, text): + return escape(text, {'"': '"'}) + + def process_link(self): + new_href = self.escape(os.path.join(self.dir, self.link['href'])) + text = self.escape(self.link['text']) + if not self.active: + return + if text.startswith('['): + return + if self.link.get('rel', None) in ('prev', 'parent', 'next', 'contents', 'index'): + return + if self.indented: + text = self.last_text + ' ' + text + else: + # Save it in case we need it again + self.last_text = re.sub(' \([\w\-\.\s]+\)', '', text) + indent = self.indent + print('%s<function link="%s" name="%s"/>' % (' ' * indent, new_href, text)) + + def handle_starttag(self, tag, attrs): + if tag == 'a': + self.start_a(attrs) + elif tag == 'dl': + self.start_dl(attrs) + elif tag == 'dt': + self.start_dt(attrs) + elif tag == 'h2': + self.start_h2(attrs) + elif tag == 'td': + self.start_td(attrs) + elif tag == 'table': + self.start_table(attrs) + + def handle_endtag(self, tag): + if tag == 'a': + self.end_a() + elif tag == 'dl': + self.end_dl() + elif tag == 'dt': + self.end_dt() + elif tag == 'h2': + self.end_h2() + elif tag == 'td': + self.end_td() + elif tag == 'table': + self.end_table() + + def start_dl(self, attrs): + if self.last_text: + # Looks like we found the second part to a command + self.indented = True + + def end_dl(self): + self.indented = False + + def start_dt(self, attrs): + self.data = '' + self.nolink = True + + def end_dt(self): + if not self.active: + return + if self.nolink == True: + # Looks like we found the first part to a command + self.last_text = re.sub(' \([\w\-\.\s]+\)', '', self.data) + self.nolink = False + + def start_h2(self, attrs): + for k, v in attrs: + if k == 'id': + self.header = v + if v == '_': + self.active = True + + def end_h2(self): + pass + + def start_td(self, attrs): + self.indented = False + self.last_text = '' + + def end_td(self): + pass + + def start_table(self, attrs): + pass + + def end_table(self): + if self.header == self.last_letter: + self.active = False + + def start_a(self, attrs): + self.nolink = False + self.link = {} + for attr in attrs: + self.link[attr[0]] = attr[1] + self.data = '' + + def end_a(self): + text = self.data.replace('\t', '').replace('\n', ' ') + text = text.replace("Whats ", "What's ") + self.link['text'] = text + # handle a tag without href attribute + try: + href = self.link['href'] + except KeyError: + return + self.process_link() + + def handle_data(self, data): + self.data += data + + def handle_entityref(self, name): + # not meant to be called while convert_charrefs is true + raise AssertionError('entityrefs should not be handled any more') + +def main(): + base = sys.argv[1] + fn = sys.argv[2] + version = escape(sys.argv[3]) + + parser = PyHTMLParser(base, fn, indent=0) + print('<?xml version="1.0" encoding="iso-8859-1"?>') + print('<book title="Python %s Documentation" name="Python %s" version="%s" link="index.html">' % (version, version, version)) + print('<chapters>') + parser.parse_file(fn) + print('</chapters>') + + print('<functions>') + + fn = 'genindex-all.html' + parser = PyIdxHTMLParser(base, fn, indent=1) + text = open(base + '/' + fn, encoding='latin_1').read() + parser.feed(text) + parser.close() + + print('</functions>') + print('</book>') + +main() |