#! /usr/bin/python3 from html.parser import HTMLParser from xml.sax.saxutils import escape import os, sys, re class PyHTMLParser(HTMLParser): pages_to_include = set(('whatsnew/index.html', 'tutorial/index.html', 'using/index.html', 'reference/index.html', 'library/index.html', 'howto/index.html', 'extending/index.html', 'c-api/index.html', 'install/index.html', 'distutils/index.html')) def __init__(self, basedir, fn, indent, parents=set()): HTMLParser.__init__(self, convert_charrefs=True) self.basedir = basedir self.dir, self.fn = os.path.split(fn) self.data = '' self.parents = parents self.link = {} self.indent = indent self.last_indent = indent - 1 self.sub_indent = 0 self.sub_count = 0 self.next_link = False def escape(self, text): return escape(text, {'"': '"'}) def process_link(self): new_href = self.escape(os.path.join(self.dir, self.link['href'])) text = self.escape(self.link['text']) indent = self.indent + self.sub_indent if self.last_indent == indent: print('%s' % (' ' * self.last_indent)) self.sub_count -= 1 print('%s' % (' ' * indent, new_href, text)) self.sub_count += 1 self.last_indent = self.indent + self.sub_indent def handle_starttag(self, tag, attrs): if tag == 'a': self.start_a(attrs) elif tag == 'li': self.start_li(attrs) def handle_endtag(self, tag): if tag == 'a': self.end_a() elif tag == 'li': self.end_li() def start_li(self, attrs): self.sub_indent += 1 self.next_link = True def end_li(self): indent = self.indent + self.sub_indent if self.sub_count > 0: print('%s' % (' ' * self.last_indent)) self.sub_count -= 1 self.last_indent -= 1 self.sub_indent -= 1 def start_a(self, attrs): self.link = {} for attr in attrs: self.link[attr[0]] = attr[1] self.data = '' def end_a(self): process = False text = self.escape(self.data.replace('\t', '').replace('\n', ' ')) self.link['text'] = text # handle a tag without href attribute try: href = self.link['href'] except KeyError: return abs_href = os.path.join(self.basedir, href) if abs_href in self.parents: return if href.startswith('..') or href.startswith('http:') \ or href.startswith('mailto:') or href.startswith('news:'): return if href in ('', 'about.html', 'modindex.html', 'genindex.html', 'glossary.html', 'search.html', 'contents.html', 'download.html', 'bugs.html', 'license.html', 'copyright.html'): return if 'class' in self.link: if self.link['class'] in ('biglink'): process = True if self.link['class'] in ('reference external'): if self.next_link: process = True next_link = False if process == True: self.process_link() if href in self.pages_to_include: self.parse_file(os.path.join(self.dir, href)) def finish(self): if self.sub_count > 0: print('%s' % (' ' * self.last_indent)) def handle_data(self, data): self.data += data def parse_file(self, href): # TODO basedir bestimmen parent = os.path.join(self.basedir, self.fn) self.parents.add(parent) parser = PyHTMLParser(self.basedir, href, self.indent + 1, self.parents) text = open(self.basedir + '/' + href, encoding='latin_1').read() parser.feed(text) parser.finish() parser.close() if parent in self.parents: self.parents.remove(parent) class PyIdxHTMLParser(HTMLParser): def __init__(self, basedir, fn, indent): HTMLParser.__init__(self, convert_charrefs=True) self.basedir = basedir self.dir, self.fn = os.path.split(fn) self.data = '' self.link = {} self.indent = indent self.active = False self.indented = False self.nolink = False self.header = '' self.last_letter = 'Z' self.last_text = '' def escape(self, text): return escape(text, {'"': '"'}) def process_link(self): new_href = self.escape(os.path.join(self.dir, self.link['href'])) text = self.escape(self.link['text']) if not self.active: return if text.startswith('['): return if self.link.get('rel', None) in ('prev', 'parent', 'next', 'contents', 'index'): return if self.indented: text = self.last_text + ' ' + text else: # Save it in case we need it again self.last_text = re.sub(' \([\w\-\.\s]+\)', '', text) indent = self.indent print('%s' % (' ' * indent, new_href, text)) def handle_starttag(self, tag, attrs): if tag == 'a': self.start_a(attrs) elif tag == 'dl': self.start_dl(attrs) elif tag == 'dt': self.start_dt(attrs) elif tag == 'h2': self.start_h2(attrs) elif tag == 'td': self.start_td(attrs) elif tag == 'table': self.start_table(attrs) def handle_endtag(self, tag): if tag == 'a': self.end_a() elif tag == 'dl': self.end_dl() elif tag == 'dt': self.end_dt() elif tag == 'h2': self.end_h2() elif tag == 'td': self.end_td() elif tag == 'table': self.end_table() def start_dl(self, attrs): if self.last_text: # Looks like we found the second part to a command self.indented = True def end_dl(self): self.indented = False def start_dt(self, attrs): self.data = '' self.nolink = True def end_dt(self): if not self.active: return if self.nolink == True: # Looks like we found the first part to a command self.last_text = re.sub(' \([\w\-\.\s]+\)', '', self.data) self.nolink = False def start_h2(self, attrs): for k, v in attrs: if k == 'id': self.header = v if v == '_': self.active = True def end_h2(self): pass def start_td(self, attrs): self.indented = False self.last_text = '' def end_td(self): pass def start_table(self, attrs): pass def end_table(self): if self.header == self.last_letter: self.active = False def start_a(self, attrs): self.nolink = False self.link = {} for attr in attrs: self.link[attr[0]] = attr[1] self.data = '' def end_a(self): text = self.data.replace('\t', '').replace('\n', ' ') text = text.replace("Whats ", "What's ") self.link['text'] = text # handle a tag without href attribute try: href = self.link['href'] except KeyError: return self.process_link() def handle_data(self, data): self.data += data def handle_entityref(self, name): # not meant to be called while convert_charrefs is true raise AssertionError('entityrefs should not be handled any more') def main(): base = sys.argv[1] fn = sys.argv[2] version = escape(sys.argv[3]) parser = PyHTMLParser(base, fn, indent=0) print('') print('' % (version, version, version)) print('') parser.parse_file(fn) print('') print('') fn = 'genindex-all.html' parser = PyIdxHTMLParser(base, fn, indent=1) text = open(base + '/' + fn, encoding='latin_1').read() parser.feed(text) parser.close() print('') print('') main()