summaryrefslogtreecommitdiff
path: root/debian/pyhtml2devhelp.py
diff options
context:
space:
mode:
Diffstat (limited to 'debian/pyhtml2devhelp.py')
-rw-r--r--debian/pyhtml2devhelp.py279
1 files changed, 279 insertions, 0 deletions
diff --git a/debian/pyhtml2devhelp.py b/debian/pyhtml2devhelp.py
new file mode 100644
index 0000000..7d1c8d9
--- /dev/null
+++ b/debian/pyhtml2devhelp.py
@@ -0,0 +1,279 @@
+#! /usr/bin/python3
+
+from html.parser import HTMLParser
+from xml.sax.saxutils import escape
+import os, sys, re
+
+class PyHTMLParser(HTMLParser):
+ pages_to_include = set(('whatsnew/index.html', 'tutorial/index.html', 'using/index.html',
+ 'reference/index.html', 'library/index.html', 'howto/index.html',
+ 'extending/index.html', 'c-api/index.html', 'install/index.html',
+ 'distutils/index.html'))
+
+ def __init__(self, basedir, fn, indent, parents=set()):
+ HTMLParser.__init__(self, convert_charrefs=True)
+ self.basedir = basedir
+ self.dir, self.fn = os.path.split(fn)
+ self.data = ''
+ self.parents = parents
+ self.link = {}
+ self.indent = indent
+ self.last_indent = indent - 1
+ self.sub_indent = 0
+ self.sub_count = 0
+ self.next_link = False
+
+ def escape(self, text):
+ return escape(text, {'"': '"'})
+
+ def process_link(self):
+ new_href = self.escape(os.path.join(self.dir, self.link['href']))
+ text = self.escape(self.link['text'])
+ indent = self.indent + self.sub_indent
+ if self.last_indent == indent:
+ print('%s</sub>' % (' ' * self.last_indent))
+ self.sub_count -= 1
+ print('%s<sub link="%s" name="%s">' % (' ' * indent, new_href, text))
+ self.sub_count += 1
+ self.last_indent = self.indent + self.sub_indent
+
+ def handle_starttag(self, tag, attrs):
+ if tag == 'a':
+ self.start_a(attrs)
+ elif tag == 'li':
+ self.start_li(attrs)
+
+ def handle_endtag(self, tag):
+ if tag == 'a':
+ self.end_a()
+ elif tag == 'li':
+ self.end_li()
+
+ def start_li(self, attrs):
+ self.sub_indent += 1
+ self.next_link = True
+
+ def end_li(self):
+ indent = self.indent + self.sub_indent
+ if self.sub_count > 0:
+ print('%s</sub>' % (' ' * self.last_indent))
+ self.sub_count -= 1
+ self.last_indent -= 1
+ self.sub_indent -= 1
+
+ def start_a(self, attrs):
+ self.link = {}
+ for attr in attrs:
+ self.link[attr[0]] = attr[1]
+ self.data = ''
+
+ def end_a(self):
+ process = False
+ text = self.escape(self.data.replace('\t', '').replace('\n', ' '))
+ self.link['text'] = text
+ # handle a tag without href attribute
+ try:
+ href = self.link['href']
+ except KeyError:
+ return
+
+ abs_href = os.path.join(self.basedir, href)
+ if abs_href in self.parents:
+ return
+ if href.startswith('..') or href.startswith('http:') \
+ or href.startswith('mailto:') or href.startswith('news:'):
+ return
+ if href in ('', 'about.html', 'modindex.html', 'genindex.html', 'glossary.html',
+ 'search.html', 'contents.html', 'download.html', 'bugs.html',
+ 'license.html', 'copyright.html'):
+ return
+
+ if 'class' in self.link:
+ if self.link['class'] in ('biglink'):
+ process = True
+ if self.link['class'] in ('reference external'):
+ if self.next_link:
+ process = True
+ next_link = False
+
+ if process == True:
+ self.process_link()
+ if href in self.pages_to_include:
+ self.parse_file(os.path.join(self.dir, href))
+
+ def finish(self):
+ if self.sub_count > 0:
+ print('%s</sub>' % (' ' * self.last_indent))
+
+ def handle_data(self, data):
+ self.data += data
+
+ def parse_file(self, href):
+ # TODO basedir bestimmen
+ parent = os.path.join(self.basedir, self.fn)
+ self.parents.add(parent)
+ parser = PyHTMLParser(self.basedir, href, self.indent + 1,
+ self.parents)
+ text = open(self.basedir + '/' + href, encoding='latin_1').read()
+ parser.feed(text)
+ parser.finish()
+ parser.close()
+ if parent in self.parents:
+ self.parents.remove(parent)
+
+class PyIdxHTMLParser(HTMLParser):
+ def __init__(self, basedir, fn, indent):
+ HTMLParser.__init__(self, convert_charrefs=True)
+ self.basedir = basedir
+ self.dir, self.fn = os.path.split(fn)
+ self.data = ''
+ self.link = {}
+ self.indent = indent
+ self.active = False
+ self.indented = False
+ self.nolink = False
+ self.header = ''
+ self.last_letter = 'Z'
+ self.last_text = ''
+
+ def escape(self, text):
+ return escape(text, {'"': '&quot;'})
+
+ def process_link(self):
+ new_href = self.escape(os.path.join(self.dir, self.link['href']))
+ text = self.escape(self.link['text'])
+ if not self.active:
+ return
+ if text.startswith('['):
+ return
+ if self.link.get('rel', None) in ('prev', 'parent', 'next', 'contents', 'index'):
+ return
+ if self.indented:
+ text = self.last_text + ' ' + text
+ else:
+ # Save it in case we need it again
+ self.last_text = re.sub(' \([\w\-\.\s]+\)', '', text)
+ indent = self.indent
+ print('%s<function link="%s" name="%s"/>' % (' ' * indent, new_href, text))
+
+ def handle_starttag(self, tag, attrs):
+ if tag == 'a':
+ self.start_a(attrs)
+ elif tag == 'dl':
+ self.start_dl(attrs)
+ elif tag == 'dt':
+ self.start_dt(attrs)
+ elif tag == 'h2':
+ self.start_h2(attrs)
+ elif tag == 'td':
+ self.start_td(attrs)
+ elif tag == 'table':
+ self.start_table(attrs)
+
+ def handle_endtag(self, tag):
+ if tag == 'a':
+ self.end_a()
+ elif tag == 'dl':
+ self.end_dl()
+ elif tag == 'dt':
+ self.end_dt()
+ elif tag == 'h2':
+ self.end_h2()
+ elif tag == 'td':
+ self.end_td()
+ elif tag == 'table':
+ self.end_table()
+
+ def start_dl(self, attrs):
+ if self.last_text:
+ # Looks like we found the second part to a command
+ self.indented = True
+
+ def end_dl(self):
+ self.indented = False
+
+ def start_dt(self, attrs):
+ self.data = ''
+ self.nolink = True
+
+ def end_dt(self):
+ if not self.active:
+ return
+ if self.nolink == True:
+ # Looks like we found the first part to a command
+ self.last_text = re.sub(' \([\w\-\.\s]+\)', '', self.data)
+ self.nolink = False
+
+ def start_h2(self, attrs):
+ for k, v in attrs:
+ if k == 'id':
+ self.header = v
+ if v == '_':
+ self.active = True
+
+ def end_h2(self):
+ pass
+
+ def start_td(self, attrs):
+ self.indented = False
+ self.last_text = ''
+
+ def end_td(self):
+ pass
+
+ def start_table(self, attrs):
+ pass
+
+ def end_table(self):
+ if self.header == self.last_letter:
+ self.active = False
+
+ def start_a(self, attrs):
+ self.nolink = False
+ self.link = {}
+ for attr in attrs:
+ self.link[attr[0]] = attr[1]
+ self.data = ''
+
+ def end_a(self):
+ text = self.data.replace('\t', '').replace('\n', ' ')
+ text = text.replace("Whats ", "What's ")
+ self.link['text'] = text
+ # handle a tag without href attribute
+ try:
+ href = self.link['href']
+ except KeyError:
+ return
+ self.process_link()
+
+ def handle_data(self, data):
+ self.data += data
+
+ def handle_entityref(self, name):
+ # not meant to be called while convert_charrefs is true
+ raise AssertionError('entityrefs should not be handled any more')
+
+def main():
+ base = sys.argv[1]
+ fn = sys.argv[2]
+ version = escape(sys.argv[3])
+
+ parser = PyHTMLParser(base, fn, indent=0)
+ print('<?xml version="1.0" encoding="iso-8859-1"?>')
+ print('<book title="Python %s Documentation" name="Python %s" version="%s" link="index.html">' % (version, version, version))
+ print('<chapters>')
+ parser.parse_file(fn)
+ print('</chapters>')
+
+ print('<functions>')
+
+ fn = 'genindex-all.html'
+ parser = PyIdxHTMLParser(base, fn, indent=1)
+ text = open(base + '/' + fn, encoding='latin_1').read()
+ parser.feed(text)
+ parser.close()
+
+ print('</functions>')
+ print('</book>')
+
+main()