#!/usr/bin/env python3 import re import subprocess import sys def run(): if len(sys.argv) > 1: print(""" ns-html2rst - Convert Cocoa HTML documentation into ReST usage: nshtml2rst < NSString.html > NSString.rst """) sys.exit(0) html = sys.stdin.read() # Treat
(.*?)
', r'
\1
', html, flags=re.MULTILINE | re.DOTALL) # Strip all attributes from
...
containing class="..." # The resulting classes confound ReST html = re.sub( r']*class=[^>]*>(.*?)', r'
\1
', html, flags=re.MULTILINE | re.DOTALL) # Remove links from ..., which doesn't have a rendering in # ReST html = re.sub( r'(.*?)]*?>(.*?)(.*?)', r'\1\2\3', html, flags=re.MULTILINE | re.DOTALL) # Let pandoc do most of the hard work p = subprocess.Popen( args=['pandoc', '--reference-links', '-f', 'html', '-t', 'rst'], stdin=subprocess.PIPE, stdout=subprocess.PIPE ) rst, stderr = p.communicate(html) # HACKETY HACK HACK: Our html documents apparently contain some # bogus heading level nesting. Just fix up the one we know about # so that ReST doesn't complain later. rst = re.sub("(^|\n)('+)($|\n)", lambda m: m.group(1) + len(m.group(2)) * '^' + m.group(3), rst, flags=re.MULTILINE) sys.stdout.write(rst) if __name__ == '__main__': run()