Blame - scripts/html2markdown.py - chromium.googlesource.com/website

blob: 280223e34b816999294709609d391ce5e5e91bf2 [file] [log] [blame]

Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1	# Copyright 2021 Google LLC
				2	#
				3	# Licensed under the Apache License, Version 2.0 (the "License");
				4	# you may not use this file except in compliance with the License.
				5	# You may obtain a copy of the License at
				6	#
				7	# https://www.apache.org/licenses/LICENSE-2.0
				8	#
				9	# Unless required by applicable law or agreed to in writing, software
				10	# distributed under the License is distributed on an "AS IS" BASIS,
				11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	# See the License for the specific language governing permissions and
				13	# limitations under the License.
				14
				15	"""HTML to Markdown renderer."""
				16
				17	import os
				18	import re
				19	import io
				20	import textwrap
				21	import urllib
				22	import xml.sax
				23
				24
				25	class _Flags:
				26	# Whether to render h1s and h2s with underlined - and =.
				27	underline_headers = False
				28
				29	# The set of characters to escape with \'\\\' in the
				30	# Markdown. This is not the set of all special Markdown
				31	# characters, but rather those characters that tend to
				32	# get misinterpreted as Markdown syntax the most. Blindly
				33	# escaping all special Markdown characters results in ugly
				34	# Markdown.
				35	escape_chars = r'\`*[]'
				36
				37	# Format for italic tags.
				38	italic_format = '*'
				39
				40	# Format for bold tags.
				41	bold_format = '**'
				42
				43	# Format for strikethrough tags.
				44	strike_format = '~~'
				45
				46	# Format for underline tags.
				47	highlight_format = '=='
				48
				49	# Number of spaces to indent an unordered list.
				50	# This total includes the bullet.
				51	# For example, a value of 4 yields '* '
				52	unordered_list_indent = 4
				53
				54	# Number of spaces to indent an ordered list.
				55	# This total includes the number.
				56	# For example, a value of 4 yields '1. '
				57	ordered_list_indent = 4
				58
				59	# The DIV blocks that should be formatted as code.
				60	code_class_regex = r'^sites-codeblock sites-codesnippet-block$'
				61
				62	# The class of DIV blocks used for table of contents.
				63	toc_class_regex = r'^sites-embed-content sites-embed-type-toc$'
				64
				65	# The class of DIV blocks that should be ignored.
				66	ignore_class_regex = r''
				67
				68	# The style of DIV blocks that should be ignored.
				69	ignore_style_regex = r'^display:none;$'
				70
				71	# Format text blocks to the given line width. Set to zero
				72	# to disable line wrapping.
				73	line_width = 80
				74
				75	# Whether to use indented code blocks, if False use fenced.
				76	indented_code_blocks = False
				77
				78	# Whether to use HTML code blocks instead of fenced code
				79	# blocks if source code block includes formatted text.
				80	allow_html_code_blocks = True
				81
				82	# Links that are automatically recognized by the renderer.
				83	shortlinks_regex = r'^http://(ag\|b\|cl\|g\|go\|who)/'
				84
				85	# Print the fragment tree for debugging.
				86	debug_print_tree = False
				87
				88
				89	FLAGS = _Flags()
				90
				91
				92	def _EscapeText(text, reserved_chars):
				93	"""Escapes any reserved characters with a backslash.
				94
				95	Args:
				96	text: The string to escape.
				97	reserved_chars: A string of reserved characters that need to be escaped.
				98
				99	Returns:
				100	The escaped text.
				101	"""
				102	markdown = io.StringIO()
				103	for c in text:
				104	if c in reserved_chars:
				105	markdown.write('\\')
				106	markdown.write(c)
				107	return markdown.getvalue()
				108
				109
				110	def _EscapeContentForHtml(text):
				111	result = io.StringIO()
				112	escapes = {'<': '<', '>': '>'}
				113	for c in text:
				114	result.write(c if c not in escapes else escapes[c])
				115	return result
				116
				117
				118	ENCODED_NEWLINE = '&#%d;' % ord('\n')
				119
				120
				121	def _RestoreEncodedNewlines(text):
				122	return text.replace(ENCODED_NEWLINE, '\n')
				123
				124
				125	def _WrapLine(line, indent):
				126	"""Wraps the line to fit into the column limit.
				127
				128	Args:
				129	line: The string to wrap.
				130	indent: An integer with the number of columns of indentation.
				131
				132	Returns:
				133	The wrapped text.
				134	"""
				135	if FLAGS.line_width > 0:
				136	return ('\n' + ' ' * indent).join(textwrap.wrap(
				137	line,
				138	width=FLAGS.line_width - indent,
				139	break_long_words=False,
				140	break_on_hyphens=False))
				141	return line
				142
				143
				144	class Fragment:
				145	"""Base class for all output fragments.
				146
				147	To generate a line of output, the methods will be called in the following
				148	order:
				149
				150	WriteIndent()
				151	WriteContentIntoParentAndClear()
				152	ConsumeContent() -- for the topmost fragment only
				153	StripLine()
				154	WrapLine()
				155	"""
				156
				157	def __init__(self, indent, prefix, suffix):
				158	self._content = io.StringIO()
				159	self._indent = indent
				160	self._prefix = prefix
				161	self._suffix = suffix
				162	self._parent = None
				163	self._children = []
				164
				165	def __repr__(self):
				166	debug_print = lambda text: text.encode('utf-8') if text else ''
				167	return ('{' +
				168	self.__class__.__name__ +
				169	': indent=' + debug_print(self._indent) +
				170	'; prefix=' + debug_print(self._prefix) +
				171	'; content=' + debug_print(self._content.getvalue()) +
				172	'; suffix=' + debug_print(self._suffix) +
				173	'}')
				174
				175	def SetParent(self, parent):
				176	self._parent = parent
				177
				178	def AddChild(self, node):
				179	self._children.append(node)
				180	node.SetParent(self)
				181	return node
				182
				183	def GetChildren(self):
				184	return self._children
				185
				186	def _AllChildren(self):
				187	all_children = []
				188	def Traverse(fragment):
				189	for c in fragment.GetChildren():
				190	all_children.append(c)
				191	Traverse(c)
				192	Traverse(self)
				193	return all_children
				194
				195	def WriteIndent(self, output):
				196	if self._indent:
				197	output.write(self._indent)
				198
				199	def WriteContentIntoParentAndClear(self):
				200	self._WriteContent(self._parent._content) # pylint: disable=protected-access
				201	self._ClearContent()
				202	self._children = []
				203
				204	def _WriteContent(self, output):
				205	"""Implementation of content rendering. Can be overridden in subclasses."""
				206	self._Write(output, self._prefix, self._content.getvalue(), self._suffix)
				207
				208	def _Write(self, output, prefix, content, suffix):
				209	"""Default implementation of content rendering for reuse by subclasses."""
				210	has_content = bool(content.strip())
				211	if prefix and has_content:
				212	output.write(prefix)
				213	output.write(content)
				214	if suffix and has_content:
				215	output.write(suffix)
				216
				217	def UnsetSuffix(self):
				218	self._suffix = ''
				219
				220	def UnsetPrefix(self):
				221	self._prefix = ''
				222
				223	def _UpdatePrefixAndSuffix(self, prefix, suffix):
				224	if self._prefix:
				225	self._prefix = prefix
				226	if self._suffix:
				227	self._suffix = suffix
				228
				229	def _ClearContent(self):
				230	"""Clears the content. This will only be called after it's been written."""
				231	self._content = io.StringIO()
				232
				233	def ConsumeContent(self):
				234	content = self._content
				235	self._ClearContent()
				236	return content
				237
				238	def Append(self, text):
				239	"""Appends text.
				240
				241	Args:
				242	text: The string to append, it will be escaped.
				243	"""
				244	assert isinstance(text, str)
				245	self._content.write(self.EscapeText(text))
				246
				247	def EscapeText(self, text):
				248	"""Escapes any reserved characters when Append() is called with text.
				249
				250	By default this defers to the parent fragment.
				251
				252	Args:
				253	text: The string to escape.
				254
				255	Returns:
				256	The escaped string.
				257	"""
				258	if self._parent:
				259	return self._parent.EscapeText(text)
				260	return text
				261
				262	def StripLine(self, text):
				263	"""Does any needed stripping of whitespace.
				264
				265	Some blocks (code for example) will want to preserve whitespace, while
				266	others will want to coalesce it together. By default this defers to the
				267	parent fragment.
				268
				269	Args:
				270	text: The string to strip
				271
				272	Returns:
				273	The stripped string.
				274	"""
				275	if self._parent:
				276	return self._parent.StripLine(text)
				277	return text
				278
				279	def WrapLine(self, line, indent):
				280	"""Wraps the line to fit into the column limit, if necessary.
				281
				282	Most blocks (code for example) will want to preserve whitespace and won't
				283	break their output.
				284
				285	Args:
				286	text: The string to wrap.
				287	indent: Indent string.
				288	Returns:
				289	The wrapped string.
				290	"""
				291	del indent
				292	return line
				293
				294	def NeedsToMergeWith(self, text):
				295	del text
				296	return False
				297
				298
				299	class HTML(Fragment):
				300	"""Markdown fragment that consists of just an unescaped HTML string."""
				301
				302	def __init__(self, prefix=None, suffix=None):
				303	super().__init__(indent=None, prefix=prefix, suffix=suffix)
				304
				305	def EscapeText(self, text):
				306	return text
				307
				308
				309	class Href(Fragment):
				310	"""HTML fragment containing an <a href=> tag. Used within table cells.
				311
				312	If the href falls within a table cell, using a Href() element will allow
				313	us to have proper formatting; the Markdown-style Link() element will not
				314	be processed properly.
				315	"""
				316	def __init__(self, href):
				317	super().__init__(indent=None, prefix='<a href="%s">' % href, suffix='</a>')
				318
				319
				320	class Text(Fragment):
				321	"""Markdown fragment that consists of just a string."""
				322
				323	def __init__(self, indent=None, prefix=None, suffix=None):
				324	super().__init__(indent, prefix, suffix)
				325
				326
				327	class IgnoreBlock(Fragment):
				328	"""Markdown fragment that omits all content."""
				329
				330	def __init__(self):
				331	super().__init__(None, None, None)
				332
				333
				334	class TextBlock(Text):
				335	"""A TextBlock coalesces all spaces and escapes all reserved chars."""
				336
				337	def EscapeText(self, text):
				338	text = _EscapeContentForHtml(text).getvalue()
				339	return _EscapeText(text, FLAGS.escape_chars)
				340
				341	def StripLine(self, text):
				342	# Treat newlines as spaces and then coalesce spaces.
				343	text = text.replace('\n', ' ')
				344	# Replace all Unicode nonbreaking spaces with simple spaces. This is safer
				345	# than deletion since spaces are coalesced below anyway.
				346	text = text.replace(chr(160), ' ')
				347
				348	return re.sub(r' +', ' ', text.strip())
				349
				350
				351	class Div(TextBlock):
				352	"""Placeholder that helps with the two-column layout conversion."""
				353
				354	def __init__(self, cls):
				355	self.cls = cls
				356	super().__init__()
				357
				358
				359	class Table(TextBlock):
				360	"""Placeholder that identifies when we're in a (data) table.
				361
				362	(As opposed to a table being used for layout-purposes, which we don't
				363	want to export.)
				364	"""
				365	cls = None
				366
				367
				368	class TD(Text):
Dirk Pranke	7aa0137	2021-11-05 16:16:09 -0700	[diff] [blame]	369	def __init__(self, rowspan, colspan):
				370	prefix = '<td'
				371	if rowspan and str(rowspan) != '1':
				372	prefix += ' rowspan=%s' % rowspan
				373	if colspan and str(colspan) != '1':
				374	prefix += ' colspan=%s' % colspan
				375	prefix += '>'
				376	super().__init__(indent='', prefix=prefix, suffix='</td>')
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	377
				378
				379	class Content(TextBlock):
				380	"""Placeholder that identifies when we're processing the main content."""
				381	cls = None
				382
				383
				384	class WrappedTextBlock(TextBlock):
				385	"""A WrappedTextBlock wraps the output lines to fit into the column limit."""
				386
				387	def WrapLine(self, line, indent):
				388	return _WrapLine(line, len(indent))
				389
				390
				391	class BlockquoteBlock(WrappedTextBlock):
				392	"""A BlockquoteBlock wraps content and prepends each line with '> '.
				393
				394	The generator must emit BlockquoteBlocks with no indent for paragraphs
				395	inside a blockquote. This will allow propagating the final call to WrapLine
				396	up to the outermost BlockquoteBlock which will wrap the lines and prepend
				397	each of them with the indent.
				398	"""
				399
				400	def __init__(self, indent='> '):
				401	super().__init__(indent, None, None)
				402
				403	def WrapLine(self, line, indent):
				404	if not self._indent and self._parent:
				405	return self._parent.WrapLine(line, indent)
				406	wrapped = _WrapLine(line, len(indent))
				407	lines = wrapped.splitlines(True)
				408	return indent.join([l.lstrip() for l in lines])
				409
				410
				411	class CodeBlock(Text):
				412	"""Base class for different code block fragment implementations."""
				413
				414	def EscapeText(self, text):
				415	return text
				416
				417	def StripLine(self, text):
				418	# Completely ignore newlines in code blocks. Sites always uses <br/>.
				419	return text.replace('\n', '')
				420
				421	def ChangeToHtml(self):
				422	content = self._content.getvalue()
				423	if content:
				424	self._content = _EscapeContentForHtml(content)
				425
				426
				427	class IndentedCodeBlock(CodeBlock):
				428	"""A IndentedCodeBlock indents by four spaces."""
				429
				430	def __init__(self, indent=' '):
				431	super().__init__(indent, None, None)
				432
				433
				434	class FencedCodeBlock(CodeBlock):
				435	"""A FencedCodeBlock is fenced with triple backticks (```).
				436
				437	To render correctly, content writing must not happen
				438	unless the end of the source code block has been encountered.
				439	That is, the entire code block from the source HTML must
				440	be rendered in a single write pass.
				441	"""
				442
				443	def __init__(self, indent=None,
				444	prefix='```none' + ENCODED_NEWLINE,
				445	suffix=ENCODED_NEWLINE + '```'):
				446	super().__init__(indent, prefix, suffix)
				447
				448	def WriteIndent(self, output):
				449	# Adjust inner fragments and self before rendering.
				450	if FLAGS.allow_html_code_blocks:
				451	has_formatted_text = False
				452	for c in self._AllChildren():
				453	if isinstance(c, FormattedText):
				454	c.ChangeToHtml()
				455	has_formatted_text = True
				456	if has_formatted_text:
				457	for c in self._AllChildren():
				458	if isinstance(c, CodeBlock):
				459	c.ChangeToHtml()
				460	self._UpdatePrefixAndSuffix(
				461	'<pre><code>', ENCODED_NEWLINE + '</code></pre>')
				462	super().WriteIndent(output)
				463
				464	def StripLine(self, text):
				465	text = super().StripLine(text)
				466	lines = _RestoreEncodedNewlines(text).splitlines()
				467	return '\n'.join([l for l in lines if l])
				468
				469	def WrapLine(self, line, indent):
				470	lines = line.splitlines(True)
				471	return indent.join(lines)
				472
				473
				474	class FencedCodeBlockLine(Text):
				475	"""A line of code inside FencedCodeBlock."""
				476
				477	def __init__(self, indent=None,
				478	prefix=ENCODED_NEWLINE, suffix=ENCODED_NEWLINE):
				479	super().__init__(indent, prefix, suffix)
				480
				481	def StripLine(self, text):
				482	text = super().StripLine(text)
				483	return _RestoreEncodedNewlines(text)
				484
				485
				486	class UnderlinedHeader(TextBlock):
				487	"""Markdown fragment for an underlined section header."""
				488
				489	def __init__(self, char):
				490	super().__init__()
				491	self._char = char
				492
				493	def _WriteContent(self, output):
				494	length = len(self.StripLine(self._content.getvalue()))
				495	if length > 0:
				496	# '\n' will be stripped, so use an encoded '\n' that we can later replace
				497	# after the line is stripped.
				498	self._Write(output,
				499	None,
				500	self._content.getvalue(),
				501	ENCODED_NEWLINE + self._char * length)
				502
				503	def StripLine(self, text):
				504	text = super().StripLine(text)
				505	return _RestoreEncodedNewlines(text)
				506
				507
				508	class FormattedText(Text):
				509	"""Text wrapped in Markdown formatting."""
				510
				511	def __init__(self, fmt):
				512	super().__init__(None, fmt, fmt)
				513
				514	def _Pad(self, bigger, smaller):
				515	return ' ' * (len(bigger) - len(smaller))
				516
				517	def _WriteContent(self, output):
				518	prefix = self._prefix
				519	content = self._content.getvalue()
				520	suffix = self._suffix
				521	if prefix:
				522	# If there are whitespaces immediately after the prefix,
				523	# they must be pushed out before the prefix.
				524	lstripped = content.lstrip()
				525	if len(content) > len(lstripped):
				526	prefix = self._Pad(content, lstripped) + prefix
				527	content = lstripped
				528	if suffix:
				529	# If there are whitespaces immediately before the suffix,
				530	# they must be pushed out after the suffix.
				531	rstripped = content.rstrip()
				532	if len(content) > len(rstripped):
				533	suffix = suffix + self._Pad(content, rstripped)
				534	content = rstripped
				535	self._Write(output, prefix, content, suffix)
				536
				537	def ChangeToHtml(self):
				538	content = self._content.getvalue()
				539	if content:
				540	content = _EscapeContentForHtml(content)
				541
				542
				543	class BoldFormattedText(FormattedText):
				544	"""Text formatted as bold."""
				545
				546	def __init__(self):
				547	super().__init__(FLAGS.bold_format)
				548
				549	def NeedsToMergeWith(self, text):
				550	return isinstance(text, BoldFormattedText)
				551
				552	def ChangeToHtml(self):
				553	super().ChangeToHtml()
				554	self._UpdatePrefixAndSuffix('<b>', '</b>')
				555
				556
				557	class ItalicFormattedText(FormattedText):
				558	"""Text formatted as italic."""
				559
				560	def __init__(self):
				561	super().__init__(FLAGS.italic_format)
				562
				563	def NeedsToMergeWith(self, text):
				564	return isinstance(text, ItalicFormattedText)
				565
				566	def ChangeToHtml(self):
				567	super().ChangeToHtml()
				568	self._UpdatePrefixAndSuffix('<i>', '</i>')
				569
				570
				571	class StrikeThroughFormattedText(FormattedText):
				572	"""Text formatted as strike through."""
				573
				574	def __init__(self):
				575	super().__init__(FLAGS.strike_format)
				576
				577	def NeedsToMergeWith(self, text):
				578	return isinstance(text, StrikeThroughFormattedText)
				579
				580	def ChangeToHtml(self):
				581	super().ChangeToHtml()
				582	self._UpdatePrefixAndSuffix('<s>', '</s>')
				583
				584
				585	class HighlightFormattedText(FormattedText):
				586	"""Highlighted text."""
				587
				588	def __init__(self):
				589	super().__init__(FLAGS.highlight_format)
				590
				591	def NeedsToMergeWith(self, text):
				592	return isinstance(text, HighlightFormattedText)
				593
				594	def ChangeToHtml(self):
				595	super().ChangeToHtml()
				596	self._UpdatePrefixAndSuffix('<u>', '</u>')
				597
				598
				599	class ListItem(Text):
				600	"""Item in a list."""
				601
				602	def __init__(self, bullet):
				603	super().__init__()
				604	self._bullet = bullet
				605
				606	def WriteIndent(self, output):
				607	if self._bullet:
				608	# TODO(dpranke): The original code relied on strings and bytes
				609	# being interchangeable in Python2, so you could seek backwards
				610	# from the current location with a relative offset. You can't
				611	# do that in Python3, apparently.
				612	#
				613	# To get around this for the moment, instead of seeking backwards
				614	# 4 characters, we embed 4 '\b' backspaces, and then have the client
				615	# do a global search and replace of ' \b\b\b\b' with '' instead.
				616	#
				617	# This is awkward, so we should rework this so that this isn't needed.
				618	#
				619	# output.seek(-len(self._bullet), os.SEEK_CUR)
				620	output.write('\b' * len(self._bullet))
				621	output.write(self._bullet)
				622	super().WriteIndent(output)
				623
				624	def _ClearContent(self):
				625	self._bullet = None
				626	super()._ClearContent()
				627
				628	def WrapLine(self, line, indent):
				629	return _WrapLine(line, len(indent))
				630
				631
				632	class Link(Text):
				633	"""Markdown link."""
				634
				635	def __init__(self, href):
				636	super().__init__()
				637	self._href = href
				638	self._url_opener_prefix = ''
				639	self._url_opener_suffix = ''
				640
				641	def MakeAnImage(self, width, height):
				642	self._url_opener_prefix = '!'
				643	if width and height:
				644	self._url_opener_suffix = (
				645	'{{width="{}" height="{}"}}'.format(width, height))
				646
				647	def _IsShortLink(self, text):
				648	if FLAGS.shortlinks_regex and (
				649	re.compile(FLAGS.shortlinks_regex).match(self._href)):
				650	parsed_href = urllib.parse.urlsplit(self._href)
				651	if parsed_href.netloc + parsed_href.path == text:
				652	return True
				653	return None
				654
				655	def _WriteLink(self, output, text):
				656	write_short_link = (not (self._url_opener_prefix or self._url_opener_suffix)
				657	and self._IsShortLink(text))
				658	if write_short_link:
				659	self._Write(output, None, text, None)
				660	else:
				661	self._Write(output,
				662	self._url_opener_prefix + '[',
				663	text,
				664	'](' + self._href + ')' + self._url_opener_suffix)
				665
				666	def _WriteContent(self, output):
				667	text = self._content.getvalue()
				668	if text:
				669	if text.startswith('http://') or text.startswith('https://'):
				670	self._Write(output, '<', text, '>')
				671	else:
				672	self._WriteLink(output, text)
				673
				674
				675	class Image(Text):
				676	"""Image."""
				677
				678	def __init__(self, src, alt, width, height):
				679	super().__init__()
				680	self._src = src
				681	self._alt = alt or 'image'
				682	self._width = width
				683	self._height = height
				684
				685	def _WriteContent(self, output):
				686	tag = '<img alt="%s" src="%s"' % (self._alt, self._src)
				687	if self._height:
				688	tag += ' height=%s' % self._height
				689	if self._width:
				690	tag += ' width=%s' % self._width
				691	tag += '>'
				692	self._Write(output, '', tag, '')
				693
				694
				695	class Code(Text):
				696	"""Inline code."""
				697
				698	def __init__(self):
				699	super().__init__(None, '`', '`')
				700
				701	def EscapeText(self, text):
				702	return text
				703
				704	def _WriteContent(self, output):
				705	prefix = self._prefix
				706	content = self._content.getvalue()
				707	suffix = self._suffix
				708	if '`' in content:
				709	# If a backtick (`) is present inside inline code, the fragment
				710	# must use double backticks.
				711	prefix = suffix = '``'
				712	# Since having content starting or ending with a backtick would emit
				713	# triple backticks which designates a fenced code fragment, pad content
				714	# to avoid this.
				715	if content.startswith('`'):
				716	content = ' ' + content
				717	if content.endswith('`'):
				718	content += ' '
				719	self._Write(output, prefix, content, suffix)
				720
				721	def NeedsToMergeWith(self, text):
				722	return isinstance(text, Code)
				723
				724
				725	class EmbeddedContent(Text):
				726	"""Embedded content: Docs, Drawings, Presentations, etc."""
				727
				728	def __init__(self, href, width, height):
				729	super().__init__()
				730	self._href = href
				731	self._width = width
				732	self._height = height
				733
				734	def _WriteContent(self, output):
				735	parsed_href = urllib.parse.urlsplit(self._href)
				736	if parsed_href.scheme == 'http':
				737	parsed_href = urllib.parse.SplitResult(
				738	'https', parsed_href.netloc, parsed_href.path, parsed_href.query,
				739	parsed_href.fragment)
				740	# Note: 'allow="fullscreen"' is requested for all content for simplicity.
				741	# g3doc server has dedicated logic to deal with these requests.
				742	element = '<iframe src="{}"{} allow="fullscreen" />'.format(
				743	urllib.parse.urlunsplit(parsed_href),
				744	(' width="{}" height="{}"'.format(self._width, self._height) if (
				745	self._width and self._height) else ''))
				746	self._Write(output, None, element, None)
				747
				748
				749	class ListInfo:
				750
				751	def __init__(self, tag):
				752	self.tag = tag # The tag used to start the list
				753	self.item_count = 0 # The number of items in the list
				754
				755
				756	class FragmentTree:
				757	"""Class for managing a tree of fragments.
				758
				759	There is a "scope" formed by nested fragments, e.g.
				760	italic fragment inside bold fragment inside paragraph.
				761	The scope is stored in the stack. For convenience,
				762	the stack always have one element.
				763
				764	Fragments popped out from the scope may be re-added
				765	back into the tree as children of the last fragment.
				766	This allows "chaining" of structured content for future
				767	processing. For example, if there were several bold
				768	fragments inside a paragraph interleaved with fragments
				769	of regular text, all these fragments will end up as
				770	children of the paragraph fragment.
				771
				772	"""
				773
				774	def __init__(self, top_fragment):
				775	self._stack = [top_fragment]
				776
				777	def ActiveFragmentScopeDepth(self):
				778	return len(self._stack) - 1
				779
				780	def StartFragment(self, fragment):
				781	fragment.SetParent(self._stack[-1])
				782	self._stack.append(fragment)
				783	return fragment
				784
				785	def EndFragment(self):
				786	return self._stack.pop()
				787
				788	def AppendFragment(self, fragment):
				789	return self._stack[-1].AddChild(fragment)
				790
				791	def _ApplyRecursivelyToNode(self, node, scope_operation, operation, # pylint: disable=missing-docstring
				792	debug_indent):
				793	if not debug_indent:
				794	for child in node.GetChildren():
				795	self._ApplyRecursivelyToNode(child, scope_operation, operation, None)
				796	else:
				797	debug_indent += ' c '
				798	for child in node.GetChildren():
				799	print(debug_indent + repr(child))
				800	self._ApplyRecursivelyToNode(child, scope_operation, operation,
				801	debug_indent)
				802	operation(node)
				803
				804	def _ApplyRecursivelyToScope(self, nodes, scope_operation, operation, # pylint: disable=missing-docstring
				805	debug_indent):
				806	node = nodes.pop()
				807	scope_operation(node)
				808	if debug_indent:
				809	print(debug_indent + repr(node))
				810	if nodes:
				811	self._ApplyRecursivelyToScope(nodes, scope_operation, operation,
				812	(debug_indent + ' s ' if debug_indent
				813	else None))
				814	self._ApplyRecursivelyToNode(node, scope_operation, operation,
				815	debug_indent)
				816
				817	def ApplyToAllFragments(self, scope_operation, operation):
				818	"""Recursively applies operations to all fragments in the tree.
				819
				820	The omnipresent topmost fragment is excluded. The 'scope_operation'
				821	is applied to every element in the fragment stack in pre-order.
				822	The 'operation' is applied to all fragments in the tree in post-order.
				823
				824	Args:
				825	scope_operation: The operation to apply to fragments in the scope stack.
				826	operation: The operation to apply to all fragments in the tree.
				827	"""
				828	self._ApplyRecursivelyToScope(list(reversed(self._stack[1:])),
				829	scope_operation, operation,
				830	' ' if FLAGS.debug_print_tree else None)
				831
				832	def FindFirstFragmentFromEnd(self, predicate, steps_from_last=0):
				833	sub_stack = self._stack[:-steps_from_last if steps_from_last else None]
				834	return next((node for node in sub_stack if predicate(node)), None)
				835
				836	def PeekFragmentFromStart(self, steps_from_first=0):
				837	return self._stack[steps_from_first]
				838
				839	def PeekFragmentFromEnd(self, steps_from_last=0):
				840	return self._stack[-(steps_from_last + 1)]
				841
				842	def PeekLastAppendedFragment(self):
				843	return (self._stack[-1].GetChildren()[-1]
				844	if self._stack[-1].GetChildren() else None)
				845
				846
				847	class MarkdownGenerator:
				848	"""Generates Markdown based on the series of HTML tags seen.
				849
				850	Each time an opening HTML tag is seen, the appropriate markdown fragment is
				851	created and pushed onto a stack. Any text encountered is appended to the
				852	fragment at the top of the stack. When a closing HTML tag is seen, the stack
				853	is popped and the fragment removed is appended to the new top of the stack.
				854
				855	Markdown is buffered in the fragment stack until an entire line has been
				856	formed, at which point _WriteFragmentsAsLine() is called to write it out. The
				857	content buffered in the stack is cleared, but otherwise the stack remains
				858	unmodified.
				859	"""
				860
				861	def __init__(self, out, url_translator):
				862	self._out = out
				863	self._url_translator = url_translator
				864	self._fragment_tree = FragmentTree(Text())
				865	self._list_info_stack = []
				866	self._pending_newlines = 0
				867	# Initialize the regexps to match nothing (rather than be None).
				868	self._code_class_regex = re.compile(FLAGS.code_class_regex or 'a^')
				869	self._toc_class_regex = re.compile(FLAGS.toc_class_regex or 'a^')
				870	self._ignore_class_regex = re.compile(FLAGS.ignore_class_regex or 'a^')
				871	self._ignore_style_regex = re.compile(FLAGS.ignore_style_regex or 'a^')
				872
				873	def _Push(self, fragment):
				874	"""Sets the parent fragment and pushes it onto the fragment stack.
				875
				876	In the case where there is an IgnoreBlock on the stack, a new IgnoreBlock
				877	is pushed instead.
				878
				879	Args:
				880	fragment: The Fragment object to push on the stack.
				881	"""
				882	if isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
				883	# If the top of the stack is IgnoreBlock, push an IgnoreBlock instead.
				884	fragment = IgnoreBlock()
				885	else:
				886	# Check if we need to merge adjacent formatting, e.g.
				887	# instead of boldbold we need to write boldbold,
				888	# as the former is not correct Markdown syntax.
				889	last_appended = self._fragment_tree.PeekLastAppendedFragment()
				890	if last_appended and last_appended.NeedsToMergeWith(fragment):
				891	last_appended.UnsetSuffix()
				892	fragment.UnsetPrefix()
				893
				894	self._fragment_tree.StartFragment(fragment)
				895
				896	def _Pop(self):
				897	"""Pops the fragment stack it to the new top of stack.
				898
				899	If the fragment stack would be empty after popping, then the fragment is
				900	written to the output first.
				901	"""
				902	if self._fragment_tree.ActiveFragmentScopeDepth() > 1:
				903	fragment = self._fragment_tree.EndFragment()
				904	self._fragment_tree.AppendFragment(fragment)
				905	else:
				906	self._WriteFragmentsAsLine(newlines=0)
				907	self._fragment_tree.EndFragment()
				908
				909	def _IsWithinFragmentType(self, fragment_type, steps_from_last=0):
				910	return self._fragment_tree.FindFirstFragmentFromEnd(
				911	lambda fragment: isinstance(fragment, fragment_type),
				912	steps_from_last) is not None
				913
				914	def _LastFragmentIs(self, fragment_type, cls):
				915	fragment = self._fragment_tree.PeekFragmentFromEnd()
				916	return (isinstance(fragment, fragment_type) and fragment.cls == cls)
				917
				918	def Break(self):
				919	if not self._IsWithinFragmentType(FencedCodeBlock):
				920	self._WriteFragmentsAsLine(newlines=1)
				921	else:
				922	fragment = FencedCodeBlockLine(prefix='', suffix='')
				923	self._Push(fragment)
				924	fragment.Append(ENCODED_NEWLINE)
				925	self._Pop()
				926
				927	def HorizontalRule(self):
				928	# Horizontal rule must be preceded and followed by a blank line
				929	self._AddVerticallyPaddedParagraph('---')
				930
				931	def StartDocument(self):
				932	self._Push(WrappedTextBlock())
				933
				934	def EndDocument(self):
				935	self._Pop()
				936
				937	def StartParagraph(self):
				938	self._WriteFragmentsAsLine(newlines=2)
				939
				940	def EndParagraph(self):
				941	self._WriteFragmentsAsLine(newlines=2)
				942
				943	def StartDiv(self, cls, style, ident):
				944	"""Process opening of a div element.
				945
				946	Args:
				947	cls: The class attribute of the element.
				948	style: The style attribute of the element.
				949	ident: The id attribute of the element
				950	"""
				951	if not self._IsWithinFragmentType(FencedCodeBlock):
				952	if self._IsWithinFragmentType(CodeBlock):
				953	self._WriteFragmentsAsLine(newlines=1)
				954	else:
				955	self._WriteFragmentsAsLine(newlines=2)
				956
				957	if ((cls and self._ignore_class_regex.match(cls)) or
				958	style and self._ignore_style_regex.match(style)):
				959	self._Push(IgnoreBlock())
				960	elif self._IsWithinFragmentType(FencedCodeBlock):
				961	self._Push(FencedCodeBlockLine())
				962	elif self._IsWithinFragmentType(CodeBlock):
				963	self._Push(CodeBlock())
				964	elif self._IsWithinFragmentType(BlockquoteBlock):
				965	self._Push(BlockquoteBlock(indent=None))
				966	elif cls and self._toc_class_regex.match(cls):
				967	self._AddTableOfContents()
				968	self._Push(IgnoreBlock()) # Ignore the items inside the Sites TOC
				969	elif cls and self._code_class_regex.match(cls):
				970	if FLAGS.indented_code_blocks:
				971	self._Push(IndentedCodeBlock())
				972	else:
				973	self._Push(FencedCodeBlock())
				974	else:
				975	self._Push(WrappedTextBlock())
				976
				977	def EndDiv(self):
				978	if not self._IsWithinFragmentType(FencedCodeBlock, steps_from_last=1):
				979	if self._IsWithinFragmentType(CodeBlock, steps_from_last=1):
				980	self._WriteFragmentsAsLine(newlines=1)
				981	else:
				982	self._WriteFragmentsAsLine(newlines=2)
				983	self._Pop()
				984
				985	def StartHeader(self, level):
				986	self._WriteFragmentsAsLine(newlines=2)
				987	if level == 1 and FLAGS.underline_headers:
				988	self._Push(UnderlinedHeader('='))
				989	elif level == 2 and FLAGS.underline_headers:
				990	self._Push(UnderlinedHeader('-'))
				991	else:
				992	self._Push(TextBlock(prefix=('#' * level) + ' '))
				993
				994	def EndHeader(self):
				995	self._WriteFragmentsAsLine(newlines=2)
				996	self._Pop()
				997
				998	def StartList(self, tag):
				999	if not self._list_info_stack:
				1000	self._WriteFragmentsAsLine(newlines=2)
				1001	else:
				1002	self._WriteFragmentsAsLine(newlines=1)
				1003	self._list_info_stack.append(ListInfo(tag))
				1004	if tag == 'ol':
				1005	self._Push(Text(' ' * FLAGS.ordered_list_indent))
				1006	else:
				1007	self._Push(Text(' ' * FLAGS.unordered_list_indent))
				1008
				1009	def EndList(self):
				1010	self._list_info_stack.pop()
				1011	if not self._list_info_stack:
				1012	self._WriteFragmentsAsLine(newlines=2)
				1013	else:
				1014	self._WriteFragmentsAsLine(newlines=1)
				1015	self._Pop()
				1016
				1017	def StartListItem(self):
				1018	self._WriteFragmentsAsLine(newlines=1)
				1019	# Google Sites sometimes spits out pages with <li> tags not enclosed within
				1020	# an <ol> or <ul> tag.
				1021	tag = ''
				1022	if self._list_info_stack:
				1023	self._list_info_stack[-1].item_count += 1
				1024	tag = self._list_info_stack[-1].tag
				1025	if tag == 'ol':
				1026	item_count = self._list_info_stack[-1].item_count
				1027	# string.ljust makes room for as many digits as you need.
				1028	prefix = ('%d.' % item_count).ljust(FLAGS.ordered_list_indent)
				1029	self._Push(ListItem(prefix))
				1030	else:
				1031	prefix = '*'.ljust(FLAGS.unordered_list_indent)
				1032	self._Push(ListItem(prefix))
				1033
				1034	def EndListItem(self):
				1035	self._WriteFragmentsAsLine(newlines=1)
				1036	self._Pop()
				1037
				1038	def StartFormat(self, tag):
				1039	# Allowed formatting depends on the surrounding fragment type.
				1040	if self._IsWithinFragmentType(TD) and tag == 'b':
				1041	# TODO(dpranke): This is a hack because I don't yet really understand
				1042	# how the ChangeToHtml() logic works in CodeBlocks, but it seems like
				1043	# we should be able to do something similar to what they do.
				1044	# Also, this should really be rewriting these to <th>s instead.
				1045	self._Push(HTML('<b>', '</b>'))
				1046	return
				1047
				1048	if not self._IsWithinFragmentType(IndentedCodeBlock):
				1049	formats_map = {
				1050	'i': ItalicFormattedText,
				1051	'em': ItalicFormattedText,
				1052	'b': BoldFormattedText,
				1053	'strong': BoldFormattedText,
				1054	'strike': StrikeThroughFormattedText,
				1055	's': StrikeThroughFormattedText,
				1056	'del': StrikeThroughFormattedText,
				1057	'u': HighlightFormattedText,
				1058	'code': Code,
				1059	None: Text,
				1060	}
				1061	if self._IsWithinFragmentType(FencedCodeBlock):
				1062	if FLAGS.allow_html_code_blocks:
				1063	# HTML code block can render formats but must not use Code fragments.
				1064	formats_map['code'] = formats_map[None] = CodeBlock
				1065	else:
				1066	formats_map = {None: CodeBlock}
				1067	else:
				1068	# Inside an indented code block no formatting is allowed.
				1069	formats_map = {None: CodeBlock}
				1070	self._Push(formats_map[tag]() if tag in formats_map
				1071	else formats_map[None]())
				1072
				1073	def EndFormat(self):
				1074	self._Pop()
				1075
				1076	def StartAnchor(self, href):
				1077	if href is not None:
				1078	href = self._url_translator.Translate(href)
				1079	if self._IsWithinFragmentType(TD):
				1080	self._Push(Href(href))
				1081	else:
				1082	self._Push(Link(href))
				1083	else:
				1084	self._Push(Text())
				1085
				1086	def EndAnchor(self):
				1087	self._Pop()
				1088
				1089	def StartBlockquote(self):
				1090	if not self._IsWithinFragmentType(CodeBlock):
				1091	self._WriteFragmentsAsLine(newlines=1)
				1092	self._Push(BlockquoteBlock())
				1093	else:
				1094	self._Push(Text())
				1095
				1096	def EndBlockquote(self):
				1097	if not self._IsWithinFragmentType(CodeBlock):
				1098	self._WriteFragmentsAsLine(newlines=2)
				1099	self._Pop()
				1100
				1101	def Image(self, src, alt, width, height):
				1102	src = self._url_translator.Translate(src)
				1103	self._fragment_tree.AppendFragment(Image(src, alt, width, height))
				1104
				1105	def Iframe(self, src, width, height):
				1106	"""Process an <iframe> element.
				1107
				1108	Sites use <iframe> for embedded content: Docs, Drawings, etc.
				1109	g3doc implements this by supporting <iframe> HTML tag directly.
				1110
				1111	Args:
				1112	src: Source URL.
				1113	width: Element width.
				1114	height: Element height.
				1115	"""
				1116	if False:
				1117	# TODO(dpranke): Figure out if we should support embedded IFRAME tags.
				1118	# For now, we skip over them.
				1119	self._WriteFragmentsAsLine(newlines=2)
				1120	self._Push(EmbeddedContent(src, width, height))
				1121	self._Pop()
				1122
				1123	def StartTable(self, cls):
				1124	if (cls and 'sites-layout-hbox' in cls and
				1125	'sites-layout-name-one-column' not in cls):
				1126	self._AddHTMLBlock('<div class="two-column-container">')
				1127	self._Push(Div(cls='two-column-container'))
				1128	elif (cls and 'sites-layout-name-one-column' in cls):
				1129	pass
				1130	else:
				1131	self._AddHTMLBlock('<table>')
				1132	self._Push(Table())
				1133
				1134	def EndTable(self):
				1135	if self._LastFragmentIs(Div, cls='two-column-container'):
				1136	self._AddHTMLBlock('</div>')
				1137	self._Pop()
				1138	elif self._IsWithinFragmentType(Table):
				1139	self._AddHTMLBlock('</table>')
				1140	self._Pop()
				1141
				1142	def StartTR(self):
				1143	if self._IsWithinFragmentType(Table):
				1144	self._AddHTMLBlock('<tr>')
				1145
				1146	def EndTR(self):
				1147	if self._IsWithinFragmentType(Table):
				1148	self._AddHTMLBlock('</tr>')
				1149
Dirk Pranke	7aa0137	2021-11-05 16:16:09 -0700	[diff] [blame]	1150	def StartTD(self, cls, rowspan, colspan):
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1151	if self._LastFragmentIs(Div, cls='two-column-container'):
				1152	if cls and ('sites-tile-name-content-1' in cls or
				1153	'sites-tile-name-content-2' in cls):
				1154	self._AddHTMLBlock('<div class="column">')
				1155	self._Push(Div(cls='column'))
				1156	else:
				1157	self._Push(Text())
				1158	elif self._IsWithinFragmentType(Table):
Dirk Pranke	7aa0137	2021-11-05 16:16:09 -0700	[diff] [blame]	1159	self._Push(TD(rowspan, colspan))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1160
				1161	def EndTD(self):
				1162	if self._LastFragmentIs(Div, cls='column'):
				1163	self._AddHTMLBlock('</div>')
				1164	self._Pop()
				1165	elif self._IsWithinFragmentType(Table):
				1166	self._Pop()
				1167	self._WriteFragmentsAsLine(newlines=1)
				1168
				1169	def Text(self, text):
				1170	if not isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
				1171	fragment = (CodeBlock() if self._IsWithinFragmentType(CodeBlock)
				1172	else Text())
				1173	self._fragment_tree.AppendFragment(fragment)
				1174	fragment.Append(text)
				1175
				1176	def _AddTableOfContents(self):
				1177	# TOC must be preceded and followed by a blank line
				1178	self._AddVerticallyPaddedParagraph('[TOC]')
				1179
				1180	def _AddVerticallyPaddedParagraph(self, text):
				1181	self._WriteFragmentsAsLine(newlines=2)
				1182	fragment = CodeBlock() # Use CodeBlock to prevent escaping
				1183	self._fragment_tree.AppendFragment(fragment)
				1184	fragment.Append(text)
				1185	self._WriteFragmentsAsLine(newlines=2)
				1186
				1187	def _AddHTMLBlock(self, html):
				1188	"""Writes out a block-level string of html."""
				1189	fragment = HTML()
				1190	fragment.Append(html)
				1191	self._fragment_tree.AppendFragment(fragment)
				1192	self._WriteFragmentsAsLine(newlines=1)
				1193
				1194	def _WriteFragmentsAsLine(self, newlines):
				1195	"""Writes out any content currently buffered in the fragment stack.
				1196
				1197	Args:
				1198	newlines: The minimum number of newlines required in the output after this
				1199	line. These newlines won't be written out until the next line with
				1200	content is encountered.
				1201	"""
				1202
				1203	# Generate indent and the content, then clear content in fragments.
				1204	indent = io.StringIO()
				1205	self._fragment_tree.ApplyToAllFragments(
				1206	lambda fragment: fragment.WriteIndent(indent),
				1207	lambda fragment: fragment.WriteContentIntoParentAndClear())
				1208	last_fragment = self._fragment_tree.PeekFragmentFromEnd()
				1209	content = self._fragment_tree.PeekFragmentFromStart().ConsumeContent()
				1210	content = last_fragment.StripLine(content.getvalue())
				1211	indent = indent.getvalue()
				1212	content = last_fragment.WrapLine(content, indent)
				1213
				1214	# Write the content, if any.
				1215	if content:
				1216	self._out.write('\n' * self._pending_newlines)
				1217	self._out.write(indent)
				1218	self._out.write(content)
				1219	self._pending_newlines = newlines
				1220	elif self._pending_newlines > 0 and self._pending_newlines < newlines:
				1221	self._pending_newlines = newlines
				1222
				1223	if FLAGS.debug_print_tree:
				1224	# Separate trees printed during each writing session
				1225	print('-' * 20)
				1226
				1227
				1228	class XhtmlHandler(xml.sax.ContentHandler):
				1229	"""Translates SAX events into MarkdownGenerator calls."""
				1230
				1231	# regex that matches an HTML header tag and extracts the level.
				1232	_HEADER_TAG_RE = re.compile(r'h([1-6])$')
				1233
				1234	def __init__(self, out, url_translator):
				1235	xml.sax.ContentHandler.__init__(self)
				1236	self._generator = MarkdownGenerator(out, url_translator)
				1237
				1238	def startDocument(self):
				1239	self._generator.StartDocument()
				1240
				1241	def endDocument(self):
				1242	self._generator.EndDocument()
				1243
				1244	def startElementNS(self, name, qname, attrs):
				1245	tag = name[1]
				1246	if tag == 'a':
				1247	href = attrs.get((None, 'href'))
				1248	self._generator.StartAnchor(href)
				1249	elif tag == 'br':
				1250	self._generator.Break()
				1251	elif tag == 'hr':
				1252	self._generator.HorizontalRule()
				1253	elif tag == 'li':
				1254	self._generator.StartListItem()
				1255	elif tag == 'div':
				1256	cls = attrs.get((None, 'class'))
				1257	style = attrs.get((None, 'style'))
				1258	ident = attrs.get((None, 'id'))
				1259	self._generator.StartDiv(cls, style, ident)
				1260	elif tag == 'p':
				1261	self._generator.StartParagraph()
				1262	elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
				1263	self._generator.StartFormat(tag)
				1264	elif tag in ('ul', 'ol'):
				1265	self._generator.StartList(tag)
				1266	elif tag == 'img':
				1267	src = attrs.get((None, 'src'))
				1268	alt = attrs.get((None, 'alt'))
				1269	width = attrs.get((None, 'width'))
				1270	height = attrs.get((None, 'height'))
				1271	self._generator.Image(src, alt, width, height)
				1272	elif tag == 'blockquote':
				1273	self._generator.StartBlockquote()
				1274	elif tag == 'iframe':
				1275	src = attrs.get((None, 'src'))
				1276	width = attrs.get((None, 'width'))
				1277	height = attrs.get((None, 'height'))
				1278	self._generator.Iframe(src, width, height)
				1279	elif tag == 'table':
				1280	cls = attrs.get((None, 'class'))
				1281	self._generator.StartTable(cls)
				1282	elif tag == 'tr':
				1283	self._generator.StartTR()
				1284	elif tag == 'td':
Dirk Pranke	7aa0137	2021-11-05 16:16:09 -0700	[diff] [blame]	1285	self._generator.StartTD(attrs.get((None, 'class')),
				1286	attrs.get((None, 'rowspan')),
				1287	attrs.get((None, 'colspan')))
Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame]	1288	else:
				1289	match = self._HEADER_TAG_RE.match(tag)
				1290	if match:
				1291	level = int(match.group(1))
				1292	self._generator.StartHeader(level)
				1293
				1294	def endElementNS(self, name, qname):
				1295	tag = name[1]
				1296	if tag == 'a':
				1297	self._generator.EndAnchor()
				1298	elif tag == 'li':
				1299	self._generator.EndListItem()
				1300	elif tag == 'div':
				1301	self._generator.EndDiv()
				1302	elif tag == 'p':
				1303	self._generator.EndParagraph()
				1304	elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
				1305	self._generator.EndFormat()
				1306	elif tag in ('ul', 'ol'):
				1307	self._generator.EndList()
				1308	elif tag == 'blockquote':
				1309	self._generator.EndBlockquote()
				1310	elif tag == 'td':
				1311	self._generator.EndTD()
				1312	elif tag == 'tr':
				1313	self._generator.EndTR()
				1314	elif tag == 'table':
				1315	self._generator.EndTable()
				1316	else:
				1317	match = self._HEADER_TAG_RE.match(tag)
				1318	if match:
				1319	self._generator.EndHeader()
				1320
				1321	def characters(self, content):
				1322	self._generator.Text(content)
				1323
				1324
				1325	class DefaultUrlTranslator:
				1326	"""No-op UrlTranslator."""
				1327
				1328	def Translate(self, href):
				1329	return href
				1330
				1331
				1332	def Convert(input_stream, output_stream, url_translator=DefaultUrlTranslator()):
				1333	"""Converts an input stream of xhtml into an output stream of markdown.
				1334
				1335	Args:
				1336	input_stream: filehandle for the XHTML input.
				1337	output_stream: filehandle for the Markdown output.
				1338	url_translator: Callback for translating URLs embedded in the page.
				1339	"""
				1340	parser = xml.sax.make_parser()
				1341	parser.setContentHandler(XhtmlHandler(output_stream, url_translator))
				1342	parser.setFeature(xml.sax.handler.feature_namespaces, 1)
				1343	parser.parse(input_stream)