Blame - scripts/html2markdown.py - chromium.googlesource.com/website

blob: 0ba8381c0486dd30038f638d6845488db28a79dd [file] [log] [blame]

Dirk Pranke	7bbb547	2021-11-02 16:33:21 -0700	[diff] [blame^]	1	# Copyright 2021 Google LLC
				2	#
				3	# Licensed under the Apache License, Version 2.0 (the "License");
				4	# you may not use this file except in compliance with the License.
				5	# You may obtain a copy of the License at
				6	#
				7	# https://www.apache.org/licenses/LICENSE-2.0
				8	#
				9	# Unless required by applicable law or agreed to in writing, software
				10	# distributed under the License is distributed on an "AS IS" BASIS,
				11	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	# See the License for the specific language governing permissions and
				13	# limitations under the License.
				14
				15	"""HTML to Markdown renderer."""
				16
				17	import os
				18	import re
				19	import io
				20	import textwrap
				21	import urllib
				22	import xml.sax
				23
				24
				25	class _Flags:
				26	# Whether to render h1s and h2s with underlined - and =.
				27	underline_headers = False
				28
				29	# The set of characters to escape with \'\\\' in the
				30	# Markdown. This is not the set of all special Markdown
				31	# characters, but rather those characters that tend to
				32	# get misinterpreted as Markdown syntax the most. Blindly
				33	# escaping all special Markdown characters results in ugly
				34	# Markdown.
				35	escape_chars = r'\`*[]'
				36
				37	# Format for italic tags.
				38	italic_format = '*'
				39
				40	# Format for bold tags.
				41	bold_format = '**'
				42
				43	# Format for strikethrough tags.
				44	strike_format = '~~'
				45
				46	# Format for underline tags.
				47	highlight_format = '=='
				48
				49	# Number of spaces to indent an unordered list.
				50	# This total includes the bullet.
				51	# For example, a value of 4 yields '* '
				52	unordered_list_indent = 4
				53
				54	# Number of spaces to indent an ordered list.
				55	# This total includes the number.
				56	# For example, a value of 4 yields '1. '
				57	ordered_list_indent = 4
				58
				59	# The DIV blocks that should be formatted as code.
				60	code_class_regex = r'^sites-codeblock sites-codesnippet-block$'
				61
				62	# The class of DIV blocks used for table of contents.
				63	toc_class_regex = r'^sites-embed-content sites-embed-type-toc$'
				64
				65	# The class of DIV blocks that should be ignored.
				66	ignore_class_regex = r''
				67
				68	# The style of DIV blocks that should be ignored.
				69	ignore_style_regex = r'^display:none;$'
				70
				71	# Format text blocks to the given line width. Set to zero
				72	# to disable line wrapping.
				73	line_width = 80
				74
				75	# Whether to use indented code blocks, if False use fenced.
				76	indented_code_blocks = False
				77
				78	# Whether to use HTML code blocks instead of fenced code
				79	# blocks if source code block includes formatted text.
				80	allow_html_code_blocks = True
				81
				82	# Links that are automatically recognized by the renderer.
				83	shortlinks_regex = r'^http://(ag\|b\|cl\|g\|go\|who)/'
				84
				85	# Print the fragment tree for debugging.
				86	debug_print_tree = False
				87
				88
				89	FLAGS = _Flags()
				90
				91
				92	def _EscapeText(text, reserved_chars):
				93	"""Escapes any reserved characters with a backslash.
				94
				95	Args:
				96	text: The string to escape.
				97	reserved_chars: A string of reserved characters that need to be escaped.
				98
				99	Returns:
				100	The escaped text.
				101	"""
				102	markdown = io.StringIO()
				103	for c in text:
				104	if c in reserved_chars:
				105	markdown.write('\\')
				106	markdown.write(c)
				107	return markdown.getvalue()
				108
				109
				110	def _EscapeContentForHtml(text):
				111	result = io.StringIO()
				112	escapes = {'<': '<', '>': '>'}
				113	for c in text:
				114	result.write(c if c not in escapes else escapes[c])
				115	return result
				116
				117
				118	ENCODED_NEWLINE = '&#%d;' % ord('\n')
				119
				120
				121	def _RestoreEncodedNewlines(text):
				122	return text.replace(ENCODED_NEWLINE, '\n')
				123
				124
				125	def _WrapLine(line, indent):
				126	"""Wraps the line to fit into the column limit.
				127
				128	Args:
				129	line: The string to wrap.
				130	indent: An integer with the number of columns of indentation.
				131
				132	Returns:
				133	The wrapped text.
				134	"""
				135	if FLAGS.line_width > 0:
				136	return ('\n' + ' ' * indent).join(textwrap.wrap(
				137	line,
				138	width=FLAGS.line_width - indent,
				139	break_long_words=False,
				140	break_on_hyphens=False))
				141	return line
				142
				143
				144	class Fragment:
				145	"""Base class for all output fragments.
				146
				147	To generate a line of output, the methods will be called in the following
				148	order:
				149
				150	WriteIndent()
				151	WriteContentIntoParentAndClear()
				152	ConsumeContent() -- for the topmost fragment only
				153	StripLine()
				154	WrapLine()
				155	"""
				156
				157	def __init__(self, indent, prefix, suffix):
				158	self._content = io.StringIO()
				159	self._indent = indent
				160	self._prefix = prefix
				161	self._suffix = suffix
				162	self._parent = None
				163	self._children = []
				164
				165	def __repr__(self):
				166	debug_print = lambda text: text.encode('utf-8') if text else ''
				167	return ('{' +
				168	self.__class__.__name__ +
				169	': indent=' + debug_print(self._indent) +
				170	'; prefix=' + debug_print(self._prefix) +
				171	'; content=' + debug_print(self._content.getvalue()) +
				172	'; suffix=' + debug_print(self._suffix) +
				173	'}')
				174
				175	def SetParent(self, parent):
				176	self._parent = parent
				177
				178	def AddChild(self, node):
				179	self._children.append(node)
				180	node.SetParent(self)
				181	return node
				182
				183	def GetChildren(self):
				184	return self._children
				185
				186	def _AllChildren(self):
				187	all_children = []
				188	def Traverse(fragment):
				189	for c in fragment.GetChildren():
				190	all_children.append(c)
				191	Traverse(c)
				192	Traverse(self)
				193	return all_children
				194
				195	def WriteIndent(self, output):
				196	if self._indent:
				197	output.write(self._indent)
				198
				199	def WriteContentIntoParentAndClear(self):
				200	self._WriteContent(self._parent._content) # pylint: disable=protected-access
				201	self._ClearContent()
				202	self._children = []
				203
				204	def _WriteContent(self, output):
				205	"""Implementation of content rendering. Can be overridden in subclasses."""
				206	self._Write(output, self._prefix, self._content.getvalue(), self._suffix)
				207
				208	def _Write(self, output, prefix, content, suffix):
				209	"""Default implementation of content rendering for reuse by subclasses."""
				210	has_content = bool(content.strip())
				211	if prefix and has_content:
				212	output.write(prefix)
				213	output.write(content)
				214	if suffix and has_content:
				215	output.write(suffix)
				216
				217	def UnsetSuffix(self):
				218	self._suffix = ''
				219
				220	def UnsetPrefix(self):
				221	self._prefix = ''
				222
				223	def _UpdatePrefixAndSuffix(self, prefix, suffix):
				224	if self._prefix:
				225	self._prefix = prefix
				226	if self._suffix:
				227	self._suffix = suffix
				228
				229	def _ClearContent(self):
				230	"""Clears the content. This will only be called after it's been written."""
				231	self._content = io.StringIO()
				232
				233	def ConsumeContent(self):
				234	content = self._content
				235	self._ClearContent()
				236	return content
				237
				238	def Append(self, text):
				239	"""Appends text.
				240
				241	Args:
				242	text: The string to append, it will be escaped.
				243	"""
				244	assert isinstance(text, str)
				245	self._content.write(self.EscapeText(text))
				246
				247	def EscapeText(self, text):
				248	"""Escapes any reserved characters when Append() is called with text.
				249
				250	By default this defers to the parent fragment.
				251
				252	Args:
				253	text: The string to escape.
				254
				255	Returns:
				256	The escaped string.
				257	"""
				258	if self._parent:
				259	return self._parent.EscapeText(text)
				260	return text
				261
				262	def StripLine(self, text):
				263	"""Does any needed stripping of whitespace.
				264
				265	Some blocks (code for example) will want to preserve whitespace, while
				266	others will want to coalesce it together. By default this defers to the
				267	parent fragment.
				268
				269	Args:
				270	text: The string to strip
				271
				272	Returns:
				273	The stripped string.
				274	"""
				275	if self._parent:
				276	return self._parent.StripLine(text)
				277	return text
				278
				279	def WrapLine(self, line, indent):
				280	"""Wraps the line to fit into the column limit, if necessary.
				281
				282	Most blocks (code for example) will want to preserve whitespace and won't
				283	break their output.
				284
				285	Args:
				286	text: The string to wrap.
				287	indent: Indent string.
				288	Returns:
				289	The wrapped string.
				290	"""
				291	del indent
				292	return line
				293
				294	def NeedsToMergeWith(self, text):
				295	del text
				296	return False
				297
				298
				299	class HTML(Fragment):
				300	"""Markdown fragment that consists of just an unescaped HTML string."""
				301
				302	def __init__(self, prefix=None, suffix=None):
				303	super().__init__(indent=None, prefix=prefix, suffix=suffix)
				304
				305	def EscapeText(self, text):
				306	return text
				307
				308
				309	class Href(Fragment):
				310	"""HTML fragment containing an <a href=> tag. Used within table cells.
				311
				312	If the href falls within a table cell, using a Href() element will allow
				313	us to have proper formatting; the Markdown-style Link() element will not
				314	be processed properly.
				315	"""
				316	def __init__(self, href):
				317	super().__init__(indent=None, prefix='<a href="%s">' % href, suffix='</a>')
				318
				319
				320	class Text(Fragment):
				321	"""Markdown fragment that consists of just a string."""
				322
				323	def __init__(self, indent=None, prefix=None, suffix=None):
				324	super().__init__(indent, prefix, suffix)
				325
				326
				327	class IgnoreBlock(Fragment):
				328	"""Markdown fragment that omits all content."""
				329
				330	def __init__(self):
				331	super().__init__(None, None, None)
				332
				333
				334	class TextBlock(Text):
				335	"""A TextBlock coalesces all spaces and escapes all reserved chars."""
				336
				337	def EscapeText(self, text):
				338	text = _EscapeContentForHtml(text).getvalue()
				339	return _EscapeText(text, FLAGS.escape_chars)
				340
				341	def StripLine(self, text):
				342	# Treat newlines as spaces and then coalesce spaces.
				343	text = text.replace('\n', ' ')
				344	# Replace all Unicode nonbreaking spaces with simple spaces. This is safer
				345	# than deletion since spaces are coalesced below anyway.
				346	text = text.replace(chr(160), ' ')
				347
				348	return re.sub(r' +', ' ', text.strip())
				349
				350
				351	class Div(TextBlock):
				352	"""Placeholder that helps with the two-column layout conversion."""
				353
				354	def __init__(self, cls):
				355	self.cls = cls
				356	super().__init__()
				357
				358
				359	class Table(TextBlock):
				360	"""Placeholder that identifies when we're in a (data) table.
				361
				362	(As opposed to a table being used for layout-purposes, which we don't
				363	want to export.)
				364	"""
				365	cls = None
				366
				367
				368	class TD(Text):
				369	def __init__(self):
				370	super().__init__(indent='', prefix='<td>', suffix='</td>')
				371
				372
				373	class Content(TextBlock):
				374	"""Placeholder that identifies when we're processing the main content."""
				375	cls = None
				376
				377
				378	class WrappedTextBlock(TextBlock):
				379	"""A WrappedTextBlock wraps the output lines to fit into the column limit."""
				380
				381	def WrapLine(self, line, indent):
				382	return _WrapLine(line, len(indent))
				383
				384
				385	class BlockquoteBlock(WrappedTextBlock):
				386	"""A BlockquoteBlock wraps content and prepends each line with '> '.
				387
				388	The generator must emit BlockquoteBlocks with no indent for paragraphs
				389	inside a blockquote. This will allow propagating the final call to WrapLine
				390	up to the outermost BlockquoteBlock which will wrap the lines and prepend
				391	each of them with the indent.
				392	"""
				393
				394	def __init__(self, indent='> '):
				395	super().__init__(indent, None, None)
				396
				397	def WrapLine(self, line, indent):
				398	if not self._indent and self._parent:
				399	return self._parent.WrapLine(line, indent)
				400	wrapped = _WrapLine(line, len(indent))
				401	lines = wrapped.splitlines(True)
				402	return indent.join([l.lstrip() for l in lines])
				403
				404
				405	class CodeBlock(Text):
				406	"""Base class for different code block fragment implementations."""
				407
				408	def EscapeText(self, text):
				409	return text
				410
				411	def StripLine(self, text):
				412	# Completely ignore newlines in code blocks. Sites always uses <br/>.
				413	return text.replace('\n', '')
				414
				415	def ChangeToHtml(self):
				416	content = self._content.getvalue()
				417	if content:
				418	self._content = _EscapeContentForHtml(content)
				419
				420
				421	class IndentedCodeBlock(CodeBlock):
				422	"""A IndentedCodeBlock indents by four spaces."""
				423
				424	def __init__(self, indent=' '):
				425	super().__init__(indent, None, None)
				426
				427
				428	class FencedCodeBlock(CodeBlock):
				429	"""A FencedCodeBlock is fenced with triple backticks (```).
				430
				431	To render correctly, content writing must not happen
				432	unless the end of the source code block has been encountered.
				433	That is, the entire code block from the source HTML must
				434	be rendered in a single write pass.
				435	"""
				436
				437	def __init__(self, indent=None,
				438	prefix='```none' + ENCODED_NEWLINE,
				439	suffix=ENCODED_NEWLINE + '```'):
				440	super().__init__(indent, prefix, suffix)
				441
				442	def WriteIndent(self, output):
				443	# Adjust inner fragments and self before rendering.
				444	if FLAGS.allow_html_code_blocks:
				445	has_formatted_text = False
				446	for c in self._AllChildren():
				447	if isinstance(c, FormattedText):
				448	c.ChangeToHtml()
				449	has_formatted_text = True
				450	if has_formatted_text:
				451	for c in self._AllChildren():
				452	if isinstance(c, CodeBlock):
				453	c.ChangeToHtml()
				454	self._UpdatePrefixAndSuffix(
				455	'<pre><code>', ENCODED_NEWLINE + '</code></pre>')
				456	super().WriteIndent(output)
				457
				458	def StripLine(self, text):
				459	text = super().StripLine(text)
				460	lines = _RestoreEncodedNewlines(text).splitlines()
				461	return '\n'.join([l for l in lines if l])
				462
				463	def WrapLine(self, line, indent):
				464	lines = line.splitlines(True)
				465	return indent.join(lines)
				466
				467
				468	class FencedCodeBlockLine(Text):
				469	"""A line of code inside FencedCodeBlock."""
				470
				471	def __init__(self, indent=None,
				472	prefix=ENCODED_NEWLINE, suffix=ENCODED_NEWLINE):
				473	super().__init__(indent, prefix, suffix)
				474
				475	def StripLine(self, text):
				476	text = super().StripLine(text)
				477	return _RestoreEncodedNewlines(text)
				478
				479
				480	class UnderlinedHeader(TextBlock):
				481	"""Markdown fragment for an underlined section header."""
				482
				483	def __init__(self, char):
				484	super().__init__()
				485	self._char = char
				486
				487	def _WriteContent(self, output):
				488	length = len(self.StripLine(self._content.getvalue()))
				489	if length > 0:
				490	# '\n' will be stripped, so use an encoded '\n' that we can later replace
				491	# after the line is stripped.
				492	self._Write(output,
				493	None,
				494	self._content.getvalue(),
				495	ENCODED_NEWLINE + self._char * length)
				496
				497	def StripLine(self, text):
				498	text = super().StripLine(text)
				499	return _RestoreEncodedNewlines(text)
				500
				501
				502	class FormattedText(Text):
				503	"""Text wrapped in Markdown formatting."""
				504
				505	def __init__(self, fmt):
				506	super().__init__(None, fmt, fmt)
				507
				508	def _Pad(self, bigger, smaller):
				509	return ' ' * (len(bigger) - len(smaller))
				510
				511	def _WriteContent(self, output):
				512	prefix = self._prefix
				513	content = self._content.getvalue()
				514	suffix = self._suffix
				515	if prefix:
				516	# If there are whitespaces immediately after the prefix,
				517	# they must be pushed out before the prefix.
				518	lstripped = content.lstrip()
				519	if len(content) > len(lstripped):
				520	prefix = self._Pad(content, lstripped) + prefix
				521	content = lstripped
				522	if suffix:
				523	# If there are whitespaces immediately before the suffix,
				524	# they must be pushed out after the suffix.
				525	rstripped = content.rstrip()
				526	if len(content) > len(rstripped):
				527	suffix = suffix + self._Pad(content, rstripped)
				528	content = rstripped
				529	self._Write(output, prefix, content, suffix)
				530
				531	def ChangeToHtml(self):
				532	content = self._content.getvalue()
				533	if content:
				534	content = _EscapeContentForHtml(content)
				535
				536
				537	class BoldFormattedText(FormattedText):
				538	"""Text formatted as bold."""
				539
				540	def __init__(self):
				541	super().__init__(FLAGS.bold_format)
				542
				543	def NeedsToMergeWith(self, text):
				544	return isinstance(text, BoldFormattedText)
				545
				546	def ChangeToHtml(self):
				547	super().ChangeToHtml()
				548	self._UpdatePrefixAndSuffix('<b>', '</b>')
				549
				550
				551	class ItalicFormattedText(FormattedText):
				552	"""Text formatted as italic."""
				553
				554	def __init__(self):
				555	super().__init__(FLAGS.italic_format)
				556
				557	def NeedsToMergeWith(self, text):
				558	return isinstance(text, ItalicFormattedText)
				559
				560	def ChangeToHtml(self):
				561	super().ChangeToHtml()
				562	self._UpdatePrefixAndSuffix('<i>', '</i>')
				563
				564
				565	class StrikeThroughFormattedText(FormattedText):
				566	"""Text formatted as strike through."""
				567
				568	def __init__(self):
				569	super().__init__(FLAGS.strike_format)
				570
				571	def NeedsToMergeWith(self, text):
				572	return isinstance(text, StrikeThroughFormattedText)
				573
				574	def ChangeToHtml(self):
				575	super().ChangeToHtml()
				576	self._UpdatePrefixAndSuffix('<s>', '</s>')
				577
				578
				579	class HighlightFormattedText(FormattedText):
				580	"""Highlighted text."""
				581
				582	def __init__(self):
				583	super().__init__(FLAGS.highlight_format)
				584
				585	def NeedsToMergeWith(self, text):
				586	return isinstance(text, HighlightFormattedText)
				587
				588	def ChangeToHtml(self):
				589	super().ChangeToHtml()
				590	self._UpdatePrefixAndSuffix('<u>', '</u>')
				591
				592
				593	class ListItem(Text):
				594	"""Item in a list."""
				595
				596	def __init__(self, bullet):
				597	super().__init__()
				598	self._bullet = bullet
				599
				600	def WriteIndent(self, output):
				601	if self._bullet:
				602	# TODO(dpranke): The original code relied on strings and bytes
				603	# being interchangeable in Python2, so you could seek backwards
				604	# from the current location with a relative offset. You can't
				605	# do that in Python3, apparently.
				606	#
				607	# To get around this for the moment, instead of seeking backwards
				608	# 4 characters, we embed 4 '\b' backspaces, and then have the client
				609	# do a global search and replace of ' \b\b\b\b' with '' instead.
				610	#
				611	# This is awkward, so we should rework this so that this isn't needed.
				612	#
				613	# output.seek(-len(self._bullet), os.SEEK_CUR)
				614	output.write('\b' * len(self._bullet))
				615	output.write(self._bullet)
				616	super().WriteIndent(output)
				617
				618	def _ClearContent(self):
				619	self._bullet = None
				620	super()._ClearContent()
				621
				622	def WrapLine(self, line, indent):
				623	return _WrapLine(line, len(indent))
				624
				625
				626	class Link(Text):
				627	"""Markdown link."""
				628
				629	def __init__(self, href):
				630	super().__init__()
				631	self._href = href
				632	self._url_opener_prefix = ''
				633	self._url_opener_suffix = ''
				634
				635	def MakeAnImage(self, width, height):
				636	self._url_opener_prefix = '!'
				637	if width and height:
				638	self._url_opener_suffix = (
				639	'{{width="{}" height="{}"}}'.format(width, height))
				640
				641	def _IsShortLink(self, text):
				642	if FLAGS.shortlinks_regex and (
				643	re.compile(FLAGS.shortlinks_regex).match(self._href)):
				644	parsed_href = urllib.parse.urlsplit(self._href)
				645	if parsed_href.netloc + parsed_href.path == text:
				646	return True
				647	return None
				648
				649	def _WriteLink(self, output, text):
				650	write_short_link = (not (self._url_opener_prefix or self._url_opener_suffix)
				651	and self._IsShortLink(text))
				652	if write_short_link:
				653	self._Write(output, None, text, None)
				654	else:
				655	self._Write(output,
				656	self._url_opener_prefix + '[',
				657	text,
				658	'](' + self._href + ')' + self._url_opener_suffix)
				659
				660	def _WriteContent(self, output):
				661	text = self._content.getvalue()
				662	if text:
				663	if text.startswith('http://') or text.startswith('https://'):
				664	self._Write(output, '<', text, '>')
				665	else:
				666	self._WriteLink(output, text)
				667
				668
				669	class Image(Text):
				670	"""Image."""
				671
				672	def __init__(self, src, alt, width, height):
				673	super().__init__()
				674	self._src = src
				675	self._alt = alt or 'image'
				676	self._width = width
				677	self._height = height
				678
				679	def _WriteContent(self, output):
				680	tag = '<img alt="%s" src="%s"' % (self._alt, self._src)
				681	if self._height:
				682	tag += ' height=%s' % self._height
				683	if self._width:
				684	tag += ' width=%s' % self._width
				685	tag += '>'
				686	self._Write(output, '', tag, '')
				687
				688
				689	class Code(Text):
				690	"""Inline code."""
				691
				692	def __init__(self):
				693	super().__init__(None, '`', '`')
				694
				695	def EscapeText(self, text):
				696	return text
				697
				698	def _WriteContent(self, output):
				699	prefix = self._prefix
				700	content = self._content.getvalue()
				701	suffix = self._suffix
				702	if '`' in content:
				703	# If a backtick (`) is present inside inline code, the fragment
				704	# must use double backticks.
				705	prefix = suffix = '``'
				706	# Since having content starting or ending with a backtick would emit
				707	# triple backticks which designates a fenced code fragment, pad content
				708	# to avoid this.
				709	if content.startswith('`'):
				710	content = ' ' + content
				711	if content.endswith('`'):
				712	content += ' '
				713	self._Write(output, prefix, content, suffix)
				714
				715	def NeedsToMergeWith(self, text):
				716	return isinstance(text, Code)
				717
				718
				719	class EmbeddedContent(Text):
				720	"""Embedded content: Docs, Drawings, Presentations, etc."""
				721
				722	def __init__(self, href, width, height):
				723	super().__init__()
				724	self._href = href
				725	self._width = width
				726	self._height = height
				727
				728	def _WriteContent(self, output):
				729	parsed_href = urllib.parse.urlsplit(self._href)
				730	if parsed_href.scheme == 'http':
				731	parsed_href = urllib.parse.SplitResult(
				732	'https', parsed_href.netloc, parsed_href.path, parsed_href.query,
				733	parsed_href.fragment)
				734	# Note: 'allow="fullscreen"' is requested for all content for simplicity.
				735	# g3doc server has dedicated logic to deal with these requests.
				736	element = '<iframe src="{}"{} allow="fullscreen" />'.format(
				737	urllib.parse.urlunsplit(parsed_href),
				738	(' width="{}" height="{}"'.format(self._width, self._height) if (
				739	self._width and self._height) else ''))
				740	self._Write(output, None, element, None)
				741
				742
				743	class ListInfo:
				744
				745	def __init__(self, tag):
				746	self.tag = tag # The tag used to start the list
				747	self.item_count = 0 # The number of items in the list
				748
				749
				750	class FragmentTree:
				751	"""Class for managing a tree of fragments.
				752
				753	There is a "scope" formed by nested fragments, e.g.
				754	italic fragment inside bold fragment inside paragraph.
				755	The scope is stored in the stack. For convenience,
				756	the stack always have one element.
				757
				758	Fragments popped out from the scope may be re-added
				759	back into the tree as children of the last fragment.
				760	This allows "chaining" of structured content for future
				761	processing. For example, if there were several bold
				762	fragments inside a paragraph interleaved with fragments
				763	of regular text, all these fragments will end up as
				764	children of the paragraph fragment.
				765
				766	"""
				767
				768	def __init__(self, top_fragment):
				769	self._stack = [top_fragment]
				770
				771	def ActiveFragmentScopeDepth(self):
				772	return len(self._stack) - 1
				773
				774	def StartFragment(self, fragment):
				775	fragment.SetParent(self._stack[-1])
				776	self._stack.append(fragment)
				777	return fragment
				778
				779	def EndFragment(self):
				780	return self._stack.pop()
				781
				782	def AppendFragment(self, fragment):
				783	return self._stack[-1].AddChild(fragment)
				784
				785	def _ApplyRecursivelyToNode(self, node, scope_operation, operation, # pylint: disable=missing-docstring
				786	debug_indent):
				787	if not debug_indent:
				788	for child in node.GetChildren():
				789	self._ApplyRecursivelyToNode(child, scope_operation, operation, None)
				790	else:
				791	debug_indent += ' c '
				792	for child in node.GetChildren():
				793	print(debug_indent + repr(child))
				794	self._ApplyRecursivelyToNode(child, scope_operation, operation,
				795	debug_indent)
				796	operation(node)
				797
				798	def _ApplyRecursivelyToScope(self, nodes, scope_operation, operation, # pylint: disable=missing-docstring
				799	debug_indent):
				800	node = nodes.pop()
				801	scope_operation(node)
				802	if debug_indent:
				803	print(debug_indent + repr(node))
				804	if nodes:
				805	self._ApplyRecursivelyToScope(nodes, scope_operation, operation,
				806	(debug_indent + ' s ' if debug_indent
				807	else None))
				808	self._ApplyRecursivelyToNode(node, scope_operation, operation,
				809	debug_indent)
				810
				811	def ApplyToAllFragments(self, scope_operation, operation):
				812	"""Recursively applies operations to all fragments in the tree.
				813
				814	The omnipresent topmost fragment is excluded. The 'scope_operation'
				815	is applied to every element in the fragment stack in pre-order.
				816	The 'operation' is applied to all fragments in the tree in post-order.
				817
				818	Args:
				819	scope_operation: The operation to apply to fragments in the scope stack.
				820	operation: The operation to apply to all fragments in the tree.
				821	"""
				822	self._ApplyRecursivelyToScope(list(reversed(self._stack[1:])),
				823	scope_operation, operation,
				824	' ' if FLAGS.debug_print_tree else None)
				825
				826	def FindFirstFragmentFromEnd(self, predicate, steps_from_last=0):
				827	sub_stack = self._stack[:-steps_from_last if steps_from_last else None]
				828	return next((node for node in sub_stack if predicate(node)), None)
				829
				830	def PeekFragmentFromStart(self, steps_from_first=0):
				831	return self._stack[steps_from_first]
				832
				833	def PeekFragmentFromEnd(self, steps_from_last=0):
				834	return self._stack[-(steps_from_last + 1)]
				835
				836	def PeekLastAppendedFragment(self):
				837	return (self._stack[-1].GetChildren()[-1]
				838	if self._stack[-1].GetChildren() else None)
				839
				840
				841	class MarkdownGenerator:
				842	"""Generates Markdown based on the series of HTML tags seen.
				843
				844	Each time an opening HTML tag is seen, the appropriate markdown fragment is
				845	created and pushed onto a stack. Any text encountered is appended to the
				846	fragment at the top of the stack. When a closing HTML tag is seen, the stack
				847	is popped and the fragment removed is appended to the new top of the stack.
				848
				849	Markdown is buffered in the fragment stack until an entire line has been
				850	formed, at which point _WriteFragmentsAsLine() is called to write it out. The
				851	content buffered in the stack is cleared, but otherwise the stack remains
				852	unmodified.
				853	"""
				854
				855	def __init__(self, out, url_translator):
				856	self._out = out
				857	self._url_translator = url_translator
				858	self._fragment_tree = FragmentTree(Text())
				859	self._list_info_stack = []
				860	self._pending_newlines = 0
				861	# Initialize the regexps to match nothing (rather than be None).
				862	self._code_class_regex = re.compile(FLAGS.code_class_regex or 'a^')
				863	self._toc_class_regex = re.compile(FLAGS.toc_class_regex or 'a^')
				864	self._ignore_class_regex = re.compile(FLAGS.ignore_class_regex or 'a^')
				865	self._ignore_style_regex = re.compile(FLAGS.ignore_style_regex or 'a^')
				866
				867	def _Push(self, fragment):
				868	"""Sets the parent fragment and pushes it onto the fragment stack.
				869
				870	In the case where there is an IgnoreBlock on the stack, a new IgnoreBlock
				871	is pushed instead.
				872
				873	Args:
				874	fragment: The Fragment object to push on the stack.
				875	"""
				876	if isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
				877	# If the top of the stack is IgnoreBlock, push an IgnoreBlock instead.
				878	fragment = IgnoreBlock()
				879	else:
				880	# Check if we need to merge adjacent formatting, e.g.
				881	# instead of boldbold we need to write boldbold,
				882	# as the former is not correct Markdown syntax.
				883	last_appended = self._fragment_tree.PeekLastAppendedFragment()
				884	if last_appended and last_appended.NeedsToMergeWith(fragment):
				885	last_appended.UnsetSuffix()
				886	fragment.UnsetPrefix()
				887
				888	self._fragment_tree.StartFragment(fragment)
				889
				890	def _Pop(self):
				891	"""Pops the fragment stack it to the new top of stack.
				892
				893	If the fragment stack would be empty after popping, then the fragment is
				894	written to the output first.
				895	"""
				896	if self._fragment_tree.ActiveFragmentScopeDepth() > 1:
				897	fragment = self._fragment_tree.EndFragment()
				898	self._fragment_tree.AppendFragment(fragment)
				899	else:
				900	self._WriteFragmentsAsLine(newlines=0)
				901	self._fragment_tree.EndFragment()
				902
				903	def _IsWithinFragmentType(self, fragment_type, steps_from_last=0):
				904	return self._fragment_tree.FindFirstFragmentFromEnd(
				905	lambda fragment: isinstance(fragment, fragment_type),
				906	steps_from_last) is not None
				907
				908	def _LastFragmentIs(self, fragment_type, cls):
				909	fragment = self._fragment_tree.PeekFragmentFromEnd()
				910	return (isinstance(fragment, fragment_type) and fragment.cls == cls)
				911
				912	def Break(self):
				913	if not self._IsWithinFragmentType(FencedCodeBlock):
				914	self._WriteFragmentsAsLine(newlines=1)
				915	else:
				916	fragment = FencedCodeBlockLine(prefix='', suffix='')
				917	self._Push(fragment)
				918	fragment.Append(ENCODED_NEWLINE)
				919	self._Pop()
				920
				921	def HorizontalRule(self):
				922	# Horizontal rule must be preceded and followed by a blank line
				923	self._AddVerticallyPaddedParagraph('---')
				924
				925	def StartDocument(self):
				926	self._Push(WrappedTextBlock())
				927
				928	def EndDocument(self):
				929	self._Pop()
				930
				931	def StartParagraph(self):
				932	self._WriteFragmentsAsLine(newlines=2)
				933
				934	def EndParagraph(self):
				935	self._WriteFragmentsAsLine(newlines=2)
				936
				937	def StartDiv(self, cls, style, ident):
				938	"""Process opening of a div element.
				939
				940	Args:
				941	cls: The class attribute of the element.
				942	style: The style attribute of the element.
				943	ident: The id attribute of the element
				944	"""
				945	if not self._IsWithinFragmentType(FencedCodeBlock):
				946	if self._IsWithinFragmentType(CodeBlock):
				947	self._WriteFragmentsAsLine(newlines=1)
				948	else:
				949	self._WriteFragmentsAsLine(newlines=2)
				950
				951	if ((cls and self._ignore_class_regex.match(cls)) or
				952	style and self._ignore_style_regex.match(style)):
				953	self._Push(IgnoreBlock())
				954	elif self._IsWithinFragmentType(FencedCodeBlock):
				955	self._Push(FencedCodeBlockLine())
				956	elif self._IsWithinFragmentType(CodeBlock):
				957	self._Push(CodeBlock())
				958	elif self._IsWithinFragmentType(BlockquoteBlock):
				959	self._Push(BlockquoteBlock(indent=None))
				960	elif cls and self._toc_class_regex.match(cls):
				961	self._AddTableOfContents()
				962	self._Push(IgnoreBlock()) # Ignore the items inside the Sites TOC
				963	elif cls and self._code_class_regex.match(cls):
				964	if FLAGS.indented_code_blocks:
				965	self._Push(IndentedCodeBlock())
				966	else:
				967	self._Push(FencedCodeBlock())
				968	else:
				969	self._Push(WrappedTextBlock())
				970
				971	def EndDiv(self):
				972	if not self._IsWithinFragmentType(FencedCodeBlock, steps_from_last=1):
				973	if self._IsWithinFragmentType(CodeBlock, steps_from_last=1):
				974	self._WriteFragmentsAsLine(newlines=1)
				975	else:
				976	self._WriteFragmentsAsLine(newlines=2)
				977	self._Pop()
				978
				979	def StartHeader(self, level):
				980	self._WriteFragmentsAsLine(newlines=2)
				981	if level == 1 and FLAGS.underline_headers:
				982	self._Push(UnderlinedHeader('='))
				983	elif level == 2 and FLAGS.underline_headers:
				984	self._Push(UnderlinedHeader('-'))
				985	else:
				986	self._Push(TextBlock(prefix=('#' * level) + ' '))
				987
				988	def EndHeader(self):
				989	self._WriteFragmentsAsLine(newlines=2)
				990	self._Pop()
				991
				992	def StartList(self, tag):
				993	if not self._list_info_stack:
				994	self._WriteFragmentsAsLine(newlines=2)
				995	else:
				996	self._WriteFragmentsAsLine(newlines=1)
				997	self._list_info_stack.append(ListInfo(tag))
				998	if tag == 'ol':
				999	self._Push(Text(' ' * FLAGS.ordered_list_indent))
				1000	else:
				1001	self._Push(Text(' ' * FLAGS.unordered_list_indent))
				1002
				1003	def EndList(self):
				1004	self._list_info_stack.pop()
				1005	if not self._list_info_stack:
				1006	self._WriteFragmentsAsLine(newlines=2)
				1007	else:
				1008	self._WriteFragmentsAsLine(newlines=1)
				1009	self._Pop()
				1010
				1011	def StartListItem(self):
				1012	self._WriteFragmentsAsLine(newlines=1)
				1013	# Google Sites sometimes spits out pages with <li> tags not enclosed within
				1014	# an <ol> or <ul> tag.
				1015	tag = ''
				1016	if self._list_info_stack:
				1017	self._list_info_stack[-1].item_count += 1
				1018	tag = self._list_info_stack[-1].tag
				1019	if tag == 'ol':
				1020	item_count = self._list_info_stack[-1].item_count
				1021	# string.ljust makes room for as many digits as you need.
				1022	prefix = ('%d.' % item_count).ljust(FLAGS.ordered_list_indent)
				1023	self._Push(ListItem(prefix))
				1024	else:
				1025	prefix = '*'.ljust(FLAGS.unordered_list_indent)
				1026	self._Push(ListItem(prefix))
				1027
				1028	def EndListItem(self):
				1029	self._WriteFragmentsAsLine(newlines=1)
				1030	self._Pop()
				1031
				1032	def StartFormat(self, tag):
				1033	# Allowed formatting depends on the surrounding fragment type.
				1034	if self._IsWithinFragmentType(TD) and tag == 'b':
				1035	# TODO(dpranke): This is a hack because I don't yet really understand
				1036	# how the ChangeToHtml() logic works in CodeBlocks, but it seems like
				1037	# we should be able to do something similar to what they do.
				1038	# Also, this should really be rewriting these to <th>s instead.
				1039	self._Push(HTML('<b>', '</b>'))
				1040	return
				1041
				1042	if not self._IsWithinFragmentType(IndentedCodeBlock):
				1043	formats_map = {
				1044	'i': ItalicFormattedText,
				1045	'em': ItalicFormattedText,
				1046	'b': BoldFormattedText,
				1047	'strong': BoldFormattedText,
				1048	'strike': StrikeThroughFormattedText,
				1049	's': StrikeThroughFormattedText,
				1050	'del': StrikeThroughFormattedText,
				1051	'u': HighlightFormattedText,
				1052	'code': Code,
				1053	None: Text,
				1054	}
				1055	if self._IsWithinFragmentType(FencedCodeBlock):
				1056	if FLAGS.allow_html_code_blocks:
				1057	# HTML code block can render formats but must not use Code fragments.
				1058	formats_map['code'] = formats_map[None] = CodeBlock
				1059	else:
				1060	formats_map = {None: CodeBlock}
				1061	else:
				1062	# Inside an indented code block no formatting is allowed.
				1063	formats_map = {None: CodeBlock}
				1064	self._Push(formats_map[tag]() if tag in formats_map
				1065	else formats_map[None]())
				1066
				1067	def EndFormat(self):
				1068	self._Pop()
				1069
				1070	def StartAnchor(self, href):
				1071	if href is not None:
				1072	href = self._url_translator.Translate(href)
				1073	if self._IsWithinFragmentType(TD):
				1074	self._Push(Href(href))
				1075	else:
				1076	self._Push(Link(href))
				1077	else:
				1078	self._Push(Text())
				1079
				1080	def EndAnchor(self):
				1081	self._Pop()
				1082
				1083	def StartBlockquote(self):
				1084	if not self._IsWithinFragmentType(CodeBlock):
				1085	self._WriteFragmentsAsLine(newlines=1)
				1086	self._Push(BlockquoteBlock())
				1087	else:
				1088	self._Push(Text())
				1089
				1090	def EndBlockquote(self):
				1091	if not self._IsWithinFragmentType(CodeBlock):
				1092	self._WriteFragmentsAsLine(newlines=2)
				1093	self._Pop()
				1094
				1095	def Image(self, src, alt, width, height):
				1096	src = self._url_translator.Translate(src)
				1097	self._fragment_tree.AppendFragment(Image(src, alt, width, height))
				1098
				1099	def Iframe(self, src, width, height):
				1100	"""Process an <iframe> element.
				1101
				1102	Sites use <iframe> for embedded content: Docs, Drawings, etc.
				1103	g3doc implements this by supporting <iframe> HTML tag directly.
				1104
				1105	Args:
				1106	src: Source URL.
				1107	width: Element width.
				1108	height: Element height.
				1109	"""
				1110	if False:
				1111	# TODO(dpranke): Figure out if we should support embedded IFRAME tags.
				1112	# For now, we skip over them.
				1113	self._WriteFragmentsAsLine(newlines=2)
				1114	self._Push(EmbeddedContent(src, width, height))
				1115	self._Pop()
				1116
				1117	def StartTable(self, cls):
				1118	if (cls and 'sites-layout-hbox' in cls and
				1119	'sites-layout-name-one-column' not in cls):
				1120	self._AddHTMLBlock('<div class="two-column-container">')
				1121	self._Push(Div(cls='two-column-container'))
				1122	elif (cls and 'sites-layout-name-one-column' in cls):
				1123	pass
				1124	else:
				1125	self._AddHTMLBlock('<table>')
				1126	self._Push(Table())
				1127
				1128	def EndTable(self):
				1129	if self._LastFragmentIs(Div, cls='two-column-container'):
				1130	self._AddHTMLBlock('</div>')
				1131	self._Pop()
				1132	elif self._IsWithinFragmentType(Table):
				1133	self._AddHTMLBlock('</table>')
				1134	self._Pop()
				1135
				1136	def StartTR(self):
				1137	if self._IsWithinFragmentType(Table):
				1138	self._AddHTMLBlock('<tr>')
				1139
				1140	def EndTR(self):
				1141	if self._IsWithinFragmentType(Table):
				1142	self._AddHTMLBlock('</tr>')
				1143
				1144	def StartTD(self, cls):
				1145	if self._LastFragmentIs(Div, cls='two-column-container'):
				1146	if cls and ('sites-tile-name-content-1' in cls or
				1147	'sites-tile-name-content-2' in cls):
				1148	self._AddHTMLBlock('<div class="column">')
				1149	self._Push(Div(cls='column'))
				1150	else:
				1151	self._Push(Text())
				1152	elif self._IsWithinFragmentType(Table):
				1153	self._Push(TD())
				1154
				1155	def EndTD(self):
				1156	if self._LastFragmentIs(Div, cls='column'):
				1157	self._AddHTMLBlock('</div>')
				1158	self._Pop()
				1159	elif self._IsWithinFragmentType(Table):
				1160	self._Pop()
				1161	self._WriteFragmentsAsLine(newlines=1)
				1162
				1163	def Text(self, text):
				1164	if not isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
				1165	fragment = (CodeBlock() if self._IsWithinFragmentType(CodeBlock)
				1166	else Text())
				1167	self._fragment_tree.AppendFragment(fragment)
				1168	fragment.Append(text)
				1169
				1170	def _AddTableOfContents(self):
				1171	# TOC must be preceded and followed by a blank line
				1172	self._AddVerticallyPaddedParagraph('[TOC]')
				1173
				1174	def _AddVerticallyPaddedParagraph(self, text):
				1175	self._WriteFragmentsAsLine(newlines=2)
				1176	fragment = CodeBlock() # Use CodeBlock to prevent escaping
				1177	self._fragment_tree.AppendFragment(fragment)
				1178	fragment.Append(text)
				1179	self._WriteFragmentsAsLine(newlines=2)
				1180
				1181	def _AddHTMLBlock(self, html):
				1182	"""Writes out a block-level string of html."""
				1183	fragment = HTML()
				1184	fragment.Append(html)
				1185	self._fragment_tree.AppendFragment(fragment)
				1186	self._WriteFragmentsAsLine(newlines=1)
				1187
				1188	def _WriteFragmentsAsLine(self, newlines):
				1189	"""Writes out any content currently buffered in the fragment stack.
				1190
				1191	Args:
				1192	newlines: The minimum number of newlines required in the output after this
				1193	line. These newlines won't be written out until the next line with
				1194	content is encountered.
				1195	"""
				1196
				1197	# Generate indent and the content, then clear content in fragments.
				1198	indent = io.StringIO()
				1199	self._fragment_tree.ApplyToAllFragments(
				1200	lambda fragment: fragment.WriteIndent(indent),
				1201	lambda fragment: fragment.WriteContentIntoParentAndClear())
				1202	last_fragment = self._fragment_tree.PeekFragmentFromEnd()
				1203	content = self._fragment_tree.PeekFragmentFromStart().ConsumeContent()
				1204	content = last_fragment.StripLine(content.getvalue())
				1205	indent = indent.getvalue()
				1206	content = last_fragment.WrapLine(content, indent)
				1207
				1208	# Write the content, if any.
				1209	if content:
				1210	self._out.write('\n' * self._pending_newlines)
				1211	self._out.write(indent)
				1212	self._out.write(content)
				1213	self._pending_newlines = newlines
				1214	elif self._pending_newlines > 0 and self._pending_newlines < newlines:
				1215	self._pending_newlines = newlines
				1216
				1217	if FLAGS.debug_print_tree:
				1218	# Separate trees printed during each writing session
				1219	print('-' * 20)
				1220
				1221
				1222	class XhtmlHandler(xml.sax.ContentHandler):
				1223	"""Translates SAX events into MarkdownGenerator calls."""
				1224
				1225	# regex that matches an HTML header tag and extracts the level.
				1226	_HEADER_TAG_RE = re.compile(r'h([1-6])$')
				1227
				1228	def __init__(self, out, url_translator):
				1229	xml.sax.ContentHandler.__init__(self)
				1230	self._generator = MarkdownGenerator(out, url_translator)
				1231
				1232	def startDocument(self):
				1233	self._generator.StartDocument()
				1234
				1235	def endDocument(self):
				1236	self._generator.EndDocument()
				1237
				1238	def startElementNS(self, name, qname, attrs):
				1239	tag = name[1]
				1240	if tag == 'a':
				1241	href = attrs.get((None, 'href'))
				1242	self._generator.StartAnchor(href)
				1243	elif tag == 'br':
				1244	self._generator.Break()
				1245	elif tag == 'hr':
				1246	self._generator.HorizontalRule()
				1247	elif tag == 'li':
				1248	self._generator.StartListItem()
				1249	elif tag == 'div':
				1250	cls = attrs.get((None, 'class'))
				1251	style = attrs.get((None, 'style'))
				1252	ident = attrs.get((None, 'id'))
				1253	self._generator.StartDiv(cls, style, ident)
				1254	elif tag == 'p':
				1255	self._generator.StartParagraph()
				1256	elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
				1257	self._generator.StartFormat(tag)
				1258	elif tag in ('ul', 'ol'):
				1259	self._generator.StartList(tag)
				1260	elif tag == 'img':
				1261	src = attrs.get((None, 'src'))
				1262	alt = attrs.get((None, 'alt'))
				1263	width = attrs.get((None, 'width'))
				1264	height = attrs.get((None, 'height'))
				1265	self._generator.Image(src, alt, width, height)
				1266	elif tag == 'blockquote':
				1267	self._generator.StartBlockquote()
				1268	elif tag == 'iframe':
				1269	src = attrs.get((None, 'src'))
				1270	width = attrs.get((None, 'width'))
				1271	height = attrs.get((None, 'height'))
				1272	self._generator.Iframe(src, width, height)
				1273	elif tag == 'table':
				1274	cls = attrs.get((None, 'class'))
				1275	self._generator.StartTable(cls)
				1276	elif tag == 'tr':
				1277	self._generator.StartTR()
				1278	elif tag == 'td':
				1279	self._generator.StartTD(attrs.get((None, 'class')))
				1280	else:
				1281	match = self._HEADER_TAG_RE.match(tag)
				1282	if match:
				1283	level = int(match.group(1))
				1284	self._generator.StartHeader(level)
				1285
				1286	def endElementNS(self, name, qname):
				1287	tag = name[1]
				1288	if tag == 'a':
				1289	self._generator.EndAnchor()
				1290	elif tag == 'li':
				1291	self._generator.EndListItem()
				1292	elif tag == 'div':
				1293	self._generator.EndDiv()
				1294	elif tag == 'p':
				1295	self._generator.EndParagraph()
				1296	elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
				1297	self._generator.EndFormat()
				1298	elif tag in ('ul', 'ol'):
				1299	self._generator.EndList()
				1300	elif tag == 'blockquote':
				1301	self._generator.EndBlockquote()
				1302	elif tag == 'td':
				1303	self._generator.EndTD()
				1304	elif tag == 'tr':
				1305	self._generator.EndTR()
				1306	elif tag == 'table':
				1307	self._generator.EndTable()
				1308	else:
				1309	match = self._HEADER_TAG_RE.match(tag)
				1310	if match:
				1311	self._generator.EndHeader()
				1312
				1313	def characters(self, content):
				1314	self._generator.Text(content)
				1315
				1316
				1317	class DefaultUrlTranslator:
				1318	"""No-op UrlTranslator."""
				1319
				1320	def Translate(self, href):
				1321	return href
				1322
				1323
				1324	def Convert(input_stream, output_stream, url_translator=DefaultUrlTranslator()):
				1325	"""Converts an input stream of xhtml into an output stream of markdown.
				1326
				1327	Args:
				1328	input_stream: filehandle for the XHTML input.
				1329	output_stream: filehandle for the Markdown output.
				1330	url_translator: Callback for translating URLs embedded in the page.
				1331	"""
				1332	parser = xml.sax.make_parser()
				1333	parser.setContentHandler(XhtmlHandler(output_stream, url_translator))
				1334	parser.setFeature(xml.sax.handler.feature_namespaces, 1)
				1335	parser.parse(input_stream)