blob: 0ba8381c0486dd30038f638d6845488db28a79dd [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001# Copyright 2021 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""HTML to Markdown renderer."""
16
17import os
18import re
19import io
20import textwrap
21import urllib
22import xml.sax
23
24
25class _Flags:
26 # Whether to render h1s and h2s with underlined - and =.
27 underline_headers = False
28
29 # The set of characters to escape with \'\\\' in the
30 # Markdown. This is not the set of all special Markdown
31 # characters, but rather those characters that tend to
32 # get misinterpreted as Markdown syntax the most. Blindly
33 # escaping all special Markdown characters results in ugly
34 # Markdown.
35 escape_chars = r'\`*[]'
36
37 # Format for italic tags.
38 italic_format = '*'
39
40 # Format for bold tags.
41 bold_format = '**'
42
43 # Format for strikethrough tags.
44 strike_format = '~~'
45
46 # Format for underline tags.
47 highlight_format = '=='
48
49 # Number of spaces to indent an unordered list.
50 # This total includes the bullet.
51 # For example, a value of 4 yields '* '
52 unordered_list_indent = 4
53
54 # Number of spaces to indent an ordered list.
55 # This total includes the number.
56 # For example, a value of 4 yields '1. '
57 ordered_list_indent = 4
58
59 # The DIV blocks that should be formatted as code.
60 code_class_regex = r'^sites-codeblock sites-codesnippet-block$'
61
62 # The class of DIV blocks used for table of contents.
63 toc_class_regex = r'^sites-embed-content sites-embed-type-toc$'
64
65 # The class of DIV blocks that should be ignored.
66 ignore_class_regex = r''
67
68 # The style of DIV blocks that should be ignored.
69 ignore_style_regex = r'^display:none;$'
70
71 # Format text blocks to the given line width. Set to zero
72 # to disable line wrapping.
73 line_width = 80
74
75 # Whether to use indented code blocks, if False use fenced.
76 indented_code_blocks = False
77
78 # Whether to use HTML code blocks instead of fenced code
79 # blocks if source code block includes formatted text.
80 allow_html_code_blocks = True
81
82 # Links that are automatically recognized by the renderer.
83 shortlinks_regex = r'^http://(ag|b|cl|g|go|who)/'
84
85 # Print the fragment tree for debugging.
86 debug_print_tree = False
87
88
89FLAGS = _Flags()
90
91
92def _EscapeText(text, reserved_chars):
93 """Escapes any reserved characters with a backslash.
94
95 Args:
96 text: The string to escape.
97 reserved_chars: A string of reserved characters that need to be escaped.
98
99 Returns:
100 The escaped text.
101 """
102 markdown = io.StringIO()
103 for c in text:
104 if c in reserved_chars:
105 markdown.write('\\')
106 markdown.write(c)
107 return markdown.getvalue()
108
109
110def _EscapeContentForHtml(text):
111 result = io.StringIO()
112 escapes = {'<': '&lt;', '>': '&gt;'}
113 for c in text:
114 result.write(c if c not in escapes else escapes[c])
115 return result
116
117
118ENCODED_NEWLINE = '&#%d;' % ord('\n')
119
120
121def _RestoreEncodedNewlines(text):
122 return text.replace(ENCODED_NEWLINE, '\n')
123
124
125def _WrapLine(line, indent):
126 """Wraps the line to fit into the column limit.
127
128 Args:
129 line: The string to wrap.
130 indent: An integer with the number of columns of indentation.
131
132 Returns:
133 The wrapped text.
134 """
135 if FLAGS.line_width > 0:
136 return ('\n' + ' ' * indent).join(textwrap.wrap(
137 line,
138 width=FLAGS.line_width - indent,
139 break_long_words=False,
140 break_on_hyphens=False))
141 return line
142
143
144class Fragment:
145 """Base class for all output fragments.
146
147 To generate a line of output, the methods will be called in the following
148 order:
149
150 WriteIndent()
151 WriteContentIntoParentAndClear()
152 ConsumeContent() -- for the topmost fragment only
153 StripLine()
154 WrapLine()
155 """
156
157 def __init__(self, indent, prefix, suffix):
158 self._content = io.StringIO()
159 self._indent = indent
160 self._prefix = prefix
161 self._suffix = suffix
162 self._parent = None
163 self._children = []
164
165 def __repr__(self):
166 debug_print = lambda text: text.encode('utf-8') if text else ''
167 return ('{' +
168 self.__class__.__name__ +
169 ': indent=' + debug_print(self._indent) +
170 '; prefix=' + debug_print(self._prefix) +
171 '; content=' + debug_print(self._content.getvalue()) +
172 '; suffix=' + debug_print(self._suffix) +
173 '}')
174
175 def SetParent(self, parent):
176 self._parent = parent
177
178 def AddChild(self, node):
179 self._children.append(node)
180 node.SetParent(self)
181 return node
182
183 def GetChildren(self):
184 return self._children
185
186 def _AllChildren(self):
187 all_children = []
188 def Traverse(fragment):
189 for c in fragment.GetChildren():
190 all_children.append(c)
191 Traverse(c)
192 Traverse(self)
193 return all_children
194
195 def WriteIndent(self, output):
196 if self._indent:
197 output.write(self._indent)
198
199 def WriteContentIntoParentAndClear(self):
200 self._WriteContent(self._parent._content) # pylint: disable=protected-access
201 self._ClearContent()
202 self._children = []
203
204 def _WriteContent(self, output):
205 """Implementation of content rendering. Can be overridden in subclasses."""
206 self._Write(output, self._prefix, self._content.getvalue(), self._suffix)
207
208 def _Write(self, output, prefix, content, suffix):
209 """Default implementation of content rendering for reuse by subclasses."""
210 has_content = bool(content.strip())
211 if prefix and has_content:
212 output.write(prefix)
213 output.write(content)
214 if suffix and has_content:
215 output.write(suffix)
216
217 def UnsetSuffix(self):
218 self._suffix = ''
219
220 def UnsetPrefix(self):
221 self._prefix = ''
222
223 def _UpdatePrefixAndSuffix(self, prefix, suffix):
224 if self._prefix:
225 self._prefix = prefix
226 if self._suffix:
227 self._suffix = suffix
228
229 def _ClearContent(self):
230 """Clears the content. This will only be called after it's been written."""
231 self._content = io.StringIO()
232
233 def ConsumeContent(self):
234 content = self._content
235 self._ClearContent()
236 return content
237
238 def Append(self, text):
239 """Appends text.
240
241 Args:
242 text: The string to append, it will be escaped.
243 """
244 assert isinstance(text, str)
245 self._content.write(self.EscapeText(text))
246
247 def EscapeText(self, text):
248 """Escapes any reserved characters when Append() is called with text.
249
250 By default this defers to the parent fragment.
251
252 Args:
253 text: The string to escape.
254
255 Returns:
256 The escaped string.
257 """
258 if self._parent:
259 return self._parent.EscapeText(text)
260 return text
261
262 def StripLine(self, text):
263 """Does any needed stripping of whitespace.
264
265 Some blocks (code for example) will want to preserve whitespace, while
266 others will want to coalesce it together. By default this defers to the
267 parent fragment.
268
269 Args:
270 text: The string to strip
271
272 Returns:
273 The stripped string.
274 """
275 if self._parent:
276 return self._parent.StripLine(text)
277 return text
278
279 def WrapLine(self, line, indent):
280 """Wraps the line to fit into the column limit, if necessary.
281
282 Most blocks (code for example) will want to preserve whitespace and won't
283 break their output.
284
285 Args:
286 text: The string to wrap.
287 indent: Indent string.
288 Returns:
289 The wrapped string.
290 """
291 del indent
292 return line
293
294 def NeedsToMergeWith(self, text):
295 del text
296 return False
297
298
299class HTML(Fragment):
300 """Markdown fragment that consists of just an unescaped HTML string."""
301
302 def __init__(self, prefix=None, suffix=None):
303 super().__init__(indent=None, prefix=prefix, suffix=suffix)
304
305 def EscapeText(self, text):
306 return text
307
308
309class Href(Fragment):
310 """HTML fragment containing an <a href=> tag. Used within table cells.
311
312 If the href falls within a table cell, using a Href() element will allow
313 us to have proper formatting; the Markdown-style Link() element will not
314 be processed properly.
315 """
316 def __init__(self, href):
317 super().__init__(indent=None, prefix='<a href="%s">' % href, suffix='</a>')
318
319
320class Text(Fragment):
321 """Markdown fragment that consists of just a string."""
322
323 def __init__(self, indent=None, prefix=None, suffix=None):
324 super().__init__(indent, prefix, suffix)
325
326
327class IgnoreBlock(Fragment):
328 """Markdown fragment that omits all content."""
329
330 def __init__(self):
331 super().__init__(None, None, None)
332
333
334class TextBlock(Text):
335 """A TextBlock coalesces all spaces and escapes all reserved chars."""
336
337 def EscapeText(self, text):
338 text = _EscapeContentForHtml(text).getvalue()
339 return _EscapeText(text, FLAGS.escape_chars)
340
341 def StripLine(self, text):
342 # Treat newlines as spaces and then coalesce spaces.
343 text = text.replace('\n', ' ')
344 # Replace all Unicode nonbreaking spaces with simple spaces. This is safer
345 # than deletion since spaces are coalesced below anyway.
346 text = text.replace(chr(160), ' ')
347
348 return re.sub(r' +', ' ', text.strip())
349
350
351class Div(TextBlock):
352 """Placeholder that helps with the two-column layout conversion."""
353
354 def __init__(self, cls):
355 self.cls = cls
356 super().__init__()
357
358
359class Table(TextBlock):
360 """Placeholder that identifies when we're in a (data) table.
361
362 (As opposed to a table being used for layout-purposes, which we don't
363 want to export.)
364 """
365 cls = None
366
367
368class TD(Text):
369 def __init__(self):
370 super().__init__(indent='', prefix='<td>', suffix='</td>')
371
372
373class Content(TextBlock):
374 """Placeholder that identifies when we're processing the main content."""
375 cls = None
376
377
378class WrappedTextBlock(TextBlock):
379 """A WrappedTextBlock wraps the output lines to fit into the column limit."""
380
381 def WrapLine(self, line, indent):
382 return _WrapLine(line, len(indent))
383
384
385class BlockquoteBlock(WrappedTextBlock):
386 """A BlockquoteBlock wraps content and prepends each line with '> '.
387
388 The generator must emit BlockquoteBlocks with no indent for paragraphs
389 inside a blockquote. This will allow propagating the final call to WrapLine
390 up to the outermost BlockquoteBlock which will wrap the lines and prepend
391 each of them with the indent.
392 """
393
394 def __init__(self, indent='> '):
395 super().__init__(indent, None, None)
396
397 def WrapLine(self, line, indent):
398 if not self._indent and self._parent:
399 return self._parent.WrapLine(line, indent)
400 wrapped = _WrapLine(line, len(indent))
401 lines = wrapped.splitlines(True)
402 return indent.join([l.lstrip() for l in lines])
403
404
405class CodeBlock(Text):
406 """Base class for different code block fragment implementations."""
407
408 def EscapeText(self, text):
409 return text
410
411 def StripLine(self, text):
412 # Completely ignore newlines in code blocks. Sites always uses <br/>.
413 return text.replace('\n', '')
414
415 def ChangeToHtml(self):
416 content = self._content.getvalue()
417 if content:
418 self._content = _EscapeContentForHtml(content)
419
420
421class IndentedCodeBlock(CodeBlock):
422 """A IndentedCodeBlock indents by four spaces."""
423
424 def __init__(self, indent=' '):
425 super().__init__(indent, None, None)
426
427
428class FencedCodeBlock(CodeBlock):
429 """A FencedCodeBlock is fenced with triple backticks (```).
430
431 To render correctly, content writing must not happen
432 unless the end of the source code block has been encountered.
433 That is, the entire code block from the source HTML must
434 be rendered in a single write pass.
435 """
436
437 def __init__(self, indent=None,
438 prefix='```none' + ENCODED_NEWLINE,
439 suffix=ENCODED_NEWLINE + '```'):
440 super().__init__(indent, prefix, suffix)
441
442 def WriteIndent(self, output):
443 # Adjust inner fragments and self before rendering.
444 if FLAGS.allow_html_code_blocks:
445 has_formatted_text = False
446 for c in self._AllChildren():
447 if isinstance(c, FormattedText):
448 c.ChangeToHtml()
449 has_formatted_text = True
450 if has_formatted_text:
451 for c in self._AllChildren():
452 if isinstance(c, CodeBlock):
453 c.ChangeToHtml()
454 self._UpdatePrefixAndSuffix(
455 '<pre><code>', ENCODED_NEWLINE + '</code></pre>')
456 super().WriteIndent(output)
457
458 def StripLine(self, text):
459 text = super().StripLine(text)
460 lines = _RestoreEncodedNewlines(text).splitlines()
461 return '\n'.join([l for l in lines if l])
462
463 def WrapLine(self, line, indent):
464 lines = line.splitlines(True)
465 return indent.join(lines)
466
467
468class FencedCodeBlockLine(Text):
469 """A line of code inside FencedCodeBlock."""
470
471 def __init__(self, indent=None,
472 prefix=ENCODED_NEWLINE, suffix=ENCODED_NEWLINE):
473 super().__init__(indent, prefix, suffix)
474
475 def StripLine(self, text):
476 text = super().StripLine(text)
477 return _RestoreEncodedNewlines(text)
478
479
480class UnderlinedHeader(TextBlock):
481 """Markdown fragment for an underlined section header."""
482
483 def __init__(self, char):
484 super().__init__()
485 self._char = char
486
487 def _WriteContent(self, output):
488 length = len(self.StripLine(self._content.getvalue()))
489 if length > 0:
490 # '\n' will be stripped, so use an encoded '\n' that we can later replace
491 # after the line is stripped.
492 self._Write(output,
493 None,
494 self._content.getvalue(),
495 ENCODED_NEWLINE + self._char * length)
496
497 def StripLine(self, text):
498 text = super().StripLine(text)
499 return _RestoreEncodedNewlines(text)
500
501
502class FormattedText(Text):
503 """Text wrapped in Markdown formatting."""
504
505 def __init__(self, fmt):
506 super().__init__(None, fmt, fmt)
507
508 def _Pad(self, bigger, smaller):
509 return ' ' * (len(bigger) - len(smaller))
510
511 def _WriteContent(self, output):
512 prefix = self._prefix
513 content = self._content.getvalue()
514 suffix = self._suffix
515 if prefix:
516 # If there are whitespaces immediately after the prefix,
517 # they must be pushed out before the prefix.
518 lstripped = content.lstrip()
519 if len(content) > len(lstripped):
520 prefix = self._Pad(content, lstripped) + prefix
521 content = lstripped
522 if suffix:
523 # If there are whitespaces immediately before the suffix,
524 # they must be pushed out after the suffix.
525 rstripped = content.rstrip()
526 if len(content) > len(rstripped):
527 suffix = suffix + self._Pad(content, rstripped)
528 content = rstripped
529 self._Write(output, prefix, content, suffix)
530
531 def ChangeToHtml(self):
532 content = self._content.getvalue()
533 if content:
534 content = _EscapeContentForHtml(content)
535
536
537class BoldFormattedText(FormattedText):
538 """Text formatted as bold."""
539
540 def __init__(self):
541 super().__init__(FLAGS.bold_format)
542
543 def NeedsToMergeWith(self, text):
544 return isinstance(text, BoldFormattedText)
545
546 def ChangeToHtml(self):
547 super().ChangeToHtml()
548 self._UpdatePrefixAndSuffix('<b>', '</b>')
549
550
551class ItalicFormattedText(FormattedText):
552 """Text formatted as italic."""
553
554 def __init__(self):
555 super().__init__(FLAGS.italic_format)
556
557 def NeedsToMergeWith(self, text):
558 return isinstance(text, ItalicFormattedText)
559
560 def ChangeToHtml(self):
561 super().ChangeToHtml()
562 self._UpdatePrefixAndSuffix('<i>', '</i>')
563
564
565class StrikeThroughFormattedText(FormattedText):
566 """Text formatted as strike through."""
567
568 def __init__(self):
569 super().__init__(FLAGS.strike_format)
570
571 def NeedsToMergeWith(self, text):
572 return isinstance(text, StrikeThroughFormattedText)
573
574 def ChangeToHtml(self):
575 super().ChangeToHtml()
576 self._UpdatePrefixAndSuffix('<s>', '</s>')
577
578
579class HighlightFormattedText(FormattedText):
580 """Highlighted text."""
581
582 def __init__(self):
583 super().__init__(FLAGS.highlight_format)
584
585 def NeedsToMergeWith(self, text):
586 return isinstance(text, HighlightFormattedText)
587
588 def ChangeToHtml(self):
589 super().ChangeToHtml()
590 self._UpdatePrefixAndSuffix('<u>', '</u>')
591
592
593class ListItem(Text):
594 """Item in a list."""
595
596 def __init__(self, bullet):
597 super().__init__()
598 self._bullet = bullet
599
600 def WriteIndent(self, output):
601 if self._bullet:
602 # TODO(dpranke): The original code relied on strings and bytes
603 # being interchangeable in Python2, so you could seek backwards
604 # from the current location with a relative offset. You can't
605 # do that in Python3, apparently.
606 #
607 # To get around this for the moment, instead of seeking backwards
608 # 4 characters, we embed 4 '\b' backspaces, and then have the client
609 # do a global search and replace of ' \b\b\b\b' with '' instead.
610 #
611 # This is awkward, so we should rework this so that this isn't needed.
612 #
613 # output.seek(-len(self._bullet), os.SEEK_CUR)
614 output.write('\b' * len(self._bullet))
615 output.write(self._bullet)
616 super().WriteIndent(output)
617
618 def _ClearContent(self):
619 self._bullet = None
620 super()._ClearContent()
621
622 def WrapLine(self, line, indent):
623 return _WrapLine(line, len(indent))
624
625
626class Link(Text):
627 """Markdown link."""
628
629 def __init__(self, href):
630 super().__init__()
631 self._href = href
632 self._url_opener_prefix = ''
633 self._url_opener_suffix = ''
634
635 def MakeAnImage(self, width, height):
636 self._url_opener_prefix = '!'
637 if width and height:
638 self._url_opener_suffix = (
639 '{{width="{}" height="{}"}}'.format(width, height))
640
641 def _IsShortLink(self, text):
642 if FLAGS.shortlinks_regex and (
643 re.compile(FLAGS.shortlinks_regex).match(self._href)):
644 parsed_href = urllib.parse.urlsplit(self._href)
645 if parsed_href.netloc + parsed_href.path == text:
646 return True
647 return None
648
649 def _WriteLink(self, output, text):
650 write_short_link = (not (self._url_opener_prefix or self._url_opener_suffix)
651 and self._IsShortLink(text))
652 if write_short_link:
653 self._Write(output, None, text, None)
654 else:
655 self._Write(output,
656 self._url_opener_prefix + '[',
657 text,
658 '](' + self._href + ')' + self._url_opener_suffix)
659
660 def _WriteContent(self, output):
661 text = self._content.getvalue()
662 if text:
663 if text.startswith('http://') or text.startswith('https://'):
664 self._Write(output, '<', text, '>')
665 else:
666 self._WriteLink(output, text)
667
668
669class Image(Text):
670 """Image."""
671
672 def __init__(self, src, alt, width, height):
673 super().__init__()
674 self._src = src
675 self._alt = alt or 'image'
676 self._width = width
677 self._height = height
678
679 def _WriteContent(self, output):
680 tag = '<img alt="%s" src="%s"' % (self._alt, self._src)
681 if self._height:
682 tag += ' height=%s' % self._height
683 if self._width:
684 tag += ' width=%s' % self._width
685 tag += '>'
686 self._Write(output, '', tag, '')
687
688
689class Code(Text):
690 """Inline code."""
691
692 def __init__(self):
693 super().__init__(None, '`', '`')
694
695 def EscapeText(self, text):
696 return text
697
698 def _WriteContent(self, output):
699 prefix = self._prefix
700 content = self._content.getvalue()
701 suffix = self._suffix
702 if '`' in content:
703 # If a backtick (`) is present inside inline code, the fragment
704 # must use double backticks.
705 prefix = suffix = '``'
706 # Since having content starting or ending with a backtick would emit
707 # triple backticks which designates a fenced code fragment, pad content
708 # to avoid this.
709 if content.startswith('`'):
710 content = ' ' + content
711 if content.endswith('`'):
712 content += ' '
713 self._Write(output, prefix, content, suffix)
714
715 def NeedsToMergeWith(self, text):
716 return isinstance(text, Code)
717
718
719class EmbeddedContent(Text):
720 """Embedded content: Docs, Drawings, Presentations, etc."""
721
722 def __init__(self, href, width, height):
723 super().__init__()
724 self._href = href
725 self._width = width
726 self._height = height
727
728 def _WriteContent(self, output):
729 parsed_href = urllib.parse.urlsplit(self._href)
730 if parsed_href.scheme == 'http':
731 parsed_href = urllib.parse.SplitResult(
732 'https', parsed_href.netloc, parsed_href.path, parsed_href.query,
733 parsed_href.fragment)
734 # Note: 'allow="fullscreen"' is requested for all content for simplicity.
735 # g3doc server has dedicated logic to deal with these requests.
736 element = '<iframe src="{}"{} allow="fullscreen" />'.format(
737 urllib.parse.urlunsplit(parsed_href),
738 (' width="{}" height="{}"'.format(self._width, self._height) if (
739 self._width and self._height) else ''))
740 self._Write(output, None, element, None)
741
742
743class ListInfo:
744
745 def __init__(self, tag):
746 self.tag = tag # The tag used to start the list
747 self.item_count = 0 # The number of items in the list
748
749
750class FragmentTree:
751 """Class for managing a tree of fragments.
752
753 There is a "scope" formed by nested fragments, e.g.
754 italic fragment inside bold fragment inside paragraph.
755 The scope is stored in the stack. For convenience,
756 the stack always have one element.
757
758 Fragments popped out from the scope may be re-added
759 back into the tree as children of the last fragment.
760 This allows "chaining" of structured content for future
761 processing. For example, if there were several bold
762 fragments inside a paragraph interleaved with fragments
763 of regular text, all these fragments will end up as
764 children of the paragraph fragment.
765
766 """
767
768 def __init__(self, top_fragment):
769 self._stack = [top_fragment]
770
771 def ActiveFragmentScopeDepth(self):
772 return len(self._stack) - 1
773
774 def StartFragment(self, fragment):
775 fragment.SetParent(self._stack[-1])
776 self._stack.append(fragment)
777 return fragment
778
779 def EndFragment(self):
780 return self._stack.pop()
781
782 def AppendFragment(self, fragment):
783 return self._stack[-1].AddChild(fragment)
784
785 def _ApplyRecursivelyToNode(self, node, scope_operation, operation, # pylint: disable=missing-docstring
786 debug_indent):
787 if not debug_indent:
788 for child in node.GetChildren():
789 self._ApplyRecursivelyToNode(child, scope_operation, operation, None)
790 else:
791 debug_indent += ' c '
792 for child in node.GetChildren():
793 print(debug_indent + repr(child))
794 self._ApplyRecursivelyToNode(child, scope_operation, operation,
795 debug_indent)
796 operation(node)
797
798 def _ApplyRecursivelyToScope(self, nodes, scope_operation, operation, # pylint: disable=missing-docstring
799 debug_indent):
800 node = nodes.pop()
801 scope_operation(node)
802 if debug_indent:
803 print(debug_indent + repr(node))
804 if nodes:
805 self._ApplyRecursivelyToScope(nodes, scope_operation, operation,
806 (debug_indent + ' s ' if debug_indent
807 else None))
808 self._ApplyRecursivelyToNode(node, scope_operation, operation,
809 debug_indent)
810
811 def ApplyToAllFragments(self, scope_operation, operation):
812 """Recursively applies operations to all fragments in the tree.
813
814 The omnipresent topmost fragment is excluded. The 'scope_operation'
815 is applied to every element in the fragment stack in pre-order.
816 The 'operation' is applied to all fragments in the tree in post-order.
817
818 Args:
819 scope_operation: The operation to apply to fragments in the scope stack.
820 operation: The operation to apply to all fragments in the tree.
821 """
822 self._ApplyRecursivelyToScope(list(reversed(self._stack[1:])),
823 scope_operation, operation,
824 ' ' if FLAGS.debug_print_tree else None)
825
826 def FindFirstFragmentFromEnd(self, predicate, steps_from_last=0):
827 sub_stack = self._stack[:-steps_from_last if steps_from_last else None]
828 return next((node for node in sub_stack if predicate(node)), None)
829
830 def PeekFragmentFromStart(self, steps_from_first=0):
831 return self._stack[steps_from_first]
832
833 def PeekFragmentFromEnd(self, steps_from_last=0):
834 return self._stack[-(steps_from_last + 1)]
835
836 def PeekLastAppendedFragment(self):
837 return (self._stack[-1].GetChildren()[-1]
838 if self._stack[-1].GetChildren() else None)
839
840
841class MarkdownGenerator:
842 """Generates Markdown based on the series of HTML tags seen.
843
844 Each time an opening HTML tag is seen, the appropriate markdown fragment is
845 created and pushed onto a stack. Any text encountered is appended to the
846 fragment at the top of the stack. When a closing HTML tag is seen, the stack
847 is popped and the fragment removed is appended to the new top of the stack.
848
849 Markdown is buffered in the fragment stack until an entire line has been
850 formed, at which point _WriteFragmentsAsLine() is called to write it out. The
851 content buffered in the stack is cleared, but otherwise the stack remains
852 unmodified.
853 """
854
855 def __init__(self, out, url_translator):
856 self._out = out
857 self._url_translator = url_translator
858 self._fragment_tree = FragmentTree(Text())
859 self._list_info_stack = []
860 self._pending_newlines = 0
861 # Initialize the regexps to match nothing (rather than be None).
862 self._code_class_regex = re.compile(FLAGS.code_class_regex or 'a^')
863 self._toc_class_regex = re.compile(FLAGS.toc_class_regex or 'a^')
864 self._ignore_class_regex = re.compile(FLAGS.ignore_class_regex or 'a^')
865 self._ignore_style_regex = re.compile(FLAGS.ignore_style_regex or 'a^')
866
867 def _Push(self, fragment):
868 """Sets the parent fragment and pushes it onto the fragment stack.
869
870 In the case where there is an IgnoreBlock on the stack, a new IgnoreBlock
871 is pushed instead.
872
873 Args:
874 fragment: The Fragment object to push on the stack.
875 """
876 if isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
877 # If the top of the stack is IgnoreBlock, push an IgnoreBlock instead.
878 fragment = IgnoreBlock()
879 else:
880 # Check if we need to merge adjacent formatting, e.g.
881 # instead of **bold****bold** we need to write **boldbold**,
882 # as the former is not correct Markdown syntax.
883 last_appended = self._fragment_tree.PeekLastAppendedFragment()
884 if last_appended and last_appended.NeedsToMergeWith(fragment):
885 last_appended.UnsetSuffix()
886 fragment.UnsetPrefix()
887
888 self._fragment_tree.StartFragment(fragment)
889
890 def _Pop(self):
891 """Pops the fragment stack it to the new top of stack.
892
893 If the fragment stack would be empty after popping, then the fragment is
894 written to the output first.
895 """
896 if self._fragment_tree.ActiveFragmentScopeDepth() > 1:
897 fragment = self._fragment_tree.EndFragment()
898 self._fragment_tree.AppendFragment(fragment)
899 else:
900 self._WriteFragmentsAsLine(newlines=0)
901 self._fragment_tree.EndFragment()
902
903 def _IsWithinFragmentType(self, fragment_type, steps_from_last=0):
904 return self._fragment_tree.FindFirstFragmentFromEnd(
905 lambda fragment: isinstance(fragment, fragment_type),
906 steps_from_last) is not None
907
908 def _LastFragmentIs(self, fragment_type, cls):
909 fragment = self._fragment_tree.PeekFragmentFromEnd()
910 return (isinstance(fragment, fragment_type) and fragment.cls == cls)
911
912 def Break(self):
913 if not self._IsWithinFragmentType(FencedCodeBlock):
914 self._WriteFragmentsAsLine(newlines=1)
915 else:
916 fragment = FencedCodeBlockLine(prefix='', suffix='')
917 self._Push(fragment)
918 fragment.Append(ENCODED_NEWLINE)
919 self._Pop()
920
921 def HorizontalRule(self):
922 # Horizontal rule must be preceded and followed by a blank line
923 self._AddVerticallyPaddedParagraph('---')
924
925 def StartDocument(self):
926 self._Push(WrappedTextBlock())
927
928 def EndDocument(self):
929 self._Pop()
930
931 def StartParagraph(self):
932 self._WriteFragmentsAsLine(newlines=2)
933
934 def EndParagraph(self):
935 self._WriteFragmentsAsLine(newlines=2)
936
937 def StartDiv(self, cls, style, ident):
938 """Process opening of a div element.
939
940 Args:
941 cls: The class attribute of the element.
942 style: The style attribute of the element.
943 ident: The id attribute of the element
944 """
945 if not self._IsWithinFragmentType(FencedCodeBlock):
946 if self._IsWithinFragmentType(CodeBlock):
947 self._WriteFragmentsAsLine(newlines=1)
948 else:
949 self._WriteFragmentsAsLine(newlines=2)
950
951 if ((cls and self._ignore_class_regex.match(cls)) or
952 style and self._ignore_style_regex.match(style)):
953 self._Push(IgnoreBlock())
954 elif self._IsWithinFragmentType(FencedCodeBlock):
955 self._Push(FencedCodeBlockLine())
956 elif self._IsWithinFragmentType(CodeBlock):
957 self._Push(CodeBlock())
958 elif self._IsWithinFragmentType(BlockquoteBlock):
959 self._Push(BlockquoteBlock(indent=None))
960 elif cls and self._toc_class_regex.match(cls):
961 self._AddTableOfContents()
962 self._Push(IgnoreBlock()) # Ignore the items inside the Sites TOC
963 elif cls and self._code_class_regex.match(cls):
964 if FLAGS.indented_code_blocks:
965 self._Push(IndentedCodeBlock())
966 else:
967 self._Push(FencedCodeBlock())
968 else:
969 self._Push(WrappedTextBlock())
970
971 def EndDiv(self):
972 if not self._IsWithinFragmentType(FencedCodeBlock, steps_from_last=1):
973 if self._IsWithinFragmentType(CodeBlock, steps_from_last=1):
974 self._WriteFragmentsAsLine(newlines=1)
975 else:
976 self._WriteFragmentsAsLine(newlines=2)
977 self._Pop()
978
979 def StartHeader(self, level):
980 self._WriteFragmentsAsLine(newlines=2)
981 if level == 1 and FLAGS.underline_headers:
982 self._Push(UnderlinedHeader('='))
983 elif level == 2 and FLAGS.underline_headers:
984 self._Push(UnderlinedHeader('-'))
985 else:
986 self._Push(TextBlock(prefix=('#' * level) + ' '))
987
988 def EndHeader(self):
989 self._WriteFragmentsAsLine(newlines=2)
990 self._Pop()
991
992 def StartList(self, tag):
993 if not self._list_info_stack:
994 self._WriteFragmentsAsLine(newlines=2)
995 else:
996 self._WriteFragmentsAsLine(newlines=1)
997 self._list_info_stack.append(ListInfo(tag))
998 if tag == 'ol':
999 self._Push(Text(' ' * FLAGS.ordered_list_indent))
1000 else:
1001 self._Push(Text(' ' * FLAGS.unordered_list_indent))
1002
1003 def EndList(self):
1004 self._list_info_stack.pop()
1005 if not self._list_info_stack:
1006 self._WriteFragmentsAsLine(newlines=2)
1007 else:
1008 self._WriteFragmentsAsLine(newlines=1)
1009 self._Pop()
1010
1011 def StartListItem(self):
1012 self._WriteFragmentsAsLine(newlines=1)
1013 # Google Sites sometimes spits out pages with <li> tags not enclosed within
1014 # an <ol> or <ul> tag.
1015 tag = ''
1016 if self._list_info_stack:
1017 self._list_info_stack[-1].item_count += 1
1018 tag = self._list_info_stack[-1].tag
1019 if tag == 'ol':
1020 item_count = self._list_info_stack[-1].item_count
1021 # string.ljust makes room for as many digits as you need.
1022 prefix = ('%d.' % item_count).ljust(FLAGS.ordered_list_indent)
1023 self._Push(ListItem(prefix))
1024 else:
1025 prefix = '*'.ljust(FLAGS.unordered_list_indent)
1026 self._Push(ListItem(prefix))
1027
1028 def EndListItem(self):
1029 self._WriteFragmentsAsLine(newlines=1)
1030 self._Pop()
1031
1032 def StartFormat(self, tag):
1033 # Allowed formatting depends on the surrounding fragment type.
1034 if self._IsWithinFragmentType(TD) and tag == 'b':
1035 # TODO(dpranke): This is a hack because I don't yet really understand
1036 # how the ChangeToHtml() logic works in CodeBlocks, but it seems like
1037 # we should be able to do something similar to what they do.
1038 # Also, this should really be rewriting these to <th>s instead.
1039 self._Push(HTML('<b>', '</b>'))
1040 return
1041
1042 if not self._IsWithinFragmentType(IndentedCodeBlock):
1043 formats_map = {
1044 'i': ItalicFormattedText,
1045 'em': ItalicFormattedText,
1046 'b': BoldFormattedText,
1047 'strong': BoldFormattedText,
1048 'strike': StrikeThroughFormattedText,
1049 's': StrikeThroughFormattedText,
1050 'del': StrikeThroughFormattedText,
1051 'u': HighlightFormattedText,
1052 'code': Code,
1053 None: Text,
1054 }
1055 if self._IsWithinFragmentType(FencedCodeBlock):
1056 if FLAGS.allow_html_code_blocks:
1057 # HTML code block can render formats but must not use Code fragments.
1058 formats_map['code'] = formats_map[None] = CodeBlock
1059 else:
1060 formats_map = {None: CodeBlock}
1061 else:
1062 # Inside an indented code block no formatting is allowed.
1063 formats_map = {None: CodeBlock}
1064 self._Push(formats_map[tag]() if tag in formats_map
1065 else formats_map[None]())
1066
1067 def EndFormat(self):
1068 self._Pop()
1069
1070 def StartAnchor(self, href):
1071 if href is not None:
1072 href = self._url_translator.Translate(href)
1073 if self._IsWithinFragmentType(TD):
1074 self._Push(Href(href))
1075 else:
1076 self._Push(Link(href))
1077 else:
1078 self._Push(Text())
1079
1080 def EndAnchor(self):
1081 self._Pop()
1082
1083 def StartBlockquote(self):
1084 if not self._IsWithinFragmentType(CodeBlock):
1085 self._WriteFragmentsAsLine(newlines=1)
1086 self._Push(BlockquoteBlock())
1087 else:
1088 self._Push(Text())
1089
1090 def EndBlockquote(self):
1091 if not self._IsWithinFragmentType(CodeBlock):
1092 self._WriteFragmentsAsLine(newlines=2)
1093 self._Pop()
1094
1095 def Image(self, src, alt, width, height):
1096 src = self._url_translator.Translate(src)
1097 self._fragment_tree.AppendFragment(Image(src, alt, width, height))
1098
1099 def Iframe(self, src, width, height):
1100 """Process an <iframe> element.
1101
1102 Sites use <iframe> for embedded content: Docs, Drawings, etc.
1103 g3doc implements this by supporting <iframe> HTML tag directly.
1104
1105 Args:
1106 src: Source URL.
1107 width: Element width.
1108 height: Element height.
1109 """
1110 if False:
1111 # TODO(dpranke): Figure out if we should support embedded IFRAME tags.
1112 # For now, we skip over them.
1113 self._WriteFragmentsAsLine(newlines=2)
1114 self._Push(EmbeddedContent(src, width, height))
1115 self._Pop()
1116
1117 def StartTable(self, cls):
1118 if (cls and 'sites-layout-hbox' in cls and
1119 'sites-layout-name-one-column' not in cls):
1120 self._AddHTMLBlock('<div class="two-column-container">')
1121 self._Push(Div(cls='two-column-container'))
1122 elif (cls and 'sites-layout-name-one-column' in cls):
1123 pass
1124 else:
1125 self._AddHTMLBlock('<table>')
1126 self._Push(Table())
1127
1128 def EndTable(self):
1129 if self._LastFragmentIs(Div, cls='two-column-container'):
1130 self._AddHTMLBlock('</div>')
1131 self._Pop()
1132 elif self._IsWithinFragmentType(Table):
1133 self._AddHTMLBlock('</table>')
1134 self._Pop()
1135
1136 def StartTR(self):
1137 if self._IsWithinFragmentType(Table):
1138 self._AddHTMLBlock('<tr>')
1139
1140 def EndTR(self):
1141 if self._IsWithinFragmentType(Table):
1142 self._AddHTMLBlock('</tr>')
1143
1144 def StartTD(self, cls):
1145 if self._LastFragmentIs(Div, cls='two-column-container'):
1146 if cls and ('sites-tile-name-content-1' in cls or
1147 'sites-tile-name-content-2' in cls):
1148 self._AddHTMLBlock('<div class="column">')
1149 self._Push(Div(cls='column'))
1150 else:
1151 self._Push(Text())
1152 elif self._IsWithinFragmentType(Table):
1153 self._Push(TD())
1154
1155 def EndTD(self):
1156 if self._LastFragmentIs(Div, cls='column'):
1157 self._AddHTMLBlock('</div>')
1158 self._Pop()
1159 elif self._IsWithinFragmentType(Table):
1160 self._Pop()
1161 self._WriteFragmentsAsLine(newlines=1)
1162
1163 def Text(self, text):
1164 if not isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
1165 fragment = (CodeBlock() if self._IsWithinFragmentType(CodeBlock)
1166 else Text())
1167 self._fragment_tree.AppendFragment(fragment)
1168 fragment.Append(text)
1169
1170 def _AddTableOfContents(self):
1171 # TOC must be preceded and followed by a blank line
1172 self._AddVerticallyPaddedParagraph('[TOC]')
1173
1174 def _AddVerticallyPaddedParagraph(self, text):
1175 self._WriteFragmentsAsLine(newlines=2)
1176 fragment = CodeBlock() # Use CodeBlock to prevent escaping
1177 self._fragment_tree.AppendFragment(fragment)
1178 fragment.Append(text)
1179 self._WriteFragmentsAsLine(newlines=2)
1180
1181 def _AddHTMLBlock(self, html):
1182 """Writes out a block-level string of html."""
1183 fragment = HTML()
1184 fragment.Append(html)
1185 self._fragment_tree.AppendFragment(fragment)
1186 self._WriteFragmentsAsLine(newlines=1)
1187
1188 def _WriteFragmentsAsLine(self, newlines):
1189 """Writes out any content currently buffered in the fragment stack.
1190
1191 Args:
1192 newlines: The minimum number of newlines required in the output after this
1193 line. These newlines won't be written out until the next line with
1194 content is encountered.
1195 """
1196
1197 # Generate indent and the content, then clear content in fragments.
1198 indent = io.StringIO()
1199 self._fragment_tree.ApplyToAllFragments(
1200 lambda fragment: fragment.WriteIndent(indent),
1201 lambda fragment: fragment.WriteContentIntoParentAndClear())
1202 last_fragment = self._fragment_tree.PeekFragmentFromEnd()
1203 content = self._fragment_tree.PeekFragmentFromStart().ConsumeContent()
1204 content = last_fragment.StripLine(content.getvalue())
1205 indent = indent.getvalue()
1206 content = last_fragment.WrapLine(content, indent)
1207
1208 # Write the content, if any.
1209 if content:
1210 self._out.write('\n' * self._pending_newlines)
1211 self._out.write(indent)
1212 self._out.write(content)
1213 self._pending_newlines = newlines
1214 elif self._pending_newlines > 0 and self._pending_newlines < newlines:
1215 self._pending_newlines = newlines
1216
1217 if FLAGS.debug_print_tree:
1218 # Separate trees printed during each writing session
1219 print('-' * 20)
1220
1221
1222class XhtmlHandler(xml.sax.ContentHandler):
1223 """Translates SAX events into MarkdownGenerator calls."""
1224
1225 # regex that matches an HTML header tag and extracts the level.
1226 _HEADER_TAG_RE = re.compile(r'h([1-6])$')
1227
1228 def __init__(self, out, url_translator):
1229 xml.sax.ContentHandler.__init__(self)
1230 self._generator = MarkdownGenerator(out, url_translator)
1231
1232 def startDocument(self):
1233 self._generator.StartDocument()
1234
1235 def endDocument(self):
1236 self._generator.EndDocument()
1237
1238 def startElementNS(self, name, qname, attrs):
1239 tag = name[1]
1240 if tag == 'a':
1241 href = attrs.get((None, 'href'))
1242 self._generator.StartAnchor(href)
1243 elif tag == 'br':
1244 self._generator.Break()
1245 elif tag == 'hr':
1246 self._generator.HorizontalRule()
1247 elif tag == 'li':
1248 self._generator.StartListItem()
1249 elif tag == 'div':
1250 cls = attrs.get((None, 'class'))
1251 style = attrs.get((None, 'style'))
1252 ident = attrs.get((None, 'id'))
1253 self._generator.StartDiv(cls, style, ident)
1254 elif tag == 'p':
1255 self._generator.StartParagraph()
1256 elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
1257 self._generator.StartFormat(tag)
1258 elif tag in ('ul', 'ol'):
1259 self._generator.StartList(tag)
1260 elif tag == 'img':
1261 src = attrs.get((None, 'src'))
1262 alt = attrs.get((None, 'alt'))
1263 width = attrs.get((None, 'width'))
1264 height = attrs.get((None, 'height'))
1265 self._generator.Image(src, alt, width, height)
1266 elif tag == 'blockquote':
1267 self._generator.StartBlockquote()
1268 elif tag == 'iframe':
1269 src = attrs.get((None, 'src'))
1270 width = attrs.get((None, 'width'))
1271 height = attrs.get((None, 'height'))
1272 self._generator.Iframe(src, width, height)
1273 elif tag == 'table':
1274 cls = attrs.get((None, 'class'))
1275 self._generator.StartTable(cls)
1276 elif tag == 'tr':
1277 self._generator.StartTR()
1278 elif tag == 'td':
1279 self._generator.StartTD(attrs.get((None, 'class')))
1280 else:
1281 match = self._HEADER_TAG_RE.match(tag)
1282 if match:
1283 level = int(match.group(1))
1284 self._generator.StartHeader(level)
1285
1286 def endElementNS(self, name, qname):
1287 tag = name[1]
1288 if tag == 'a':
1289 self._generator.EndAnchor()
1290 elif tag == 'li':
1291 self._generator.EndListItem()
1292 elif tag == 'div':
1293 self._generator.EndDiv()
1294 elif tag == 'p':
1295 self._generator.EndParagraph()
1296 elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
1297 self._generator.EndFormat()
1298 elif tag in ('ul', 'ol'):
1299 self._generator.EndList()
1300 elif tag == 'blockquote':
1301 self._generator.EndBlockquote()
1302 elif tag == 'td':
1303 self._generator.EndTD()
1304 elif tag == 'tr':
1305 self._generator.EndTR()
1306 elif tag == 'table':
1307 self._generator.EndTable()
1308 else:
1309 match = self._HEADER_TAG_RE.match(tag)
1310 if match:
1311 self._generator.EndHeader()
1312
1313 def characters(self, content):
1314 self._generator.Text(content)
1315
1316
1317class DefaultUrlTranslator:
1318 """No-op UrlTranslator."""
1319
1320 def Translate(self, href):
1321 return href
1322
1323
1324def Convert(input_stream, output_stream, url_translator=DefaultUrlTranslator()):
1325 """Converts an input stream of xhtml into an output stream of markdown.
1326
1327 Args:
1328 input_stream: filehandle for the XHTML input.
1329 output_stream: filehandle for the Markdown output.
1330 url_translator: Callback for translating URLs embedded in the page.
1331 """
1332 parser = xml.sax.make_parser()
1333 parser.setContentHandler(XhtmlHandler(output_stream, url_translator))
1334 parser.setFeature(xml.sax.handler.feature_namespaces, 1)
1335 parser.parse(input_stream)