blob: 280223e34b816999294709609d391ce5e5e91bf2 [file] [log] [blame]
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001# Copyright 2021 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""HTML to Markdown renderer."""
16
17import os
18import re
19import io
20import textwrap
21import urllib
22import xml.sax
23
24
25class _Flags:
26 # Whether to render h1s and h2s with underlined - and =.
27 underline_headers = False
28
29 # The set of characters to escape with \'\\\' in the
30 # Markdown. This is not the set of all special Markdown
31 # characters, but rather those characters that tend to
32 # get misinterpreted as Markdown syntax the most. Blindly
33 # escaping all special Markdown characters results in ugly
34 # Markdown.
35 escape_chars = r'\`*[]'
36
37 # Format for italic tags.
38 italic_format = '*'
39
40 # Format for bold tags.
41 bold_format = '**'
42
43 # Format for strikethrough tags.
44 strike_format = '~~'
45
46 # Format for underline tags.
47 highlight_format = '=='
48
49 # Number of spaces to indent an unordered list.
50 # This total includes the bullet.
51 # For example, a value of 4 yields '* '
52 unordered_list_indent = 4
53
54 # Number of spaces to indent an ordered list.
55 # This total includes the number.
56 # For example, a value of 4 yields '1. '
57 ordered_list_indent = 4
58
59 # The DIV blocks that should be formatted as code.
60 code_class_regex = r'^sites-codeblock sites-codesnippet-block$'
61
62 # The class of DIV blocks used for table of contents.
63 toc_class_regex = r'^sites-embed-content sites-embed-type-toc$'
64
65 # The class of DIV blocks that should be ignored.
66 ignore_class_regex = r''
67
68 # The style of DIV blocks that should be ignored.
69 ignore_style_regex = r'^display:none;$'
70
71 # Format text blocks to the given line width. Set to zero
72 # to disable line wrapping.
73 line_width = 80
74
75 # Whether to use indented code blocks, if False use fenced.
76 indented_code_blocks = False
77
78 # Whether to use HTML code blocks instead of fenced code
79 # blocks if source code block includes formatted text.
80 allow_html_code_blocks = True
81
82 # Links that are automatically recognized by the renderer.
83 shortlinks_regex = r'^http://(ag|b|cl|g|go|who)/'
84
85 # Print the fragment tree for debugging.
86 debug_print_tree = False
87
88
89FLAGS = _Flags()
90
91
92def _EscapeText(text, reserved_chars):
93 """Escapes any reserved characters with a backslash.
94
95 Args:
96 text: The string to escape.
97 reserved_chars: A string of reserved characters that need to be escaped.
98
99 Returns:
100 The escaped text.
101 """
102 markdown = io.StringIO()
103 for c in text:
104 if c in reserved_chars:
105 markdown.write('\\')
106 markdown.write(c)
107 return markdown.getvalue()
108
109
110def _EscapeContentForHtml(text):
111 result = io.StringIO()
112 escapes = {'<': '&lt;', '>': '&gt;'}
113 for c in text:
114 result.write(c if c not in escapes else escapes[c])
115 return result
116
117
118ENCODED_NEWLINE = '&#%d;' % ord('\n')
119
120
121def _RestoreEncodedNewlines(text):
122 return text.replace(ENCODED_NEWLINE, '\n')
123
124
125def _WrapLine(line, indent):
126 """Wraps the line to fit into the column limit.
127
128 Args:
129 line: The string to wrap.
130 indent: An integer with the number of columns of indentation.
131
132 Returns:
133 The wrapped text.
134 """
135 if FLAGS.line_width > 0:
136 return ('\n' + ' ' * indent).join(textwrap.wrap(
137 line,
138 width=FLAGS.line_width - indent,
139 break_long_words=False,
140 break_on_hyphens=False))
141 return line
142
143
144class Fragment:
145 """Base class for all output fragments.
146
147 To generate a line of output, the methods will be called in the following
148 order:
149
150 WriteIndent()
151 WriteContentIntoParentAndClear()
152 ConsumeContent() -- for the topmost fragment only
153 StripLine()
154 WrapLine()
155 """
156
157 def __init__(self, indent, prefix, suffix):
158 self._content = io.StringIO()
159 self._indent = indent
160 self._prefix = prefix
161 self._suffix = suffix
162 self._parent = None
163 self._children = []
164
165 def __repr__(self):
166 debug_print = lambda text: text.encode('utf-8') if text else ''
167 return ('{' +
168 self.__class__.__name__ +
169 ': indent=' + debug_print(self._indent) +
170 '; prefix=' + debug_print(self._prefix) +
171 '; content=' + debug_print(self._content.getvalue()) +
172 '; suffix=' + debug_print(self._suffix) +
173 '}')
174
175 def SetParent(self, parent):
176 self._parent = parent
177
178 def AddChild(self, node):
179 self._children.append(node)
180 node.SetParent(self)
181 return node
182
183 def GetChildren(self):
184 return self._children
185
186 def _AllChildren(self):
187 all_children = []
188 def Traverse(fragment):
189 for c in fragment.GetChildren():
190 all_children.append(c)
191 Traverse(c)
192 Traverse(self)
193 return all_children
194
195 def WriteIndent(self, output):
196 if self._indent:
197 output.write(self._indent)
198
199 def WriteContentIntoParentAndClear(self):
200 self._WriteContent(self._parent._content) # pylint: disable=protected-access
201 self._ClearContent()
202 self._children = []
203
204 def _WriteContent(self, output):
205 """Implementation of content rendering. Can be overridden in subclasses."""
206 self._Write(output, self._prefix, self._content.getvalue(), self._suffix)
207
208 def _Write(self, output, prefix, content, suffix):
209 """Default implementation of content rendering for reuse by subclasses."""
210 has_content = bool(content.strip())
211 if prefix and has_content:
212 output.write(prefix)
213 output.write(content)
214 if suffix and has_content:
215 output.write(suffix)
216
217 def UnsetSuffix(self):
218 self._suffix = ''
219
220 def UnsetPrefix(self):
221 self._prefix = ''
222
223 def _UpdatePrefixAndSuffix(self, prefix, suffix):
224 if self._prefix:
225 self._prefix = prefix
226 if self._suffix:
227 self._suffix = suffix
228
229 def _ClearContent(self):
230 """Clears the content. This will only be called after it's been written."""
231 self._content = io.StringIO()
232
233 def ConsumeContent(self):
234 content = self._content
235 self._ClearContent()
236 return content
237
238 def Append(self, text):
239 """Appends text.
240
241 Args:
242 text: The string to append, it will be escaped.
243 """
244 assert isinstance(text, str)
245 self._content.write(self.EscapeText(text))
246
247 def EscapeText(self, text):
248 """Escapes any reserved characters when Append() is called with text.
249
250 By default this defers to the parent fragment.
251
252 Args:
253 text: The string to escape.
254
255 Returns:
256 The escaped string.
257 """
258 if self._parent:
259 return self._parent.EscapeText(text)
260 return text
261
262 def StripLine(self, text):
263 """Does any needed stripping of whitespace.
264
265 Some blocks (code for example) will want to preserve whitespace, while
266 others will want to coalesce it together. By default this defers to the
267 parent fragment.
268
269 Args:
270 text: The string to strip
271
272 Returns:
273 The stripped string.
274 """
275 if self._parent:
276 return self._parent.StripLine(text)
277 return text
278
279 def WrapLine(self, line, indent):
280 """Wraps the line to fit into the column limit, if necessary.
281
282 Most blocks (code for example) will want to preserve whitespace and won't
283 break their output.
284
285 Args:
286 text: The string to wrap.
287 indent: Indent string.
288 Returns:
289 The wrapped string.
290 """
291 del indent
292 return line
293
294 def NeedsToMergeWith(self, text):
295 del text
296 return False
297
298
299class HTML(Fragment):
300 """Markdown fragment that consists of just an unescaped HTML string."""
301
302 def __init__(self, prefix=None, suffix=None):
303 super().__init__(indent=None, prefix=prefix, suffix=suffix)
304
305 def EscapeText(self, text):
306 return text
307
308
309class Href(Fragment):
310 """HTML fragment containing an <a href=> tag. Used within table cells.
311
312 If the href falls within a table cell, using a Href() element will allow
313 us to have proper formatting; the Markdown-style Link() element will not
314 be processed properly.
315 """
316 def __init__(self, href):
317 super().__init__(indent=None, prefix='<a href="%s">' % href, suffix='</a>')
318
319
320class Text(Fragment):
321 """Markdown fragment that consists of just a string."""
322
323 def __init__(self, indent=None, prefix=None, suffix=None):
324 super().__init__(indent, prefix, suffix)
325
326
327class IgnoreBlock(Fragment):
328 """Markdown fragment that omits all content."""
329
330 def __init__(self):
331 super().__init__(None, None, None)
332
333
334class TextBlock(Text):
335 """A TextBlock coalesces all spaces and escapes all reserved chars."""
336
337 def EscapeText(self, text):
338 text = _EscapeContentForHtml(text).getvalue()
339 return _EscapeText(text, FLAGS.escape_chars)
340
341 def StripLine(self, text):
342 # Treat newlines as spaces and then coalesce spaces.
343 text = text.replace('\n', ' ')
344 # Replace all Unicode nonbreaking spaces with simple spaces. This is safer
345 # than deletion since spaces are coalesced below anyway.
346 text = text.replace(chr(160), ' ')
347
348 return re.sub(r' +', ' ', text.strip())
349
350
351class Div(TextBlock):
352 """Placeholder that helps with the two-column layout conversion."""
353
354 def __init__(self, cls):
355 self.cls = cls
356 super().__init__()
357
358
359class Table(TextBlock):
360 """Placeholder that identifies when we're in a (data) table.
361
362 (As opposed to a table being used for layout-purposes, which we don't
363 want to export.)
364 """
365 cls = None
366
367
368class TD(Text):
Dirk Pranke7aa01372021-11-05 16:16:09 -0700369 def __init__(self, rowspan, colspan):
370 prefix = '<td'
371 if rowspan and str(rowspan) != '1':
372 prefix += ' rowspan=%s' % rowspan
373 if colspan and str(colspan) != '1':
374 prefix += ' colspan=%s' % colspan
375 prefix += '>'
376 super().__init__(indent='', prefix=prefix, suffix='</td>')
Dirk Pranke7bbb5472021-11-02 16:33:21 -0700377
378
379class Content(TextBlock):
380 """Placeholder that identifies when we're processing the main content."""
381 cls = None
382
383
384class WrappedTextBlock(TextBlock):
385 """A WrappedTextBlock wraps the output lines to fit into the column limit."""
386
387 def WrapLine(self, line, indent):
388 return _WrapLine(line, len(indent))
389
390
391class BlockquoteBlock(WrappedTextBlock):
392 """A BlockquoteBlock wraps content and prepends each line with '> '.
393
394 The generator must emit BlockquoteBlocks with no indent for paragraphs
395 inside a blockquote. This will allow propagating the final call to WrapLine
396 up to the outermost BlockquoteBlock which will wrap the lines and prepend
397 each of them with the indent.
398 """
399
400 def __init__(self, indent='> '):
401 super().__init__(indent, None, None)
402
403 def WrapLine(self, line, indent):
404 if not self._indent and self._parent:
405 return self._parent.WrapLine(line, indent)
406 wrapped = _WrapLine(line, len(indent))
407 lines = wrapped.splitlines(True)
408 return indent.join([l.lstrip() for l in lines])
409
410
411class CodeBlock(Text):
412 """Base class for different code block fragment implementations."""
413
414 def EscapeText(self, text):
415 return text
416
417 def StripLine(self, text):
418 # Completely ignore newlines in code blocks. Sites always uses <br/>.
419 return text.replace('\n', '')
420
421 def ChangeToHtml(self):
422 content = self._content.getvalue()
423 if content:
424 self._content = _EscapeContentForHtml(content)
425
426
427class IndentedCodeBlock(CodeBlock):
428 """A IndentedCodeBlock indents by four spaces."""
429
430 def __init__(self, indent=' '):
431 super().__init__(indent, None, None)
432
433
434class FencedCodeBlock(CodeBlock):
435 """A FencedCodeBlock is fenced with triple backticks (```).
436
437 To render correctly, content writing must not happen
438 unless the end of the source code block has been encountered.
439 That is, the entire code block from the source HTML must
440 be rendered in a single write pass.
441 """
442
443 def __init__(self, indent=None,
444 prefix='```none' + ENCODED_NEWLINE,
445 suffix=ENCODED_NEWLINE + '```'):
446 super().__init__(indent, prefix, suffix)
447
448 def WriteIndent(self, output):
449 # Adjust inner fragments and self before rendering.
450 if FLAGS.allow_html_code_blocks:
451 has_formatted_text = False
452 for c in self._AllChildren():
453 if isinstance(c, FormattedText):
454 c.ChangeToHtml()
455 has_formatted_text = True
456 if has_formatted_text:
457 for c in self._AllChildren():
458 if isinstance(c, CodeBlock):
459 c.ChangeToHtml()
460 self._UpdatePrefixAndSuffix(
461 '<pre><code>', ENCODED_NEWLINE + '</code></pre>')
462 super().WriteIndent(output)
463
464 def StripLine(self, text):
465 text = super().StripLine(text)
466 lines = _RestoreEncodedNewlines(text).splitlines()
467 return '\n'.join([l for l in lines if l])
468
469 def WrapLine(self, line, indent):
470 lines = line.splitlines(True)
471 return indent.join(lines)
472
473
474class FencedCodeBlockLine(Text):
475 """A line of code inside FencedCodeBlock."""
476
477 def __init__(self, indent=None,
478 prefix=ENCODED_NEWLINE, suffix=ENCODED_NEWLINE):
479 super().__init__(indent, prefix, suffix)
480
481 def StripLine(self, text):
482 text = super().StripLine(text)
483 return _RestoreEncodedNewlines(text)
484
485
486class UnderlinedHeader(TextBlock):
487 """Markdown fragment for an underlined section header."""
488
489 def __init__(self, char):
490 super().__init__()
491 self._char = char
492
493 def _WriteContent(self, output):
494 length = len(self.StripLine(self._content.getvalue()))
495 if length > 0:
496 # '\n' will be stripped, so use an encoded '\n' that we can later replace
497 # after the line is stripped.
498 self._Write(output,
499 None,
500 self._content.getvalue(),
501 ENCODED_NEWLINE + self._char * length)
502
503 def StripLine(self, text):
504 text = super().StripLine(text)
505 return _RestoreEncodedNewlines(text)
506
507
508class FormattedText(Text):
509 """Text wrapped in Markdown formatting."""
510
511 def __init__(self, fmt):
512 super().__init__(None, fmt, fmt)
513
514 def _Pad(self, bigger, smaller):
515 return ' ' * (len(bigger) - len(smaller))
516
517 def _WriteContent(self, output):
518 prefix = self._prefix
519 content = self._content.getvalue()
520 suffix = self._suffix
521 if prefix:
522 # If there are whitespaces immediately after the prefix,
523 # they must be pushed out before the prefix.
524 lstripped = content.lstrip()
525 if len(content) > len(lstripped):
526 prefix = self._Pad(content, lstripped) + prefix
527 content = lstripped
528 if suffix:
529 # If there are whitespaces immediately before the suffix,
530 # they must be pushed out after the suffix.
531 rstripped = content.rstrip()
532 if len(content) > len(rstripped):
533 suffix = suffix + self._Pad(content, rstripped)
534 content = rstripped
535 self._Write(output, prefix, content, suffix)
536
537 def ChangeToHtml(self):
538 content = self._content.getvalue()
539 if content:
540 content = _EscapeContentForHtml(content)
541
542
543class BoldFormattedText(FormattedText):
544 """Text formatted as bold."""
545
546 def __init__(self):
547 super().__init__(FLAGS.bold_format)
548
549 def NeedsToMergeWith(self, text):
550 return isinstance(text, BoldFormattedText)
551
552 def ChangeToHtml(self):
553 super().ChangeToHtml()
554 self._UpdatePrefixAndSuffix('<b>', '</b>')
555
556
557class ItalicFormattedText(FormattedText):
558 """Text formatted as italic."""
559
560 def __init__(self):
561 super().__init__(FLAGS.italic_format)
562
563 def NeedsToMergeWith(self, text):
564 return isinstance(text, ItalicFormattedText)
565
566 def ChangeToHtml(self):
567 super().ChangeToHtml()
568 self._UpdatePrefixAndSuffix('<i>', '</i>')
569
570
571class StrikeThroughFormattedText(FormattedText):
572 """Text formatted as strike through."""
573
574 def __init__(self):
575 super().__init__(FLAGS.strike_format)
576
577 def NeedsToMergeWith(self, text):
578 return isinstance(text, StrikeThroughFormattedText)
579
580 def ChangeToHtml(self):
581 super().ChangeToHtml()
582 self._UpdatePrefixAndSuffix('<s>', '</s>')
583
584
585class HighlightFormattedText(FormattedText):
586 """Highlighted text."""
587
588 def __init__(self):
589 super().__init__(FLAGS.highlight_format)
590
591 def NeedsToMergeWith(self, text):
592 return isinstance(text, HighlightFormattedText)
593
594 def ChangeToHtml(self):
595 super().ChangeToHtml()
596 self._UpdatePrefixAndSuffix('<u>', '</u>')
597
598
599class ListItem(Text):
600 """Item in a list."""
601
602 def __init__(self, bullet):
603 super().__init__()
604 self._bullet = bullet
605
606 def WriteIndent(self, output):
607 if self._bullet:
608 # TODO(dpranke): The original code relied on strings and bytes
609 # being interchangeable in Python2, so you could seek backwards
610 # from the current location with a relative offset. You can't
611 # do that in Python3, apparently.
612 #
613 # To get around this for the moment, instead of seeking backwards
614 # 4 characters, we embed 4 '\b' backspaces, and then have the client
615 # do a global search and replace of ' \b\b\b\b' with '' instead.
616 #
617 # This is awkward, so we should rework this so that this isn't needed.
618 #
619 # output.seek(-len(self._bullet), os.SEEK_CUR)
620 output.write('\b' * len(self._bullet))
621 output.write(self._bullet)
622 super().WriteIndent(output)
623
624 def _ClearContent(self):
625 self._bullet = None
626 super()._ClearContent()
627
628 def WrapLine(self, line, indent):
629 return _WrapLine(line, len(indent))
630
631
632class Link(Text):
633 """Markdown link."""
634
635 def __init__(self, href):
636 super().__init__()
637 self._href = href
638 self._url_opener_prefix = ''
639 self._url_opener_suffix = ''
640
641 def MakeAnImage(self, width, height):
642 self._url_opener_prefix = '!'
643 if width and height:
644 self._url_opener_suffix = (
645 '{{width="{}" height="{}"}}'.format(width, height))
646
647 def _IsShortLink(self, text):
648 if FLAGS.shortlinks_regex and (
649 re.compile(FLAGS.shortlinks_regex).match(self._href)):
650 parsed_href = urllib.parse.urlsplit(self._href)
651 if parsed_href.netloc + parsed_href.path == text:
652 return True
653 return None
654
655 def _WriteLink(self, output, text):
656 write_short_link = (not (self._url_opener_prefix or self._url_opener_suffix)
657 and self._IsShortLink(text))
658 if write_short_link:
659 self._Write(output, None, text, None)
660 else:
661 self._Write(output,
662 self._url_opener_prefix + '[',
663 text,
664 '](' + self._href + ')' + self._url_opener_suffix)
665
666 def _WriteContent(self, output):
667 text = self._content.getvalue()
668 if text:
669 if text.startswith('http://') or text.startswith('https://'):
670 self._Write(output, '<', text, '>')
671 else:
672 self._WriteLink(output, text)
673
674
675class Image(Text):
676 """Image."""
677
678 def __init__(self, src, alt, width, height):
679 super().__init__()
680 self._src = src
681 self._alt = alt or 'image'
682 self._width = width
683 self._height = height
684
685 def _WriteContent(self, output):
686 tag = '<img alt="%s" src="%s"' % (self._alt, self._src)
687 if self._height:
688 tag += ' height=%s' % self._height
689 if self._width:
690 tag += ' width=%s' % self._width
691 tag += '>'
692 self._Write(output, '', tag, '')
693
694
695class Code(Text):
696 """Inline code."""
697
698 def __init__(self):
699 super().__init__(None, '`', '`')
700
701 def EscapeText(self, text):
702 return text
703
704 def _WriteContent(self, output):
705 prefix = self._prefix
706 content = self._content.getvalue()
707 suffix = self._suffix
708 if '`' in content:
709 # If a backtick (`) is present inside inline code, the fragment
710 # must use double backticks.
711 prefix = suffix = '``'
712 # Since having content starting or ending with a backtick would emit
713 # triple backticks which designates a fenced code fragment, pad content
714 # to avoid this.
715 if content.startswith('`'):
716 content = ' ' + content
717 if content.endswith('`'):
718 content += ' '
719 self._Write(output, prefix, content, suffix)
720
721 def NeedsToMergeWith(self, text):
722 return isinstance(text, Code)
723
724
725class EmbeddedContent(Text):
726 """Embedded content: Docs, Drawings, Presentations, etc."""
727
728 def __init__(self, href, width, height):
729 super().__init__()
730 self._href = href
731 self._width = width
732 self._height = height
733
734 def _WriteContent(self, output):
735 parsed_href = urllib.parse.urlsplit(self._href)
736 if parsed_href.scheme == 'http':
737 parsed_href = urllib.parse.SplitResult(
738 'https', parsed_href.netloc, parsed_href.path, parsed_href.query,
739 parsed_href.fragment)
740 # Note: 'allow="fullscreen"' is requested for all content for simplicity.
741 # g3doc server has dedicated logic to deal with these requests.
742 element = '<iframe src="{}"{} allow="fullscreen" />'.format(
743 urllib.parse.urlunsplit(parsed_href),
744 (' width="{}" height="{}"'.format(self._width, self._height) if (
745 self._width and self._height) else ''))
746 self._Write(output, None, element, None)
747
748
749class ListInfo:
750
751 def __init__(self, tag):
752 self.tag = tag # The tag used to start the list
753 self.item_count = 0 # The number of items in the list
754
755
756class FragmentTree:
757 """Class for managing a tree of fragments.
758
759 There is a "scope" formed by nested fragments, e.g.
760 italic fragment inside bold fragment inside paragraph.
761 The scope is stored in the stack. For convenience,
762 the stack always have one element.
763
764 Fragments popped out from the scope may be re-added
765 back into the tree as children of the last fragment.
766 This allows "chaining" of structured content for future
767 processing. For example, if there were several bold
768 fragments inside a paragraph interleaved with fragments
769 of regular text, all these fragments will end up as
770 children of the paragraph fragment.
771
772 """
773
774 def __init__(self, top_fragment):
775 self._stack = [top_fragment]
776
777 def ActiveFragmentScopeDepth(self):
778 return len(self._stack) - 1
779
780 def StartFragment(self, fragment):
781 fragment.SetParent(self._stack[-1])
782 self._stack.append(fragment)
783 return fragment
784
785 def EndFragment(self):
786 return self._stack.pop()
787
788 def AppendFragment(self, fragment):
789 return self._stack[-1].AddChild(fragment)
790
791 def _ApplyRecursivelyToNode(self, node, scope_operation, operation, # pylint: disable=missing-docstring
792 debug_indent):
793 if not debug_indent:
794 for child in node.GetChildren():
795 self._ApplyRecursivelyToNode(child, scope_operation, operation, None)
796 else:
797 debug_indent += ' c '
798 for child in node.GetChildren():
799 print(debug_indent + repr(child))
800 self._ApplyRecursivelyToNode(child, scope_operation, operation,
801 debug_indent)
802 operation(node)
803
804 def _ApplyRecursivelyToScope(self, nodes, scope_operation, operation, # pylint: disable=missing-docstring
805 debug_indent):
806 node = nodes.pop()
807 scope_operation(node)
808 if debug_indent:
809 print(debug_indent + repr(node))
810 if nodes:
811 self._ApplyRecursivelyToScope(nodes, scope_operation, operation,
812 (debug_indent + ' s ' if debug_indent
813 else None))
814 self._ApplyRecursivelyToNode(node, scope_operation, operation,
815 debug_indent)
816
817 def ApplyToAllFragments(self, scope_operation, operation):
818 """Recursively applies operations to all fragments in the tree.
819
820 The omnipresent topmost fragment is excluded. The 'scope_operation'
821 is applied to every element in the fragment stack in pre-order.
822 The 'operation' is applied to all fragments in the tree in post-order.
823
824 Args:
825 scope_operation: The operation to apply to fragments in the scope stack.
826 operation: The operation to apply to all fragments in the tree.
827 """
828 self._ApplyRecursivelyToScope(list(reversed(self._stack[1:])),
829 scope_operation, operation,
830 ' ' if FLAGS.debug_print_tree else None)
831
832 def FindFirstFragmentFromEnd(self, predicate, steps_from_last=0):
833 sub_stack = self._stack[:-steps_from_last if steps_from_last else None]
834 return next((node for node in sub_stack if predicate(node)), None)
835
836 def PeekFragmentFromStart(self, steps_from_first=0):
837 return self._stack[steps_from_first]
838
839 def PeekFragmentFromEnd(self, steps_from_last=0):
840 return self._stack[-(steps_from_last + 1)]
841
842 def PeekLastAppendedFragment(self):
843 return (self._stack[-1].GetChildren()[-1]
844 if self._stack[-1].GetChildren() else None)
845
846
847class MarkdownGenerator:
848 """Generates Markdown based on the series of HTML tags seen.
849
850 Each time an opening HTML tag is seen, the appropriate markdown fragment is
851 created and pushed onto a stack. Any text encountered is appended to the
852 fragment at the top of the stack. When a closing HTML tag is seen, the stack
853 is popped and the fragment removed is appended to the new top of the stack.
854
855 Markdown is buffered in the fragment stack until an entire line has been
856 formed, at which point _WriteFragmentsAsLine() is called to write it out. The
857 content buffered in the stack is cleared, but otherwise the stack remains
858 unmodified.
859 """
860
861 def __init__(self, out, url_translator):
862 self._out = out
863 self._url_translator = url_translator
864 self._fragment_tree = FragmentTree(Text())
865 self._list_info_stack = []
866 self._pending_newlines = 0
867 # Initialize the regexps to match nothing (rather than be None).
868 self._code_class_regex = re.compile(FLAGS.code_class_regex or 'a^')
869 self._toc_class_regex = re.compile(FLAGS.toc_class_regex or 'a^')
870 self._ignore_class_regex = re.compile(FLAGS.ignore_class_regex or 'a^')
871 self._ignore_style_regex = re.compile(FLAGS.ignore_style_regex or 'a^')
872
873 def _Push(self, fragment):
874 """Sets the parent fragment and pushes it onto the fragment stack.
875
876 In the case where there is an IgnoreBlock on the stack, a new IgnoreBlock
877 is pushed instead.
878
879 Args:
880 fragment: The Fragment object to push on the stack.
881 """
882 if isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
883 # If the top of the stack is IgnoreBlock, push an IgnoreBlock instead.
884 fragment = IgnoreBlock()
885 else:
886 # Check if we need to merge adjacent formatting, e.g.
887 # instead of **bold****bold** we need to write **boldbold**,
888 # as the former is not correct Markdown syntax.
889 last_appended = self._fragment_tree.PeekLastAppendedFragment()
890 if last_appended and last_appended.NeedsToMergeWith(fragment):
891 last_appended.UnsetSuffix()
892 fragment.UnsetPrefix()
893
894 self._fragment_tree.StartFragment(fragment)
895
896 def _Pop(self):
897 """Pops the fragment stack it to the new top of stack.
898
899 If the fragment stack would be empty after popping, then the fragment is
900 written to the output first.
901 """
902 if self._fragment_tree.ActiveFragmentScopeDepth() > 1:
903 fragment = self._fragment_tree.EndFragment()
904 self._fragment_tree.AppendFragment(fragment)
905 else:
906 self._WriteFragmentsAsLine(newlines=0)
907 self._fragment_tree.EndFragment()
908
909 def _IsWithinFragmentType(self, fragment_type, steps_from_last=0):
910 return self._fragment_tree.FindFirstFragmentFromEnd(
911 lambda fragment: isinstance(fragment, fragment_type),
912 steps_from_last) is not None
913
914 def _LastFragmentIs(self, fragment_type, cls):
915 fragment = self._fragment_tree.PeekFragmentFromEnd()
916 return (isinstance(fragment, fragment_type) and fragment.cls == cls)
917
918 def Break(self):
919 if not self._IsWithinFragmentType(FencedCodeBlock):
920 self._WriteFragmentsAsLine(newlines=1)
921 else:
922 fragment = FencedCodeBlockLine(prefix='', suffix='')
923 self._Push(fragment)
924 fragment.Append(ENCODED_NEWLINE)
925 self._Pop()
926
927 def HorizontalRule(self):
928 # Horizontal rule must be preceded and followed by a blank line
929 self._AddVerticallyPaddedParagraph('---')
930
931 def StartDocument(self):
932 self._Push(WrappedTextBlock())
933
934 def EndDocument(self):
935 self._Pop()
936
937 def StartParagraph(self):
938 self._WriteFragmentsAsLine(newlines=2)
939
940 def EndParagraph(self):
941 self._WriteFragmentsAsLine(newlines=2)
942
943 def StartDiv(self, cls, style, ident):
944 """Process opening of a div element.
945
946 Args:
947 cls: The class attribute of the element.
948 style: The style attribute of the element.
949 ident: The id attribute of the element
950 """
951 if not self._IsWithinFragmentType(FencedCodeBlock):
952 if self._IsWithinFragmentType(CodeBlock):
953 self._WriteFragmentsAsLine(newlines=1)
954 else:
955 self._WriteFragmentsAsLine(newlines=2)
956
957 if ((cls and self._ignore_class_regex.match(cls)) or
958 style and self._ignore_style_regex.match(style)):
959 self._Push(IgnoreBlock())
960 elif self._IsWithinFragmentType(FencedCodeBlock):
961 self._Push(FencedCodeBlockLine())
962 elif self._IsWithinFragmentType(CodeBlock):
963 self._Push(CodeBlock())
964 elif self._IsWithinFragmentType(BlockquoteBlock):
965 self._Push(BlockquoteBlock(indent=None))
966 elif cls and self._toc_class_regex.match(cls):
967 self._AddTableOfContents()
968 self._Push(IgnoreBlock()) # Ignore the items inside the Sites TOC
969 elif cls and self._code_class_regex.match(cls):
970 if FLAGS.indented_code_blocks:
971 self._Push(IndentedCodeBlock())
972 else:
973 self._Push(FencedCodeBlock())
974 else:
975 self._Push(WrappedTextBlock())
976
977 def EndDiv(self):
978 if not self._IsWithinFragmentType(FencedCodeBlock, steps_from_last=1):
979 if self._IsWithinFragmentType(CodeBlock, steps_from_last=1):
980 self._WriteFragmentsAsLine(newlines=1)
981 else:
982 self._WriteFragmentsAsLine(newlines=2)
983 self._Pop()
984
985 def StartHeader(self, level):
986 self._WriteFragmentsAsLine(newlines=2)
987 if level == 1 and FLAGS.underline_headers:
988 self._Push(UnderlinedHeader('='))
989 elif level == 2 and FLAGS.underline_headers:
990 self._Push(UnderlinedHeader('-'))
991 else:
992 self._Push(TextBlock(prefix=('#' * level) + ' '))
993
994 def EndHeader(self):
995 self._WriteFragmentsAsLine(newlines=2)
996 self._Pop()
997
998 def StartList(self, tag):
999 if not self._list_info_stack:
1000 self._WriteFragmentsAsLine(newlines=2)
1001 else:
1002 self._WriteFragmentsAsLine(newlines=1)
1003 self._list_info_stack.append(ListInfo(tag))
1004 if tag == 'ol':
1005 self._Push(Text(' ' * FLAGS.ordered_list_indent))
1006 else:
1007 self._Push(Text(' ' * FLAGS.unordered_list_indent))
1008
1009 def EndList(self):
1010 self._list_info_stack.pop()
1011 if not self._list_info_stack:
1012 self._WriteFragmentsAsLine(newlines=2)
1013 else:
1014 self._WriteFragmentsAsLine(newlines=1)
1015 self._Pop()
1016
1017 def StartListItem(self):
1018 self._WriteFragmentsAsLine(newlines=1)
1019 # Google Sites sometimes spits out pages with <li> tags not enclosed within
1020 # an <ol> or <ul> tag.
1021 tag = ''
1022 if self._list_info_stack:
1023 self._list_info_stack[-1].item_count += 1
1024 tag = self._list_info_stack[-1].tag
1025 if tag == 'ol':
1026 item_count = self._list_info_stack[-1].item_count
1027 # string.ljust makes room for as many digits as you need.
1028 prefix = ('%d.' % item_count).ljust(FLAGS.ordered_list_indent)
1029 self._Push(ListItem(prefix))
1030 else:
1031 prefix = '*'.ljust(FLAGS.unordered_list_indent)
1032 self._Push(ListItem(prefix))
1033
1034 def EndListItem(self):
1035 self._WriteFragmentsAsLine(newlines=1)
1036 self._Pop()
1037
1038 def StartFormat(self, tag):
1039 # Allowed formatting depends on the surrounding fragment type.
1040 if self._IsWithinFragmentType(TD) and tag == 'b':
1041 # TODO(dpranke): This is a hack because I don't yet really understand
1042 # how the ChangeToHtml() logic works in CodeBlocks, but it seems like
1043 # we should be able to do something similar to what they do.
1044 # Also, this should really be rewriting these to <th>s instead.
1045 self._Push(HTML('<b>', '</b>'))
1046 return
1047
1048 if not self._IsWithinFragmentType(IndentedCodeBlock):
1049 formats_map = {
1050 'i': ItalicFormattedText,
1051 'em': ItalicFormattedText,
1052 'b': BoldFormattedText,
1053 'strong': BoldFormattedText,
1054 'strike': StrikeThroughFormattedText,
1055 's': StrikeThroughFormattedText,
1056 'del': StrikeThroughFormattedText,
1057 'u': HighlightFormattedText,
1058 'code': Code,
1059 None: Text,
1060 }
1061 if self._IsWithinFragmentType(FencedCodeBlock):
1062 if FLAGS.allow_html_code_blocks:
1063 # HTML code block can render formats but must not use Code fragments.
1064 formats_map['code'] = formats_map[None] = CodeBlock
1065 else:
1066 formats_map = {None: CodeBlock}
1067 else:
1068 # Inside an indented code block no formatting is allowed.
1069 formats_map = {None: CodeBlock}
1070 self._Push(formats_map[tag]() if tag in formats_map
1071 else formats_map[None]())
1072
1073 def EndFormat(self):
1074 self._Pop()
1075
1076 def StartAnchor(self, href):
1077 if href is not None:
1078 href = self._url_translator.Translate(href)
1079 if self._IsWithinFragmentType(TD):
1080 self._Push(Href(href))
1081 else:
1082 self._Push(Link(href))
1083 else:
1084 self._Push(Text())
1085
1086 def EndAnchor(self):
1087 self._Pop()
1088
1089 def StartBlockquote(self):
1090 if not self._IsWithinFragmentType(CodeBlock):
1091 self._WriteFragmentsAsLine(newlines=1)
1092 self._Push(BlockquoteBlock())
1093 else:
1094 self._Push(Text())
1095
1096 def EndBlockquote(self):
1097 if not self._IsWithinFragmentType(CodeBlock):
1098 self._WriteFragmentsAsLine(newlines=2)
1099 self._Pop()
1100
1101 def Image(self, src, alt, width, height):
1102 src = self._url_translator.Translate(src)
1103 self._fragment_tree.AppendFragment(Image(src, alt, width, height))
1104
1105 def Iframe(self, src, width, height):
1106 """Process an <iframe> element.
1107
1108 Sites use <iframe> for embedded content: Docs, Drawings, etc.
1109 g3doc implements this by supporting <iframe> HTML tag directly.
1110
1111 Args:
1112 src: Source URL.
1113 width: Element width.
1114 height: Element height.
1115 """
1116 if False:
1117 # TODO(dpranke): Figure out if we should support embedded IFRAME tags.
1118 # For now, we skip over them.
1119 self._WriteFragmentsAsLine(newlines=2)
1120 self._Push(EmbeddedContent(src, width, height))
1121 self._Pop()
1122
1123 def StartTable(self, cls):
1124 if (cls and 'sites-layout-hbox' in cls and
1125 'sites-layout-name-one-column' not in cls):
1126 self._AddHTMLBlock('<div class="two-column-container">')
1127 self._Push(Div(cls='two-column-container'))
1128 elif (cls and 'sites-layout-name-one-column' in cls):
1129 pass
1130 else:
1131 self._AddHTMLBlock('<table>')
1132 self._Push(Table())
1133
1134 def EndTable(self):
1135 if self._LastFragmentIs(Div, cls='two-column-container'):
1136 self._AddHTMLBlock('</div>')
1137 self._Pop()
1138 elif self._IsWithinFragmentType(Table):
1139 self._AddHTMLBlock('</table>')
1140 self._Pop()
1141
1142 def StartTR(self):
1143 if self._IsWithinFragmentType(Table):
1144 self._AddHTMLBlock('<tr>')
1145
1146 def EndTR(self):
1147 if self._IsWithinFragmentType(Table):
1148 self._AddHTMLBlock('</tr>')
1149
Dirk Pranke7aa01372021-11-05 16:16:09 -07001150 def StartTD(self, cls, rowspan, colspan):
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001151 if self._LastFragmentIs(Div, cls='two-column-container'):
1152 if cls and ('sites-tile-name-content-1' in cls or
1153 'sites-tile-name-content-2' in cls):
1154 self._AddHTMLBlock('<div class="column">')
1155 self._Push(Div(cls='column'))
1156 else:
1157 self._Push(Text())
1158 elif self._IsWithinFragmentType(Table):
Dirk Pranke7aa01372021-11-05 16:16:09 -07001159 self._Push(TD(rowspan, colspan))
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001160
1161 def EndTD(self):
1162 if self._LastFragmentIs(Div, cls='column'):
1163 self._AddHTMLBlock('</div>')
1164 self._Pop()
1165 elif self._IsWithinFragmentType(Table):
1166 self._Pop()
1167 self._WriteFragmentsAsLine(newlines=1)
1168
1169 def Text(self, text):
1170 if not isinstance(self._fragment_tree.PeekFragmentFromEnd(), IgnoreBlock):
1171 fragment = (CodeBlock() if self._IsWithinFragmentType(CodeBlock)
1172 else Text())
1173 self._fragment_tree.AppendFragment(fragment)
1174 fragment.Append(text)
1175
1176 def _AddTableOfContents(self):
1177 # TOC must be preceded and followed by a blank line
1178 self._AddVerticallyPaddedParagraph('[TOC]')
1179
1180 def _AddVerticallyPaddedParagraph(self, text):
1181 self._WriteFragmentsAsLine(newlines=2)
1182 fragment = CodeBlock() # Use CodeBlock to prevent escaping
1183 self._fragment_tree.AppendFragment(fragment)
1184 fragment.Append(text)
1185 self._WriteFragmentsAsLine(newlines=2)
1186
1187 def _AddHTMLBlock(self, html):
1188 """Writes out a block-level string of html."""
1189 fragment = HTML()
1190 fragment.Append(html)
1191 self._fragment_tree.AppendFragment(fragment)
1192 self._WriteFragmentsAsLine(newlines=1)
1193
1194 def _WriteFragmentsAsLine(self, newlines):
1195 """Writes out any content currently buffered in the fragment stack.
1196
1197 Args:
1198 newlines: The minimum number of newlines required in the output after this
1199 line. These newlines won't be written out until the next line with
1200 content is encountered.
1201 """
1202
1203 # Generate indent and the content, then clear content in fragments.
1204 indent = io.StringIO()
1205 self._fragment_tree.ApplyToAllFragments(
1206 lambda fragment: fragment.WriteIndent(indent),
1207 lambda fragment: fragment.WriteContentIntoParentAndClear())
1208 last_fragment = self._fragment_tree.PeekFragmentFromEnd()
1209 content = self._fragment_tree.PeekFragmentFromStart().ConsumeContent()
1210 content = last_fragment.StripLine(content.getvalue())
1211 indent = indent.getvalue()
1212 content = last_fragment.WrapLine(content, indent)
1213
1214 # Write the content, if any.
1215 if content:
1216 self._out.write('\n' * self._pending_newlines)
1217 self._out.write(indent)
1218 self._out.write(content)
1219 self._pending_newlines = newlines
1220 elif self._pending_newlines > 0 and self._pending_newlines < newlines:
1221 self._pending_newlines = newlines
1222
1223 if FLAGS.debug_print_tree:
1224 # Separate trees printed during each writing session
1225 print('-' * 20)
1226
1227
1228class XhtmlHandler(xml.sax.ContentHandler):
1229 """Translates SAX events into MarkdownGenerator calls."""
1230
1231 # regex that matches an HTML header tag and extracts the level.
1232 _HEADER_TAG_RE = re.compile(r'h([1-6])$')
1233
1234 def __init__(self, out, url_translator):
1235 xml.sax.ContentHandler.__init__(self)
1236 self._generator = MarkdownGenerator(out, url_translator)
1237
1238 def startDocument(self):
1239 self._generator.StartDocument()
1240
1241 def endDocument(self):
1242 self._generator.EndDocument()
1243
1244 def startElementNS(self, name, qname, attrs):
1245 tag = name[1]
1246 if tag == 'a':
1247 href = attrs.get((None, 'href'))
1248 self._generator.StartAnchor(href)
1249 elif tag == 'br':
1250 self._generator.Break()
1251 elif tag == 'hr':
1252 self._generator.HorizontalRule()
1253 elif tag == 'li':
1254 self._generator.StartListItem()
1255 elif tag == 'div':
1256 cls = attrs.get((None, 'class'))
1257 style = attrs.get((None, 'style'))
1258 ident = attrs.get((None, 'id'))
1259 self._generator.StartDiv(cls, style, ident)
1260 elif tag == 'p':
1261 self._generator.StartParagraph()
1262 elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
1263 self._generator.StartFormat(tag)
1264 elif tag in ('ul', 'ol'):
1265 self._generator.StartList(tag)
1266 elif tag == 'img':
1267 src = attrs.get((None, 'src'))
1268 alt = attrs.get((None, 'alt'))
1269 width = attrs.get((None, 'width'))
1270 height = attrs.get((None, 'height'))
1271 self._generator.Image(src, alt, width, height)
1272 elif tag == 'blockquote':
1273 self._generator.StartBlockquote()
1274 elif tag == 'iframe':
1275 src = attrs.get((None, 'src'))
1276 width = attrs.get((None, 'width'))
1277 height = attrs.get((None, 'height'))
1278 self._generator.Iframe(src, width, height)
1279 elif tag == 'table':
1280 cls = attrs.get((None, 'class'))
1281 self._generator.StartTable(cls)
1282 elif tag == 'tr':
1283 self._generator.StartTR()
1284 elif tag == 'td':
Dirk Pranke7aa01372021-11-05 16:16:09 -07001285 self._generator.StartTD(attrs.get((None, 'class')),
1286 attrs.get((None, 'rowspan')),
1287 attrs.get((None, 'colspan')))
Dirk Pranke7bbb5472021-11-02 16:33:21 -07001288 else:
1289 match = self._HEADER_TAG_RE.match(tag)
1290 if match:
1291 level = int(match.group(1))
1292 self._generator.StartHeader(level)
1293
1294 def endElementNS(self, name, qname):
1295 tag = name[1]
1296 if tag == 'a':
1297 self._generator.EndAnchor()
1298 elif tag == 'li':
1299 self._generator.EndListItem()
1300 elif tag == 'div':
1301 self._generator.EndDiv()
1302 elif tag == 'p':
1303 self._generator.EndParagraph()
1304 elif tag in ('b', 'code', 'em', 'i', 'strong', 's', 'strike', 'del', 'u'):
1305 self._generator.EndFormat()
1306 elif tag in ('ul', 'ol'):
1307 self._generator.EndList()
1308 elif tag == 'blockquote':
1309 self._generator.EndBlockquote()
1310 elif tag == 'td':
1311 self._generator.EndTD()
1312 elif tag == 'tr':
1313 self._generator.EndTR()
1314 elif tag == 'table':
1315 self._generator.EndTable()
1316 else:
1317 match = self._HEADER_TAG_RE.match(tag)
1318 if match:
1319 self._generator.EndHeader()
1320
1321 def characters(self, content):
1322 self._generator.Text(content)
1323
1324
1325class DefaultUrlTranslator:
1326 """No-op UrlTranslator."""
1327
1328 def Translate(self, href):
1329 return href
1330
1331
1332def Convert(input_stream, output_stream, url_translator=DefaultUrlTranslator()):
1333 """Converts an input stream of xhtml into an output stream of markdown.
1334
1335 Args:
1336 input_stream: filehandle for the XHTML input.
1337 output_stream: filehandle for the Markdown output.
1338 url_translator: Callback for translating URLs embedded in the page.
1339 """
1340 parser = xml.sax.make_parser()
1341 parser.setContentHandler(XhtmlHandler(output_stream, url_translator))
1342 parser.setFeature(xml.sax.handler.feature_namespaces, 1)
1343 parser.parse(input_stream)