Yu-Ping Wu | 6a8f3a2 | 2021-11-24 00:45:03 +0000 | [diff] [blame^] | 1 | """ |
| 2 | Python Markdown |
| 3 | |
| 4 | A Python implementation of John Gruber's Markdown. |
| 5 | |
| 6 | Documentation: https://python-markdown.github.io/ |
| 7 | GitHub: https://github.com/Python-Markdown/markdown/ |
| 8 | PyPI: https://pypi.org/project/Markdown/ |
| 9 | |
| 10 | Started by Manfred Stienstra (http://www.dwerg.net/). |
| 11 | Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). |
| 12 | Currently maintained by Waylan Limberg (https://github.com/waylan), |
| 13 | Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). |
| 14 | |
| 15 | Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later) |
| 16 | Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) |
| 17 | Copyright 2004 Manfred Stienstra (the original version) |
| 18 | |
| 19 | License: BSD (see LICENSE.md for details). |
| 20 | """ |
| 21 | |
| 22 | import re |
| 23 | import importlib |
| 24 | import sys |
| 25 | |
| 26 | |
| 27 | # Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it. |
| 28 | # Users can still do `from html import parser` and get the default behavior. |
| 29 | spec = importlib.util.find_spec('html.parser') |
| 30 | htmlparser = importlib.util.module_from_spec(spec) |
| 31 | spec.loader.exec_module(htmlparser) |
| 32 | sys.modules['htmlparser'] = htmlparser |
| 33 | |
| 34 | # Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions. |
| 35 | htmlparser.piclose = re.compile(r'\?>') |
| 36 | # Monkeypatch HTMLParser to only recognize entity references with a closing semicolon. |
| 37 | htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') |
| 38 | # Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block, |
| 39 | # so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete, |
| 40 | # and the two regex are the same, then incomplete will simply never match and we avoid the logic within. |
| 41 | htmlparser.incomplete = htmlparser.entityref |
| 42 | # Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value. |
| 43 | htmlparser.locatestarttagend_tolerant = re.compile(r""" |
| 44 | <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here |
| 45 | (?:[\s/]* # optional whitespace before attribute name |
| 46 | (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here |
| 47 | (?:\s*=+\s* # value indicator |
| 48 | (?:'[^']*' # LITA-enclosed value |
| 49 | |"[^"]*" # LIT-enclosed value |
| 50 | |(?!['"])[^`>\s]* # bare value <= added backtick here |
| 51 | ) |
| 52 | (?:\s*,)* # possibly followed by a comma |
| 53 | )?(?:\s|/(?!>))* |
| 54 | )* |
| 55 | )? |
| 56 | \s* # trailing whitespace |
| 57 | """, re.VERBOSE) |
| 58 | |
| 59 | # Match a blank line at the start of a block of text (two newlines). |
| 60 | # The newlines may be preceded by additional whitespace. |
| 61 | blank_line_re = re.compile(r'^([ ]*\n){2}') |
| 62 | |
| 63 | |
| 64 | class HTMLExtractor(htmlparser.HTMLParser): |
| 65 | """ |
| 66 | Extract raw HTML from text. |
| 67 | |
| 68 | The raw HTML is stored in the `htmlStash` of the Markdown instance passed |
| 69 | to `md` and the remaining text is stored in `cleandoc` as a list of strings. |
| 70 | """ |
| 71 | |
| 72 | def __init__(self, md, *args, **kwargs): |
| 73 | if 'convert_charrefs' not in kwargs: |
| 74 | kwargs['convert_charrefs'] = False |
| 75 | |
| 76 | # Block tags that should contain no content (self closing) |
| 77 | self.empty_tags = set(['hr']) |
| 78 | |
| 79 | # This calls self.reset |
| 80 | super().__init__(*args, **kwargs) |
| 81 | self.md = md |
| 82 | |
| 83 | def reset(self): |
| 84 | """Reset this instance. Loses all unprocessed data.""" |
| 85 | self.inraw = False |
| 86 | self.intail = False |
| 87 | self.stack = [] # When inraw==True, stack contains a list of tags |
| 88 | self._cache = [] |
| 89 | self.cleandoc = [] |
| 90 | super().reset() |
| 91 | |
| 92 | def close(self): |
| 93 | """Handle any buffered data.""" |
| 94 | super().close() |
| 95 | if len(self.rawdata): |
| 96 | # Temp fix for https://bugs.python.org/issue41989 |
| 97 | # TODO: remove this when the bug is fixed in all supported Python versions. |
| 98 | if self.convert_charrefs and not self.cdata_elem: # pragma: no cover |
| 99 | self.handle_data(htmlparser.unescape(self.rawdata)) |
| 100 | else: |
| 101 | self.handle_data(self.rawdata) |
| 102 | # Handle any unclosed tags. |
| 103 | if len(self._cache): |
| 104 | self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) |
| 105 | self._cache = [] |
| 106 | |
| 107 | @property |
| 108 | def line_offset(self): |
| 109 | """Returns char index in self.rawdata for the start of the current line. """ |
| 110 | if self.lineno > 1 and '\n' in self.rawdata: |
| 111 | m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata) |
| 112 | if m: |
| 113 | return m.end() |
| 114 | else: # pragma: no cover |
| 115 | # Value of self.lineno must exceed total number of lines. |
| 116 | # Find index of begining of last line. |
| 117 | return self.rawdata.rfind('\n') |
| 118 | return 0 |
| 119 | |
| 120 | def at_line_start(self): |
| 121 | """ |
| 122 | Returns True if current position is at start of line. |
| 123 | |
| 124 | Allows for up to three blank spaces at start of line. |
| 125 | """ |
| 126 | if self.offset == 0: |
| 127 | return True |
| 128 | if self.offset > 3: |
| 129 | return False |
| 130 | # Confirm up to first 3 chars are whitespace |
| 131 | return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == '' |
| 132 | |
| 133 | def get_endtag_text(self, tag): |
| 134 | """ |
| 135 | Returns the text of the end tag. |
| 136 | |
| 137 | If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`. |
| 138 | """ |
| 139 | # Attempt to extract actual tag from raw source text |
| 140 | start = self.line_offset + self.offset |
| 141 | m = htmlparser.endendtag.search(self.rawdata, start) |
| 142 | if m: |
| 143 | return self.rawdata[start:m.end()] |
| 144 | else: # pragma: no cover |
| 145 | # Failed to extract from raw data. Assume well formed and lowercase. |
| 146 | return '</{}>'.format(tag) |
| 147 | |
| 148 | def handle_starttag(self, tag, attrs): |
| 149 | # Handle tags that should always be empty and do not specify a closing tag |
| 150 | if tag in self.empty_tags: |
| 151 | self.handle_startendtag(tag, attrs) |
| 152 | return |
| 153 | |
| 154 | if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)): |
| 155 | # Started a new raw block. Prepare stack. |
| 156 | self.inraw = True |
| 157 | self.cleandoc.append('\n') |
| 158 | |
| 159 | text = self.get_starttag_text() |
| 160 | if self.inraw: |
| 161 | self.stack.append(tag) |
| 162 | self._cache.append(text) |
| 163 | else: |
| 164 | self.cleandoc.append(text) |
| 165 | if tag in self.CDATA_CONTENT_ELEMENTS: |
| 166 | # This is presumably a standalone tag in a code span (see #1036). |
| 167 | self.clear_cdata_mode() |
| 168 | |
| 169 | def handle_endtag(self, tag): |
| 170 | text = self.get_endtag_text(tag) |
| 171 | |
| 172 | if self.inraw: |
| 173 | self._cache.append(text) |
| 174 | if tag in self.stack: |
| 175 | # Remove tag from stack |
| 176 | while self.stack: |
| 177 | if self.stack.pop() == tag: |
| 178 | break |
| 179 | if len(self.stack) == 0: |
| 180 | # End of raw block. |
| 181 | if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]): |
| 182 | # Preserve blank line and end of raw block. |
| 183 | self._cache.append('\n') |
| 184 | else: |
| 185 | # More content exists after endtag. |
| 186 | self.intail = True |
| 187 | # Reset stack. |
| 188 | self.inraw = False |
| 189 | self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache))) |
| 190 | # Insert blank line between this and next line. |
| 191 | self.cleandoc.append('\n\n') |
| 192 | self._cache = [] |
| 193 | else: |
| 194 | self.cleandoc.append(text) |
| 195 | |
| 196 | def handle_data(self, data): |
| 197 | if self.intail and '\n' in data: |
| 198 | self.intail = False |
| 199 | if self.inraw: |
| 200 | self._cache.append(data) |
| 201 | else: |
| 202 | self.cleandoc.append(data) |
| 203 | |
| 204 | def handle_empty_tag(self, data, is_block): |
| 205 | """ Handle empty tags (`<data>`). """ |
| 206 | if self.inraw or self.intail: |
| 207 | # Append this to the existing raw block |
| 208 | self._cache.append(data) |
| 209 | elif self.at_line_start() and is_block: |
| 210 | # Handle this as a standalone raw block |
| 211 | if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]): |
| 212 | # Preserve blank line after tag in raw block. |
| 213 | data += '\n' |
| 214 | else: |
| 215 | # More content exists after tag. |
| 216 | self.intail = True |
| 217 | item = self.cleandoc[-1] if self.cleandoc else '' |
| 218 | # If we only have one newline before block element, add another |
| 219 | if not item.endswith('\n\n') and item.endswith('\n'): |
| 220 | self.cleandoc.append('\n') |
| 221 | self.cleandoc.append(self.md.htmlStash.store(data)) |
| 222 | # Insert blank line between this and next line. |
| 223 | self.cleandoc.append('\n\n') |
| 224 | else: |
| 225 | self.cleandoc.append(data) |
| 226 | |
| 227 | def handle_startendtag(self, tag, attrs): |
| 228 | self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag)) |
| 229 | |
| 230 | def handle_charref(self, name): |
| 231 | self.handle_empty_tag('&#{};'.format(name), is_block=False) |
| 232 | |
| 233 | def handle_entityref(self, name): |
| 234 | self.handle_empty_tag('&{};'.format(name), is_block=False) |
| 235 | |
| 236 | def handle_comment(self, data): |
| 237 | self.handle_empty_tag('<!--{}-->'.format(data), is_block=True) |
| 238 | |
| 239 | def handle_decl(self, data): |
| 240 | self.handle_empty_tag('<!{}>'.format(data), is_block=True) |
| 241 | |
| 242 | def handle_pi(self, data): |
| 243 | self.handle_empty_tag('<?{}?>'.format(data), is_block=True) |
| 244 | |
| 245 | def unknown_decl(self, data): |
| 246 | end = ']]>' if data.startswith('CDATA[') else ']>' |
| 247 | self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True) |
| 248 | |
| 249 | def parse_pi(self, i): |
| 250 | if self.at_line_start() or self.intail: |
| 251 | return super().parse_pi(i) |
| 252 | # This is not the beginning of a raw block so treat as plain data |
| 253 | # and avoid consuming any tags which may follow (see #1066). |
| 254 | self.handle_data('<?') |
| 255 | return i + 2 |
| 256 | |
| 257 | def parse_html_declaration(self, i): |
| 258 | if self.at_line_start() or self.intail: |
| 259 | return super().parse_html_declaration(i) |
| 260 | # This is not the beginning of a raw block so treat as plain data |
| 261 | # and avoid consuming any tags which may follow (see #1066). |
| 262 | self.handle_data('<!') |
| 263 | return i + 2 |
| 264 | |
| 265 | # The rest has been copied from base class in standard lib to address #1036. |
| 266 | # As __startag_text is private, all references to it must be in this subclass. |
| 267 | # The last few lines of parse_starttag are reversed so that handle_starttag |
| 268 | # can override cdata_mode in certain situations (in a code span). |
| 269 | __starttag_text = None |
| 270 | |
| 271 | def get_starttag_text(self): |
| 272 | """Return full source of start tag: '<...>'.""" |
| 273 | return self.__starttag_text |
| 274 | |
| 275 | def parse_starttag(self, i): # pragma: no cover |
| 276 | self.__starttag_text = None |
| 277 | endpos = self.check_for_whole_start_tag(i) |
| 278 | if endpos < 0: |
| 279 | return endpos |
| 280 | rawdata = self.rawdata |
| 281 | self.__starttag_text = rawdata[i:endpos] |
| 282 | |
| 283 | # Now parse the data between i+1 and j into a tag and attrs |
| 284 | attrs = [] |
| 285 | match = htmlparser.tagfind_tolerant.match(rawdata, i+1) |
| 286 | assert match, 'unexpected call to parse_starttag()' |
| 287 | k = match.end() |
| 288 | self.lasttag = tag = match.group(1).lower() |
| 289 | while k < endpos: |
| 290 | m = htmlparser.attrfind_tolerant.match(rawdata, k) |
| 291 | if not m: |
| 292 | break |
| 293 | attrname, rest, attrvalue = m.group(1, 2, 3) |
| 294 | if not rest: |
| 295 | attrvalue = None |
| 296 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
| 297 | attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127 |
| 298 | attrvalue = attrvalue[1:-1] |
| 299 | if attrvalue: |
| 300 | attrvalue = htmlparser.unescape(attrvalue) |
| 301 | attrs.append((attrname.lower(), attrvalue)) |
| 302 | k = m.end() |
| 303 | |
| 304 | end = rawdata[k:endpos].strip() |
| 305 | if end not in (">", "/>"): |
| 306 | lineno, offset = self.getpos() |
| 307 | if "\n" in self.__starttag_text: |
| 308 | lineno = lineno + self.__starttag_text.count("\n") |
| 309 | offset = len(self.__starttag_text) \ |
| 310 | - self.__starttag_text.rfind("\n") # noqa: E127 |
| 311 | else: |
| 312 | offset = offset + len(self.__starttag_text) |
| 313 | self.handle_data(rawdata[i:endpos]) |
| 314 | return endpos |
| 315 | if end.endswith('/>'): |
| 316 | # XHTML-style empty tag: <span attr="value" /> |
| 317 | self.handle_startendtag(tag, attrs) |
| 318 | else: |
| 319 | # *** set cdata_mode first so we can override it in handle_starttag (see #1036) *** |
| 320 | if tag in self.CDATA_CONTENT_ELEMENTS: |
| 321 | self.set_cdata_mode(tag) |
| 322 | self.handle_starttag(tag, attrs) |
| 323 | return endpos |