blob: c08856ab8896a0790d14c895eb7198ba2b032552 [file] [log] [blame]
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +00001"""
2Python Markdown
3
4A Python implementation of John Gruber's Markdown.
5
6Documentation: https://python-markdown.github.io/
7GitHub: https://github.com/Python-Markdown/markdown/
8PyPI: https://pypi.org/project/Markdown/
9
10Started by Manfred Stienstra (http://www.dwerg.net/).
11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
12Currently maintained by Waylan Limberg (https://github.com/waylan),
13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14
15Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later)
16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
17Copyright 2004 Manfred Stienstra (the original version)
18
19License: BSD (see LICENSE.md for details).
20"""
21
22import re
23import importlib
24import sys
25
26
27# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
28# Users can still do `from html import parser` and get the default behavior.
29spec = importlib.util.find_spec('html.parser')
30htmlparser = importlib.util.module_from_spec(spec)
31spec.loader.exec_module(htmlparser)
32sys.modules['htmlparser'] = htmlparser
33
34# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
35htmlparser.piclose = re.compile(r'\?>')
36# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
37htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
38# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block,
39# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
40# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
41htmlparser.incomplete = htmlparser.entityref
42# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
43htmlparser.locatestarttagend_tolerant = re.compile(r"""
44 <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
45 (?:[\s/]* # optional whitespace before attribute name
46 (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
47 (?:\s*=+\s* # value indicator
48 (?:'[^']*' # LITA-enclosed value
49 |"[^"]*" # LIT-enclosed value
50 |(?!['"])[^`>\s]* # bare value <= added backtick here
51 )
52 (?:\s*,)* # possibly followed by a comma
53 )?(?:\s|/(?!>))*
54 )*
55 )?
56 \s* # trailing whitespace
57""", re.VERBOSE)
58
59# Match a blank line at the start of a block of text (two newlines).
60# The newlines may be preceded by additional whitespace.
61blank_line_re = re.compile(r'^([ ]*\n){2}')
62
63
64class HTMLExtractor(htmlparser.HTMLParser):
65 """
66 Extract raw HTML from text.
67
68 The raw HTML is stored in the `htmlStash` of the Markdown instance passed
69 to `md` and the remaining text is stored in `cleandoc` as a list of strings.
70 """
71
72 def __init__(self, md, *args, **kwargs):
73 if 'convert_charrefs' not in kwargs:
74 kwargs['convert_charrefs'] = False
75
76 # Block tags that should contain no content (self closing)
77 self.empty_tags = set(['hr'])
78
79 # This calls self.reset
80 super().__init__(*args, **kwargs)
81 self.md = md
82
83 def reset(self):
84 """Reset this instance. Loses all unprocessed data."""
85 self.inraw = False
86 self.intail = False
87 self.stack = [] # When inraw==True, stack contains a list of tags
88 self._cache = []
89 self.cleandoc = []
90 super().reset()
91
92 def close(self):
93 """Handle any buffered data."""
94 super().close()
95 if len(self.rawdata):
96 # Temp fix for https://bugs.python.org/issue41989
97 # TODO: remove this when the bug is fixed in all supported Python versions.
98 if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
99 self.handle_data(htmlparser.unescape(self.rawdata))
100 else:
101 self.handle_data(self.rawdata)
102 # Handle any unclosed tags.
103 if len(self._cache):
104 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
105 self._cache = []
106
107 @property
108 def line_offset(self):
109 """Returns char index in self.rawdata for the start of the current line. """
110 if self.lineno > 1 and '\n' in self.rawdata:
111 m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
112 if m:
113 return m.end()
114 else: # pragma: no cover
115 # Value of self.lineno must exceed total number of lines.
116 # Find index of begining of last line.
117 return self.rawdata.rfind('\n')
118 return 0
119
120 def at_line_start(self):
121 """
122 Returns True if current position is at start of line.
123
124 Allows for up to three blank spaces at start of line.
125 """
126 if self.offset == 0:
127 return True
128 if self.offset > 3:
129 return False
130 # Confirm up to first 3 chars are whitespace
131 return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
132
133 def get_endtag_text(self, tag):
134 """
135 Returns the text of the end tag.
136
137 If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
138 """
139 # Attempt to extract actual tag from raw source text
140 start = self.line_offset + self.offset
141 m = htmlparser.endendtag.search(self.rawdata, start)
142 if m:
143 return self.rawdata[start:m.end()]
144 else: # pragma: no cover
145 # Failed to extract from raw data. Assume well formed and lowercase.
146 return '</{}>'.format(tag)
147
148 def handle_starttag(self, tag, attrs):
149 # Handle tags that should always be empty and do not specify a closing tag
150 if tag in self.empty_tags:
151 self.handle_startendtag(tag, attrs)
152 return
153
154 if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
155 # Started a new raw block. Prepare stack.
156 self.inraw = True
157 self.cleandoc.append('\n')
158
159 text = self.get_starttag_text()
160 if self.inraw:
161 self.stack.append(tag)
162 self._cache.append(text)
163 else:
164 self.cleandoc.append(text)
165 if tag in self.CDATA_CONTENT_ELEMENTS:
166 # This is presumably a standalone tag in a code span (see #1036).
167 self.clear_cdata_mode()
168
169 def handle_endtag(self, tag):
170 text = self.get_endtag_text(tag)
171
172 if self.inraw:
173 self._cache.append(text)
174 if tag in self.stack:
175 # Remove tag from stack
176 while self.stack:
177 if self.stack.pop() == tag:
178 break
179 if len(self.stack) == 0:
180 # End of raw block.
181 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
182 # Preserve blank line and end of raw block.
183 self._cache.append('\n')
184 else:
185 # More content exists after endtag.
186 self.intail = True
187 # Reset stack.
188 self.inraw = False
189 self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
190 # Insert blank line between this and next line.
191 self.cleandoc.append('\n\n')
192 self._cache = []
193 else:
194 self.cleandoc.append(text)
195
196 def handle_data(self, data):
197 if self.intail and '\n' in data:
198 self.intail = False
199 if self.inraw:
200 self._cache.append(data)
201 else:
202 self.cleandoc.append(data)
203
204 def handle_empty_tag(self, data, is_block):
205 """ Handle empty tags (`<data>`). """
206 if self.inraw or self.intail:
207 # Append this to the existing raw block
208 self._cache.append(data)
209 elif self.at_line_start() and is_block:
210 # Handle this as a standalone raw block
211 if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
212 # Preserve blank line after tag in raw block.
213 data += '\n'
214 else:
215 # More content exists after tag.
216 self.intail = True
217 item = self.cleandoc[-1] if self.cleandoc else ''
218 # If we only have one newline before block element, add another
219 if not item.endswith('\n\n') and item.endswith('\n'):
220 self.cleandoc.append('\n')
221 self.cleandoc.append(self.md.htmlStash.store(data))
222 # Insert blank line between this and next line.
223 self.cleandoc.append('\n\n')
224 else:
225 self.cleandoc.append(data)
226
227 def handle_startendtag(self, tag, attrs):
228 self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
229
230 def handle_charref(self, name):
231 self.handle_empty_tag('&#{};'.format(name), is_block=False)
232
233 def handle_entityref(self, name):
234 self.handle_empty_tag('&{};'.format(name), is_block=False)
235
236 def handle_comment(self, data):
237 self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
238
239 def handle_decl(self, data):
240 self.handle_empty_tag('<!{}>'.format(data), is_block=True)
241
242 def handle_pi(self, data):
243 self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
244
245 def unknown_decl(self, data):
246 end = ']]>' if data.startswith('CDATA[') else ']>'
247 self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
248
249 def parse_pi(self, i):
250 if self.at_line_start() or self.intail:
251 return super().parse_pi(i)
252 # This is not the beginning of a raw block so treat as plain data
253 # and avoid consuming any tags which may follow (see #1066).
254 self.handle_data('<?')
255 return i + 2
256
257 def parse_html_declaration(self, i):
258 if self.at_line_start() or self.intail:
259 return super().parse_html_declaration(i)
260 # This is not the beginning of a raw block so treat as plain data
261 # and avoid consuming any tags which may follow (see #1066).
262 self.handle_data('<!')
263 return i + 2
264
265 # The rest has been copied from base class in standard lib to address #1036.
266 # As __startag_text is private, all references to it must be in this subclass.
267 # The last few lines of parse_starttag are reversed so that handle_starttag
268 # can override cdata_mode in certain situations (in a code span).
269 __starttag_text = None
270
271 def get_starttag_text(self):
272 """Return full source of start tag: '<...>'."""
273 return self.__starttag_text
274
275 def parse_starttag(self, i): # pragma: no cover
276 self.__starttag_text = None
277 endpos = self.check_for_whole_start_tag(i)
278 if endpos < 0:
279 return endpos
280 rawdata = self.rawdata
281 self.__starttag_text = rawdata[i:endpos]
282
283 # Now parse the data between i+1 and j into a tag and attrs
284 attrs = []
285 match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
286 assert match, 'unexpected call to parse_starttag()'
287 k = match.end()
288 self.lasttag = tag = match.group(1).lower()
289 while k < endpos:
290 m = htmlparser.attrfind_tolerant.match(rawdata, k)
291 if not m:
292 break
293 attrname, rest, attrvalue = m.group(1, 2, 3)
294 if not rest:
295 attrvalue = None
296 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
297 attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
298 attrvalue = attrvalue[1:-1]
299 if attrvalue:
300 attrvalue = htmlparser.unescape(attrvalue)
301 attrs.append((attrname.lower(), attrvalue))
302 k = m.end()
303
304 end = rawdata[k:endpos].strip()
305 if end not in (">", "/>"):
306 lineno, offset = self.getpos()
307 if "\n" in self.__starttag_text:
308 lineno = lineno + self.__starttag_text.count("\n")
309 offset = len(self.__starttag_text) \
310 - self.__starttag_text.rfind("\n") # noqa: E127
311 else:
312 offset = offset + len(self.__starttag_text)
313 self.handle_data(rawdata[i:endpos])
314 return endpos
315 if end.endswith('/>'):
316 # XHTML-style empty tag: <span attr="value" />
317 self.handle_startendtag(tag, attrs)
318 else:
319 # *** set cdata_mode first so we can override it in handle_starttag (see #1036) ***
320 if tag in self.CDATA_CONTENT_ELEMENTS:
321 self.set_cdata_mode(tag)
322 self.handle_starttag(tag, attrs)
323 return endpos