Blame - markdown/htmlparser.py - chromium.googlesource.com/chromium/src/third_party/Python-Markdown

blob: c08856ab8896a0790d14c895eb7198ba2b032552 [file] [log] [blame]

Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	1	"""
				2	Python Markdown
				3
				4	A Python implementation of John Gruber's Markdown.
				5
				6	Documentation: https://python-markdown.github.io/
				7	GitHub: https://github.com/Python-Markdown/markdown/
				8	PyPI: https://pypi.org/project/Markdown/
				9
				10	Started by Manfred Stienstra (http://www.dwerg.net/).
				11	Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
				12	Currently maintained by Waylan Limberg (https://github.com/waylan),
				13	Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
				14
				15	Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later)
				16	Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
				17	Copyright 2004 Manfred Stienstra (the original version)
				18
				19	License: BSD (see LICENSE.md for details).
				20	"""
				21
				22	import re
				23	import importlib
				24	import sys
				25
				26
				27	# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
				28	# Users can still do `from html import parser` and get the default behavior.
				29	spec = importlib.util.find_spec('html.parser')
				30	htmlparser = importlib.util.module_from_spec(spec)
				31	spec.loader.exec_module(htmlparser)
				32	sys.modules['htmlparser'] = htmlparser
				33
				34	# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
				35	htmlparser.piclose = re.compile(r'\?>')
				36	# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
				37	htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
				38	# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block,
				39	# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
				40	# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
				41	htmlparser.incomplete = htmlparser.entityref
				42	# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
				43	htmlparser.locatestarttagend_tolerant = re.compile(r"""
				44	<[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
				45	(?:[\s/]* # optional whitespace before attribute name
				46	(?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
				47	(?:\s=+\s # value indicator
				48	(?:'[^']*' # LITA-enclosed value
				49	\|"[^"]*" # LIT-enclosed value
				50	\|(?!['"])[^`>\s]* # bare value <= added backtick here
				51	)
				52	(?:\s,) # possibly followed by a comma
				53	)?(?:\s\|/(?!>))*
				54	)*
				55	)?
				56	\s* # trailing whitespace
				57	""", re.VERBOSE)
				58
				59	# Match a blank line at the start of a block of text (two newlines).
				60	# The newlines may be preceded by additional whitespace.
				61	blank_line_re = re.compile(r'^([ ]*\n){2}')
				62
				63
				64	class HTMLExtractor(htmlparser.HTMLParser):
				65	"""
				66	Extract raw HTML from text.
				67
				68	The raw HTML is stored in the `htmlStash` of the Markdown instance passed
				69	to `md` and the remaining text is stored in `cleandoc` as a list of strings.
				70	"""
				71
				72	def __init__(self, md, args, *kwargs):
				73	if 'convert_charrefs' not in kwargs:
				74	kwargs['convert_charrefs'] = False
				75
				76	# Block tags that should contain no content (self closing)
				77	self.empty_tags = set(['hr'])
				78
				79	# This calls self.reset
				80	super().__init__(args, *kwargs)
				81	self.md = md
				82
				83	def reset(self):
				84	"""Reset this instance. Loses all unprocessed data."""
				85	self.inraw = False
				86	self.intail = False
				87	self.stack = [] # When inraw==True, stack contains a list of tags
				88	self._cache = []
				89	self.cleandoc = []
				90	super().reset()
				91
				92	def close(self):
				93	"""Handle any buffered data."""
				94	super().close()
				95	if len(self.rawdata):
				96	# Temp fix for https://bugs.python.org/issue41989
				97	# TODO: remove this when the bug is fixed in all supported Python versions.
				98	if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
				99	self.handle_data(htmlparser.unescape(self.rawdata))
				100	else:
				101	self.handle_data(self.rawdata)
				102	# Handle any unclosed tags.
				103	if len(self._cache):
				104	self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
				105	self._cache = []
				106
				107	@property
				108	def line_offset(self):
				109	"""Returns char index in self.rawdata for the start of the current line. """
				110	if self.lineno > 1 and '\n' in self.rawdata:
				111	m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
				112	if m:
				113	return m.end()
				114	else: # pragma: no cover
				115	# Value of self.lineno must exceed total number of lines.
				116	# Find index of begining of last line.
				117	return self.rawdata.rfind('\n')
				118	return 0
				119
				120	def at_line_start(self):
				121	"""
				122	Returns True if current position is at start of line.
				123
				124	Allows for up to three blank spaces at start of line.
				125	"""
				126	if self.offset == 0:
				127	return True
				128	if self.offset > 3:
				129	return False
				130	# Confirm up to first 3 chars are whitespace
				131	return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
				132
				133	def get_endtag_text(self, tag):
				134	"""
				135	Returns the text of the end tag.
				136
				137	If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
				138	"""
				139	# Attempt to extract actual tag from raw source text
				140	start = self.line_offset + self.offset
				141	m = htmlparser.endendtag.search(self.rawdata, start)
				142	if m:
				143	return self.rawdata[start:m.end()]
				144	else: # pragma: no cover
				145	# Failed to extract from raw data. Assume well formed and lowercase.
				146	return '</{}>'.format(tag)
				147
				148	def handle_starttag(self, tag, attrs):
				149	# Handle tags that should always be empty and do not specify a closing tag
				150	if tag in self.empty_tags:
				151	self.handle_startendtag(tag, attrs)
				152	return
				153
				154	if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
				155	# Started a new raw block. Prepare stack.
				156	self.inraw = True
				157	self.cleandoc.append('\n')
				158
				159	text = self.get_starttag_text()
				160	if self.inraw:
				161	self.stack.append(tag)
				162	self._cache.append(text)
				163	else:
				164	self.cleandoc.append(text)
				165	if tag in self.CDATA_CONTENT_ELEMENTS:
				166	# This is presumably a standalone tag in a code span (see #1036).
				167	self.clear_cdata_mode()
				168
				169	def handle_endtag(self, tag):
				170	text = self.get_endtag_text(tag)
				171
				172	if self.inraw:
				173	self._cache.append(text)
				174	if tag in self.stack:
				175	# Remove tag from stack
				176	while self.stack:
				177	if self.stack.pop() == tag:
				178	break
				179	if len(self.stack) == 0:
				180	# End of raw block.
				181	if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
				182	# Preserve blank line and end of raw block.
				183	self._cache.append('\n')
				184	else:
				185	# More content exists after endtag.
				186	self.intail = True
				187	# Reset stack.
				188	self.inraw = False
				189	self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
				190	# Insert blank line between this and next line.
				191	self.cleandoc.append('\n\n')
				192	self._cache = []
				193	else:
				194	self.cleandoc.append(text)
				195
				196	def handle_data(self, data):
				197	if self.intail and '\n' in data:
				198	self.intail = False
				199	if self.inraw:
				200	self._cache.append(data)
				201	else:
				202	self.cleandoc.append(data)
				203
				204	def handle_empty_tag(self, data, is_block):
				205	""" Handle empty tags (`<data>`). """
				206	if self.inraw or self.intail:
				207	# Append this to the existing raw block
				208	self._cache.append(data)
				209	elif self.at_line_start() and is_block:
				210	# Handle this as a standalone raw block
				211	if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
				212	# Preserve blank line after tag in raw block.
				213	data += '\n'
				214	else:
				215	# More content exists after tag.
				216	self.intail = True
				217	item = self.cleandoc[-1] if self.cleandoc else ''
				218	# If we only have one newline before block element, add another
				219	if not item.endswith('\n\n') and item.endswith('\n'):
				220	self.cleandoc.append('\n')
				221	self.cleandoc.append(self.md.htmlStash.store(data))
				222	# Insert blank line between this and next line.
				223	self.cleandoc.append('\n\n')
				224	else:
				225	self.cleandoc.append(data)
				226
				227	def handle_startendtag(self, tag, attrs):
				228	self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
				229
				230	def handle_charref(self, name):
				231	self.handle_empty_tag('&#{};'.format(name), is_block=False)
				232
				233	def handle_entityref(self, name):
				234	self.handle_empty_tag('&{};'.format(name), is_block=False)
				235
				236	def handle_comment(self, data):
				237	self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
				238
				239	def handle_decl(self, data):
				240	self.handle_empty_tag('<!{}>'.format(data), is_block=True)
				241
				242	def handle_pi(self, data):
				243	self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
				244
				245	def unknown_decl(self, data):
				246	end = ']]>' if data.startswith('CDATA[') else ']>'
				247	self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
				248
				249	def parse_pi(self, i):
				250	if self.at_line_start() or self.intail:
				251	return super().parse_pi(i)
				252	# This is not the beginning of a raw block so treat as plain data
				253	# and avoid consuming any tags which may follow (see #1066).
				254	self.handle_data('<?')
				255	return i + 2
				256
				257	def parse_html_declaration(self, i):
				258	if self.at_line_start() or self.intail:
				259	return super().parse_html_declaration(i)
				260	# This is not the beginning of a raw block so treat as plain data
				261	# and avoid consuming any tags which may follow (see #1066).
				262	self.handle_data('<!')
				263	return i + 2
				264
				265	# The rest has been copied from base class in standard lib to address #1036.
				266	# As __startag_text is private, all references to it must be in this subclass.
				267	# The last few lines of parse_starttag are reversed so that handle_starttag
				268	# can override cdata_mode in certain situations (in a code span).
				269	__starttag_text = None
				270
				271	def get_starttag_text(self):
				272	"""Return full source of start tag: '<...>'."""
				273	return self.__starttag_text
				274
				275	def parse_starttag(self, i): # pragma: no cover
				276	self.__starttag_text = None
				277	endpos = self.check_for_whole_start_tag(i)
				278	if endpos < 0:
				279	return endpos
				280	rawdata = self.rawdata
				281	self.__starttag_text = rawdata[i:endpos]
				282
				283	# Now parse the data between i+1 and j into a tag and attrs
				284	attrs = []
				285	match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
				286	assert match, 'unexpected call to parse_starttag()'
				287	k = match.end()
				288	self.lasttag = tag = match.group(1).lower()
				289	while k < endpos:
				290	m = htmlparser.attrfind_tolerant.match(rawdata, k)
				291	if not m:
				292	break
				293	attrname, rest, attrvalue = m.group(1, 2, 3)
				294	if not rest:
				295	attrvalue = None
				296	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
				297	attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
				298	attrvalue = attrvalue[1:-1]
				299	if attrvalue:
				300	attrvalue = htmlparser.unescape(attrvalue)
				301	attrs.append((attrname.lower(), attrvalue))
				302	k = m.end()
				303
				304	end = rawdata[k:endpos].strip()
				305	if end not in (">", "/>"):
				306	lineno, offset = self.getpos()
				307	if "\n" in self.__starttag_text:
				308	lineno = lineno + self.__starttag_text.count("\n")
				309	offset = len(self.__starttag_text) \
				310	- self.__starttag_text.rfind("\n") # noqa: E127
				311	else:
				312	offset = offset + len(self.__starttag_text)
				313	self.handle_data(rawdata[i:endpos])
				314	return endpos
				315	if end.endswith('/>'):
				316	# XHTML-style empty tag: <span attr="value" />
				317	self.handle_startendtag(tag, attrs)
				318	else:
				319	# * set cdata_mode first so we can override it in handle_starttag (see #1036) *
				320	if tag in self.CDATA_CONTENT_ELEMENTS:
				321	self.set_cdata_mode(tag)
				322	self.handle_starttag(tag, attrs)
				323	return endpos