Blame - markdown/inlinepatterns.py - chromium.googlesource.com/chromium/src/third_party/Python-Markdown

blob: eb313bd40b3685f0403cad6a52a18ae5bc64acbc [file] [log] [blame]

dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	1	"""
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	2	Python Markdown
				3
				4	A Python implementation of John Gruber's Markdown.
				5
				6	Documentation: https://python-markdown.github.io/
				7	GitHub: https://github.com/Python-Markdown/markdown/
				8	PyPI: https://pypi.org/project/Markdown/
				9
				10	Started by Manfred Stienstra (http://www.dwerg.net/).
				11	Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
				12	Currently maintained by Waylan Limberg (https://github.com/waylan),
				13	Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
				14
				15	Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
				16	Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
				17	Copyright 2004 Manfred Stienstra (the original version)
				18
				19	License: BSD (see LICENSE.md for details).
				20
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	21	INLINE PATTERNS
				22	=============================================================================
				23
				24	Inline patterns such as emphasis are handled by means of auxiliary
				25	objects, one per pattern. Pattern objects must be instances of classes
				26	that extend markdown.Pattern. Each pattern object uses a single regular
				27	expression and needs support the following methods:
				28
				29	pattern.getCompiledRegExp() # returns a regular expression
				30
				31	pattern.handleMatch(m) # takes a match object and returns
				32	# an ElementTree element or just plain text
				33
				34	All of python markdown's built-in patterns subclass from Pattern,
				35	but you can add additional patterns that don't.
				36
				37	Also note that all the regular expressions used by inline must
				38	capture the whole block. For this reason, they all start with
				39	'^(.)' and end with '(.)!'. In case with built-in expression
				40	Pattern takes care of adding the "^(.)" and "(.)!".
				41
				42	Finally, the order in which regular expressions are applied is very
				43	important - e.g. if we first replace http://.../ links with <a> tags
				44	and _then_ try to replace inline html, we would end up with a mess.
				45	So, we apply the expressions in the following order:
				46
				47	* escape and backticks have to go before everything else, so
				48	that we can preempt any markdown patterns by escaping them.
				49
				50	* then we handle auto-links (must be done before inline html)
				51
				52	* then we handle inline HTML. At this point we will simply
				53	replace all inline HTML strings with a placeholder and add
				54	the actual HTML to a hash.
				55
				56	* then inline images (must be done before links)
				57
				58	* then bracketed links, first regular then reference-style
				59
				60	* finally we apply strong and emphasis
				61	"""
				62
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	63	from . import util
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	64	from collections import namedtuple
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	65	import re
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	66	import xml.etree.ElementTree as etree
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	67	try: # pragma: no cover
				68	from html import entities
				69	except ImportError: # pragma: no cover
				70	import htmlentitydefs as entities
				71
				72
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	73	def build_inlinepatterns(md, **kwargs):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	74	""" Build the default set of inline patterns for Markdown. """
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	75	inlinePatterns = util.Registry()
				76	inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)
				77	inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)
				78	inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)
				79	inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)
				80	inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)
				81	inlinePatterns.register(
				82	ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	83	)
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	84	inlinePatterns.register(
				85	ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	86	)
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	87	inlinePatterns.register(
				88	ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125
				89	)
				90	inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)
				91	inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)
				92	inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)
				93	inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)
				94	inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)
				95	inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)
				96	inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)
				97	inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	98	return inlinePatterns
				99
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	100
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	101	"""
				102	The actual regular expressions for patterns
				103	-----------------------------------------------------------------------------
				104	"""
				105
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	106	NOIMG = r'(?<!\!)'
				107
				108	# `e=f()` or ``e=f("`")``
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	109	BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)\|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	110
				111	# \<
				112	ESCAPE_RE = r'\\(.)'
				113
				114	# emphasis
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	115	EMPHASIS_RE = r'(\)([^\]+)\1'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	116
				117	# strong
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	118	STRONG_RE = r'(\*{2})(.+?)\1'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	119
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	120	# __smart__strong__
				121	SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	122
				123	# _smart_emphasis_
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	124	SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	125
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	126	# __strong _em__
				127	SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'
				128
				129	# *strongem* or **emstrong**
				130	EM_STRONG_RE = r'(\)\1{2}(.+?)\1(.?)\1{2}'
				131
				132	# ___strongem___ or ___em_strong__
				133	EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'
				134
				135	# *strongem*
				136	STRONG_EM_RE = r'(\)\1{2}(.+?)\1{2}(.?)\1'
				137
				138	# ___strong__em_
				139	STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'
				140
				141	# *strongem***
				142	STRONG_EM3_RE = r'(\)\1(?!\1)([^]+?)\1(?!\1)(.+?)\1{3}'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	143
				144	# [text](url) or [text](<url>) or [text](url "title")
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	145	LINK_RE = NOIMG + r'\['
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	146
				147	# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	148	IMAGE_LINK_RE = r'\!\['
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	149
				150	# [Google][3]
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	151	REFERENCE_RE = LINK_RE
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	152
				153	# ![alt text][2]
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	154	IMAGE_REFERENCE_RE = IMAGE_LINK_RE
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	155
				156	# stand-alone * or _
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	157	NOT_STRONG_RE = r'((^\|\s)(\*\|_)(\s\|$))'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	158
				159	# <http://www.123.com>
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	160	AUTOLINK_RE = r'<((?:[Ff]\|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	161
				162	# <me@example.com>
揚帆起航	0f44735	2022-11-28 22:32:11 +0000	[diff] [blame^]	163	AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	164
				165	# <...>
揚帆起航	0f44735	2022-11-28 22:32:11 +0000	[diff] [blame^]	166	HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]( [^<>])?\|!--(?:(?!<!--\|-->).)*--)>)'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	167
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	168	# "&" (decimal) or "&" (hex) or "&" (named)
				169	ENTITY_RE = r'(&(?:\#[0-9]+\|\#x[0-9a-fA-F]+\|[a-zA-Z0-9]+);)'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	170
				171	# two spaces at end of line
				172	LINE_BREAK_RE = r' \n'
				173
				174
				175	def dequote(string):
				176	"""Remove quotes from around a string."""
				177	if ((string.startswith('"') and string.endswith('"')) or
				178	(string.startswith("'") and string.endswith("'"))):
				179	return string[1:-1]
				180	else:
				181	return string
				182
				183
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	184	class EmStrongItem(namedtuple('EmStrongItem', ['pattern', 'builder', 'tags'])):
				185	"""Emphasis/strong pattern item."""
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	186
				187
				188	"""
				189	The pattern classes
				190	-----------------------------------------------------------------------------
				191	"""
				192
				193
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	194	class Pattern: # pragma: no cover
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	195	"""Base class that inline patterns subclass. """
				196
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	197	ANCESTOR_EXCLUDES = tuple()
				198
				199	def __init__(self, pattern, md=None):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	200	"""
				201	Create an instant of an inline pattern.
				202
				203	Keyword arguments:
				204
				205	* pattern: A regular expression that matches a pattern
				206
				207	"""
				208	self.pattern = pattern
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	209	self.compiled_re = re.compile(r"^(.?)%s(.)$" % pattern,
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	210	re.DOTALL \| re.UNICODE)
				211
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	212	self.md = md
				213
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	214	def getCompiledRegExp(self):
				215	""" Return a compiled regular expression. """
				216	return self.compiled_re
				217
				218	def handleMatch(self, m):
				219	"""Return a ElementTree element from the given match.
				220
				221	Subclasses should override this method.
				222
				223	Keyword arguments:
				224
				225	* m: A re match object containing a match of the pattern.
				226
				227	"""
				228	pass # pragma: no cover
				229
				230	def type(self):
				231	""" Return class name, to define pattern type """
				232	return self.__class__.__name__
				233
				234	def unescape(self, text):
				235	""" Return unescaped text given text with an inline placeholder. """
				236	try:
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	237	stash = self.md.treeprocessors['inline'].stashed_nodes
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	238	except KeyError: # pragma: no cover
				239	return text
				240
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	241	def get_stash(m):
				242	id = m.group(1)
				243	if id in stash:
				244	value = stash.get(id)
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	245	if isinstance(value, str):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	246	return value
				247	else:
				248	# An etree Element - return text content only
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	249	return ''.join(value.itertext())
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	250	return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
				251
				252
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	253	class InlineProcessor(Pattern):
				254	"""
				255	Base class that inline patterns subclass.
				256
				257	This is the newer style inline processor that uses a more
				258	efficient and flexible search approach.
				259	"""
				260
				261	def __init__(self, pattern, md=None):
				262	"""
				263	Create an instant of an inline pattern.
				264
				265	Keyword arguments:
				266
				267	* pattern: A regular expression that matches a pattern
				268
				269	"""
				270	self.pattern = pattern
				271	self.compiled_re = re.compile(pattern, re.DOTALL \| re.UNICODE)
				272
				273	# Api for Markdown to pass safe_mode into instance
				274	self.safe_mode = False
				275	self.md = md
				276
				277	def handleMatch(self, m, data):
				278	"""Return a ElementTree element from the given match and the
				279	start and end index of the matched text.
				280
				281	If `start` and/or `end` are returned as `None`, it will be
				282	assumed that the processor did not find a valid region of text.
				283
				284	Subclasses should override this method.
				285
				286	Keyword arguments:
				287
				288	* m: A re match object containing a match of the pattern.
				289	* data: The buffer current under analysis
				290
				291	Returns:
				292
				293	* el: The ElementTree element, text or None.
				294	* start: The start of the region that has been matched or None.
				295	* end: The end of the region that has been matched or None.
				296
				297	"""
				298	pass # pragma: no cover
				299
				300
				301	class SimpleTextPattern(Pattern): # pragma: no cover
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	302	""" Return a simple text of group(2) of a Pattern. """
				303	def handleMatch(self, m):
				304	return m.group(2)
				305
				306
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	307	class SimpleTextInlineProcessor(InlineProcessor):
				308	""" Return a simple text of group(1) of a Pattern. """
				309	def handleMatch(self, m, data):
				310	return m.group(1), m.start(0), m.end(0)
				311
				312
				313	class EscapeInlineProcessor(InlineProcessor):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	314	""" Return an escaped character. """
				315
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	316	def handleMatch(self, m, data):
				317	char = m.group(1)
				318	if char in self.md.ESCAPED_CHARS:
				319	return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	320	else:
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	321	return None, m.start(0), m.end(0)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	322
				323
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	324	class SimpleTagPattern(Pattern): # pragma: no cover
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	325	"""
				326	Return element of type `tag` with a text attribute of group(3)
				327	of a Pattern.
				328
				329	"""
				330	def __init__(self, pattern, tag):
				331	Pattern.__init__(self, pattern)
				332	self.tag = tag
				333
				334	def handleMatch(self, m):
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	335	el = etree.Element(self.tag)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	336	el.text = m.group(3)
				337	return el
				338
				339
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	340	class SimpleTagInlineProcessor(InlineProcessor):
				341	"""
				342	Return element of type `tag` with a text attribute of group(2)
				343	of a Pattern.
				344
				345	"""
				346	def __init__(self, pattern, tag):
				347	InlineProcessor.__init__(self, pattern)
				348	self.tag = tag
				349
				350	def handleMatch(self, m, data): # pragma: no cover
				351	el = etree.Element(self.tag)
				352	el.text = m.group(2)
				353	return el, m.start(0), m.end(0)
				354
				355
				356	class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	357	""" Return an element of type `tag` with no children. """
				358	def handleMatch(self, m):
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	359	return etree.Element(self.tag)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	360
				361
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	362	class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):
				363	""" Return an element of type `tag` with no children. """
				364	def handleMatch(self, m, data):
				365	return etree.Element(self.tag), m.start(0), m.end(0)
				366
				367
				368	class BacktickInlineProcessor(InlineProcessor):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	369	""" Return a `<code>` element containing the matching text. """
				370	def __init__(self, pattern):
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	371	InlineProcessor.__init__(self, pattern)
				372	self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)
				373	self.tag = 'code'
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	374
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	375	def handleMatch(self, m, data):
				376	if m.group(3):
				377	el = etree.Element(self.tag)
				378	el.text = util.AtomicString(util.code_escape(m.group(3).strip()))
				379	return el, m.start(0), m.end(0)
				380	else:
				381	return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	382
				383
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	384	class DoubleTagPattern(SimpleTagPattern): # pragma: no cover
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	385	"""Return a ElementTree element nested in tag2 nested in tag1.
				386
				387	Useful for strong emphasis etc.
				388
				389	"""
				390	def handleMatch(self, m):
				391	tag1, tag2 = self.tag.split(",")
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	392	el1 = etree.Element(tag1)
				393	el2 = etree.SubElement(el1, tag2)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	394	el2.text = m.group(3)
				395	if len(m.groups()) == 5:
				396	el2.tail = m.group(4)
				397	return el1
				398
				399
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	400	class DoubleTagInlineProcessor(SimpleTagInlineProcessor):
				401	"""Return a ElementTree element nested in tag2 nested in tag1.
				402
				403	Useful for strong emphasis etc.
				404
				405	"""
				406	def handleMatch(self, m, data): # pragma: no cover
				407	tag1, tag2 = self.tag.split(",")
				408	el1 = etree.Element(tag1)
				409	el2 = etree.SubElement(el1, tag2)
				410	el2.text = m.group(2)
				411	if len(m.groups()) == 3:
				412	el2.tail = m.group(3)
				413	return el1, m.start(0), m.end(0)
				414
				415
				416	class HtmlInlineProcessor(InlineProcessor):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	417	""" Store raw inline html and return a placeholder. """
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	418	def handleMatch(self, m, data):
				419	rawhtml = self.unescape(m.group(1))
				420	place_holder = self.md.htmlStash.store(rawhtml)
				421	return place_holder, m.start(0), m.end(0)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	422
				423	def unescape(self, text):
				424	""" Return unescaped text given text with an inline placeholder. """
				425	try:
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	426	stash = self.md.treeprocessors['inline'].stashed_nodes
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	427	except KeyError: # pragma: no cover
				428	return text
				429
				430	def get_stash(m):
				431	id = m.group(1)
				432	value = stash.get(id)
				433	if value is not None:
				434	try:
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	435	return self.md.serializer(value)
				436	except Exception:
				437	return r'\%s' % value
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	438
				439	return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
				440
				441
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	442	class AsteriskProcessor(InlineProcessor):
				443	"""Emphasis processor for handling strong and em matches inside asterisks."""
				444
				445	PATTERNS = [
				446	EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL \| re.UNICODE), 'double', 'strong,em'),
				447	EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL \| re.UNICODE), 'double', 'em,strong'),
				448	EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL \| re.UNICODE), 'double2', 'strong,em'),
				449	EmStrongItem(re.compile(STRONG_RE, re.DOTALL \| re.UNICODE), 'single', 'strong'),
				450	EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL \| re.UNICODE), 'single', 'em')
				451	]
				452
				453	def build_single(self, m, tag, idx):
				454	"""Return single tag."""
				455	el1 = etree.Element(tag)
				456	text = m.group(2)
				457	self.parse_sub_patterns(text, el1, None, idx)
				458	return el1
				459
				460	def build_double(self, m, tags, idx):
				461	"""Return double tag."""
				462
				463	tag1, tag2 = tags.split(",")
				464	el1 = etree.Element(tag1)
				465	el2 = etree.Element(tag2)
				466	text = m.group(2)
				467	self.parse_sub_patterns(text, el2, None, idx)
				468	el1.append(el2)
				469	if len(m.groups()) == 3:
				470	text = m.group(3)
				471	self.parse_sub_patterns(text, el1, el2, idx)
				472	return el1
				473
				474	def build_double2(self, m, tags, idx):
				475	"""Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""
				476
				477	tag1, tag2 = tags.split(",")
				478	el1 = etree.Element(tag1)
				479	el2 = etree.Element(tag2)
				480	text = m.group(2)
				481	self.parse_sub_patterns(text, el1, None, idx)
				482	text = m.group(3)
				483	el1.append(el2)
				484	self.parse_sub_patterns(text, el2, None, idx)
				485	return el1
				486
				487	def parse_sub_patterns(self, data, parent, last, idx):
				488	"""
				489	Parses sub patterns.
				490
				491	`data` (`str`):
				492	text to evaluate.
				493
				494	`parent` (`etree.Element`):
				495	Parent to attach text and sub elements to.
				496
				497	`last` (`etree.Element`):
				498	Last appended child to parent. Can also be None if parent has no children.
				499
				500	`idx` (`int`):
				501	Current pattern index that was used to evaluate the parent.
				502
				503	"""
				504
				505	offset = 0
				506	pos = 0
				507
				508	length = len(data)
				509	while pos < length:
				510	# Find the start of potential emphasis or strong tokens
				511	if self.compiled_re.match(data, pos):
				512	matched = False
				513	# See if the we can match an emphasis/strong pattern
				514	for index, item in enumerate(self.PATTERNS):
				515	# Only evaluate patterns that are after what was used on the parent
				516	if index <= idx:
				517	continue
				518	m = item.pattern.match(data, pos)
				519	if m:
				520	# Append child nodes to parent
				521	# Text nodes should be appended to the last
				522	# child if present, and if not, it should
				523	# be added as the parent's text node.
				524	text = data[offset:m.start(0)]
				525	if text:
				526	if last is not None:
				527	last.tail = text
				528	else:
				529	parent.text = text
				530	el = self.build_element(m, item.builder, item.tags, index)
				531	parent.append(el)
				532	last = el
				533	# Move our position past the matched hunk
				534	offset = pos = m.end(0)
				535	matched = True
				536	if not matched:
				537	# We matched nothing, move on to the next character
				538	pos += 1
				539	else:
				540	# Increment position as no potential emphasis start was found.
				541	pos += 1
				542
				543	# Append any leftover text as a text node.
				544	text = data[offset:]
				545	if text:
				546	if last is not None:
				547	last.tail = text
				548	else:
				549	parent.text = text
				550
				551	def build_element(self, m, builder, tags, index):
				552	"""Element builder."""
				553
				554	if builder == 'double2':
				555	return self.build_double2(m, tags, index)
				556	elif builder == 'double':
				557	return self.build_double(m, tags, index)
				558	else:
				559	return self.build_single(m, tags, index)
				560
				561	def handleMatch(self, m, data):
				562	"""Parse patterns."""
				563
				564	el = None
				565	start = None
				566	end = None
				567
				568	for index, item in enumerate(self.PATTERNS):
				569	m1 = item.pattern.match(data, m.start(0))
				570	if m1:
				571	start = m1.start(0)
				572	end = m1.end(0)
				573	el = self.build_element(m1, item.builder, item.tags, index)
				574	break
				575	return el, start, end
				576
				577
				578	class UnderscoreProcessor(AsteriskProcessor):
				579	"""Emphasis processor for handling strong and em matches inside underscores."""
				580
				581	PATTERNS = [
				582	EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL \| re.UNICODE), 'double', 'strong,em'),
				583	EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL \| re.UNICODE), 'double', 'em,strong'),
				584	EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL \| re.UNICODE), 'double2', 'strong,em'),
				585	EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL \| re.UNICODE), 'single', 'strong'),
				586	EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL \| re.UNICODE), 'single', 'em')
				587	]
				588
				589
				590	class LinkInlineProcessor(InlineProcessor):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	591	""" Return a link element from the given match. """
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	592	RE_LINK = re.compile(r'''$\s(?:(<[^<>]>)\s(?:('[^']'\|"[^"]")\s)?$)?''', re.DOTALL \| re.UNICODE)
				593	RE_TITLE_CLEAN = re.compile(r'\s')
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	594
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	595	def handleMatch(self, m, data):
				596	text, index, handled = self.getText(data, m.end(0))
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	597
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	598	if not handled:
				599	return None, None, None
				600
				601	href, title, index, handled = self.getLink(data, index)
				602	if not handled:
				603	return None, None, None
				604
				605	el = etree.Element("a")
				606	el.text = text
				607
				608	el.set("href", href)
				609
				610	if title is not None:
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	611	el.set("title", title)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	612
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	613	return el, m.start(0), index
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	614
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	615	def getLink(self, data, index):
				616	"""Parse data between `()` of `[Text]()` allowing recursive `()`. """
				617
				618	href = ''
				619	title = None
				620	handled = False
				621
				622	m = self.RE_LINK.match(data, pos=index)
				623	if m and m.group(1):
				624	# Matches [Text](<link> "title")
				625	href = m.group(1)[1:-1].strip()
				626	if m.group(2):
				627	title = m.group(2)[1:-1]
				628	index = m.end(0)
				629	handled = True
				630	elif m:
				631	# Track bracket nesting and index in string
				632	bracket_count = 1
				633	backtrack_count = 1
				634	start_index = m.end()
				635	index = start_index
				636	last_bracket = -1
				637
				638	# Primary (first found) quote tracking.
				639	quote = None
				640	start_quote = -1
				641	exit_quote = -1
				642	ignore_matches = False
				643
				644	# Secondary (second found) quote tracking.
				645	alt_quote = None
				646	start_alt_quote = -1
				647	exit_alt_quote = -1
				648
				649	# Track last character
				650	last = ''
				651
				652	for pos in range(index, len(data)):
				653	c = data[pos]
				654	if c == '(':
				655	# Count nested (
				656	# Don't increment the bracket count if we are sure we're in a title.
				657	if not ignore_matches:
				658	bracket_count += 1
				659	elif backtrack_count > 0:
				660	backtrack_count -= 1
				661	elif c == ')':
				662	# Match nested ) to (
				663	# Don't decrement if we are sure we are in a title that is unclosed.
				664	if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):
				665	bracket_count = 0
				666	elif not ignore_matches:
				667	bracket_count -= 1
				668	elif backtrack_count > 0:
				669	backtrack_count -= 1
揚帆起航	0f44735	2022-11-28 22:32:11 +0000	[diff] [blame^]	670	# We've found our backup end location if the title doesn't resolve.
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	671	if backtrack_count == 0:
				672	last_bracket = index + 1
				673
				674	elif c in ("'", '"'):
				675	# Quote has started
				676	if not quote:
				677	# We'll assume we are now in a title.
				678	# Brackets are quoted, so no need to match them (except for the final one).
				679	ignore_matches = True
				680	backtrack_count = bracket_count
				681	bracket_count = 1
				682	start_quote = index + 1
				683	quote = c
				684	# Secondary quote (in case the first doesn't resolve): [text](link'"title")
				685	elif c != quote and not alt_quote:
				686	start_alt_quote = index + 1
				687	alt_quote = c
				688	# Update primary quote match
				689	elif c == quote:
				690	exit_quote = index + 1
				691	# Update secondary quote match
				692	elif alt_quote and c == alt_quote:
				693	exit_alt_quote = index + 1
				694
				695	index += 1
				696
				697	# Link is closed, so let's break out of the loop
				698	if bracket_count == 0:
				699	# Get the title if we closed a title string right before link closed
				700	if exit_quote >= 0 and quote == last:
				701	href = data[start_index:start_quote - 1]
				702	title = ''.join(data[start_quote:exit_quote - 1])
				703	elif exit_alt_quote >= 0 and alt_quote == last:
				704	href = data[start_index:start_alt_quote - 1]
				705	title = ''.join(data[start_alt_quote:exit_alt_quote - 1])
				706	else:
				707	href = data[start_index:index - 1]
				708	break
				709
				710	if c != ' ':
				711	last = c
				712
				713	# We have a scenario: [test](link"notitle)
				714	# When we enter a string, we stop tracking bracket resolution in the main counter,
				715	# but we do keep a backup counter up until we discover where we might resolve all brackets
				716	# if the title string fails to resolve.
				717	if bracket_count != 0 and backtrack_count == 0:
				718	href = data[start_index:last_bracket - 1]
				719	index = last_bracket
				720	bracket_count = 0
				721
				722	handled = bracket_count == 0
				723
				724	if title is not None:
				725	title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))
				726
				727	href = self.unescape(href).strip()
				728
				729	return href, title, index, handled
				730
				731	def getText(self, data, index):
				732	"""Parse the content between `[]` of the start of an image or link
				733	resolving nested square brackets.
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	734
				735	"""
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	736	bracket_count = 1
				737	text = []
				738	for pos in range(index, len(data)):
				739	c = data[pos]
				740	if c == ']':
				741	bracket_count -= 1
				742	elif c == '[':
				743	bracket_count += 1
				744	index += 1
				745	if bracket_count == 0:
				746	break
				747	text.append(c)
				748	return ''.join(text), index, bracket_count == 0
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	749
				750
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	751	class ImageInlineProcessor(LinkInlineProcessor):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	752	""" Return a img element from the given match. """
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	753
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	754	def handleMatch(self, m, data):
				755	text, index, handled = self.getText(data, m.end(0))
				756	if not handled:
				757	return None, None, None
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	758
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	759	src, title, index, handled = self.getLink(data, index)
				760	if not handled:
				761	return None, None, None
				762
				763	el = etree.Element("img")
				764
				765	el.set("src", src)
				766
				767	if title is not None:
				768	el.set("title", title)
				769
				770	el.set('alt', self.unescape(text))
				771	return el, m.start(0), index
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	772
				773
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	774	class ReferenceInlineProcessor(LinkInlineProcessor):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	775	""" Match to a stored reference and return link element. """
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	776	NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	777
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	778	RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL \| re.UNICODE)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	779
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	780	def handleMatch(self, m, data):
				781	text, index, handled = self.getText(data, m.end(0))
				782	if not handled:
				783	return None, None, None
				784
				785	id, end, handled = self.evalId(data, index, text)
				786	if not handled:
				787	return None, None, None
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	788
				789	# Clean up linebreaks in id
				790	id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	791	if id not in self.md.references: # ignore undefined refs
				792	return None, m.start(0), end
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	793
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	794	href, title = self.md.references[id]
				795
				796	return self.makeTag(href, title, text), m.start(0), end
				797
				798	def evalId(self, data, index, text):
				799	"""
				800	Evaluate the id portion of [ref][id].
				801
				802	If [ref][] use [ref].
				803	"""
				804	m = self.RE_LINK.match(data, pos=index)
				805	if not m:
				806	return None, index, False
				807	else:
				808	id = m.group(1).lower()
				809	end = m.end(0)
				810	if not id:
				811	id = text.lower()
				812	return id, end, True
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	813
				814	def makeTag(self, href, title, text):
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	815	el = etree.Element('a')
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	816
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	817	el.set('href', href)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	818	if title:
				819	el.set('title', title)
				820
				821	el.text = text
				822	return el
				823
				824
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	825	class ShortReferenceInlineProcessor(ReferenceInlineProcessor):
				826	"""Short form of reference: [google]. """
				827	def evalId(self, data, index, text):
				828	"""Evaluate the id from of [ref] """
				829
				830	return text.lower(), index, True
				831
				832
				833	class ImageReferenceInlineProcessor(ReferenceInlineProcessor):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	834	""" Match to a stored reference and return img element. """
				835	def makeTag(self, href, title, text):
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	836	el = etree.Element("img")
				837	el.set("src", href)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	838	if title:
				839	el.set("title", title)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	840	el.set("alt", self.unescape(text))
				841	return el
				842
				843
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	844	class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):
				845	""" Short form of inage reference: ![ref]. """
				846	def evalId(self, data, index, text):
				847	"""Evaluate the id from of [ref] """
				848
				849	return text.lower(), index, True
				850
				851
				852	class AutolinkInlineProcessor(InlineProcessor):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	853	""" Return a link Element given an autolink (`<http://example/com>`). """
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	854	def handleMatch(self, m, data):
				855	el = etree.Element("a")
				856	el.set('href', self.unescape(m.group(1)))
				857	el.text = util.AtomicString(m.group(1))
				858	return el, m.start(0), m.end(0)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	859
				860
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	861	class AutomailInlineProcessor(InlineProcessor):
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	862	"""
				863	Return a mailto link Element given an automail link (`<foo@example.com>`).
				864	"""
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	865	def handleMatch(self, m, data):
				866	el = etree.Element('a')
				867	email = self.unescape(m.group(1))
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	868	if email.startswith("mailto:"):
				869	email = email[len("mailto:"):]
				870
				871	def codepoint2name(code):
				872	"""Return entity definition by code, or the code if not defined."""
				873	entity = entities.codepoint2name.get(code)
				874	if entity:
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	875	return "{}{};".format(util.AMP_SUBSTITUTE, entity)
dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	876	else:
				877	return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
				878
				879	letters = [codepoint2name(ord(letter)) for letter in email]
				880	el.text = util.AtomicString(''.join(letters))
				881
				882	mailto = "mailto:" + email
				883	mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
				884	ord(letter) for letter in mailto])
				885	el.set('href', mailto)
Yu-Ping Wu	6a8f3a2	2021-11-24 00:45:03 +0000	[diff] [blame]	886	return el, m.start(0), m.end(0)