Blame - markdown/inlinepatterns.py - chromium.googlesource.com/chromium/src/third_party/Python-Markdown

blob: 95d358d7156ccd341f5c5f33a2a48d1255d122dd [file] [log] [blame]

dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame]	1	"""
				2	INLINE PATTERNS
				3	=============================================================================
				4
				5	Inline patterns such as emphasis are handled by means of auxiliary
				6	objects, one per pattern. Pattern objects must be instances of classes
				7	that extend markdown.Pattern. Each pattern object uses a single regular
				8	expression and needs support the following methods:
				9
				10	pattern.getCompiledRegExp() # returns a regular expression
				11
				12	pattern.handleMatch(m) # takes a match object and returns
				13	# an ElementTree element or just plain text
				14
				15	All of python markdown's built-in patterns subclass from Pattern,
				16	but you can add additional patterns that don't.
				17
				18	Also note that all the regular expressions used by inline must
				19	capture the whole block. For this reason, they all start with
				20	'^(.)' and end with '(.)!'. In case with built-in expression
				21	Pattern takes care of adding the "^(.)" and "(.)!".
				22
				23	Finally, the order in which regular expressions are applied is very
				24	important - e.g. if we first replace http://.../ links with <a> tags
				25	and _then_ try to replace inline html, we would end up with a mess.
				26	So, we apply the expressions in the following order:
				27
				28	* escape and backticks have to go before everything else, so
				29	that we can preempt any markdown patterns by escaping them.
				30
				31	* then we handle auto-links (must be done before inline html)
				32
				33	* then we handle inline HTML. At this point we will simply
				34	replace all inline HTML strings with a placeholder and add
				35	the actual HTML to a hash.
				36
				37	* then inline images (must be done before links)
				38
				39	* then bracketed links, first regular then reference-style
				40
				41	* finally we apply strong and emphasis
				42	"""
				43
				44	from __future__ import absolute_import
				45	from __future__ import unicode_literals
				46	from . import util
				47	from . import odict
				48	import re
				49	try: # pragma: no cover
				50	from urllib.parse import urlparse, urlunparse
				51	except ImportError: # pragma: no cover
				52	from urlparse import urlparse, urlunparse
				53	try: # pragma: no cover
				54	from html import entities
				55	except ImportError: # pragma: no cover
				56	import htmlentitydefs as entities
				57
				58
				59	def build_inlinepatterns(md_instance, **kwargs):
				60	""" Build the default set of inline patterns for Markdown. """
				61	inlinePatterns = odict.OrderedDict()
				62	inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
				63	inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
				64	inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
				65	inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
				66	inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
				67	inlinePatterns["image_reference"] = ImageReferencePattern(
				68	IMAGE_REFERENCE_RE, md_instance
				69	)
				70	inlinePatterns["short_reference"] = ReferencePattern(
				71	SHORT_REF_RE, md_instance
				72	)
				73	inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
				74	inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
				75	inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
				76	if md_instance.safeMode != 'escape':
				77	inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
				78	inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
				79	inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
				80	inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')
				81	inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')
				82	inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
				83	inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
				84	if md_instance.smart_emphasis:
				85	inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
				86	else:
				87	inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
				88	return inlinePatterns
				89
				90	"""
				91	The actual regular expressions for patterns
				92	-----------------------------------------------------------------------------
				93	"""
				94
				95	NOBRACKET = r'[^\]\[]*'
				96	BRK = (
				97	r'\[(' +
				98	(NOBRACKET + r'(\[')*6 +
				99	(NOBRACKET + r'\])')6 +
				100	NOBRACKET + r')\]'
				101	)
				102	NOIMG = r'(?<!\!)'
				103
				104	# `e=f()` or ``e=f("`")``
				105	BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)'
				106
				107	# \<
				108	ESCAPE_RE = r'\\(.)'
				109
				110	# emphasis
				111	EMPHASIS_RE = r'(\)([^\]+)\2'
				112
				113	# strong
				114	STRONG_RE = r'(\*{2}\|_{2})(.+?)\2'
				115
				116	# *strongem* or **emstrong**
				117	EM_STRONG_RE = r'(\\|_)\2{2}(.+?)\2(.?)\2{2}'
				118
				119	# *strongem*
				120	STRONG_EM_RE = r'(\\|_)\2{2}(.+?)\2{2}(.?)\2'
				121
				122	# _smart_emphasis_
				123	SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'
				124
				125	# _emphasis_
				126	EMPHASIS_2_RE = r'(_)(.+?)\2'
				127
				128	# [text](url) or [text](<url>) or [text](url "title")
				129	LINK_RE = NOIMG + BRK + \
				130	r'''$\s(<.?>\|((?:(?:\(.?$)\|[^]))?)\s((['"])(.?)\12\s*)?\)'''
				131
				132	# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
				133	IMAGE_LINK_RE = r'\!' + BRK + r'\s$(<.?>\|([^")]+"[^"]"\|[^$]))\)'
				134
				135	# [Google][3]
				136	REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'
				137
				138	# [Google]
				139	SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'
				140
				141	# ![alt text][2]
				142	IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]'
				143
				144	# stand-alone * or _
				145	NOT_STRONG_RE = r'((^\| )(\*\|_)( \|$))'
				146
				147	# <http://www.123.com>
				148	AUTOLINK_RE = r'<((?:[Ff]\|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'
				149
				150	# <me@example.com>
				151	AUTOMAIL_RE = r'<([^> \!]@[^> ])>'
				152
				153	# <...>
				154	HTML_RE = r'(\<([a-zA-Z/][^\>]?\|\!--.?--)\>)'
				155
				156	# &
				157	ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'
				158
				159	# two spaces at end of line
				160	LINE_BREAK_RE = r' \n'
				161
				162
				163	def dequote(string):
				164	"""Remove quotes from around a string."""
				165	if ((string.startswith('"') and string.endswith('"')) or
				166	(string.startswith("'") and string.endswith("'"))):
				167	return string[1:-1]
				168	else:
				169	return string
				170
				171
				172	ATTR_RE = re.compile("\{@([^\}])=([^\}])}") # {@id=123}
				173
				174
				175	def handleAttributes(text, parent):
				176	"""Set values of an element based on attribute definitions ({@id=123})."""
				177	def attributeCallback(match):
				178	parent.set(match.group(1), match.group(2).replace('\n', ' '))
				179	return ATTR_RE.sub(attributeCallback, text)
				180
				181
				182	"""
				183	The pattern classes
				184	-----------------------------------------------------------------------------
				185	"""
				186
				187
				188	class Pattern(object):
				189	"""Base class that inline patterns subclass. """
				190
				191	def __init__(self, pattern, markdown_instance=None):
				192	"""
				193	Create an instant of an inline pattern.
				194
				195	Keyword arguments:
				196
				197	* pattern: A regular expression that matches a pattern
				198
				199	"""
				200	self.pattern = pattern
				201	self.compiled_re = re.compile("^(.?)%s(.?)$" % pattern,
				202	re.DOTALL \| re.UNICODE)
				203
				204	# Api for Markdown to pass safe_mode into instance
				205	self.safe_mode = False
				206	if markdown_instance:
				207	self.markdown = markdown_instance
				208
				209	def getCompiledRegExp(self):
				210	""" Return a compiled regular expression. """
				211	return self.compiled_re
				212
				213	def handleMatch(self, m):
				214	"""Return a ElementTree element from the given match.
				215
				216	Subclasses should override this method.
				217
				218	Keyword arguments:
				219
				220	* m: A re match object containing a match of the pattern.
				221
				222	"""
				223	pass # pragma: no cover
				224
				225	def type(self):
				226	""" Return class name, to define pattern type """
				227	return self.__class__.__name__
				228
				229	def unescape(self, text):
				230	""" Return unescaped text given text with an inline placeholder. """
				231	try:
				232	stash = self.markdown.treeprocessors['inline'].stashed_nodes
				233	except KeyError: # pragma: no cover
				234	return text
				235
				236	def itertext(el): # pragma: no cover
				237	' Reimplement Element.itertext for older python versions '
				238	tag = el.tag
				239	if not isinstance(tag, util.string_type) and tag is not None:
				240	return
				241	if el.text:
				242	yield el.text
				243	for e in el:
				244	for s in itertext(e):
				245	yield s
				246	if e.tail:
				247	yield e.tail
				248
				249	def get_stash(m):
				250	id = m.group(1)
				251	if id in stash:
				252	value = stash.get(id)
				253	if isinstance(value, util.string_type):
				254	return value
				255	else:
				256	# An etree Element - return text content only
				257	return ''.join(itertext(value))
				258	return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
				259
				260
				261	class SimpleTextPattern(Pattern):
				262	""" Return a simple text of group(2) of a Pattern. """
				263	def handleMatch(self, m):
				264	return m.group(2)
				265
				266
				267	class EscapePattern(Pattern):
				268	""" Return an escaped character. """
				269
				270	def handleMatch(self, m):
				271	char = m.group(2)
				272	if char in self.markdown.ESCAPED_CHARS:
				273	return '%s%s%s' % (util.STX, ord(char), util.ETX)
				274	else:
				275	return None
				276
				277
				278	class SimpleTagPattern(Pattern):
				279	"""
				280	Return element of type `tag` with a text attribute of group(3)
				281	of a Pattern.
				282
				283	"""
				284	def __init__(self, pattern, tag):
				285	Pattern.__init__(self, pattern)
				286	self.tag = tag
				287
				288	def handleMatch(self, m):
				289	el = util.etree.Element(self.tag)
				290	el.text = m.group(3)
				291	return el
				292
				293
				294	class SubstituteTagPattern(SimpleTagPattern):
				295	""" Return an element of type `tag` with no children. """
				296	def handleMatch(self, m):
				297	return util.etree.Element(self.tag)
				298
				299
				300	class BacktickPattern(Pattern):
				301	""" Return a `<code>` element containing the matching text. """
				302	def __init__(self, pattern):
				303	Pattern.__init__(self, pattern)
				304	self.tag = "code"
				305
				306	def handleMatch(self, m):
				307	el = util.etree.Element(self.tag)
				308	el.text = util.AtomicString(m.group(3).strip())
				309	return el
				310
				311
				312	class DoubleTagPattern(SimpleTagPattern):
				313	"""Return a ElementTree element nested in tag2 nested in tag1.
				314
				315	Useful for strong emphasis etc.
				316
				317	"""
				318	def handleMatch(self, m):
				319	tag1, tag2 = self.tag.split(",")
				320	el1 = util.etree.Element(tag1)
				321	el2 = util.etree.SubElement(el1, tag2)
				322	el2.text = m.group(3)
				323	if len(m.groups()) == 5:
				324	el2.tail = m.group(4)
				325	return el1
				326
				327
				328	class HtmlPattern(Pattern):
				329	""" Store raw inline html and return a placeholder. """
				330	def handleMatch(self, m):
				331	rawhtml = self.unescape(m.group(2))
				332	place_holder = self.markdown.htmlStash.store(rawhtml)
				333	return place_holder
				334
				335	def unescape(self, text):
				336	""" Return unescaped text given text with an inline placeholder. """
				337	try:
				338	stash = self.markdown.treeprocessors['inline'].stashed_nodes
				339	except KeyError: # pragma: no cover
				340	return text
				341
				342	def get_stash(m):
				343	id = m.group(1)
				344	value = stash.get(id)
				345	if value is not None:
				346	try:
				347	return self.markdown.serializer(value)
				348	except:
				349	return '\%s' % value
				350
				351	return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
				352
				353
				354	class LinkPattern(Pattern):
				355	""" Return a link element from the given match. """
				356	def handleMatch(self, m):
				357	el = util.etree.Element("a")
				358	el.text = m.group(2)
				359	title = m.group(13)
				360	href = m.group(9)
				361
				362	if href:
				363	if href[0] == "<":
				364	href = href[1:-1]
				365	el.set("href", self.sanitize_url(self.unescape(href.strip())))
				366	else:
				367	el.set("href", "")
				368
				369	if title:
				370	title = dequote(self.unescape(title))
				371	el.set("title", title)
				372	return el
				373
				374	def sanitize_url(self, url):
				375	"""
				376	Sanitize a url against xss attacks in "safe_mode".
				377
				378	Rather than specifically blacklisting `javascript:alert("XSS")` and all
				379	its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
				380	safe url formats. Most urls contain a network location, however some
				381	are known not to (i.e.: mailto links). Script urls do not contain a
				382	location. Additionally, for `javascript:...`, the scheme would be
				383	"javascript" but some aliases will appear to `urlparse()` to have no
				384	scheme. On top of that relative links (i.e.: "foo/bar.html") have no
				385	scheme. Therefore we must check "path", "parameters", "query" and
				386	"fragment" for any literal colons. We don't check "scheme" for colons
				387	because it should never have any and "netloc" must allow the form:
				388	`username:password@host:port`.
				389
				390	"""
				391	if not self.markdown.safeMode:
				392	# Return immediately bipassing parsing.
				393	return url
				394
				395	try:
				396	scheme, netloc, path, params, query, fragment = url = urlparse(url)
				397	except ValueError: # pragma: no cover
				398	# Bad url - so bad it couldn't be parsed.
				399	return ''
				400
				401	locless_schemes = ['', 'mailto', 'news']
				402	allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
				403	if scheme not in allowed_schemes:
				404	# Not a known (allowed) scheme. Not safe.
				405	return ''
				406
				407	if netloc == '' and scheme not in locless_schemes: # pragma: no cover
				408	# This should not happen. Treat as suspect.
				409	return ''
				410
				411	for part in url[2:]:
				412	if ":" in part:
				413	# A colon in "path", "parameters", "query"
				414	# or "fragment" is suspect.
				415	return ''
				416
				417	# Url passes all tests. Return url as-is.
				418	return urlunparse(url)
				419
				420
				421	class ImagePattern(LinkPattern):
				422	""" Return a img element from the given match. """
				423	def handleMatch(self, m):
				424	el = util.etree.Element("img")
				425	src_parts = m.group(9).split()
				426	if src_parts:
				427	src = src_parts[0]
				428	if src[0] == "<" and src[-1] == ">":
				429	src = src[1:-1]
				430	el.set('src', self.sanitize_url(self.unescape(src)))
				431	else:
				432	el.set('src', "")
				433	if len(src_parts) > 1:
				434	el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
				435
				436	if self.markdown.enable_attributes:
				437	truealt = handleAttributes(m.group(2), el)
				438	else:
				439	truealt = m.group(2)
				440
				441	el.set('alt', self.unescape(truealt))
				442	return el
				443
				444
				445	class ReferencePattern(LinkPattern):
				446	""" Match to a stored reference and return link element. """
				447
				448	NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
				449
				450	def handleMatch(self, m):
				451	try:
				452	id = m.group(9).lower()
				453	except IndexError:
				454	id = None
				455	if not id:
				456	# if we got something like "[Google][]" or "[Goggle]"
				457	# we'll use "google" as the id
				458	id = m.group(2).lower()
				459
				460	# Clean up linebreaks in id
				461	id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
				462	if id not in self.markdown.references: # ignore undefined refs
				463	return None
				464	href, title = self.markdown.references[id]
				465
				466	text = m.group(2)
				467	return self.makeTag(href, title, text)
				468
				469	def makeTag(self, href, title, text):
				470	el = util.etree.Element('a')
				471
				472	el.set('href', self.sanitize_url(href))
				473	if title:
				474	el.set('title', title)
				475
				476	el.text = text
				477	return el
				478
				479
				480	class ImageReferencePattern(ReferencePattern):
				481	""" Match to a stored reference and return img element. """
				482	def makeTag(self, href, title, text):
				483	el = util.etree.Element("img")
				484	el.set("src", self.sanitize_url(href))
				485	if title:
				486	el.set("title", title)
				487
				488	if self.markdown.enable_attributes:
				489	text = handleAttributes(text, el)
				490
				491	el.set("alt", self.unescape(text))
				492	return el
				493
				494
				495	class AutolinkPattern(Pattern):
				496	""" Return a link Element given an autolink (`<http://example/com>`). """
				497	def handleMatch(self, m):
				498	el = util.etree.Element("a")
				499	el.set('href', self.unescape(m.group(2)))
				500	el.text = util.AtomicString(m.group(2))
				501	return el
				502
				503
				504	class AutomailPattern(Pattern):
				505	"""
				506	Return a mailto link Element given an automail link (`<foo@example.com>`).
				507	"""
				508	def handleMatch(self, m):
				509	el = util.etree.Element('a')
				510	email = self.unescape(m.group(2))
				511	if email.startswith("mailto:"):
				512	email = email[len("mailto:"):]
				513
				514	def codepoint2name(code):
				515	"""Return entity definition by code, or the code if not defined."""
				516	entity = entities.codepoint2name.get(code)
				517	if entity:
				518	return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
				519	else:
				520	return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
				521
				522	letters = [codepoint2name(ord(letter)) for letter in email]
				523	el.text = util.AtomicString(''.join(letters))
				524
				525	mailto = "mailto:" + email
				526	mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
				527	ord(letter) for letter in mailto])
				528	el.set('href', mailto)
				529	return el