Blame - markdown/preprocessors.py - chromium.googlesource.com/chromium/src/third_party/Python-Markdown

blob: 7fd38d331fb5685a4c06f23e646c9d4d40e69b8b [file] [log] [blame]

dpranke	b08af21	2015-10-06 17:44:36 -0700	[diff] [blame^]	1	"""
				2	PRE-PROCESSORS
				3	=============================================================================
				4
				5	Preprocessors work on source text before we start doing anything too
				6	complicated.
				7	"""
				8
				9	from __future__ import absolute_import
				10	from __future__ import unicode_literals
				11	from . import util
				12	from . import odict
				13	import re
				14
				15
				16	def build_preprocessors(md_instance, **kwargs):
				17	""" Build the default set of preprocessors used by Markdown. """
				18	preprocessors = odict.OrderedDict()
				19	preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
				20	if md_instance.safeMode != 'escape':
				21	preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
				22	preprocessors["reference"] = ReferencePreprocessor(md_instance)
				23	return preprocessors
				24
				25
				26	class Preprocessor(util.Processor):
				27	"""
				28	Preprocessors are run after the text is broken into lines.
				29
				30	Each preprocessor implements a "run" method that takes a pointer to a
				31	list of lines of the document, modifies it as necessary and returns
				32	either the same pointer or a pointer to a new list.
				33
				34	Preprocessors must extend markdown.Preprocessor.
				35
				36	"""
				37	def run(self, lines):
				38	"""
				39	Each subclass of Preprocessor should override the `run` method, which
				40	takes the document as a list of strings split by newlines and returns
				41	the (possibly modified) list of lines.
				42
				43	"""
				44	pass # pragma: no cover
				45
				46
				47	class NormalizeWhitespace(Preprocessor):
				48	""" Normalize whitespace for consistant parsing. """
				49
				50	def run(self, lines):
				51	source = '\n'.join(lines)
				52	source = source.replace(util.STX, "").replace(util.ETX, "")
				53	source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
				54	source = source.expandtabs(self.markdown.tab_length)
				55	source = re.sub(r'(?<=\n) +\n', '\n', source)
				56	return source.split('\n')
				57
				58
				59	class HtmlBlockPreprocessor(Preprocessor):
				60	"""Remove html blocks from the text and store them for later retrieval."""
				61
				62	right_tag_patterns = ["</%s>", "%s>"]
				63	attrs_pattern = r"""
				64	\s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
				65	\| # OR
				66	\s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
				67	\| # OR
				68	\s+(?P<attr2>[^>"'/= ]+) # attr
				69	"""
				70	left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s))\s\/?\>?' % \
				71	attrs_pattern
				72	attrs_re = re.compile(attrs_pattern, re.VERBOSE)
				73	left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
				74	markdown_in_raw = False
				75
				76	def _get_left_tag(self, block):
				77	m = self.left_tag_re.match(block)
				78	if m:
				79	tag = m.group('tag')
				80	raw_attrs = m.group('attrs')
				81	attrs = {}
				82	if raw_attrs:
				83	for ma in self.attrs_re.finditer(raw_attrs):
				84	if ma.group('attr'):
				85	if ma.group('value'):
				86	attrs[ma.group('attr').strip()] = ma.group('value')
				87	else:
				88	attrs[ma.group('attr').strip()] = ""
				89	elif ma.group('attr1'):
				90	if ma.group('value1'):
				91	attrs[ma.group('attr1').strip()] = ma.group(
				92	'value1'
				93	)
				94	else:
				95	attrs[ma.group('attr1').strip()] = ""
				96	elif ma.group('attr2'):
				97	attrs[ma.group('attr2').strip()] = ""
				98	return tag, len(m.group(0)), attrs
				99	else:
				100	tag = block[1:].split(">", 1)[0].lower()
				101	return tag, len(tag)+2, {}
				102
				103	def _recursive_tagfind(self, ltag, rtag, start_index, block):
				104	while 1:
				105	i = block.find(rtag, start_index)
				106	if i == -1:
				107	return -1
				108	j = block.find(ltag, start_index)
				109	# if no ltag, or rtag found before another ltag, return index
				110	if (j > i or j == -1):
				111	return i + len(rtag)
				112	# another ltag found before rtag, use end of ltag as starting
				113	# point and search again
				114	j = block.find('>', j)
				115	start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
				116	if start_index == -1:
				117	# HTML potentially malformed- ltag has no corresponding
				118	# rtag
				119	return -1
				120
				121	def _get_right_tag(self, left_tag, left_index, block):
				122	for p in self.right_tag_patterns:
				123	tag = p % left_tag
				124	i = self._recursive_tagfind(
				125	"<%s" % left_tag, tag, left_index, block
				126	)
				127	if i > 2:
				128	return tag.lstrip("<").rstrip(">"), i
				129	return block.rstrip()[-left_index:-1].lower(), len(block)
				130
				131	def _equal_tags(self, left_tag, right_tag):
				132	if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
				133	return True
				134	if ("/" + left_tag) == right_tag:
				135	return True
				136	if (right_tag == "--" and left_tag == "--"):
				137	return True
				138	elif left_tag == right_tag[1:] and right_tag[0] == "/":
				139	return True
				140	else:
				141	return False
				142
				143	def _is_oneliner(self, tag):
				144	return (tag in ['hr', 'hr/'])
				145
				146	def _stringindex_to_listindex(self, stringindex, items):
				147	"""
				148	Same effect as concatenating the strings in items,
				149	finding the character to which stringindex refers in that string,
				150	and returning the index of the item in which that character resides.
				151	"""
				152	items.append('dummy')
				153	i, count = 0, 0
				154	while count <= stringindex:
				155	count += len(items[i])
				156	i += 1
				157	return i - 1
				158
				159	def _nested_markdown_in_html(self, items):
				160	"""Find and process html child elements of the given element block."""
				161	for i, item in enumerate(items):
				162	if self.left_tag_re.match(item):
				163	left_tag, left_index, attrs = \
				164	self._get_left_tag(''.join(items[i:]))
				165	right_tag, data_index = self._get_right_tag(
				166	left_tag, left_index, ''.join(items[i:]))
				167	right_listindex = \
				168	self._stringindex_to_listindex(data_index, items[i:]) + i
				169	if 'markdown' in attrs.keys():
				170	items[i] = items[i][left_index:] # remove opening tag
				171	placeholder = self.markdown.htmlStash.store_tag(
				172	left_tag, attrs, i + 1, right_listindex + 1)
				173	items.insert(i, placeholder)
				174	if len(items) - right_listindex <= 1: # last nest, no tail
				175	right_listindex -= 1
				176	items[right_listindex] = items[right_listindex][
				177	:-len(right_tag) - 2] # remove closing tag
				178	else: # raw html
				179	if len(items) - right_listindex <= 1: # last element
				180	right_listindex -= 1
				181	offset = 1 if i == right_listindex else 0
				182	placeholder = self.markdown.htmlStash.store('\n\n'.join(
				183	items[i:right_listindex + offset]))
				184	del items[i:right_listindex + offset]
				185	items.insert(i, placeholder)
				186	return items
				187
				188	def run(self, lines):
				189	text = "\n".join(lines)
				190	new_blocks = []
				191	text = text.rsplit("\n\n")
				192	items = []
				193	left_tag = ''
				194	right_tag = ''
				195	in_tag = False # flag
				196
				197	while text:
				198	block = text[0]
				199	if block.startswith("\n"):
				200	block = block[1:]
				201	text = text[1:]
				202
				203	if block.startswith("\n"):
				204	block = block[1:]
				205
				206	if not in_tag:
				207	if block.startswith("<") and len(block.strip()) > 1:
				208
				209	if block[1:4] == "!--":
				210	# is a comment block
				211	left_tag, left_index, attrs = "--", 2, {}
				212	else:
				213	left_tag, left_index, attrs = self._get_left_tag(block)
				214	right_tag, data_index = self._get_right_tag(left_tag,
				215	left_index,
				216	block)
				217	# keep checking conditions below and maybe just append
				218
				219	if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'):
				220	text.insert(0, block[data_index:])
				221	block = block[:data_index]
				222
				223	if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]):
				224	new_blocks.append(block)
				225	continue
				226
				227	if self._is_oneliner(left_tag):
				228	new_blocks.append(block.strip())
				229	continue
				230
				231	if block.rstrip().endswith(">") \
				232	and self._equal_tags(left_tag, right_tag):
				233	if self.markdown_in_raw and 'markdown' in attrs.keys():
				234	block = block[left_index:-len(right_tag) - 2]
				235	new_blocks.append(self.markdown.htmlStash.
				236	store_tag(left_tag, attrs, 0, 2))
				237	new_blocks.extend([block])
				238	else:
				239	new_blocks.append(
				240	self.markdown.htmlStash.store(block.strip()))
				241	continue
				242	else:
				243	# if is block level tag and is not complete
				244	if (not self._equal_tags(left_tag, right_tag)) and \
				245	(util.isBlockLevel(left_tag) or left_tag == "--"):
				246	items.append(block.strip())
				247	in_tag = True
				248	else:
				249	new_blocks.append(
				250	self.markdown.htmlStash.store(block.strip())
				251	)
				252	continue
				253
				254	else:
				255	new_blocks.append(block)
				256
				257	else:
				258	items.append(block)
				259
				260	right_tag, data_index = self._get_right_tag(left_tag, 0, block)
				261
				262	if self._equal_tags(left_tag, right_tag):
				263	# if find closing tag
				264
				265	if data_index < len(block):
				266	# we have more text after right_tag
				267	items[-1] = block[:data_index]
				268	text.insert(0, block[data_index:])
				269
				270	in_tag = False
				271	if self.markdown_in_raw and 'markdown' in attrs.keys():
				272	items[0] = items[0][left_index:]
				273	items[-1] = items[-1][:-len(right_tag) - 2]
				274	if items[len(items) - 1]: # not a newline/empty string
				275	right_index = len(items) + 3
				276	else:
				277	right_index = len(items) + 2
				278	new_blocks.append(self.markdown.htmlStash.store_tag(
				279	left_tag, attrs, 0, right_index))
				280	placeholderslen = len(self.markdown.htmlStash.tag_data)
				281	new_blocks.extend(
				282	self._nested_markdown_in_html(items))
				283	nests = len(self.markdown.htmlStash.tag_data) - \
				284	placeholderslen
				285	self.markdown.htmlStash.tag_data[-1 - nests][
				286	'right_index'] += nests - 2
				287	else:
				288	new_blocks.append(
				289	self.markdown.htmlStash.store('\n\n'.join(items)))
				290	items = []
				291
				292	if items:
				293	if self.markdown_in_raw and 'markdown' in attrs.keys():
				294	items[0] = items[0][left_index:]
				295	items[-1] = items[-1][:-len(right_tag) - 2]
				296	if items[len(items) - 1]: # not a newline/empty string
				297	right_index = len(items) + 3
				298	else:
				299	right_index = len(items) + 2
				300	new_blocks.append(
				301	self.markdown.htmlStash.store_tag(
				302	left_tag, attrs, 0, right_index))
				303	placeholderslen = len(self.markdown.htmlStash.tag_data)
				304	new_blocks.extend(self._nested_markdown_in_html(items))
				305	nests = len(self.markdown.htmlStash.tag_data) - placeholderslen
				306	self.markdown.htmlStash.tag_data[-1 - nests][
				307	'right_index'] += nests - 2
				308	else:
				309	new_blocks.append(
				310	self.markdown.htmlStash.store('\n\n'.join(items)))
				311	new_blocks.append('\n')
				312
				313	new_text = "\n\n".join(new_blocks)
				314	return new_text.split("\n")
				315
				316
				317	class ReferencePreprocessor(Preprocessor):
				318	""" Remove reference definitions from text and store for later use. """
				319
				320	TITLE = r'[ ](\"(.)\"\|\'(.)\'\|$(.)$)[ ]*'
				321	RE = re.compile(
				322	r'^[ ]{0,3}\[([^\]])\]:\s([^ ])[ ](%s)?$' % TITLE, re.DOTALL
				323	)
				324	TITLE_RE = re.compile(r'^%s$' % TITLE)
				325
				326	def run(self, lines):
				327	new_text = []
				328	while lines:
				329	line = lines.pop(0)
				330	m = self.RE.match(line)
				331	if m:
				332	id = m.group(1).strip().lower()
				333	link = m.group(2).lstrip('<').rstrip('>')
				334	t = m.group(5) or m.group(6) or m.group(7)
				335	if not t:
				336	# Check next line for title
				337	tm = self.TITLE_RE.match(lines[0])
				338	if tm:
				339	lines.pop(0)
				340	t = tm.group(2) or tm.group(3) or tm.group(4)
				341	self.markdown.references[id] = (link, t)
				342	else:
				343	new_text.append(line)
				344
				345	return new_text # + "\n"