blob: b0621a8287b68eea8dfee39876e1f23ccbce98d2 [file] [log] [blame]
dprankeb08af212015-10-06 17:44:36 -07001"""
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +00002Python Markdown
3
4A Python implementation of John Gruber's Markdown.
5
6Documentation: https://python-markdown.github.io/
7GitHub: https://github.com/Python-Markdown/markdown/
8PyPI: https://pypi.org/project/Markdown/
9
10Started by Manfred Stienstra (http://www.dwerg.net/).
11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
12Currently maintained by Waylan Limberg (https://github.com/waylan),
13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14
15Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
17Copyright 2004 Manfred Stienstra (the original version)
18
19License: BSD (see LICENSE.md for details).
20
dprankeb08af212015-10-06 17:44:36 -070021INLINE PATTERNS
22=============================================================================
23
24Inline patterns such as *emphasis* are handled by means of auxiliary
25objects, one per pattern. Pattern objects must be instances of classes
26that extend markdown.Pattern. Each pattern object uses a single regular
27expression and needs support the following methods:
28
29 pattern.getCompiledRegExp() # returns a regular expression
30
31 pattern.handleMatch(m) # takes a match object and returns
32 # an ElementTree element or just plain text
33
34All of python markdown's built-in patterns subclass from Pattern,
35but you can add additional patterns that don't.
36
37Also note that all the regular expressions used by inline must
38capture the whole block. For this reason, they all start with
39'^(.*)' and end with '(.*)!'. In case with built-in expression
40Pattern takes care of adding the "^(.*)" and "(.*)!".
41
42Finally, the order in which regular expressions are applied is very
43important - e.g. if we first replace http://.../ links with <a> tags
44and _then_ try to replace inline html, we would end up with a mess.
45So, we apply the expressions in the following order:
46
47* escape and backticks have to go before everything else, so
48 that we can preempt any markdown patterns by escaping them.
49
50* then we handle auto-links (must be done before inline html)
51
52* then we handle inline HTML. At this point we will simply
53 replace all inline HTML strings with a placeholder and add
54 the actual HTML to a hash.
55
56* then inline images (must be done before links)
57
58* then bracketed links, first regular then reference-style
59
60* finally we apply strong and emphasis
61"""
62
dprankeb08af212015-10-06 17:44:36 -070063from . import util
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000064from collections import namedtuple
dprankeb08af212015-10-06 17:44:36 -070065import re
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000066import xml.etree.ElementTree as etree
dprankeb08af212015-10-06 17:44:36 -070067try: # pragma: no cover
68 from html import entities
69except ImportError: # pragma: no cover
70 import htmlentitydefs as entities
71
72
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000073def build_inlinepatterns(md, **kwargs):
dprankeb08af212015-10-06 17:44:36 -070074 """ Build the default set of inline patterns for Markdown. """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000075 inlinePatterns = util.Registry()
76 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)
77 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)
78 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)
79 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)
80 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)
81 inlinePatterns.register(
82 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140
dprankeb08af212015-10-06 17:44:36 -070083 )
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000084 inlinePatterns.register(
85 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130
dprankeb08af212015-10-06 17:44:36 -070086 )
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000087 inlinePatterns.register(
88 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125
89 )
90 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)
91 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)
92 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)
93 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)
94 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)
95 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)
96 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)
97 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)
dprankeb08af212015-10-06 17:44:36 -070098 return inlinePatterns
99
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000100
dprankeb08af212015-10-06 17:44:36 -0700101"""
102The actual regular expressions for patterns
103-----------------------------------------------------------------------------
104"""
105
dprankeb08af212015-10-06 17:44:36 -0700106NOIMG = r'(?<!\!)'
107
108# `e=f()` or ``e=f("`")``
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000109BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'
dprankeb08af212015-10-06 17:44:36 -0700110
111# \<
112ESCAPE_RE = r'\\(.)'
113
114# *emphasis*
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000115EMPHASIS_RE = r'(\*)([^\*]+)\1'
dprankeb08af212015-10-06 17:44:36 -0700116
117# **strong**
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000118STRONG_RE = r'(\*{2})(.+?)\1'
dprankeb08af212015-10-06 17:44:36 -0700119
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000120# __smart__strong__
121SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'
dprankeb08af212015-10-06 17:44:36 -0700122
123# _smart_emphasis_
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000124SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'
dprankeb08af212015-10-06 17:44:36 -0700125
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000126# __strong _em__
127SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'
128
129# ***strongem*** or ***em*strong**
130EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}'
131
132# ___strongem___ or ___em_strong__
133EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'
134
135# ***strong**em*
136STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1'
137
138# ___strong__em_
139STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'
140
141# **strong*em***
142STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}'
dprankeb08af212015-10-06 17:44:36 -0700143
144# [text](url) or [text](<url>) or [text](url "title")
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000145LINK_RE = NOIMG + r'\['
dprankeb08af212015-10-06 17:44:36 -0700146
147# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000148IMAGE_LINK_RE = r'\!\['
dprankeb08af212015-10-06 17:44:36 -0700149
150# [Google][3]
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000151REFERENCE_RE = LINK_RE
dprankeb08af212015-10-06 17:44:36 -0700152
153# ![alt text][2]
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000154IMAGE_REFERENCE_RE = IMAGE_LINK_RE
dprankeb08af212015-10-06 17:44:36 -0700155
156# stand-alone * or _
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000157NOT_STRONG_RE = r'((^|\s)(\*|_)(\s|$))'
dprankeb08af212015-10-06 17:44:36 -0700158
159# <http://www.123.com>
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000160AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'
dprankeb08af212015-10-06 17:44:36 -0700161
162# <me@example.com>
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000163AUTOMAIL_RE = r'<([^<> !]*@[^@<> ]*)>'
dprankeb08af212015-10-06 17:44:36 -0700164
165# <...>
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000166HTML_RE = r'(<([a-zA-Z/][^<>]*|!--(?:(?!<!--|-->).)*--)>)'
dprankeb08af212015-10-06 17:44:36 -0700167
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000168# "&#38;" (decimal) or "&#x26;" (hex) or "&amp;" (named)
169ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
dprankeb08af212015-10-06 17:44:36 -0700170
171# two spaces at end of line
172LINE_BREAK_RE = r' \n'
173
174
175def dequote(string):
176 """Remove quotes from around a string."""
177 if ((string.startswith('"') and string.endswith('"')) or
178 (string.startswith("'") and string.endswith("'"))):
179 return string[1:-1]
180 else:
181 return string
182
183
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000184class EmStrongItem(namedtuple('EmStrongItem', ['pattern', 'builder', 'tags'])):
185 """Emphasis/strong pattern item."""
dprankeb08af212015-10-06 17:44:36 -0700186
187
188"""
189The pattern classes
190-----------------------------------------------------------------------------
191"""
192
193
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000194class Pattern: # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700195 """Base class that inline patterns subclass. """
196
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000197 ANCESTOR_EXCLUDES = tuple()
198
199 def __init__(self, pattern, md=None):
dprankeb08af212015-10-06 17:44:36 -0700200 """
201 Create an instant of an inline pattern.
202
203 Keyword arguments:
204
205 * pattern: A regular expression that matches a pattern
206
207 """
208 self.pattern = pattern
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000209 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern,
dprankeb08af212015-10-06 17:44:36 -0700210 re.DOTALL | re.UNICODE)
211
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000212 self.md = md
213
214 @property
215 @util.deprecated("Use 'md' instead.")
216 def markdown(self):
217 # TODO: remove this later
218 return self.md
dprankeb08af212015-10-06 17:44:36 -0700219
220 def getCompiledRegExp(self):
221 """ Return a compiled regular expression. """
222 return self.compiled_re
223
224 def handleMatch(self, m):
225 """Return a ElementTree element from the given match.
226
227 Subclasses should override this method.
228
229 Keyword arguments:
230
231 * m: A re match object containing a match of the pattern.
232
233 """
234 pass # pragma: no cover
235
236 def type(self):
237 """ Return class name, to define pattern type """
238 return self.__class__.__name__
239
240 def unescape(self, text):
241 """ Return unescaped text given text with an inline placeholder. """
242 try:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000243 stash = self.md.treeprocessors['inline'].stashed_nodes
dprankeb08af212015-10-06 17:44:36 -0700244 except KeyError: # pragma: no cover
245 return text
246
dprankeb08af212015-10-06 17:44:36 -0700247 def get_stash(m):
248 id = m.group(1)
249 if id in stash:
250 value = stash.get(id)
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000251 if isinstance(value, str):
dprankeb08af212015-10-06 17:44:36 -0700252 return value
253 else:
254 # An etree Element - return text content only
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000255 return ''.join(value.itertext())
dprankeb08af212015-10-06 17:44:36 -0700256 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
257
258
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000259class InlineProcessor(Pattern):
260 """
261 Base class that inline patterns subclass.
262
263 This is the newer style inline processor that uses a more
264 efficient and flexible search approach.
265 """
266
267 def __init__(self, pattern, md=None):
268 """
269 Create an instant of an inline pattern.
270
271 Keyword arguments:
272
273 * pattern: A regular expression that matches a pattern
274
275 """
276 self.pattern = pattern
277 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
278
279 # Api for Markdown to pass safe_mode into instance
280 self.safe_mode = False
281 self.md = md
282
283 def handleMatch(self, m, data):
284 """Return a ElementTree element from the given match and the
285 start and end index of the matched text.
286
287 If `start` and/or `end` are returned as `None`, it will be
288 assumed that the processor did not find a valid region of text.
289
290 Subclasses should override this method.
291
292 Keyword arguments:
293
294 * m: A re match object containing a match of the pattern.
295 * data: The buffer current under analysis
296
297 Returns:
298
299 * el: The ElementTree element, text or None.
300 * start: The start of the region that has been matched or None.
301 * end: The end of the region that has been matched or None.
302
303 """
304 pass # pragma: no cover
305
306
307class SimpleTextPattern(Pattern): # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700308 """ Return a simple text of group(2) of a Pattern. """
309 def handleMatch(self, m):
310 return m.group(2)
311
312
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000313class SimpleTextInlineProcessor(InlineProcessor):
314 """ Return a simple text of group(1) of a Pattern. """
315 def handleMatch(self, m, data):
316 return m.group(1), m.start(0), m.end(0)
317
318
319class EscapeInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700320 """ Return an escaped character. """
321
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000322 def handleMatch(self, m, data):
323 char = m.group(1)
324 if char in self.md.ESCAPED_CHARS:
325 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700326 else:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000327 return None, m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700328
329
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000330class SimpleTagPattern(Pattern): # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700331 """
332 Return element of type `tag` with a text attribute of group(3)
333 of a Pattern.
334
335 """
336 def __init__(self, pattern, tag):
337 Pattern.__init__(self, pattern)
338 self.tag = tag
339
340 def handleMatch(self, m):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000341 el = etree.Element(self.tag)
dprankeb08af212015-10-06 17:44:36 -0700342 el.text = m.group(3)
343 return el
344
345
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000346class SimpleTagInlineProcessor(InlineProcessor):
347 """
348 Return element of type `tag` with a text attribute of group(2)
349 of a Pattern.
350
351 """
352 def __init__(self, pattern, tag):
353 InlineProcessor.__init__(self, pattern)
354 self.tag = tag
355
356 def handleMatch(self, m, data): # pragma: no cover
357 el = etree.Element(self.tag)
358 el.text = m.group(2)
359 return el, m.start(0), m.end(0)
360
361
362class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700363 """ Return an element of type `tag` with no children. """
364 def handleMatch(self, m):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000365 return etree.Element(self.tag)
dprankeb08af212015-10-06 17:44:36 -0700366
367
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000368class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):
369 """ Return an element of type `tag` with no children. """
370 def handleMatch(self, m, data):
371 return etree.Element(self.tag), m.start(0), m.end(0)
372
373
374class BacktickInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700375 """ Return a `<code>` element containing the matching text. """
376 def __init__(self, pattern):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000377 InlineProcessor.__init__(self, pattern)
378 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)
379 self.tag = 'code'
dprankeb08af212015-10-06 17:44:36 -0700380
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000381 def handleMatch(self, m, data):
382 if m.group(3):
383 el = etree.Element(self.tag)
384 el.text = util.AtomicString(util.code_escape(m.group(3).strip()))
385 return el, m.start(0), m.end(0)
386 else:
387 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700388
389
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000390class DoubleTagPattern(SimpleTagPattern): # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700391 """Return a ElementTree element nested in tag2 nested in tag1.
392
393 Useful for strong emphasis etc.
394
395 """
396 def handleMatch(self, m):
397 tag1, tag2 = self.tag.split(",")
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000398 el1 = etree.Element(tag1)
399 el2 = etree.SubElement(el1, tag2)
dprankeb08af212015-10-06 17:44:36 -0700400 el2.text = m.group(3)
401 if len(m.groups()) == 5:
402 el2.tail = m.group(4)
403 return el1
404
405
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000406class DoubleTagInlineProcessor(SimpleTagInlineProcessor):
407 """Return a ElementTree element nested in tag2 nested in tag1.
408
409 Useful for strong emphasis etc.
410
411 """
412 def handleMatch(self, m, data): # pragma: no cover
413 tag1, tag2 = self.tag.split(",")
414 el1 = etree.Element(tag1)
415 el2 = etree.SubElement(el1, tag2)
416 el2.text = m.group(2)
417 if len(m.groups()) == 3:
418 el2.tail = m.group(3)
419 return el1, m.start(0), m.end(0)
420
421
422class HtmlInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700423 """ Store raw inline html and return a placeholder. """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000424 def handleMatch(self, m, data):
425 rawhtml = self.unescape(m.group(1))
426 place_holder = self.md.htmlStash.store(rawhtml)
427 return place_holder, m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700428
429 def unescape(self, text):
430 """ Return unescaped text given text with an inline placeholder. """
431 try:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000432 stash = self.md.treeprocessors['inline'].stashed_nodes
dprankeb08af212015-10-06 17:44:36 -0700433 except KeyError: # pragma: no cover
434 return text
435
436 def get_stash(m):
437 id = m.group(1)
438 value = stash.get(id)
439 if value is not None:
440 try:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000441 return self.md.serializer(value)
442 except Exception:
443 return r'\%s' % value
dprankeb08af212015-10-06 17:44:36 -0700444
445 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
446
447
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000448class AsteriskProcessor(InlineProcessor):
449 """Emphasis processor for handling strong and em matches inside asterisks."""
450
451 PATTERNS = [
452 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
453 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
454 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
455 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
456 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
457 ]
458
459 def build_single(self, m, tag, idx):
460 """Return single tag."""
461 el1 = etree.Element(tag)
462 text = m.group(2)
463 self.parse_sub_patterns(text, el1, None, idx)
464 return el1
465
466 def build_double(self, m, tags, idx):
467 """Return double tag."""
468
469 tag1, tag2 = tags.split(",")
470 el1 = etree.Element(tag1)
471 el2 = etree.Element(tag2)
472 text = m.group(2)
473 self.parse_sub_patterns(text, el2, None, idx)
474 el1.append(el2)
475 if len(m.groups()) == 3:
476 text = m.group(3)
477 self.parse_sub_patterns(text, el1, el2, idx)
478 return el1
479
480 def build_double2(self, m, tags, idx):
481 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""
482
483 tag1, tag2 = tags.split(",")
484 el1 = etree.Element(tag1)
485 el2 = etree.Element(tag2)
486 text = m.group(2)
487 self.parse_sub_patterns(text, el1, None, idx)
488 text = m.group(3)
489 el1.append(el2)
490 self.parse_sub_patterns(text, el2, None, idx)
491 return el1
492
493 def parse_sub_patterns(self, data, parent, last, idx):
494 """
495 Parses sub patterns.
496
497 `data` (`str`):
498 text to evaluate.
499
500 `parent` (`etree.Element`):
501 Parent to attach text and sub elements to.
502
503 `last` (`etree.Element`):
504 Last appended child to parent. Can also be None if parent has no children.
505
506 `idx` (`int`):
507 Current pattern index that was used to evaluate the parent.
508
509 """
510
511 offset = 0
512 pos = 0
513
514 length = len(data)
515 while pos < length:
516 # Find the start of potential emphasis or strong tokens
517 if self.compiled_re.match(data, pos):
518 matched = False
519 # See if the we can match an emphasis/strong pattern
520 for index, item in enumerate(self.PATTERNS):
521 # Only evaluate patterns that are after what was used on the parent
522 if index <= idx:
523 continue
524 m = item.pattern.match(data, pos)
525 if m:
526 # Append child nodes to parent
527 # Text nodes should be appended to the last
528 # child if present, and if not, it should
529 # be added as the parent's text node.
530 text = data[offset:m.start(0)]
531 if text:
532 if last is not None:
533 last.tail = text
534 else:
535 parent.text = text
536 el = self.build_element(m, item.builder, item.tags, index)
537 parent.append(el)
538 last = el
539 # Move our position past the matched hunk
540 offset = pos = m.end(0)
541 matched = True
542 if not matched:
543 # We matched nothing, move on to the next character
544 pos += 1
545 else:
546 # Increment position as no potential emphasis start was found.
547 pos += 1
548
549 # Append any leftover text as a text node.
550 text = data[offset:]
551 if text:
552 if last is not None:
553 last.tail = text
554 else:
555 parent.text = text
556
557 def build_element(self, m, builder, tags, index):
558 """Element builder."""
559
560 if builder == 'double2':
561 return self.build_double2(m, tags, index)
562 elif builder == 'double':
563 return self.build_double(m, tags, index)
564 else:
565 return self.build_single(m, tags, index)
566
567 def handleMatch(self, m, data):
568 """Parse patterns."""
569
570 el = None
571 start = None
572 end = None
573
574 for index, item in enumerate(self.PATTERNS):
575 m1 = item.pattern.match(data, m.start(0))
576 if m1:
577 start = m1.start(0)
578 end = m1.end(0)
579 el = self.build_element(m1, item.builder, item.tags, index)
580 break
581 return el, start, end
582
583
584class UnderscoreProcessor(AsteriskProcessor):
585 """Emphasis processor for handling strong and em matches inside underscores."""
586
587 PATTERNS = [
588 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
589 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
590 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
591 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
592 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
593 ]
594
595
596class LinkInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700597 """ Return a link element from the given match. """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000598 RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE)
599 RE_TITLE_CLEAN = re.compile(r'\s')
dprankeb08af212015-10-06 17:44:36 -0700600
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000601 def handleMatch(self, m, data):
602 text, index, handled = self.getText(data, m.end(0))
dprankeb08af212015-10-06 17:44:36 -0700603
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000604 if not handled:
605 return None, None, None
606
607 href, title, index, handled = self.getLink(data, index)
608 if not handled:
609 return None, None, None
610
611 el = etree.Element("a")
612 el.text = text
613
614 el.set("href", href)
615
616 if title is not None:
dprankeb08af212015-10-06 17:44:36 -0700617 el.set("title", title)
dprankeb08af212015-10-06 17:44:36 -0700618
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000619 return el, m.start(0), index
dprankeb08af212015-10-06 17:44:36 -0700620
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000621 def getLink(self, data, index):
622 """Parse data between `()` of `[Text]()` allowing recursive `()`. """
623
624 href = ''
625 title = None
626 handled = False
627
628 m = self.RE_LINK.match(data, pos=index)
629 if m and m.group(1):
630 # Matches [Text](<link> "title")
631 href = m.group(1)[1:-1].strip()
632 if m.group(2):
633 title = m.group(2)[1:-1]
634 index = m.end(0)
635 handled = True
636 elif m:
637 # Track bracket nesting and index in string
638 bracket_count = 1
639 backtrack_count = 1
640 start_index = m.end()
641 index = start_index
642 last_bracket = -1
643
644 # Primary (first found) quote tracking.
645 quote = None
646 start_quote = -1
647 exit_quote = -1
648 ignore_matches = False
649
650 # Secondary (second found) quote tracking.
651 alt_quote = None
652 start_alt_quote = -1
653 exit_alt_quote = -1
654
655 # Track last character
656 last = ''
657
658 for pos in range(index, len(data)):
659 c = data[pos]
660 if c == '(':
661 # Count nested (
662 # Don't increment the bracket count if we are sure we're in a title.
663 if not ignore_matches:
664 bracket_count += 1
665 elif backtrack_count > 0:
666 backtrack_count -= 1
667 elif c == ')':
668 # Match nested ) to (
669 # Don't decrement if we are sure we are in a title that is unclosed.
670 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):
671 bracket_count = 0
672 elif not ignore_matches:
673 bracket_count -= 1
674 elif backtrack_count > 0:
675 backtrack_count -= 1
676 # We've found our backup end location if the title doesn't reslove.
677 if backtrack_count == 0:
678 last_bracket = index + 1
679
680 elif c in ("'", '"'):
681 # Quote has started
682 if not quote:
683 # We'll assume we are now in a title.
684 # Brackets are quoted, so no need to match them (except for the final one).
685 ignore_matches = True
686 backtrack_count = bracket_count
687 bracket_count = 1
688 start_quote = index + 1
689 quote = c
690 # Secondary quote (in case the first doesn't resolve): [text](link'"title")
691 elif c != quote and not alt_quote:
692 start_alt_quote = index + 1
693 alt_quote = c
694 # Update primary quote match
695 elif c == quote:
696 exit_quote = index + 1
697 # Update secondary quote match
698 elif alt_quote and c == alt_quote:
699 exit_alt_quote = index + 1
700
701 index += 1
702
703 # Link is closed, so let's break out of the loop
704 if bracket_count == 0:
705 # Get the title if we closed a title string right before link closed
706 if exit_quote >= 0 and quote == last:
707 href = data[start_index:start_quote - 1]
708 title = ''.join(data[start_quote:exit_quote - 1])
709 elif exit_alt_quote >= 0 and alt_quote == last:
710 href = data[start_index:start_alt_quote - 1]
711 title = ''.join(data[start_alt_quote:exit_alt_quote - 1])
712 else:
713 href = data[start_index:index - 1]
714 break
715
716 if c != ' ':
717 last = c
718
719 # We have a scenario: [test](link"notitle)
720 # When we enter a string, we stop tracking bracket resolution in the main counter,
721 # but we do keep a backup counter up until we discover where we might resolve all brackets
722 # if the title string fails to resolve.
723 if bracket_count != 0 and backtrack_count == 0:
724 href = data[start_index:last_bracket - 1]
725 index = last_bracket
726 bracket_count = 0
727
728 handled = bracket_count == 0
729
730 if title is not None:
731 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))
732
733 href = self.unescape(href).strip()
734
735 return href, title, index, handled
736
737 def getText(self, data, index):
738 """Parse the content between `[]` of the start of an image or link
739 resolving nested square brackets.
dprankeb08af212015-10-06 17:44:36 -0700740
741 """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000742 bracket_count = 1
743 text = []
744 for pos in range(index, len(data)):
745 c = data[pos]
746 if c == ']':
747 bracket_count -= 1
748 elif c == '[':
749 bracket_count += 1
750 index += 1
751 if bracket_count == 0:
752 break
753 text.append(c)
754 return ''.join(text), index, bracket_count == 0
dprankeb08af212015-10-06 17:44:36 -0700755
756
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000757class ImageInlineProcessor(LinkInlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700758 """ Return a img element from the given match. """
dprankeb08af212015-10-06 17:44:36 -0700759
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000760 def handleMatch(self, m, data):
761 text, index, handled = self.getText(data, m.end(0))
762 if not handled:
763 return None, None, None
dprankeb08af212015-10-06 17:44:36 -0700764
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000765 src, title, index, handled = self.getLink(data, index)
766 if not handled:
767 return None, None, None
768
769 el = etree.Element("img")
770
771 el.set("src", src)
772
773 if title is not None:
774 el.set("title", title)
775
776 el.set('alt', self.unescape(text))
777 return el, m.start(0), index
dprankeb08af212015-10-06 17:44:36 -0700778
779
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000780class ReferenceInlineProcessor(LinkInlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700781 """ Match to a stored reference and return link element. """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000782 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)
dprankeb08af212015-10-06 17:44:36 -0700783
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000784 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE)
dprankeb08af212015-10-06 17:44:36 -0700785
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000786 def handleMatch(self, m, data):
787 text, index, handled = self.getText(data, m.end(0))
788 if not handled:
789 return None, None, None
790
791 id, end, handled = self.evalId(data, index, text)
792 if not handled:
793 return None, None, None
dprankeb08af212015-10-06 17:44:36 -0700794
795 # Clean up linebreaks in id
796 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000797 if id not in self.md.references: # ignore undefined refs
798 return None, m.start(0), end
dprankeb08af212015-10-06 17:44:36 -0700799
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000800 href, title = self.md.references[id]
801
802 return self.makeTag(href, title, text), m.start(0), end
803
804 def evalId(self, data, index, text):
805 """
806 Evaluate the id portion of [ref][id].
807
808 If [ref][] use [ref].
809 """
810 m = self.RE_LINK.match(data, pos=index)
811 if not m:
812 return None, index, False
813 else:
814 id = m.group(1).lower()
815 end = m.end(0)
816 if not id:
817 id = text.lower()
818 return id, end, True
dprankeb08af212015-10-06 17:44:36 -0700819
820 def makeTag(self, href, title, text):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000821 el = etree.Element('a')
dprankeb08af212015-10-06 17:44:36 -0700822
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000823 el.set('href', href)
dprankeb08af212015-10-06 17:44:36 -0700824 if title:
825 el.set('title', title)
826
827 el.text = text
828 return el
829
830
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000831class ShortReferenceInlineProcessor(ReferenceInlineProcessor):
832 """Short form of reference: [google]. """
833 def evalId(self, data, index, text):
834 """Evaluate the id from of [ref] """
835
836 return text.lower(), index, True
837
838
839class ImageReferenceInlineProcessor(ReferenceInlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700840 """ Match to a stored reference and return img element. """
841 def makeTag(self, href, title, text):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000842 el = etree.Element("img")
843 el.set("src", href)
dprankeb08af212015-10-06 17:44:36 -0700844 if title:
845 el.set("title", title)
dprankeb08af212015-10-06 17:44:36 -0700846 el.set("alt", self.unescape(text))
847 return el
848
849
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000850class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):
851 """ Short form of inage reference: ![ref]. """
852 def evalId(self, data, index, text):
853 """Evaluate the id from of [ref] """
854
855 return text.lower(), index, True
856
857
858class AutolinkInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700859 """ Return a link Element given an autolink (`<http://example/com>`). """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000860 def handleMatch(self, m, data):
861 el = etree.Element("a")
862 el.set('href', self.unescape(m.group(1)))
863 el.text = util.AtomicString(m.group(1))
864 return el, m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700865
866
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000867class AutomailInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700868 """
869 Return a mailto link Element given an automail link (`<foo@example.com>`).
870 """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000871 def handleMatch(self, m, data):
872 el = etree.Element('a')
873 email = self.unescape(m.group(1))
dprankeb08af212015-10-06 17:44:36 -0700874 if email.startswith("mailto:"):
875 email = email[len("mailto:"):]
876
877 def codepoint2name(code):
878 """Return entity definition by code, or the code if not defined."""
879 entity = entities.codepoint2name.get(code)
880 if entity:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000881 return "{}{};".format(util.AMP_SUBSTITUTE, entity)
dprankeb08af212015-10-06 17:44:36 -0700882 else:
883 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
884
885 letters = [codepoint2name(ord(letter)) for letter in email]
886 el.text = util.AtomicString(''.join(letters))
887
888 mailto = "mailto:" + email
889 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
890 ord(letter) for letter in mailto])
891 el.set('href', mailto)
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000892 return el, m.start(0), m.end(0)