blob: eb313bd40b3685f0403cad6a52a18ae5bc64acbc [file] [log] [blame]
dprankeb08af212015-10-06 17:44:36 -07001"""
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +00002Python Markdown
3
4A Python implementation of John Gruber's Markdown.
5
6Documentation: https://python-markdown.github.io/
7GitHub: https://github.com/Python-Markdown/markdown/
8PyPI: https://pypi.org/project/Markdown/
9
10Started by Manfred Stienstra (http://www.dwerg.net/).
11Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
12Currently maintained by Waylan Limberg (https://github.com/waylan),
13Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14
15Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
16Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
17Copyright 2004 Manfred Stienstra (the original version)
18
19License: BSD (see LICENSE.md for details).
20
dprankeb08af212015-10-06 17:44:36 -070021INLINE PATTERNS
22=============================================================================
23
24Inline patterns such as *emphasis* are handled by means of auxiliary
25objects, one per pattern. Pattern objects must be instances of classes
26that extend markdown.Pattern. Each pattern object uses a single regular
27expression and needs support the following methods:
28
29 pattern.getCompiledRegExp() # returns a regular expression
30
31 pattern.handleMatch(m) # takes a match object and returns
32 # an ElementTree element or just plain text
33
34All of python markdown's built-in patterns subclass from Pattern,
35but you can add additional patterns that don't.
36
37Also note that all the regular expressions used by inline must
38capture the whole block. For this reason, they all start with
39'^(.*)' and end with '(.*)!'. In case with built-in expression
40Pattern takes care of adding the "^(.*)" and "(.*)!".
41
42Finally, the order in which regular expressions are applied is very
43important - e.g. if we first replace http://.../ links with <a> tags
44and _then_ try to replace inline html, we would end up with a mess.
45So, we apply the expressions in the following order:
46
47* escape and backticks have to go before everything else, so
48 that we can preempt any markdown patterns by escaping them.
49
50* then we handle auto-links (must be done before inline html)
51
52* then we handle inline HTML. At this point we will simply
53 replace all inline HTML strings with a placeholder and add
54 the actual HTML to a hash.
55
56* then inline images (must be done before links)
57
58* then bracketed links, first regular then reference-style
59
60* finally we apply strong and emphasis
61"""
62
dprankeb08af212015-10-06 17:44:36 -070063from . import util
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000064from collections import namedtuple
dprankeb08af212015-10-06 17:44:36 -070065import re
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000066import xml.etree.ElementTree as etree
dprankeb08af212015-10-06 17:44:36 -070067try: # pragma: no cover
68 from html import entities
69except ImportError: # pragma: no cover
70 import htmlentitydefs as entities
71
72
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000073def build_inlinepatterns(md, **kwargs):
dprankeb08af212015-10-06 17:44:36 -070074 """ Build the default set of inline patterns for Markdown. """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000075 inlinePatterns = util.Registry()
76 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)
77 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)
78 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)
79 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)
80 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)
81 inlinePatterns.register(
82 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140
dprankeb08af212015-10-06 17:44:36 -070083 )
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000084 inlinePatterns.register(
85 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130
dprankeb08af212015-10-06 17:44:36 -070086 )
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +000087 inlinePatterns.register(
88 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125
89 )
90 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)
91 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)
92 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)
93 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)
94 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)
95 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)
96 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)
97 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)
dprankeb08af212015-10-06 17:44:36 -070098 return inlinePatterns
99
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000100
dprankeb08af212015-10-06 17:44:36 -0700101"""
102The actual regular expressions for patterns
103-----------------------------------------------------------------------------
104"""
105
dprankeb08af212015-10-06 17:44:36 -0700106NOIMG = r'(?<!\!)'
107
108# `e=f()` or ``e=f("`")``
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000109BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'
dprankeb08af212015-10-06 17:44:36 -0700110
111# \<
112ESCAPE_RE = r'\\(.)'
113
114# *emphasis*
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000115EMPHASIS_RE = r'(\*)([^\*]+)\1'
dprankeb08af212015-10-06 17:44:36 -0700116
117# **strong**
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000118STRONG_RE = r'(\*{2})(.+?)\1'
dprankeb08af212015-10-06 17:44:36 -0700119
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000120# __smart__strong__
121SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'
dprankeb08af212015-10-06 17:44:36 -0700122
123# _smart_emphasis_
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000124SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'
dprankeb08af212015-10-06 17:44:36 -0700125
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000126# __strong _em__
127SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'
128
129# ***strongem*** or ***em*strong**
130EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}'
131
132# ___strongem___ or ___em_strong__
133EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'
134
135# ***strong**em*
136STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1'
137
138# ___strong__em_
139STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'
140
141# **strong*em***
142STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}'
dprankeb08af212015-10-06 17:44:36 -0700143
144# [text](url) or [text](<url>) or [text](url "title")
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000145LINK_RE = NOIMG + r'\['
dprankeb08af212015-10-06 17:44:36 -0700146
147# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000148IMAGE_LINK_RE = r'\!\['
dprankeb08af212015-10-06 17:44:36 -0700149
150# [Google][3]
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000151REFERENCE_RE = LINK_RE
dprankeb08af212015-10-06 17:44:36 -0700152
153# ![alt text][2]
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000154IMAGE_REFERENCE_RE = IMAGE_LINK_RE
dprankeb08af212015-10-06 17:44:36 -0700155
156# stand-alone * or _
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000157NOT_STRONG_RE = r'((^|\s)(\*|_)(\s|$))'
dprankeb08af212015-10-06 17:44:36 -0700158
159# <http://www.123.com>
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000160AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'
dprankeb08af212015-10-06 17:44:36 -0700161
162# <me@example.com>
揚帆起航0f447352022-11-28 22:32:11 +0000163AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
dprankeb08af212015-10-06 17:44:36 -0700164
165# <...>
揚帆起航0f447352022-11-28 22:32:11 +0000166HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)'
dprankeb08af212015-10-06 17:44:36 -0700167
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000168# "&#38;" (decimal) or "&#x26;" (hex) or "&amp;" (named)
169ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
dprankeb08af212015-10-06 17:44:36 -0700170
171# two spaces at end of line
172LINE_BREAK_RE = r' \n'
173
174
175def dequote(string):
176 """Remove quotes from around a string."""
177 if ((string.startswith('"') and string.endswith('"')) or
178 (string.startswith("'") and string.endswith("'"))):
179 return string[1:-1]
180 else:
181 return string
182
183
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000184class EmStrongItem(namedtuple('EmStrongItem', ['pattern', 'builder', 'tags'])):
185 """Emphasis/strong pattern item."""
dprankeb08af212015-10-06 17:44:36 -0700186
187
188"""
189The pattern classes
190-----------------------------------------------------------------------------
191"""
192
193
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000194class Pattern: # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700195 """Base class that inline patterns subclass. """
196
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000197 ANCESTOR_EXCLUDES = tuple()
198
199 def __init__(self, pattern, md=None):
dprankeb08af212015-10-06 17:44:36 -0700200 """
201 Create an instant of an inline pattern.
202
203 Keyword arguments:
204
205 * pattern: A regular expression that matches a pattern
206
207 """
208 self.pattern = pattern
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000209 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern,
dprankeb08af212015-10-06 17:44:36 -0700210 re.DOTALL | re.UNICODE)
211
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000212 self.md = md
213
dprankeb08af212015-10-06 17:44:36 -0700214 def getCompiledRegExp(self):
215 """ Return a compiled regular expression. """
216 return self.compiled_re
217
218 def handleMatch(self, m):
219 """Return a ElementTree element from the given match.
220
221 Subclasses should override this method.
222
223 Keyword arguments:
224
225 * m: A re match object containing a match of the pattern.
226
227 """
228 pass # pragma: no cover
229
230 def type(self):
231 """ Return class name, to define pattern type """
232 return self.__class__.__name__
233
234 def unescape(self, text):
235 """ Return unescaped text given text with an inline placeholder. """
236 try:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000237 stash = self.md.treeprocessors['inline'].stashed_nodes
dprankeb08af212015-10-06 17:44:36 -0700238 except KeyError: # pragma: no cover
239 return text
240
dprankeb08af212015-10-06 17:44:36 -0700241 def get_stash(m):
242 id = m.group(1)
243 if id in stash:
244 value = stash.get(id)
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000245 if isinstance(value, str):
dprankeb08af212015-10-06 17:44:36 -0700246 return value
247 else:
248 # An etree Element - return text content only
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000249 return ''.join(value.itertext())
dprankeb08af212015-10-06 17:44:36 -0700250 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
251
252
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000253class InlineProcessor(Pattern):
254 """
255 Base class that inline patterns subclass.
256
257 This is the newer style inline processor that uses a more
258 efficient and flexible search approach.
259 """
260
261 def __init__(self, pattern, md=None):
262 """
263 Create an instant of an inline pattern.
264
265 Keyword arguments:
266
267 * pattern: A regular expression that matches a pattern
268
269 """
270 self.pattern = pattern
271 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
272
273 # Api for Markdown to pass safe_mode into instance
274 self.safe_mode = False
275 self.md = md
276
277 def handleMatch(self, m, data):
278 """Return a ElementTree element from the given match and the
279 start and end index of the matched text.
280
281 If `start` and/or `end` are returned as `None`, it will be
282 assumed that the processor did not find a valid region of text.
283
284 Subclasses should override this method.
285
286 Keyword arguments:
287
288 * m: A re match object containing a match of the pattern.
289 * data: The buffer current under analysis
290
291 Returns:
292
293 * el: The ElementTree element, text or None.
294 * start: The start of the region that has been matched or None.
295 * end: The end of the region that has been matched or None.
296
297 """
298 pass # pragma: no cover
299
300
301class SimpleTextPattern(Pattern): # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700302 """ Return a simple text of group(2) of a Pattern. """
303 def handleMatch(self, m):
304 return m.group(2)
305
306
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000307class SimpleTextInlineProcessor(InlineProcessor):
308 """ Return a simple text of group(1) of a Pattern. """
309 def handleMatch(self, m, data):
310 return m.group(1), m.start(0), m.end(0)
311
312
313class EscapeInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700314 """ Return an escaped character. """
315
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000316 def handleMatch(self, m, data):
317 char = m.group(1)
318 if char in self.md.ESCAPED_CHARS:
319 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700320 else:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000321 return None, m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700322
323
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000324class SimpleTagPattern(Pattern): # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700325 """
326 Return element of type `tag` with a text attribute of group(3)
327 of a Pattern.
328
329 """
330 def __init__(self, pattern, tag):
331 Pattern.__init__(self, pattern)
332 self.tag = tag
333
334 def handleMatch(self, m):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000335 el = etree.Element(self.tag)
dprankeb08af212015-10-06 17:44:36 -0700336 el.text = m.group(3)
337 return el
338
339
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000340class SimpleTagInlineProcessor(InlineProcessor):
341 """
342 Return element of type `tag` with a text attribute of group(2)
343 of a Pattern.
344
345 """
346 def __init__(self, pattern, tag):
347 InlineProcessor.__init__(self, pattern)
348 self.tag = tag
349
350 def handleMatch(self, m, data): # pragma: no cover
351 el = etree.Element(self.tag)
352 el.text = m.group(2)
353 return el, m.start(0), m.end(0)
354
355
356class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700357 """ Return an element of type `tag` with no children. """
358 def handleMatch(self, m):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000359 return etree.Element(self.tag)
dprankeb08af212015-10-06 17:44:36 -0700360
361
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000362class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):
363 """ Return an element of type `tag` with no children. """
364 def handleMatch(self, m, data):
365 return etree.Element(self.tag), m.start(0), m.end(0)
366
367
368class BacktickInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700369 """ Return a `<code>` element containing the matching text. """
370 def __init__(self, pattern):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000371 InlineProcessor.__init__(self, pattern)
372 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)
373 self.tag = 'code'
dprankeb08af212015-10-06 17:44:36 -0700374
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000375 def handleMatch(self, m, data):
376 if m.group(3):
377 el = etree.Element(self.tag)
378 el.text = util.AtomicString(util.code_escape(m.group(3).strip()))
379 return el, m.start(0), m.end(0)
380 else:
381 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700382
383
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000384class DoubleTagPattern(SimpleTagPattern): # pragma: no cover
dprankeb08af212015-10-06 17:44:36 -0700385 """Return a ElementTree element nested in tag2 nested in tag1.
386
387 Useful for strong emphasis etc.
388
389 """
390 def handleMatch(self, m):
391 tag1, tag2 = self.tag.split(",")
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000392 el1 = etree.Element(tag1)
393 el2 = etree.SubElement(el1, tag2)
dprankeb08af212015-10-06 17:44:36 -0700394 el2.text = m.group(3)
395 if len(m.groups()) == 5:
396 el2.tail = m.group(4)
397 return el1
398
399
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000400class DoubleTagInlineProcessor(SimpleTagInlineProcessor):
401 """Return a ElementTree element nested in tag2 nested in tag1.
402
403 Useful for strong emphasis etc.
404
405 """
406 def handleMatch(self, m, data): # pragma: no cover
407 tag1, tag2 = self.tag.split(",")
408 el1 = etree.Element(tag1)
409 el2 = etree.SubElement(el1, tag2)
410 el2.text = m.group(2)
411 if len(m.groups()) == 3:
412 el2.tail = m.group(3)
413 return el1, m.start(0), m.end(0)
414
415
416class HtmlInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700417 """ Store raw inline html and return a placeholder. """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000418 def handleMatch(self, m, data):
419 rawhtml = self.unescape(m.group(1))
420 place_holder = self.md.htmlStash.store(rawhtml)
421 return place_holder, m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700422
423 def unescape(self, text):
424 """ Return unescaped text given text with an inline placeholder. """
425 try:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000426 stash = self.md.treeprocessors['inline'].stashed_nodes
dprankeb08af212015-10-06 17:44:36 -0700427 except KeyError: # pragma: no cover
428 return text
429
430 def get_stash(m):
431 id = m.group(1)
432 value = stash.get(id)
433 if value is not None:
434 try:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000435 return self.md.serializer(value)
436 except Exception:
437 return r'\%s' % value
dprankeb08af212015-10-06 17:44:36 -0700438
439 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
440
441
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000442class AsteriskProcessor(InlineProcessor):
443 """Emphasis processor for handling strong and em matches inside asterisks."""
444
445 PATTERNS = [
446 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
447 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
448 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
449 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
450 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
451 ]
452
453 def build_single(self, m, tag, idx):
454 """Return single tag."""
455 el1 = etree.Element(tag)
456 text = m.group(2)
457 self.parse_sub_patterns(text, el1, None, idx)
458 return el1
459
460 def build_double(self, m, tags, idx):
461 """Return double tag."""
462
463 tag1, tag2 = tags.split(",")
464 el1 = etree.Element(tag1)
465 el2 = etree.Element(tag2)
466 text = m.group(2)
467 self.parse_sub_patterns(text, el2, None, idx)
468 el1.append(el2)
469 if len(m.groups()) == 3:
470 text = m.group(3)
471 self.parse_sub_patterns(text, el1, el2, idx)
472 return el1
473
474 def build_double2(self, m, tags, idx):
475 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""
476
477 tag1, tag2 = tags.split(",")
478 el1 = etree.Element(tag1)
479 el2 = etree.Element(tag2)
480 text = m.group(2)
481 self.parse_sub_patterns(text, el1, None, idx)
482 text = m.group(3)
483 el1.append(el2)
484 self.parse_sub_patterns(text, el2, None, idx)
485 return el1
486
487 def parse_sub_patterns(self, data, parent, last, idx):
488 """
489 Parses sub patterns.
490
491 `data` (`str`):
492 text to evaluate.
493
494 `parent` (`etree.Element`):
495 Parent to attach text and sub elements to.
496
497 `last` (`etree.Element`):
498 Last appended child to parent. Can also be None if parent has no children.
499
500 `idx` (`int`):
501 Current pattern index that was used to evaluate the parent.
502
503 """
504
505 offset = 0
506 pos = 0
507
508 length = len(data)
509 while pos < length:
510 # Find the start of potential emphasis or strong tokens
511 if self.compiled_re.match(data, pos):
512 matched = False
513 # See if the we can match an emphasis/strong pattern
514 for index, item in enumerate(self.PATTERNS):
515 # Only evaluate patterns that are after what was used on the parent
516 if index <= idx:
517 continue
518 m = item.pattern.match(data, pos)
519 if m:
520 # Append child nodes to parent
521 # Text nodes should be appended to the last
522 # child if present, and if not, it should
523 # be added as the parent's text node.
524 text = data[offset:m.start(0)]
525 if text:
526 if last is not None:
527 last.tail = text
528 else:
529 parent.text = text
530 el = self.build_element(m, item.builder, item.tags, index)
531 parent.append(el)
532 last = el
533 # Move our position past the matched hunk
534 offset = pos = m.end(0)
535 matched = True
536 if not matched:
537 # We matched nothing, move on to the next character
538 pos += 1
539 else:
540 # Increment position as no potential emphasis start was found.
541 pos += 1
542
543 # Append any leftover text as a text node.
544 text = data[offset:]
545 if text:
546 if last is not None:
547 last.tail = text
548 else:
549 parent.text = text
550
551 def build_element(self, m, builder, tags, index):
552 """Element builder."""
553
554 if builder == 'double2':
555 return self.build_double2(m, tags, index)
556 elif builder == 'double':
557 return self.build_double(m, tags, index)
558 else:
559 return self.build_single(m, tags, index)
560
561 def handleMatch(self, m, data):
562 """Parse patterns."""
563
564 el = None
565 start = None
566 end = None
567
568 for index, item in enumerate(self.PATTERNS):
569 m1 = item.pattern.match(data, m.start(0))
570 if m1:
571 start = m1.start(0)
572 end = m1.end(0)
573 el = self.build_element(m1, item.builder, item.tags, index)
574 break
575 return el, start, end
576
577
578class UnderscoreProcessor(AsteriskProcessor):
579 """Emphasis processor for handling strong and em matches inside underscores."""
580
581 PATTERNS = [
582 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
583 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
584 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
585 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
586 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
587 ]
588
589
590class LinkInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700591 """ Return a link element from the given match. """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000592 RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE)
593 RE_TITLE_CLEAN = re.compile(r'\s')
dprankeb08af212015-10-06 17:44:36 -0700594
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000595 def handleMatch(self, m, data):
596 text, index, handled = self.getText(data, m.end(0))
dprankeb08af212015-10-06 17:44:36 -0700597
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000598 if not handled:
599 return None, None, None
600
601 href, title, index, handled = self.getLink(data, index)
602 if not handled:
603 return None, None, None
604
605 el = etree.Element("a")
606 el.text = text
607
608 el.set("href", href)
609
610 if title is not None:
dprankeb08af212015-10-06 17:44:36 -0700611 el.set("title", title)
dprankeb08af212015-10-06 17:44:36 -0700612
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000613 return el, m.start(0), index
dprankeb08af212015-10-06 17:44:36 -0700614
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000615 def getLink(self, data, index):
616 """Parse data between `()` of `[Text]()` allowing recursive `()`. """
617
618 href = ''
619 title = None
620 handled = False
621
622 m = self.RE_LINK.match(data, pos=index)
623 if m and m.group(1):
624 # Matches [Text](<link> "title")
625 href = m.group(1)[1:-1].strip()
626 if m.group(2):
627 title = m.group(2)[1:-1]
628 index = m.end(0)
629 handled = True
630 elif m:
631 # Track bracket nesting and index in string
632 bracket_count = 1
633 backtrack_count = 1
634 start_index = m.end()
635 index = start_index
636 last_bracket = -1
637
638 # Primary (first found) quote tracking.
639 quote = None
640 start_quote = -1
641 exit_quote = -1
642 ignore_matches = False
643
644 # Secondary (second found) quote tracking.
645 alt_quote = None
646 start_alt_quote = -1
647 exit_alt_quote = -1
648
649 # Track last character
650 last = ''
651
652 for pos in range(index, len(data)):
653 c = data[pos]
654 if c == '(':
655 # Count nested (
656 # Don't increment the bracket count if we are sure we're in a title.
657 if not ignore_matches:
658 bracket_count += 1
659 elif backtrack_count > 0:
660 backtrack_count -= 1
661 elif c == ')':
662 # Match nested ) to (
663 # Don't decrement if we are sure we are in a title that is unclosed.
664 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):
665 bracket_count = 0
666 elif not ignore_matches:
667 bracket_count -= 1
668 elif backtrack_count > 0:
669 backtrack_count -= 1
揚帆起航0f447352022-11-28 22:32:11 +0000670 # We've found our backup end location if the title doesn't resolve.
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000671 if backtrack_count == 0:
672 last_bracket = index + 1
673
674 elif c in ("'", '"'):
675 # Quote has started
676 if not quote:
677 # We'll assume we are now in a title.
678 # Brackets are quoted, so no need to match them (except for the final one).
679 ignore_matches = True
680 backtrack_count = bracket_count
681 bracket_count = 1
682 start_quote = index + 1
683 quote = c
684 # Secondary quote (in case the first doesn't resolve): [text](link'"title")
685 elif c != quote and not alt_quote:
686 start_alt_quote = index + 1
687 alt_quote = c
688 # Update primary quote match
689 elif c == quote:
690 exit_quote = index + 1
691 # Update secondary quote match
692 elif alt_quote and c == alt_quote:
693 exit_alt_quote = index + 1
694
695 index += 1
696
697 # Link is closed, so let's break out of the loop
698 if bracket_count == 0:
699 # Get the title if we closed a title string right before link closed
700 if exit_quote >= 0 and quote == last:
701 href = data[start_index:start_quote - 1]
702 title = ''.join(data[start_quote:exit_quote - 1])
703 elif exit_alt_quote >= 0 and alt_quote == last:
704 href = data[start_index:start_alt_quote - 1]
705 title = ''.join(data[start_alt_quote:exit_alt_quote - 1])
706 else:
707 href = data[start_index:index - 1]
708 break
709
710 if c != ' ':
711 last = c
712
713 # We have a scenario: [test](link"notitle)
714 # When we enter a string, we stop tracking bracket resolution in the main counter,
715 # but we do keep a backup counter up until we discover where we might resolve all brackets
716 # if the title string fails to resolve.
717 if bracket_count != 0 and backtrack_count == 0:
718 href = data[start_index:last_bracket - 1]
719 index = last_bracket
720 bracket_count = 0
721
722 handled = bracket_count == 0
723
724 if title is not None:
725 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))
726
727 href = self.unescape(href).strip()
728
729 return href, title, index, handled
730
731 def getText(self, data, index):
732 """Parse the content between `[]` of the start of an image or link
733 resolving nested square brackets.
dprankeb08af212015-10-06 17:44:36 -0700734
735 """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000736 bracket_count = 1
737 text = []
738 for pos in range(index, len(data)):
739 c = data[pos]
740 if c == ']':
741 bracket_count -= 1
742 elif c == '[':
743 bracket_count += 1
744 index += 1
745 if bracket_count == 0:
746 break
747 text.append(c)
748 return ''.join(text), index, bracket_count == 0
dprankeb08af212015-10-06 17:44:36 -0700749
750
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000751class ImageInlineProcessor(LinkInlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700752 """ Return a img element from the given match. """
dprankeb08af212015-10-06 17:44:36 -0700753
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000754 def handleMatch(self, m, data):
755 text, index, handled = self.getText(data, m.end(0))
756 if not handled:
757 return None, None, None
dprankeb08af212015-10-06 17:44:36 -0700758
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000759 src, title, index, handled = self.getLink(data, index)
760 if not handled:
761 return None, None, None
762
763 el = etree.Element("img")
764
765 el.set("src", src)
766
767 if title is not None:
768 el.set("title", title)
769
770 el.set('alt', self.unescape(text))
771 return el, m.start(0), index
dprankeb08af212015-10-06 17:44:36 -0700772
773
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000774class ReferenceInlineProcessor(LinkInlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700775 """ Match to a stored reference and return link element. """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000776 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)
dprankeb08af212015-10-06 17:44:36 -0700777
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000778 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE)
dprankeb08af212015-10-06 17:44:36 -0700779
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000780 def handleMatch(self, m, data):
781 text, index, handled = self.getText(data, m.end(0))
782 if not handled:
783 return None, None, None
784
785 id, end, handled = self.evalId(data, index, text)
786 if not handled:
787 return None, None, None
dprankeb08af212015-10-06 17:44:36 -0700788
789 # Clean up linebreaks in id
790 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000791 if id not in self.md.references: # ignore undefined refs
792 return None, m.start(0), end
dprankeb08af212015-10-06 17:44:36 -0700793
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000794 href, title = self.md.references[id]
795
796 return self.makeTag(href, title, text), m.start(0), end
797
798 def evalId(self, data, index, text):
799 """
800 Evaluate the id portion of [ref][id].
801
802 If [ref][] use [ref].
803 """
804 m = self.RE_LINK.match(data, pos=index)
805 if not m:
806 return None, index, False
807 else:
808 id = m.group(1).lower()
809 end = m.end(0)
810 if not id:
811 id = text.lower()
812 return id, end, True
dprankeb08af212015-10-06 17:44:36 -0700813
814 def makeTag(self, href, title, text):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000815 el = etree.Element('a')
dprankeb08af212015-10-06 17:44:36 -0700816
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000817 el.set('href', href)
dprankeb08af212015-10-06 17:44:36 -0700818 if title:
819 el.set('title', title)
820
821 el.text = text
822 return el
823
824
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000825class ShortReferenceInlineProcessor(ReferenceInlineProcessor):
826 """Short form of reference: [google]. """
827 def evalId(self, data, index, text):
828 """Evaluate the id from of [ref] """
829
830 return text.lower(), index, True
831
832
833class ImageReferenceInlineProcessor(ReferenceInlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700834 """ Match to a stored reference and return img element. """
835 def makeTag(self, href, title, text):
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000836 el = etree.Element("img")
837 el.set("src", href)
dprankeb08af212015-10-06 17:44:36 -0700838 if title:
839 el.set("title", title)
dprankeb08af212015-10-06 17:44:36 -0700840 el.set("alt", self.unescape(text))
841 return el
842
843
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000844class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):
845 """ Short form of inage reference: ![ref]. """
846 def evalId(self, data, index, text):
847 """Evaluate the id from of [ref] """
848
849 return text.lower(), index, True
850
851
852class AutolinkInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700853 """ Return a link Element given an autolink (`<http://example/com>`). """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000854 def handleMatch(self, m, data):
855 el = etree.Element("a")
856 el.set('href', self.unescape(m.group(1)))
857 el.text = util.AtomicString(m.group(1))
858 return el, m.start(0), m.end(0)
dprankeb08af212015-10-06 17:44:36 -0700859
860
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000861class AutomailInlineProcessor(InlineProcessor):
dprankeb08af212015-10-06 17:44:36 -0700862 """
863 Return a mailto link Element given an automail link (`<foo@example.com>`).
864 """
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000865 def handleMatch(self, m, data):
866 el = etree.Element('a')
867 email = self.unescape(m.group(1))
dprankeb08af212015-10-06 17:44:36 -0700868 if email.startswith("mailto:"):
869 email = email[len("mailto:"):]
870
871 def codepoint2name(code):
872 """Return entity definition by code, or the code if not defined."""
873 entity = entities.codepoint2name.get(code)
874 if entity:
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000875 return "{}{};".format(util.AMP_SUBSTITUTE, entity)
dprankeb08af212015-10-06 17:44:36 -0700876 else:
877 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
878
879 letters = [codepoint2name(ord(letter)) for letter in email]
880 el.text = util.AtomicString(''.join(letters))
881
882 mailto = "mailto:" + email
883 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
884 ord(letter) for letter in mailto])
885 el.set('href', mailto)
Yu-Ping Wu6a8f3a22021-11-24 00:45:03 +0000886 return el, m.start(0), m.end(0)