third_party/Python-Markdown: Update to 3.3.4
Update Python-Markdown to 3.3.4. Version 3.3.5+ requires
importlib-metadata>=4.4, which requires Python 3.7+ according to
Gentoo's ebuild file. However, the current Python version in chromium
chroot is 3.6, so that's infeasible.
Update README.chromium.
BUG=chromium:1224332
TEST=python3 md_browser.py -d /mnt/host/source/docs/
Change-Id: Ib25d6db3ec01ddda977396a08cd5c0ff5a1f154b
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3296244
Reviewed-by: Dirk Pranke <dpranke@google.com>
Commit-Queue: Yu-Ping Wu <yupingso@chromium.org>
Cr-Commit-Position: refs/heads/main@{#944762}
NOKEYCHECK=True
GitOrigin-RevId: c5a7b8d781e5dfb13a6871364540bb2e6e71b062
diff --git a/markdown/__init__.py b/markdown/__init__.py
index 1b86553..e05af10 100644
--- a/markdown/__init__.py
+++ b/markdown/__init__.py
@@ -1,529 +1,61 @@
"""
Python Markdown
-===============
-Python Markdown converts Markdown to HTML and can be used as a library or
-called from the command line.
+A Python implementation of John Gruber's Markdown.
-## Basic usage as a module:
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
- import markdown
- html = markdown.markdown(your_text_string)
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
-See <https://pythonhosted.org/Markdown/> for more
-information and instructions on how to extend the functionality of
-Python Markdown. Read that before you try modifying this file.
-
-## Authors and License
-
-Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
-maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
-Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
-
-Contact: markdown@freewisdom.org
-
-Copyright 2007-2013 The Python Markdown Project (v. 1.7 and later)
-Copyright 200? Django Software Foundation (OrderedDict implementation)
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)
-License: BSD (see LICENSE for details).
+License: BSD (see LICENSE.md for details).
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
-from .__version__ import version, version_info # noqa
-import codecs
import sys
-import logging
-import warnings
-import importlib
-from . import util
-from .preprocessors import build_preprocessors
-from .blockprocessors import build_block_parser
-from .treeprocessors import build_treeprocessors
-from .inlinepatterns import build_inlinepatterns
-from .postprocessors import build_postprocessors
-from .extensions import Extension
-from .serializers import to_html_string, to_xhtml_string
+
+# TODO: Remove this check at some point in the future.
+# (also remove flake8's 'ignore E402' comments below)
+if sys.version_info[0] < 3: # pragma: no cover
+ raise ImportError('A recent version of Python 3 is required.')
+
+from .core import Markdown, markdown, markdownFromFile # noqa: E402
+from .util import PY37 # noqa: E402
+from .pep562 import Pep562 # noqa: E402
+from .__meta__ import __version__, __version_info__ # noqa: E402
+import warnings # noqa: E402
+
+# For backward compatibility as some extensions expect it...
+from .extensions import Extension # noqa
__all__ = ['Markdown', 'markdown', 'markdownFromFile']
+__deprecated__ = {
+ "version": ("__version__", __version__),
+ "version_info": ("__version_info__", __version_info__)
+}
-logger = logging.getLogger('MARKDOWN')
+def __getattr__(name):
+ """Get attribute."""
-class Markdown(object):
- """Convert Markdown to HTML."""
+ deprecated = __deprecated__.get(name)
+ if deprecated:
+ warnings.warn(
+ "'{}' is deprecated. Use '{}' instead.".format(name, deprecated[0]),
+ category=DeprecationWarning,
+ stacklevel=(3 if PY37 else 4)
+ )
+ return deprecated[1]
+ raise AttributeError("module '{}' has no attribute '{}'".format(__name__, name))
- doc_tag = "div" # Element used to wrap document - later removed
- option_defaults = {
- 'html_replacement_text': '[HTML_REMOVED]',
- 'tab_length': 4,
- 'enable_attributes': True,
- 'smart_emphasis': True,
- 'lazy_ol': True,
- }
-
- output_formats = {
- 'html': to_html_string,
- 'html4': to_html_string,
- 'html5': to_html_string,
- 'xhtml': to_xhtml_string,
- 'xhtml1': to_xhtml_string,
- 'xhtml5': to_xhtml_string,
- }
-
- ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']',
- '(', ')', '>', '#', '+', '-', '.', '!']
-
- def __init__(self, *args, **kwargs):
- """
- Creates a new Markdown instance.
-
- Keyword arguments:
-
- * extensions: A list of extensions.
- If they are of type string, the module mdx_name.py will be loaded.
- If they are a subclass of markdown.Extension, they will be used
- as-is.
- * extension_configs: Configuration settings for extensions.
- * output_format: Format of output. Supported formats are:
- * "xhtml1": Outputs XHTML 1.x. Default.
- * "xhtml5": Outputs XHTML style tags of HTML 5
- * "xhtml": Outputs latest supported version of XHTML
- (currently XHTML 1.1).
- * "html4": Outputs HTML 4
- * "html5": Outputs HTML style tags of HTML 5
- * "html": Outputs latest supported version of HTML
- (currently HTML 4).
- Note that it is suggested that the more specific formats ("xhtml1"
- and "html4") be used as "xhtml" or "html" may change in the future
- if it makes sense at that time.
- * safe_mode: Deprecated! Disallow raw html. One of "remove", "replace"
- or "escape".
- * html_replacement_text: Deprecated! Text used when safe_mode is set
- to "replace".
- * tab_length: Length of tabs in the source. Default: 4
- * enable_attributes: Enable the conversion of attributes. Default: True
- * smart_emphasis: Treat `_connected_words_` intelligently Default: True
- * lazy_ol: Ignore number of first item of ordered lists. Default: True
-
- """
-
- # For backward compatibility, loop through old positional args
- pos = ['extensions', 'extension_configs', 'safe_mode', 'output_format']
- for c, arg in enumerate(args):
- if pos[c] not in kwargs:
- kwargs[pos[c]] = arg
- if c+1 == len(pos): # pragma: no cover
- # ignore any additional args
- break
- if len(args):
- warnings.warn('Positional arguments are deprecated in Markdown. '
- 'Use keyword arguments only.',
- DeprecationWarning)
-
- # Loop through kwargs and assign defaults
- for option, default in self.option_defaults.items():
- setattr(self, option, kwargs.get(option, default))
-
- self.safeMode = kwargs.get('safe_mode', False)
- if self.safeMode and 'enable_attributes' not in kwargs:
- # Disable attributes in safeMode when not explicitly set
- self.enable_attributes = False
-
- if 'safe_mode' in kwargs:
- warnings.warn('"safe_mode" is deprecated in Python-Markdown. '
- 'Use an HTML sanitizer (like '
- 'Bleach http://bleach.readthedocs.org/) '
- 'if you are parsing untrusted markdown text. '
- 'See the 2.6 release notes for more info',
- DeprecationWarning)
-
- if 'html_replacement_text' in kwargs:
- warnings.warn('The "html_replacement_text" keyword is '
- 'deprecated along with "safe_mode".',
- DeprecationWarning)
-
- self.registeredExtensions = []
- self.docType = ""
- self.stripTopLevelTags = True
-
- self.build_parser()
-
- self.references = {}
- self.htmlStash = util.HtmlStash()
- self.registerExtensions(extensions=kwargs.get('extensions', []),
- configs=kwargs.get('extension_configs', {}))
- self.set_output_format(kwargs.get('output_format', 'xhtml1'))
- self.reset()
-
- def build_parser(self):
- """ Build the parser from the various parts. """
- self.preprocessors = build_preprocessors(self)
- self.parser = build_block_parser(self)
- self.inlinePatterns = build_inlinepatterns(self)
- self.treeprocessors = build_treeprocessors(self)
- self.postprocessors = build_postprocessors(self)
- return self
-
- def registerExtensions(self, extensions, configs):
- """
- Register extensions with this instance of Markdown.
-
- Keyword arguments:
-
- * extensions: A list of extensions, which can either
- be strings or objects. See the docstring on Markdown.
- * configs: A dictionary mapping module names to config options.
-
- """
- for ext in extensions:
- if isinstance(ext, util.string_type):
- ext = self.build_extension(ext, configs.get(ext, {}))
- if isinstance(ext, Extension):
- ext.extendMarkdown(self, globals())
- logger.debug(
- 'Successfully loaded extension "%s.%s".'
- % (ext.__class__.__module__, ext.__class__.__name__)
- )
- elif ext is not None:
- raise TypeError(
- 'Extension "%s.%s" must be of type: "markdown.Extension"'
- % (ext.__class__.__module__, ext.__class__.__name__))
-
- return self
-
- def build_extension(self, ext_name, configs):
- """Build extension by name, then return the module.
-
- The extension name may contain arguments as part of the string in the
- following format: "extname(key1=value1,key2=value2)"
-
- """
-
- configs = dict(configs)
-
- # Parse extensions config params (ignore the order)
- pos = ext_name.find("(") # find the first "("
- if pos > 0:
- ext_args = ext_name[pos+1:-1]
- ext_name = ext_name[:pos]
- pairs = [x.split("=") for x in ext_args.split(",")]
- configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
- warnings.warn('Setting configs in the Named Extension string is '
- 'deprecated. It is recommended that you '
- 'pass an instance of the extension class to '
- 'Markdown or use the "extension_configs" keyword. '
- 'The current behavior will raise an error in version 2.7. '
- 'See the Release Notes for Python-Markdown version '
- '2.6 for more info.', DeprecationWarning)
-
- # Get class name (if provided): `path.to.module:ClassName`
- ext_name, class_name = ext_name.split(':', 1) \
- if ':' in ext_name else (ext_name, '')
-
- # Try loading the extension first from one place, then another
- try:
- # Assume string uses dot syntax (`path.to.some.module`)
- module = importlib.import_module(ext_name)
- logger.debug(
- 'Successfuly imported extension module "%s".' % ext_name
- )
- # For backward compat (until deprecation)
- # check that this is an extension.
- if ('.' not in ext_name and not (hasattr(module, 'makeExtension') or
- (class_name and hasattr(module, class_name)))):
- # We have a name conflict
- # eg: extensions=['tables'] and PyTables is installed
- raise ImportError
- except ImportError:
- # Preppend `markdown.extensions.` to name
- module_name = '.'.join(['markdown.extensions', ext_name])
- try:
- module = importlib.import_module(module_name)
- logger.debug(
- 'Successfuly imported extension module "%s".' %
- module_name
- )
- warnings.warn('Using short names for Markdown\'s builtin '
- 'extensions is deprecated. Use the '
- 'full path to the extension with Python\'s dot '
- 'notation (eg: "%s" instead of "%s"). The '
- 'current behavior will raise an error in version '
- '2.7. See the Release Notes for '
- 'Python-Markdown version 2.6 for more info.' %
- (module_name, ext_name),
- DeprecationWarning)
- except ImportError:
- # Preppend `mdx_` to name
- module_name_old_style = '_'.join(['mdx', ext_name])
- try:
- module = importlib.import_module(module_name_old_style)
- logger.debug(
- 'Successfuly imported extension module "%s".' %
- module_name_old_style)
- warnings.warn('Markdown\'s behavior of prepending "mdx_" '
- 'to an extension name is deprecated. '
- 'Use the full path to the '
- 'extension with Python\'s dot notation '
- '(eg: "%s" instead of "%s"). The current '
- 'behavior will raise an error in version 2.7. '
- 'See the Release Notes for Python-Markdown '
- 'version 2.6 for more info.' %
- (module_name_old_style, ext_name),
- DeprecationWarning)
- except ImportError as e:
- message = "Failed loading extension '%s' from '%s', '%s' " \
- "or '%s'" % (ext_name, ext_name, module_name,
- module_name_old_style)
- e.args = (message,) + e.args[1:]
- raise
-
- if class_name:
- # Load given class name from module.
- return getattr(module, class_name)(**configs)
- else:
- # Expect makeExtension() function to return a class.
- try:
- return module.makeExtension(**configs)
- except AttributeError as e:
- message = e.args[0]
- message = "Failed to initiate extension " \
- "'%s': %s" % (ext_name, message)
- e.args = (message,) + e.args[1:]
- raise
-
- def registerExtension(self, extension):
- """ This gets called by the extension """
- self.registeredExtensions.append(extension)
- return self
-
- def reset(self):
- """
- Resets all state variables so that we can start with a new text.
- """
- self.htmlStash.reset()
- self.references.clear()
-
- for extension in self.registeredExtensions:
- if hasattr(extension, 'reset'):
- extension.reset()
-
- return self
-
- def set_output_format(self, format):
- """ Set the output format for the class instance. """
- self.output_format = format.lower()
- try:
- self.serializer = self.output_formats[self.output_format]
- except KeyError as e:
- valid_formats = list(self.output_formats.keys())
- valid_formats.sort()
- message = 'Invalid Output Format: "%s". Use one of %s.' \
- % (self.output_format,
- '"' + '", "'.join(valid_formats) + '"')
- e.args = (message,) + e.args[1:]
- raise
- return self
-
- def convert(self, source):
- """
- Convert markdown to serialized XHTML or HTML.
-
- Keyword arguments:
-
- * source: Source text as a Unicode string.
-
- Markdown processing takes place in five steps:
-
- 1. A bunch of "preprocessors" munge the input text.
- 2. BlockParser() parses the high-level structural elements of the
- pre-processed text into an ElementTree.
- 3. A bunch of "treeprocessors" are run against the ElementTree. One
- such treeprocessor runs InlinePatterns against the ElementTree,
- detecting inline markup.
- 4. Some post-processors are run against the text after the ElementTree
- has been serialized into text.
- 5. The output is written to a string.
-
- """
-
- # Fixup the source text
- if not source.strip():
- return '' # a blank unicode string
-
- try:
- source = util.text_type(source)
- except UnicodeDecodeError as e:
- # Customise error message while maintaining original trackback
- e.reason += '. -- Note: Markdown only accepts unicode input!'
- raise
-
- # Split into lines and run the line preprocessors.
- self.lines = source.split("\n")
- for prep in self.preprocessors.values():
- self.lines = prep.run(self.lines)
-
- # Parse the high-level elements.
- root = self.parser.parseDocument(self.lines).getroot()
-
- # Run the tree-processors
- for treeprocessor in self.treeprocessors.values():
- newRoot = treeprocessor.run(root)
- if newRoot is not None:
- root = newRoot
-
- # Serialize _properly_. Strip top-level tags.
- output = self.serializer(root)
- if self.stripTopLevelTags:
- try:
- start = output.index(
- '<%s>' % self.doc_tag) + len(self.doc_tag) + 2
- end = output.rindex('</%s>' % self.doc_tag)
- output = output[start:end].strip()
- except ValueError: # pragma: no cover
- if output.strip().endswith('<%s />' % self.doc_tag):
- # We have an empty document
- output = ''
- else:
- # We have a serious problem
- raise ValueError('Markdown failed to strip top-level '
- 'tags. Document=%r' % output.strip())
-
- # Run the text post-processors
- for pp in self.postprocessors.values():
- output = pp.run(output)
-
- return output.strip()
-
- def convertFile(self, input=None, output=None, encoding=None):
- """Converts a Markdown file and returns the HTML as a Unicode string.
-
- Decodes the file using the provided encoding (defaults to utf-8),
- passes the file content to markdown, and outputs the html to either
- the provided stream or the file with provided name, using the same
- encoding as the source file. The 'xmlcharrefreplace' error handler is
- used when encoding the output.
-
- **Note:** This is the only place that decoding and encoding of Unicode
- takes place in Python-Markdown. (All other code is Unicode-in /
- Unicode-out.)
-
- Keyword arguments:
-
- * input: File object or path. Reads from stdin if `None`.
- * output: File object or path. Writes to stdout if `None`.
- * encoding: Encoding of input and output files. Defaults to utf-8.
-
- """
-
- encoding = encoding or "utf-8"
-
- # Read the source
- if input:
- if isinstance(input, util.string_type):
- input_file = codecs.open(input, mode="r", encoding=encoding)
- else:
- input_file = codecs.getreader(encoding)(input)
- text = input_file.read()
- input_file.close()
- else:
- text = sys.stdin.read()
- if not isinstance(text, util.text_type):
- text = text.decode(encoding)
-
- text = text.lstrip('\ufeff') # remove the byte-order mark
-
- # Convert
- html = self.convert(text)
-
- # Write to file or stdout
- if output:
- if isinstance(output, util.string_type):
- output_file = codecs.open(output, "w",
- encoding=encoding,
- errors="xmlcharrefreplace")
- output_file.write(html)
- output_file.close()
- else:
- writer = codecs.getwriter(encoding)
- output_file = writer(output, errors="xmlcharrefreplace")
- output_file.write(html)
- # Don't close here. User may want to write more.
- else:
- # Encode manually and write bytes to stdout.
- html = html.encode(encoding, "xmlcharrefreplace")
- try:
- # Write bytes directly to buffer (Python 3).
- sys.stdout.buffer.write(html)
- except AttributeError:
- # Probably Python 2, which works with bytes by default.
- sys.stdout.write(html)
-
- return self
-
-
-"""
-EXPORTED FUNCTIONS
-=============================================================================
-
-Those are the two functions we really mean to export: markdown() and
-markdownFromFile().
-"""
-
-
-def markdown(text, *args, **kwargs):
- """Convert a Markdown string to HTML and return HTML as a Unicode string.
-
- This is a shortcut function for `Markdown` class to cover the most
- basic use case. It initializes an instance of Markdown, loads the
- necessary extensions and runs the parser on the given text.
-
- Keyword arguments:
-
- * text: Markdown formatted text as Unicode or ASCII string.
- * Any arguments accepted by the Markdown class.
-
- Returns: An HTML document as a string.
-
- """
- md = Markdown(*args, **kwargs)
- return md.convert(text)
-
-
-def markdownFromFile(*args, **kwargs):
- """Read markdown code from a file and write it to a file or a stream.
-
- This is a shortcut function which initializes an instance of Markdown,
- and calls the convertFile method rather than convert.
-
- Keyword arguments:
-
- * input: a file name or readable object.
- * output: a file name or writable object.
- * encoding: Encoding of input and output.
- * Any arguments accepted by the Markdown class.
-
- """
- # For backward compatibility loop through positional args
- pos = ['input', 'output', 'extensions', 'encoding']
- c = 0
- for arg in args:
- if pos[c] not in kwargs:
- kwargs[pos[c]] = arg
- c += 1
- if c == len(pos):
- break
- if len(args):
- warnings.warn('Positional arguments are depreacted in '
- 'Markdown and will raise an error in version 2.7. '
- 'Use keyword arguments only.',
- DeprecationWarning)
-
- md = Markdown(**kwargs)
- md.convertFile(kwargs.get('input', None),
- kwargs.get('output', None),
- kwargs.get('encoding', None))
+if not PY37:
+ Pep562(__name__)
diff --git a/markdown/__main__.py b/markdown/__main__.py
index 17bfa9f..7d78b7e 100644
--- a/markdown/__main__.py
+++ b/markdown/__main__.py
@@ -1,7 +1,22 @@
"""
-COMMAND-LINE SPECIFIC STUFF
-=============================================================================
+Python Markdown
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
"""
import sys
@@ -10,9 +25,17 @@
import warnings
import markdown
try:
- import yaml
+ # We use `unsafe_load` because users may need to pass in actual Python
+ # objects. As this is only available from the CLI, the user has much
+ # worse problems if an attacker can use this as an attach vector.
+ from yaml import unsafe_load as yaml_load
except ImportError: # pragma: no cover
- import json as yaml
+ try:
+ # Fall back to PyYAML <5.1
+ from yaml import load as yaml_load
+ except ImportError:
+ # Fall back to JSON
+ from json import load as yaml_load
import logging
from logging import DEBUG, WARNING, CRITICAL
@@ -27,8 +50,8 @@
usage = """%prog [options] [INPUTFILE]
(STDIN is assumed if no INPUTFILE is given)"""
desc = "A Python implementation of John Gruber's Markdown. " \
- "https://pythonhosted.org/Markdown/"
- ver = "%%prog %s" % markdown.version
+ "https://Python-Markdown.github.io/"
+ ver = "%%prog %s" % markdown.__version__
parser = optparse.OptionParser(usage=usage, description=desc, version=ver)
parser.add_option("-f", "--file", dest="filename", default=None,
@@ -36,13 +59,9 @@
metavar="OUTPUT_FILE")
parser.add_option("-e", "--encoding", dest="encoding",
help="Encoding for input and output files.",)
- parser.add_option("-s", "--safe", dest="safe", default=False,
- metavar="SAFE_MODE",
- help="Deprecated! 'replace', 'remove' or 'escape' HTML "
- "tags in input")
parser.add_option("-o", "--output_format", dest="output_format",
- default='xhtml1', metavar="OUTPUT_FORMAT",
- help="'xhtml1' (default), 'html4' or 'html5'.")
+ default='xhtml', metavar="OUTPUT_FORMAT",
+ help="Use output format 'xhtml' (default) or 'html'.")
parser.add_option("-n", "--no_lazy_ol", dest="lazy_ol",
action='store_false', default=True,
help="Observe number of first item of ordered lists.")
@@ -85,7 +104,7 @@
options.configfile, mode="r", encoding=options.encoding
) as fp:
try:
- extension_configs = yaml.load(fp)
+ extension_configs = yaml_load(fp)
except Exception as e:
message = "Failed parsing extension config file: %s" % \
options.configfile
@@ -102,10 +121,6 @@
'lazy_ol': options.lazy_ol
}
- if options.safe:
- # Avoid deprecation warning if user didn't set option
- opts['safe_mode'] = options.safe
-
return opts, options.verbose
@@ -132,5 +147,5 @@
if __name__ == '__main__': # pragma: no cover
# Support running module as a commandline command.
- # Python 2.7 & 3.x do: `python -m markdown [options] [args]`.
+ # `python -m markdown [options] [args]`.
run()
diff --git a/markdown/__meta__.py b/markdown/__meta__.py
new file mode 100644
index 0000000..79ae2f8
--- /dev/null
+++ b/markdown/__meta__.py
@@ -0,0 +1,49 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+# __version_info__ format:
+# (major, minor, patch, dev/alpha/beta/rc/final, #)
+# (1, 1, 2, 'dev', 0) => "1.1.2.dev0"
+# (1, 1, 2, 'alpha', 1) => "1.1.2a1"
+# (1, 2, 0, 'beta', 2) => "1.2b2"
+# (1, 2, 0, 'rc', 4) => "1.2rc4"
+# (1, 2, 0, 'final', 0) => "1.2"
+__version_info__ = (3, 3, 4, 'final', 0)
+
+
+def _get_version(version_info):
+ " Returns a PEP 440-compliant version number from version_info. "
+ assert len(version_info) == 5
+ assert version_info[3] in ('dev', 'alpha', 'beta', 'rc', 'final')
+
+ parts = 2 if version_info[2] == 0 else 3
+ v = '.'.join(map(str, version_info[:parts]))
+
+ if version_info[3] == 'dev':
+ v += '.dev' + str(version_info[4])
+ elif version_info[3] != 'final':
+ mapping = {'alpha': 'a', 'beta': 'b', 'rc': 'rc'}
+ v += mapping[version_info[3]] + str(version_info[4])
+
+ return v
+
+
+__version__ = _get_version(__version_info__)
diff --git a/markdown/__version__.py b/markdown/__version__.py
deleted file mode 100644
index 3442504..0000000
--- a/markdown/__version__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# markdown/__version__.py
-#
-# version_info should conform to PEP 386
-# (major, minor, micro, alpha/beta/rc/final, #)
-# (1, 1, 2, 'alpha', 0) => "1.1.2.dev"
-# (1, 2, 0, 'beta', 2) => "1.2b2"
-version_info = (2, 6, 2, 'final', 0)
-
-
-def _get_version():
- " Returns a PEP 386-compliant version number from version_info. "
- assert len(version_info) == 5
- assert version_info[3] in ('alpha', 'beta', 'rc', 'final')
-
- parts = 2 if version_info[2] == 0 else 3
- main = '.'.join(map(str, version_info[:parts]))
-
- sub = ''
- if version_info[3] == 'alpha' and version_info[4] == 0:
- # TODO: maybe append some sort of git info here??
- sub = '.dev'
- elif version_info[3] != 'final':
- mapping = {'alpha': 'a', 'beta': 'b', 'rc': 'c'}
- sub = mapping[version_info[3]] + str(version_info[4])
-
- return str(main + sub)
-
-version = _get_version()
diff --git a/markdown/blockparser.py b/markdown/blockparser.py
index 32d3254..39219fd 100644
--- a/markdown/blockparser.py
+++ b/markdown/blockparser.py
@@ -1,7 +1,26 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import xml.etree.ElementTree as etree
from . import util
-from . import odict
class State(list):
@@ -45,10 +64,16 @@
looping through them and creating an ElementTree object.
"""
- def __init__(self, markdown):
- self.blockprocessors = odict.OrderedDict()
+ def __init__(self, md):
+ self.blockprocessors = util.Registry()
self.state = State()
- self.markdown = markdown
+ self.md = md
+
+ @property
+ @util.deprecated("Use 'md' instead.")
+ def markdown(self):
+ # TODO: remove this later
+ return self.md
def parseDocument(self, lines):
""" Parse a markdown document into an ElementTree.
@@ -61,9 +86,9 @@
"""
# Create a ElementTree from the lines
- self.root = util.etree.Element(self.markdown.doc_tag)
+ self.root = etree.Element(self.md.doc_tag)
self.parseChunk(self.root, '\n'.join(lines))
- return util.etree.ElementTree(self.root)
+ return etree.ElementTree(self.root)
def parseChunk(self, parent, text):
""" Parse a chunk of markdown text and attach to given etree node.
@@ -93,7 +118,7 @@
"""
while blocks:
- for processor in self.blockprocessors.values():
+ for processor in self.blockprocessors:
if processor.test(parent, blocks[0]):
if processor.run(parent, blocks) is not False:
# run returns True or None
diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py
index 29db022..8518e50 100644
--- a/markdown/blockprocessors.py
+++ b/markdown/blockprocessors.py
@@ -1,4 +1,23 @@
"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
CORE MARKDOWN BLOCKPARSER
===========================================================================
@@ -6,35 +25,34 @@
itself with inline elements such as **bold** or *italics*, but rather just
catches blocks, lists, quotes, etc.
-The BlockParser is made up of a bunch of BlockProssors, each handling a
+The BlockParser is made up of a bunch of BlockProcessors, each handling a
different type of block. Extensions may add/replace/remove BlockProcessors
as they need to alter how markdown blocks are parsed.
"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import unicode_literals
import logging
import re
+import xml.etree.ElementTree as etree
from . import util
from .blockparser import BlockParser
logger = logging.getLogger('MARKDOWN')
-def build_block_parser(md_instance, **kwargs):
+def build_block_parser(md, **kwargs):
""" Build the default block parser used by Markdown. """
- parser = BlockParser(md_instance)
- parser.blockprocessors['empty'] = EmptyBlockProcessor(parser)
- parser.blockprocessors['indent'] = ListIndentProcessor(parser)
- parser.blockprocessors['code'] = CodeBlockProcessor(parser)
- parser.blockprocessors['hashheader'] = HashHeaderProcessor(parser)
- parser.blockprocessors['setextheader'] = SetextHeaderProcessor(parser)
- parser.blockprocessors['hr'] = HRProcessor(parser)
- parser.blockprocessors['olist'] = OListProcessor(parser)
- parser.blockprocessors['ulist'] = UListProcessor(parser)
- parser.blockprocessors['quote'] = BlockQuoteProcessor(parser)
- parser.blockprocessors['paragraph'] = ParagraphProcessor(parser)
+ parser = BlockParser(md)
+ parser.blockprocessors.register(EmptyBlockProcessor(parser), 'empty', 100)
+ parser.blockprocessors.register(ListIndentProcessor(parser), 'indent', 90)
+ parser.blockprocessors.register(CodeBlockProcessor(parser), 'code', 80)
+ parser.blockprocessors.register(HashHeaderProcessor(parser), 'hashheader', 70)
+ parser.blockprocessors.register(SetextHeaderProcessor(parser), 'setextheader', 60)
+ parser.blockprocessors.register(HRProcessor(parser), 'hr', 50)
+ parser.blockprocessors.register(OListProcessor(parser), 'olist', 40)
+ parser.blockprocessors.register(UListProcessor(parser), 'ulist', 30)
+ parser.blockprocessors.register(BlockQuoteProcessor(parser), 'quote', 20)
+ parser.blockprocessors.register(ReferenceProcessor(parser), 'reference', 15)
+ parser.blockprocessors.register(ParagraphProcessor(parser), 'paragraph', 10)
return parser
@@ -51,7 +69,7 @@
def __init__(self, parser):
self.parser = parser
- self.tab_length = parser.markdown.tab_length
+ self.tab_length = parser.md.tab_length
def lastChild(self, parent):
""" Return the last child of an etree element. """
@@ -60,13 +78,15 @@
else:
return None
- def detab(self, text):
+ def detab(self, text, length=None):
""" Remove a tab from the front of each line of the given text. """
+ if length is None:
+ length = self.tab_length
newtext = []
lines = text.split('\n')
for line in lines:
- if line.startswith(' '*self.tab_length):
- newtext.append(line[self.tab_length:])
+ if line.startswith(' ' * length):
+ newtext.append(line[length:])
elif not line.strip():
newtext.append('')
else:
@@ -141,7 +161,7 @@
LIST_TYPES = ['ul', 'ol']
def __init__(self, *args):
- BlockProcessor.__init__(self, *args)
+ super().__init__(*args)
self.INDENT_RE = re.compile(r'^(([ ]{%s})+)' % self.tab_length)
def test(self, parent, block):
@@ -178,7 +198,7 @@
# If the parent li has text, that text needs to be moved to a p
# The p must be 'inserted' at beginning of list in the event
# that other children already exist i.e.; a nested sublist.
- p = util.etree.Element('p')
+ p = etree.Element('p')
p.text = sibling[-1].text
sibling[-1].text = ''
sibling[-1].insert(0, p)
@@ -189,7 +209,7 @@
def create_item(self, parent, block):
""" Create a new li and parse the block with it as the parent. """
- li = util.etree.SubElement(parent, 'li')
+ li = etree.SubElement(parent, 'li')
self.parser.parseBlocks(li, [block])
def get_level(self, parent, block):
@@ -239,14 +259,14 @@
code = sibling[0]
block, theRest = self.detab(block)
code.text = util.AtomicString(
- '%s\n%s\n' % (code.text, block.rstrip())
+ '{}\n{}\n'.format(code.text, util.code_escape(block.rstrip()))
)
else:
# This is a new codeblock. Create the elements and insert text.
- pre = util.etree.SubElement(parent, 'pre')
- code = util.etree.SubElement(pre, 'code')
+ pre = etree.SubElement(parent, 'pre')
+ code = etree.SubElement(pre, 'code')
block, theRest = self.detab(block)
- code.text = util.AtomicString('%s\n' % block.rstrip())
+ code.text = util.AtomicString('%s\n' % util.code_escape(block.rstrip()))
if theRest:
# This block contained unindented line(s) after the first indented
# line. Insert these lines as the first block of the master blocks
@@ -259,7 +279,7 @@
RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
def test(self, parent, block):
- return bool(self.RE.search(block))
+ return bool(self.RE.search(block)) and not util.nearing_recursion_limit()
def run(self, parent, blocks):
block = blocks.pop(0)
@@ -268,7 +288,7 @@
before = block[:m.start()] # Lines before blockquote
# Pass lines before blockquote in recursively for parsing forst.
self.parser.parseBlocks(parent, [before])
- # Remove ``> `` from begining of each line.
+ # Remove ``> `` from beginning of each line.
block = '\n'.join(
[self.clean(line) for line in block[m.start():].split('\n')]
)
@@ -278,7 +298,7 @@
quote = sibling
else:
# This is a new blockquote. Create a new parent element.
- quote = util.etree.SubElement(parent, 'blockquote')
+ quote = etree.SubElement(parent, 'blockquote')
# Recursively parse block with blockquote as parent.
# change parser state so blockquotes embedded in lists use p tags
self.parser.state.set('blockquote')
@@ -300,20 +320,27 @@
""" Process ordered list blocks. """
TAG = 'ol'
- # Detect an item (``1. item``). ``group(1)`` contains contents of item.
- RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)')
- # Detect items on secondary lines. they can be of either list type.
- CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)')
- # Detect indented (nested) items of either type
- INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*')
# The integer (python string) with which the lists starts (default=1)
# Eg: If list is intialized as)
# 3. Item
# The ol tag will get starts="3" attribute
STARTSWITH = '1'
+ # Lazy ol - ignore startswith
+ LAZY_OL = True
# List of allowed sibling tags.
SIBLING_TAGS = ['ol', 'ul']
+ def __init__(self, parser):
+ super().__init__(parser)
+ # Detect an item (``1. item``). ``group(1)`` contains contents of item.
+ self.RE = re.compile(r'^[ ]{0,%d}\d+\.[ ]+(.*)' % (self.tab_length - 1))
+ # Detect items on secondary lines. they can be of either list type.
+ self.CHILD_RE = re.compile(r'^[ ]{0,%d}((\d+\.)|[*+-])[ ]+(.*)' %
+ (self.tab_length - 1))
+ # Detect indented (nested) items of either type
+ self.INDENT_RE = re.compile(r'^[ ]{%d,%d}((\d+\.)|[*+-])[ ]+.*' %
+ (self.tab_length, self.tab_length * 2 - 1))
+
def test(self, parent, block):
return bool(self.RE.match(block))
@@ -331,7 +358,7 @@
# since it's possible there are other children for this
# sibling, we can't just SubElement the p, we need to
# insert it as the first item.
- p = util.etree.Element('p')
+ p = etree.Element('p')
p.text = lst[-1].text
lst[-1].text = ''
lst[-1].insert(0, p)
@@ -339,12 +366,12 @@
# likely only when a header is not followed by a blank line
lch = self.lastChild(lst[-1])
if lch is not None and lch.tail:
- p = util.etree.SubElement(lst[-1], 'p')
+ p = etree.SubElement(lst[-1], 'p')
p.text = lch.tail.lstrip()
lch.tail = ''
# parse first block differently as it gets wrapped in a p.
- li = util.etree.SubElement(lst, 'li')
+ li = etree.SubElement(lst, 'li')
self.parser.state.set('looselist')
firstitem = items.pop(0)
self.parser.parseBlocks(li, [firstitem])
@@ -358,9 +385,9 @@
lst = parent
else:
# This is a new list so create parent with appropriate tag.
- lst = util.etree.SubElement(parent, self.TAG)
+ lst = etree.SubElement(parent, self.TAG)
# Check if a custom start integer is set
- if not self.parser.markdown.lazy_ol and self.STARTSWITH != '1':
+ if not self.LAZY_OL and self.STARTSWITH != '1':
lst.attrib['start'] = self.STARTSWITH
self.parser.state.set('list')
@@ -372,7 +399,7 @@
self.parser.parseBlocks(lst[-1], [item])
else:
# New item. Create li and parse with it as parent
- li = util.etree.SubElement(lst, 'li')
+ li = etree.SubElement(lst, 'li')
self.parser.parseBlocks(li, [item])
self.parser.state.reset()
@@ -386,7 +413,7 @@
# Check first item for the start index
if not items and self.TAG == 'ol':
# Detect the integer value of first list item
- INTEGER_RE = re.compile('(\d+)')
+ INTEGER_RE = re.compile(r'(\d+)')
self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()
# Append to the list
items.append(m.group(3))
@@ -394,12 +421,12 @@
# This is an indented (possibly nested) item.
if items[-1].startswith(' '*self.tab_length):
# Previous item was indented. Append to that item.
- items[-1] = '%s\n%s' % (items[-1], line)
+ items[-1] = '{}\n{}'.format(items[-1], line)
else:
items.append(line)
else:
# This is another line of previous item. Append to that item.
- items[-1] = '%s\n%s' % (items[-1], line)
+ items[-1] = '{}\n{}'.format(items[-1], line)
return items
@@ -407,14 +434,18 @@
""" Process unordered list blocks. """
TAG = 'ul'
- RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)')
+
+ def __init__(self, parser):
+ super().__init__(parser)
+ # Detect an item (``1. item``). ``group(1)`` contains contents of item.
+ self.RE = re.compile(r'^[ ]{0,%d}[*+-][ ]+(.*)' % (self.tab_length - 1))
class HashHeaderProcessor(BlockProcessor):
""" Process Hash Headers. """
# Detect a header at start of any line in block
- RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
+ RE = re.compile(r'(?:^|\n)(?P<level>#{1,6})(?P<header>(?:\\.|[^\\])*?)#*(?:\n|$)')
def test(self, parent, block):
return bool(self.RE.search(block))
@@ -431,7 +462,7 @@
# recursively parse this lines as a block.
self.parser.parseBlocks(parent, [before])
# Create header using named groups from RE
- h = util.etree.SubElement(parent, 'h%d' % len(m.group('level')))
+ h = etree.SubElement(parent, 'h%d' % len(m.group('level')))
h.text = m.group('header').strip()
if after:
# Insert remaining lines as first block for future parsing.
@@ -457,7 +488,7 @@
level = 1
else:
level = 2
- h = util.etree.SubElement(parent, 'h%d' % level)
+ h = etree.SubElement(parent, 'h%d' % level)
h.text = lines[0].strip()
if len(lines) > 2:
# Block contains additional lines. Add to master blocks for later.
@@ -467,16 +498,15 @@
class HRProcessor(BlockProcessor):
""" Process Horizontal Rules. """
- RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*'
+ # Python's re module doesn't officially support atomic grouping. However you can fake it.
+ # See https://stackoverflow.com/a/13577411/866026
+ RE = r'^[ ]{0,3}(?=(?P<atomicgroup>(-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,}))(?P=atomicgroup)[ ]*$'
# Detect hr on any line of a block.
SEARCH_RE = re.compile(RE, re.MULTILINE)
def test(self, parent, block):
m = self.SEARCH_RE.search(block)
- # No atomic grouping in python so we simulate it here for performance.
- # The regex only matches what would be in the atomic group - the HR.
- # Then check if we are at end of block or if next char is a newline.
- if m and (m.end() == len(block) or block[m.end()] == '\n'):
+ if m:
# Save match object on class instance so we can use it later.
self.match = m
return True
@@ -484,15 +514,16 @@
def run(self, parent, blocks):
block = blocks.pop(0)
+ match = self.match
# Check for lines in block before hr.
- prelines = block[:self.match.start()].rstrip('\n')
+ prelines = block[:match.start()].rstrip('\n')
if prelines:
# Recursively parse lines before hr so they get parsed first.
self.parser.parseBlocks(parent, [prelines])
# create hr
- util.etree.SubElement(parent, 'hr')
+ etree.SubElement(parent, 'hr')
# check for lines in block after hr.
- postlines = block[self.match.end():].lstrip('\n')
+ postlines = block[match.end():].lstrip('\n')
if postlines:
# Add lines after hr to master blocks for later parsing.
blocks.insert(0, postlines)
@@ -521,10 +552,39 @@
len(sibling) and sibling[0].tag == 'code'):
# Last block is a codeblock. Append to preserve whitespace.
sibling[0].text = util.AtomicString(
- '%s%s' % (sibling[0].text, filler)
+ '{}{}'.format(sibling[0].text, filler)
)
+class ReferenceProcessor(BlockProcessor):
+ """ Process link references. """
+ RE = re.compile(
+ r'^[ ]{0,3}\[([^\]]*)\]:[ ]*\n?[ ]*([^\s]+)[ ]*\n?[ ]*((["\'])(.*)\4|\((.*)\))?[ ]*$', re.MULTILINE
+ )
+
+ def test(self, parent, block):
+ return True
+
+ def run(self, parent, blocks):
+ block = blocks.pop(0)
+ m = self.RE.search(block)
+ if m:
+ id = m.group(1).strip().lower()
+ link = m.group(2).lstrip('<').rstrip('>')
+ title = m.group(5) or m.group(6)
+ self.parser.md.references[id] = (link, title)
+ if block[m.end():].strip():
+ # Add any content after match back to blocks as separate block
+ blocks.insert(0, block[m.end():].lstrip('\n'))
+ if block[:m.start()].strip():
+ # Add any content before match back to blocks as separate block
+ blocks.insert(0, block[:m.start()].rstrip('\n'))
+ return True
+ # No match. Restore block.
+ blocks.insert(0, block)
+ return False
+
+
class ParagraphProcessor(BlockProcessor):
""" Process Paragraph blocks. """
@@ -548,16 +608,16 @@
if sibling is not None:
# Insetrt after sibling.
if sibling.tail:
- sibling.tail = '%s\n%s' % (sibling.tail, block)
+ sibling.tail = '{}\n{}'.format(sibling.tail, block)
else:
sibling.tail = '\n%s' % block
else:
# Append to parent.text
if parent.text:
- parent.text = '%s\n%s' % (parent.text, block)
+ parent.text = '{}\n{}'.format(parent.text, block)
else:
parent.text = block.lstrip()
else:
# Create a regular paragraph
- p = util.etree.SubElement(parent, 'p')
+ p = etree.SubElement(parent, 'p')
p.text = block.lstrip()
diff --git a/markdown/core.py b/markdown/core.py
new file mode 100644
index 0000000..2f7f2d5
--- /dev/null
+++ b/markdown/core.py
@@ -0,0 +1,407 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import codecs
+import sys
+import logging
+import importlib
+from . import util
+from .preprocessors import build_preprocessors
+from .blockprocessors import build_block_parser
+from .treeprocessors import build_treeprocessors
+from .inlinepatterns import build_inlinepatterns
+from .postprocessors import build_postprocessors
+from .extensions import Extension
+from .serializers import to_html_string, to_xhtml_string
+
+__all__ = ['Markdown', 'markdown', 'markdownFromFile']
+
+
+logger = logging.getLogger('MARKDOWN')
+
+
+class Markdown:
+ """Convert Markdown to HTML."""
+
+ doc_tag = "div" # Element used to wrap document - later removed
+
+ output_formats = {
+ 'html': to_html_string,
+ 'xhtml': to_xhtml_string,
+ }
+
+ def __init__(self, **kwargs):
+ """
+ Creates a new Markdown instance.
+
+ Keyword arguments:
+
+ * extensions: A list of extensions.
+ If an item is an instance of a subclass of `markdown.extension.Extension`, the instance will be used
+ as-is. If an item is of type string, first an entry point will be loaded. If that fails, the string is
+ assumed to use Python dot notation (`path.to.module:ClassName`) to load a markdown.Extension subclass. If
+ no class is specified, then a `makeExtension` function is called within the specified module.
+ * extension_configs: Configuration settings for extensions.
+ * output_format: Format of output. Supported formats are:
+ * "xhtml": Outputs XHTML style tags. Default.
+ * "html": Outputs HTML style tags.
+ * tab_length: Length of tabs in the source. Default: 4
+
+ """
+
+ self.tab_length = kwargs.get('tab_length', 4)
+
+ self.ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']',
+ '(', ')', '>', '#', '+', '-', '.', '!']
+
+ self.block_level_elements = [
+ # Elements which are invalid to wrap in a `<p>` tag.
+ # See https://w3c.github.io/html/grouping-content.html#the-p-element
+ 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
+ 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
+ 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul',
+ # Other elements which Markdown should not be mucking up the contents of.
+ 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+ 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+ 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
+ ]
+
+ self.registeredExtensions = []
+ self.docType = ""
+ self.stripTopLevelTags = True
+
+ self.build_parser()
+
+ self.references = {}
+ self.htmlStash = util.HtmlStash()
+ self.registerExtensions(extensions=kwargs.get('extensions', []),
+ configs=kwargs.get('extension_configs', {}))
+ self.set_output_format(kwargs.get('output_format', 'xhtml'))
+ self.reset()
+
+ def build_parser(self):
+ """ Build the parser from the various parts. """
+ self.preprocessors = build_preprocessors(self)
+ self.parser = build_block_parser(self)
+ self.inlinePatterns = build_inlinepatterns(self)
+ self.treeprocessors = build_treeprocessors(self)
+ self.postprocessors = build_postprocessors(self)
+ return self
+
+ def registerExtensions(self, extensions, configs):
+ """
+ Register extensions with this instance of Markdown.
+
+ Keyword arguments:
+
+ * extensions: A list of extensions, which can either
+ be strings or objects.
+ * configs: A dictionary mapping extension names to config options.
+
+ """
+ for ext in extensions:
+ if isinstance(ext, str):
+ ext = self.build_extension(ext, configs.get(ext, {}))
+ if isinstance(ext, Extension):
+ ext._extendMarkdown(self)
+ logger.debug(
+ 'Successfully loaded extension "%s.%s".'
+ % (ext.__class__.__module__, ext.__class__.__name__)
+ )
+ elif ext is not None:
+ raise TypeError(
+ 'Extension "{}.{}" must be of type: "{}.{}"'.format(
+ ext.__class__.__module__, ext.__class__.__name__,
+ Extension.__module__, Extension.__name__
+ )
+ )
+ return self
+
+ def build_extension(self, ext_name, configs):
+ """
+ Build extension from a string name, then return an instance.
+
+ First attempt to load an entry point. The string name must be registered as an entry point in the
+ `markdown.extensions` group which points to a subclass of the `markdown.extensions.Extension` class.
+ If multiple distributions have registered the same name, the first one found is returned.
+
+ If no entry point is found, assume dot notation (`path.to.module:ClassName`). Load the specified class and
+ return an instance. If no class is specified, import the module and call a `makeExtension` function and return
+ the Extension instance returned by that function.
+ """
+ configs = dict(configs)
+
+ entry_points = [ep for ep in util.INSTALLED_EXTENSIONS if ep.name == ext_name]
+ if entry_points:
+ ext = entry_points[0].load()
+ return ext(**configs)
+
+ # Get class name (if provided): `path.to.module:ClassName`
+ ext_name, class_name = ext_name.split(':', 1) if ':' in ext_name else (ext_name, '')
+
+ try:
+ module = importlib.import_module(ext_name)
+ logger.debug(
+ 'Successfully imported extension module "%s".' % ext_name
+ )
+ except ImportError as e:
+ message = 'Failed loading extension "%s".' % ext_name
+ e.args = (message,) + e.args[1:]
+ raise
+
+ if class_name:
+ # Load given class name from module.
+ return getattr(module, class_name)(**configs)
+ else:
+ # Expect makeExtension() function to return a class.
+ try:
+ return module.makeExtension(**configs)
+ except AttributeError as e:
+ message = e.args[0]
+ message = "Failed to initiate extension " \
+ "'%s': %s" % (ext_name, message)
+ e.args = (message,) + e.args[1:]
+ raise
+
+ def registerExtension(self, extension):
+ """ This gets called by the extension """
+ self.registeredExtensions.append(extension)
+ return self
+
+ def reset(self):
+ """
+ Resets all state variables so that we can start with a new text.
+ """
+ self.htmlStash.reset()
+ self.references.clear()
+
+ for extension in self.registeredExtensions:
+ if hasattr(extension, 'reset'):
+ extension.reset()
+
+ return self
+
+ def set_output_format(self, format):
+ """ Set the output format for the class instance. """
+ self.output_format = format.lower().rstrip('145') # ignore num
+ try:
+ self.serializer = self.output_formats[self.output_format]
+ except KeyError as e:
+ valid_formats = list(self.output_formats.keys())
+ valid_formats.sort()
+ message = 'Invalid Output Format: "%s". Use one of %s.' \
+ % (self.output_format,
+ '"' + '", "'.join(valid_formats) + '"')
+ e.args = (message,) + e.args[1:]
+ raise
+ return self
+
+ def is_block_level(self, tag):
+ """Check if the tag is a block level HTML tag."""
+ if isinstance(tag, str):
+ return tag.lower().rstrip('/') in self.block_level_elements
+ # Some ElementTree tags are not strings, so return False.
+ return False
+
+ def convert(self, source):
+ """
+ Convert markdown to serialized XHTML or HTML.
+
+ Keyword arguments:
+
+ * source: Source text as a Unicode string.
+
+ Markdown processing takes place in five steps:
+
+ 1. A bunch of "preprocessors" munge the input text.
+ 2. BlockParser() parses the high-level structural elements of the
+ pre-processed text into an ElementTree.
+ 3. A bunch of "treeprocessors" are run against the ElementTree. One
+ such treeprocessor runs InlinePatterns against the ElementTree,
+ detecting inline markup.
+ 4. Some post-processors are run against the text after the ElementTree
+ has been serialized into text.
+ 5. The output is written to a string.
+
+ """
+
+ # Fixup the source text
+ if not source.strip():
+ return '' # a blank unicode string
+
+ try:
+ source = str(source)
+ except UnicodeDecodeError as e: # pragma: no cover
+ # Customise error message while maintaining original trackback
+ e.reason += '. -- Note: Markdown only accepts unicode input!'
+ raise
+
+ # Split into lines and run the line preprocessors.
+ self.lines = source.split("\n")
+ for prep in self.preprocessors:
+ self.lines = prep.run(self.lines)
+
+ # Parse the high-level elements.
+ root = self.parser.parseDocument(self.lines).getroot()
+
+ # Run the tree-processors
+ for treeprocessor in self.treeprocessors:
+ newRoot = treeprocessor.run(root)
+ if newRoot is not None:
+ root = newRoot
+
+ # Serialize _properly_. Strip top-level tags.
+ output = self.serializer(root)
+ if self.stripTopLevelTags:
+ try:
+ start = output.index(
+ '<%s>' % self.doc_tag) + len(self.doc_tag) + 2
+ end = output.rindex('</%s>' % self.doc_tag)
+ output = output[start:end].strip()
+ except ValueError as e: # pragma: no cover
+ if output.strip().endswith('<%s />' % self.doc_tag):
+ # We have an empty document
+ output = ''
+ else:
+ # We have a serious problem
+ raise ValueError('Markdown failed to strip top-level '
+ 'tags. Document=%r' % output.strip()) from e
+
+ # Run the text post-processors
+ for pp in self.postprocessors:
+ output = pp.run(output)
+
+ return output.strip()
+
+ def convertFile(self, input=None, output=None, encoding=None):
+ """Converts a markdown file and returns the HTML as a unicode string.
+
+ Decodes the file using the provided encoding (defaults to utf-8),
+ passes the file content to markdown, and outputs the html to either
+ the provided stream or the file with provided name, using the same
+ encoding as the source file. The 'xmlcharrefreplace' error handler is
+ used when encoding the output.
+
+ **Note:** This is the only place that decoding and encoding of unicode
+ takes place in Python-Markdown. (All other code is unicode-in /
+ unicode-out.)
+
+ Keyword arguments:
+
+ * input: File object or path. Reads from stdin if `None`.
+ * output: File object or path. Writes to stdout if `None`.
+ * encoding: Encoding of input and output files. Defaults to utf-8.
+
+ """
+
+ encoding = encoding or "utf-8"
+
+ # Read the source
+ if input:
+ if isinstance(input, str):
+ input_file = codecs.open(input, mode="r", encoding=encoding)
+ else:
+ input_file = codecs.getreader(encoding)(input)
+ text = input_file.read()
+ input_file.close()
+ else:
+ text = sys.stdin.read()
+ if not isinstance(text, str): # pragma: no cover
+ text = text.decode(encoding)
+
+ text = text.lstrip('\ufeff') # remove the byte-order mark
+
+ # Convert
+ html = self.convert(text)
+
+ # Write to file or stdout
+ if output:
+ if isinstance(output, str):
+ output_file = codecs.open(output, "w",
+ encoding=encoding,
+ errors="xmlcharrefreplace")
+ output_file.write(html)
+ output_file.close()
+ else:
+ writer = codecs.getwriter(encoding)
+ output_file = writer(output, errors="xmlcharrefreplace")
+ output_file.write(html)
+ # Don't close here. User may want to write more.
+ else:
+ # Encode manually and write bytes to stdout.
+ html = html.encode(encoding, "xmlcharrefreplace")
+ try:
+ # Write bytes directly to buffer (Python 3).
+ sys.stdout.buffer.write(html)
+ except AttributeError: # pragma: no cover
+ # Probably Python 2, which works with bytes by default.
+ sys.stdout.write(html)
+
+ return self
+
+
+"""
+EXPORTED FUNCTIONS
+=============================================================================
+
+Those are the two functions we really mean to export: markdown() and
+markdownFromFile().
+"""
+
+
+def markdown(text, **kwargs):
+ """Convert a markdown string to HTML and return HTML as a unicode string.
+
+ This is a shortcut function for `Markdown` class to cover the most
+ basic use case. It initializes an instance of Markdown, loads the
+ necessary extensions and runs the parser on the given text.
+
+ Keyword arguments:
+
+ * text: Markdown formatted text as Unicode or ASCII string.
+ * Any arguments accepted by the Markdown class.
+
+ Returns: An HTML document as a string.
+
+ """
+ md = Markdown(**kwargs)
+ return md.convert(text)
+
+
+def markdownFromFile(**kwargs):
+ """Read markdown code from a file and write it to a file or a stream.
+
+ This is a shortcut function which initializes an instance of Markdown,
+ and calls the convertFile method rather than convert.
+
+ Keyword arguments:
+
+ * input: a file name or readable object.
+ * output: a file name or writable object.
+ * encoding: Encoding of input and output.
+ * Any arguments accepted by the Markdown class.
+
+ """
+ md = Markdown(**kwargs)
+ md.convertFile(kwargs.get('input', None),
+ kwargs.get('output', None),
+ kwargs.get('encoding', None))
diff --git a/markdown/extensions/__init__.py b/markdown/extensions/__init__.py
index 6e7a08a..4bc8e5f 100644
--- a/markdown/extensions/__init__.py
+++ b/markdown/extensions/__init__.py
@@ -1,14 +1,29 @@
"""
-Extensions
------------------------------------------------------------------------------
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
"""
-from __future__ import unicode_literals
-from ..util import parseBoolValue
import warnings
+from ..util import parseBoolValue
-class Extension(object):
+class Extension:
""" Base class for extensions to subclass. """
# Default config -- to be overriden by a subclass
@@ -20,34 +35,8 @@
# if a default is not set here.
config = {}
- def __init__(self, *args, **kwargs):
+ def __init__(self, **kwargs):
""" Initiate Extension and set up configs. """
-
- # check for configs arg for backward compat.
- # (there only ever used to be one so we use arg[0])
- if len(args):
- if args[0] is not None:
- self.setConfigs(args[0])
- warnings.warn('Extension classes accepting positional args is '
- 'pending Deprecation. Each setting should be '
- 'passed into the Class as a keyword. Positional '
- 'args are deprecated and will raise '
- 'an error in version 2.7. See the Release Notes for '
- 'Python-Markdown version 2.6 for more info.',
- DeprecationWarning)
- # check for configs kwarg for backward compat.
- if 'configs' in kwargs.keys():
- if kwargs['configs'] is not None:
- self.setConfigs(kwargs.pop('configs', {}))
- warnings.warn('Extension classes accepting a dict on the single '
- 'keyword "config" is pending Deprecation. Each '
- 'setting should be passed into the Class as a '
- 'keyword directly. The "config" keyword is '
- 'deprecated and raise an error in '
- 'version 2.7. See the Release Notes for '
- 'Python-Markdown version 2.6 for more info.',
- DeprecationWarning)
- # finally, use kwargs
self.setConfigs(kwargs)
def getConfig(self, key, default=''):
@@ -59,7 +48,7 @@
def getConfigs(self):
""" Return all configs settings as a dict. """
- return dict([(key, self.getConfig(key)) for key in self.config.keys()])
+ return {key: self.getConfig(key) for key in self.config.keys()}
def getConfigInfo(self):
""" Return all config descriptions as a list of tuples. """
@@ -81,7 +70,25 @@
for key, value in items:
self.setConfig(key, value)
- def extendMarkdown(self, md, md_globals):
+ def _extendMarkdown(self, *args):
+ """ Private wrapper around extendMarkdown. """
+ md = args[0]
+ try:
+ self.extendMarkdown(md)
+ except TypeError as e:
+ if "missing 1 required positional argument" in str(e):
+ # Must be a 2.x extension. Pass in a dumby md_globals.
+ self.extendMarkdown(md, {})
+ warnings.warn(
+ "The 'md_globals' parameter of '{}.{}.extendMarkdown' is "
+ "deprecated.".format(self.__class__.__module__, self.__class__.__name__),
+ category=DeprecationWarning,
+ stacklevel=2
+ )
+ else:
+ raise
+
+ def extendMarkdown(self, md):
"""
Add the various proccesors and patterns to the Markdown Instance.
diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py
index 353d126..9879314 100644
--- a/markdown/extensions/abbr.py
+++ b/markdown/extensions/abbr.py
@@ -4,7 +4,7 @@
This extension adds abbreviation handling to Python-Markdown.
-See <https://pythonhosted.org/Markdown/extensions/abbreviations.html>
+See <https://Python-Markdown.github.io/extensions/abbreviations>
for documentation.
Oringinal code Copyright 2007-2008 [Waylan Limberg](http://achinghead.com/) and
@@ -12,50 +12,58 @@
All changes Copyright 2008-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
'''
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
-from ..preprocessors import Preprocessor
-from ..inlinepatterns import Pattern
-from ..util import etree, AtomicString
+from ..blockprocessors import BlockProcessor
+from ..inlinepatterns import InlineProcessor
+from ..util import AtomicString
import re
-
-# Global Vars
-ABBR_REF_RE = re.compile(r'[*]\[(?P<abbr>[^\]]*)\][ ]?:\s*(?P<title>.*)')
+import xml.etree.ElementTree as etree
class AbbrExtension(Extension):
""" Abbreviation Extension for Python-Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Insert AbbrPreprocessor before ReferencePreprocessor. """
- md.preprocessors.add('abbr', AbbrPreprocessor(md), '<reference')
+ md.parser.blockprocessors.register(AbbrPreprocessor(md.parser), 'abbr', 16)
-class AbbrPreprocessor(Preprocessor):
+class AbbrPreprocessor(BlockProcessor):
""" Abbreviation Preprocessor - parse text for abbr references. """
- def run(self, lines):
+ RE = re.compile(r'^[*]\[(?P<abbr>[^\]]*)\][ ]?:[ ]*\n?[ ]*(?P<title>.*)$', re.MULTILINE)
+
+ def test(self, parent, block):
+ return True
+
+ def run(self, parent, blocks):
'''
Find and remove all Abbreviation references from the text.
Each reference is set as a new AbbrPattern in the markdown instance.
'''
- new_text = []
- for line in lines:
- m = ABBR_REF_RE.match(line)
- if m:
- abbr = m.group('abbr').strip()
- title = m.group('title').strip()
- self.markdown.inlinePatterns['abbr-%s' % abbr] = \
- AbbrPattern(self._generate_pattern(abbr), title)
- else:
- new_text.append(line)
- return new_text
+ block = blocks.pop(0)
+ m = self.RE.search(block)
+ if m:
+ abbr = m.group('abbr').strip()
+ title = m.group('title').strip()
+ self.parser.md.inlinePatterns.register(
+ AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
+ )
+ if block[m.end():].strip():
+ # Add any content after match back to blocks as separate block
+ blocks.insert(0, block[m.end():].lstrip('\n'))
+ if block[:m.start()].strip():
+ # Add any content before match back to blocks as separate block
+ blocks.insert(0, block[:m.start()].rstrip('\n'))
+ return True
+ # No match. Restore block.
+ blocks.insert(0, block)
+ return False
def _generate_pattern(self, text):
'''
@@ -73,19 +81,19 @@
return r'(?P<abbr>\b%s\b)' % (r''.join(chars))
-class AbbrPattern(Pattern):
+class AbbrInlineProcessor(InlineProcessor):
""" Abbreviation inline pattern. """
def __init__(self, pattern, title):
- super(AbbrPattern, self).__init__(pattern)
+ super().__init__(pattern)
self.title = title
- def handleMatch(self, m):
+ def handleMatch(self, m, data):
abbr = etree.Element('abbr')
abbr.text = AtomicString(m.group('abbr'))
abbr.set('title', self.title)
- return abbr
+ return abbr, m.start(0), m.end(0)
-def makeExtension(*args, **kwargs):
- return AbbrExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return AbbrExtension(**kwargs)
diff --git a/markdown/extensions/admonition.py b/markdown/extensions/admonition.py
index 76e0fb5..9c66b4f 100644
--- a/markdown/extensions/admonition.py
+++ b/markdown/extensions/admonition.py
@@ -6,68 +6,141 @@
[rST]: http://docutils.sourceforge.net/docs/ref/rst/directives.html#specific-admonitions # noqa
-See <https://pythonhosted.org/Markdown/extensions/admonition.html>
+See <https://Python-Markdown.github.io/extensions/admonition>
for documentation.
-Original code Copyright [Tiago Serafim](http://www.tiagoserafim.com/).
+Original code Copyright [Tiago Serafim](https://www.tiagoserafim.com/).
All changes Copyright The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
from ..blockprocessors import BlockProcessor
-from ..util import etree
+import xml.etree.ElementTree as etree
import re
class AdmonitionExtension(Extension):
""" Admonition extension for Python-Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add Admonition to Markdown instance. """
md.registerExtension(self)
- md.parser.blockprocessors.add('admonition',
- AdmonitionProcessor(md.parser),
- '_begin')
+ md.parser.blockprocessors.register(AdmonitionProcessor(md.parser), 'admonition', 105)
class AdmonitionProcessor(BlockProcessor):
CLASSNAME = 'admonition'
CLASSNAME_TITLE = 'admonition-title'
- RE = re.compile(r'(?:^|\n)!!!\ ?([\w\-]+)(?:\ "(.*?)")?')
+ RE = re.compile(r'(?:^|\n)!!! ?([\w\-]+(?: +[\w\-]+)*)(?: +"(.*?)")? *(?:\n|$)')
+ RE_SPACES = re.compile(' +')
+
+ def __init__(self, parser):
+ """Initialization."""
+
+ super().__init__(parser)
+
+ self.current_sibling = None
+ self.content_indention = 0
+
+ def parse_content(self, parent, block):
+ """Get sibling admontion.
+
+ Retrieve the appropriate siblimg element. This can get trickly when
+ dealing with lists.
+
+ """
+
+ old_block = block
+ the_rest = ''
+
+ # We already acquired the block via test
+ if self.current_sibling is not None:
+ sibling = self.current_sibling
+ block, the_rest = self.detab(block, self.content_indent)
+ self.current_sibling = None
+ self.content_indent = 0
+ return sibling, block, the_rest
+
+ sibling = self.lastChild(parent)
+
+ if sibling is None or sibling.get('class', '').find(self.CLASSNAME) == -1:
+ sibling = None
+ else:
+ # If the last child is a list and the content is idented sufficient
+ # to be under it, then the content's is sibling is in the list.
+ last_child = self.lastChild(sibling)
+ indent = 0
+ while last_child:
+ if (
+ sibling and block.startswith(' ' * self.tab_length * 2) and
+ last_child and last_child.tag in ('ul', 'ol', 'dl')
+ ):
+
+ # The expectation is that we'll find an <li> or <dt>.
+ # We should get it's last child as well.
+ sibling = self.lastChild(last_child)
+ last_child = self.lastChild(sibling) if sibling else None
+
+ # Context has been lost at this point, so we must adjust the
+ # text's identation level so it will be evaluated correctly
+ # under the list.
+ block = block[self.tab_length:]
+ indent += self.tab_length
+ else:
+ last_child = None
+
+ if not block.startswith(' ' * self.tab_length):
+ sibling = None
+
+ if sibling is not None:
+ indent += self.tab_length
+ block, the_rest = self.detab(old_block, indent)
+ self.current_sibling = sibling
+ self.content_indent = indent
+
+ return sibling, block, the_rest
def test(self, parent, block):
- sibling = self.lastChild(parent)
- return self.RE.search(block) or \
- (block.startswith(' ' * self.tab_length) and sibling is not None and
- sibling.get('class', '').find(self.CLASSNAME) != -1)
+
+ if self.RE.search(block):
+ return True
+ else:
+ return self.parse_content(parent, block)[0] is not None
def run(self, parent, blocks):
- sibling = self.lastChild(parent)
block = blocks.pop(0)
m = self.RE.search(block)
if m:
- block = block[m.end() + 1:] # removes the first line
-
- block, theRest = self.detab(block)
+ if m.start() > 0:
+ self.parser.parseBlocks(parent, [block[:m.start()]])
+ block = block[m.end():] # removes the first line
+ block, theRest = self.detab(block)
+ else:
+ sibling, block, theRest = self.parse_content(parent, block)
if m:
klass, title = self.get_class_and_title(m)
div = etree.SubElement(parent, 'div')
- div.set('class', '%s %s' % (self.CLASSNAME, klass))
+ div.set('class', '{} {}'.format(self.CLASSNAME, klass))
if title:
p = etree.SubElement(div, 'p')
p.text = title
p.set('class', self.CLASSNAME_TITLE)
else:
+ # Sibling is a list item, but we need to wrap it's content should be wrapped in <p>
+ if sibling.tag in ('li', 'dd') and sibling.text:
+ text = sibling.text
+ sibling.text = ''
+ p = etree.SubElement(sibling, 'p')
+ p.text = text
+
div = sibling
self.parser.parseChunk(div, block)
@@ -80,11 +153,12 @@
def get_class_and_title(self, match):
klass, title = match.group(1).lower(), match.group(2)
+ klass = self.RE_SPACES.sub(' ', klass)
if title is None:
# no title was provided, use the capitalized classname as title
# e.g.: `!!! note` will render
# `<p class="admonition-title">Note</p>`
- title = klass.capitalize()
+ title = klass.split(' ', 1)[0].capitalize()
elif title == '':
# an explicit blank title should not be rendered
# e.g.: `!!! warning ""` will *not* render `p` with a title
@@ -92,5 +166,5 @@
return klass, title
-def makeExtension(*args, **kwargs):
- return AdmonitionExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return AdmonitionExtension(**kwargs)
diff --git a/markdown/extensions/attr_list.py b/markdown/extensions/attr_list.py
index 683bdf8..9a67551 100644
--- a/markdown/extensions/attr_list.py
+++ b/markdown/extensions/attr_list.py
@@ -6,43 +6,34 @@
[maruku](http://maruku.rubyforge.org/proposal.html#attribute_lists)'s
feature of the same name.
-See <https://pythonhosted.org/Markdown/extensions/attr_list.html>
+See <https://Python-Markdown.github.io/extensions/attr_list>
for documentation.
Original code Copyright 2011 [Waylan Limberg](http://achinghead.com/).
All changes Copyright 2011-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
from ..treeprocessors import Treeprocessor
-from ..util import isBlockLevel
import re
-try:
- Scanner = re.Scanner
-except AttributeError: # pragma: no cover
- # must be on Python 2.4
- from sre import Scanner
-
def _handle_double_quote(s, t):
- k, v = t.split('=')
+ k, v = t.split('=', 1)
return k, v.strip('"')
def _handle_single_quote(s, t):
- k, v = t.split('=')
+ k, v = t.split('=', 1)
return k, v.strip("'")
def _handle_key_value(s, t):
- return t.split('=')
+ return t.split('=', 1)
def _handle_word(s, t):
@@ -52,10 +43,11 @@
return 'id', t[1:]
return t, t
-_scanner = Scanner([
- (r'[^ ]+=".*?"', _handle_double_quote),
- (r"[^ ]+='.*?'", _handle_single_quote),
- (r'[^ ]+=[^ =]+', _handle_key_value),
+
+_scanner = re.Scanner([
+ (r'[^ =]+=".*?"', _handle_double_quote),
+ (r"[^ =]+='.*?'", _handle_single_quote),
+ (r'[^ =]+=[^ =]+', _handle_key_value),
(r'[^ =]+', _handle_word),
(r' ', None)
])
@@ -72,10 +64,10 @@
class AttrListTreeprocessor(Treeprocessor):
- BASE_RE = r'\{\:?([^\}]*)\}'
- HEADER_RE = re.compile(r'[ ]+%s[ ]*$' % BASE_RE)
- BLOCK_RE = re.compile(r'\n[ ]*%s[ ]*$' % BASE_RE)
- INLINE_RE = re.compile(r'^%s' % BASE_RE)
+ BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}'
+ HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE))
+ BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE))
+ INLINE_RE = re.compile(r'^{}'.format(BASE_RE))
NAME_RE = re.compile(r'[^A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff'
r'\u0370-\u037d\u037f-\u1fff\u200c-\u200d'
r'\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff'
@@ -83,12 +75,12 @@
r'\:\-\.0-9\u00b7\u0300-\u036f\u203f-\u2040]+')
def run(self, doc):
- for elem in doc.getiterator():
- if isBlockLevel(elem.tag):
+ for elem in doc.iter():
+ if self.md.is_block_level(elem.tag):
# Block level: check for attrs on last line of text
RE = self.BLOCK_RE
- if isheader(elem) or elem.tag == 'dt':
- # header or def-term: check for attrs at end of line
+ if isheader(elem) or elem.tag in ['dt', 'td', 'th']:
+ # header, def-term, or table cell: check for attrs at end of element
RE = self.HEADER_RE
if len(elem) and elem.tag == 'li':
# special case list items. children may include a ul or ol.
@@ -128,8 +120,6 @@
elif elem.text:
# no children. Get from text.
m = RE.search(elem.text)
- if not m and elem.tag == 'td':
- m = re.search(self.BASE_RE, elem.text)
if m:
self.assign_attrs(elem, m.group(1))
elem.text = elem.text[:m.start()]
@@ -151,7 +141,7 @@
# add to class
cls = elem.get('class')
if cls:
- elem.set('class', '%s %s' % (cls, v))
+ elem.set('class', '{} {}'.format(cls, v))
else:
elem.set('class', v)
else:
@@ -161,17 +151,16 @@
def sanitize_name(self, name):
"""
Sanitize name as 'an XML Name, minus the ":"'.
- See http://www.w3.org/TR/REC-xml-names/#NT-NCName
+ See https://www.w3.org/TR/REC-xml-names/#NT-NCName
"""
return self.NAME_RE.sub('_', name)
class AttrListExtension(Extension):
- def extendMarkdown(self, md, md_globals):
- md.treeprocessors.add(
- 'attr_list', AttrListTreeprocessor(md), '>prettify'
- )
+ def extendMarkdown(self, md):
+ md.treeprocessors.register(AttrListTreeprocessor(md), 'attr_list', 8)
+ md.registerExtension(self)
-def makeExtension(*args, **kwargs):
- return AttrListExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return AttrListExtension(**kwargs)
diff --git a/markdown/extensions/codehilite.py b/markdown/extensions/codehilite.py
index 0657c37..9eed561 100644
--- a/markdown/extensions/codehilite.py
+++ b/markdown/extensions/codehilite.py
@@ -4,28 +4,27 @@
Adds code/syntax highlighting to standard Python-Markdown code blocks.
-See <https://pythonhosted.org/Markdown/extensions/code_hilite.html>
+See <https://Python-Markdown.github.io/extensions/code_hilite>
for documentation.
Original code Copyright 2006-2008 [Waylan Limberg](http://achinghead.com/).
All changes Copyright 2008-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
from ..treeprocessors import Treeprocessor
+from ..util import parseBoolValue
-try:
+try: # pragma: no cover
from pygments import highlight
from pygments.lexers import get_lexer_by_name, guess_lexer
from pygments.formatters import get_formatter_by_name
pygments = True
-except ImportError:
+except ImportError: # pragma: no cover
pygments = False
@@ -40,52 +39,78 @@
try:
return list(map(int, expr.split()))
- except ValueError:
+ except ValueError: # pragma: no cover
return []
# ------------------ The Main CodeHilite Class ----------------------
-class CodeHilite(object):
+class CodeHilite:
"""
- Determine language of source code, and pass it into pygments hilighter.
+ Determine language of source code, and pass it on to the Pygments highlighter.
- Basic Usage:
- >>> code = CodeHilite(src = 'some text')
- >>> html = code.hilite()
+ Usage:
+ code = CodeHilite(src=some_code, lang='python')
+ html = code.hilite()
+ Arguments:
* src: Source string or any object with a .readline attribute.
- * linenums: (Boolean) Set line numbering to 'on' (True),
- 'off' (False) or 'auto'(None). Set to 'auto' by default.
+ * lang: String name of Pygments lexer to use for highlighting. Default: `None`.
- * guess_lang: (Boolean) Turn language auto-detection
- 'on' or 'off' (on by default).
+ * guess_lang: Auto-detect which lexer to use. Ignored if `lang` is set to a valid
+ value. Default: `True`.
- * css_class: Set class name of wrapper div ('codehilite' by default).
+ * use_pygments: Pass code to pygments for code highlighting. If `False`, the code is
+ instead wrapped for highlighting by a JavaScript library. Default: `True`.
- * hl_lines: (List of integers) Lines to emphasize, 1-indexed.
+ * linenums: An alias to Pygments `linenos` formatter option. Default: `None`.
- Low Level Usage:
- >>> code = CodeHilite()
- >>> code.src = 'some text' # String or anything with a .readline attr.
- >>> code.linenos = True # Turns line numbering on or of.
- >>> html = code.hilite()
+ * css_class: An alias to Pygments `cssclass` formatter option. Default: 'codehilite'.
+
+ * lang_prefix: Prefix prepended to the language when `use_pygments` is `False`.
+ Default: "language-".
+
+ Other Options:
+ Any other options are accepted and passed on to the lexer and formatter. Therefore,
+ valid options include any options which are accepted by the `html` formatter or
+ whichever lexer the code's language uses. Note that most lexers do not have any
+ options. However, a few have very useful options, such as PHP's `startinline` option.
+ Any invalid options are ignored without error.
+
+ Formatter options: https://pygments.org/docs/formatters/#HtmlFormatter
+ Lexer Options: https://pygments.org/docs/lexers/
+
+ Advanced Usage:
+ code = CodeHilite(
+ src = some_code,
+ lang = 'php',
+ startinline = True, # Lexer option. Snippet does not start with `<?php`.
+ linenostart = 42, # Formatter option. Snippet starts on line 42.
+ hl_lines = [45, 49, 50], # Formatter option. Highlight lines 45, 49, and 50.
+ linenos = 'inline' # Formatter option. Avoid alignment problems.
+ )
+ html = code.hilite()
"""
- def __init__(self, src=None, linenums=None, guess_lang=True,
- css_class="codehilite", lang=None, style='default',
- noclasses=False, tab_length=4, hl_lines=None, use_pygments=True):
+ def __init__(self, src, **options):
self.src = src
- self.lang = lang
- self.linenums = linenums
- self.guess_lang = guess_lang
- self.css_class = css_class
- self.style = style
- self.noclasses = noclasses
- self.tab_length = tab_length
- self.hl_lines = hl_lines or []
- self.use_pygments = use_pygments
+ self.lang = options.pop('lang', None)
+ self.guess_lang = options.pop('guess_lang', True)
+ self.use_pygments = options.pop('use_pygments', True)
+ self.lang_prefix = options.pop('lang_prefix', 'language-')
+
+ if 'linenos' not in options:
+ options['linenos'] = options.pop('linenums', None)
+ if 'cssclass' not in options:
+ options['cssclass'] = options.pop('css_class', 'codehilite')
+ if 'wrapcode' not in options:
+ # Override pygments default
+ options['wrapcode'] = True
+ # Disallow use of `full` option
+ options['full'] = False
+
+ self.options = options
def hilite(self):
"""
@@ -105,21 +130,16 @@
if pygments and self.use_pygments:
try:
- lexer = get_lexer_by_name(self.lang)
+ lexer = get_lexer_by_name(self.lang, **self.options)
except ValueError:
try:
if self.guess_lang:
- lexer = guess_lexer(self.src)
+ lexer = guess_lexer(self.src, **self.options)
else:
- lexer = get_lexer_by_name('text')
- except ValueError:
- lexer = get_lexer_by_name('text')
- formatter = get_formatter_by_name('html',
- linenos=self.linenums,
- cssclass=self.css_class,
- style=self.style,
- noclasses=self.noclasses,
- hl_lines=self.hl_lines)
+ lexer = get_lexer_by_name('text', **self.options)
+ except ValueError: # pragma: no cover
+ lexer = get_lexer_by_name('text', **self.options)
+ formatter = get_formatter_by_name('html', **self.options)
return highlight(self.src, lexer, formatter)
else:
# just escape and build markup usable by JS highlighting libs
@@ -129,27 +149,30 @@
txt = txt.replace('"', '"')
classes = []
if self.lang:
- classes.append('language-%s' % self.lang)
- if self.linenums:
+ classes.append('{}{}'.format(self.lang_prefix, self.lang))
+ if self.options['linenos']:
classes.append('linenums')
class_str = ''
if classes:
- class_str = ' class="%s"' % ' '.join(classes)
- return '<pre class="%s"><code%s>%s</code></pre>\n' % \
- (self.css_class, class_str, txt)
+ class_str = ' class="{}"'.format(' '.join(classes))
+ return '<pre class="{}"><code{}>{}\n</code></pre>\n'.format(
+ self.options['cssclass'],
+ class_str,
+ txt
+ )
def _parseHeader(self):
"""
- Determines language of a code block from shebang line and whether said
- line should be removed or left in place. If the sheband line contains a
- path (even a single /) then it is assumed to be a real shebang line and
- left alone. However, if no path is given (e.i.: #!python or :::python)
- then it is assumed to be a mock shebang for language identifitation of
- a code fragment and removed from the code block prior to processing for
- code highlighting. When a mock shebang (e.i: #!python) is found, line
- numbering is turned on. When colons are found in place of a shebang
- (e.i.: :::python), line numbering is left in the current state - off
- by default.
+ Determines language of a code block from shebang line and whether the
+ said line should be removed or left in place. If the sheband line
+ contains a path (even a single /) then it is assumed to be a real
+ shebang line and left alone. However, if no path is given
+ (e.i.: #!python or :::python) then it is assumed to be a mock shebang
+ for language identification of a code fragment and removed from the
+ code block prior to processing for code highlighting. When a mock
+ shebang (e.i: #!python) is found, line numbering is turned on. When
+ colons are found in place of a shebang (e.i.: :::python), line
+ numbering is left in the current state - off by default.
Also parses optional list of highlight lines, like:
@@ -166,7 +189,7 @@
c = re.compile(r'''
(?:(?:^::+)|(?P<shebang>^[#]!)) # Shebang or 2 or more colons
(?P<path>(?:/\w+)*[/ ])? # Zero or 1 path
- (?P<lang>[\w+-]*) # The language
+ (?P<lang>[\w#.+-]*) # The language
\s* # Arbitrary whitespace
# Optional highlight lines, single- or double-quote-delimited
(hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot))?
@@ -177,16 +200,16 @@
# we have a match
try:
self.lang = m.group('lang').lower()
- except IndexError:
+ except IndexError: # pragma: no cover
self.lang = None
if m.group('path'):
# path exists - restore first line
lines.insert(0, fl)
- if self.linenums is None and m.group('shebang'):
+ if self.options['linenos'] is None and m.group('shebang'):
# Overridable and Shebang exists - use line numbers
- self.linenums = True
+ self.options['linenos'] = True
- self.hl_lines = parse_hl_lines(m.group('hl_lines'))
+ self.options['hl_lines'] = parse_hl_lines(m.group('hl_lines'))
else:
# No match
lines.insert(0, fl)
@@ -200,23 +223,27 @@
class HiliteTreeprocessor(Treeprocessor):
""" Hilight source code in code blocks. """
+ def code_unescape(self, text):
+ """Unescape code."""
+ text = text.replace("<", "<")
+ text = text.replace(">", ">")
+ # Escaped '&' should be replaced at the end to avoid
+ # conflicting with < and >.
+ text = text.replace("&", "&")
+ return text
+
def run(self, root):
""" Find code blocks and store in htmlStash. """
blocks = root.iter('pre')
for block in blocks:
if len(block) == 1 and block[0].tag == 'code':
code = CodeHilite(
- block[0].text,
- linenums=self.config['linenums'],
- guess_lang=self.config['guess_lang'],
- css_class=self.config['css_class'],
- style=self.config['pygments_style'],
- noclasses=self.config['noclasses'],
- tab_length=self.markdown.tab_length,
- use_pygments=self.config['use_pygments']
+ self.code_unescape(block[0].text),
+ tab_length=self.md.tab_length,
+ style=self.config.pop('pygments_style', 'default'),
+ **self.config
)
- placeholder = self.markdown.htmlStash.store(code.hilite(),
- safe=True)
+ placeholder = self.md.htmlStash.store(code.hilite())
# Clear codeblock in etree instance
block.clear()
# Change to p element which will later
@@ -228,11 +255,11 @@
class CodeHiliteExtension(Extension):
""" Add source code hilighting to markdown codeblocks. """
- def __init__(self, *args, **kwargs):
+ def __init__(self, **kwargs):
# define default configs
self.config = {
'linenums': [None,
- "Use lines numbers. True=yes, False=no, None=auto"],
+ "Use lines numbers. True|table|inline=yes, False=no, None=auto"],
'guess_lang': [True,
"Automatic language detection - Default: True"],
'css_class': ["codehilite",
@@ -247,19 +274,34 @@
'use_pygments': [True,
'Use Pygments to Highlight code blocks. '
'Disable if using a JavaScript library. '
- 'Default: True']
+ 'Default: True'],
+ 'lang_prefix': [
+ 'language-',
+ 'Prefix prepended to the language when use_pygments is false. Default: "language-"'
+ ]
}
- super(CodeHiliteExtension, self).__init__(*args, **kwargs)
+ for key, value in kwargs.items():
+ if key in self.config:
+ self.setConfig(key, value)
+ else:
+ # manually set unknown keywords.
+ if isinstance(value, str):
+ try:
+ # Attempt to parse str as a bool value
+ value = parseBoolValue(value, preserve_none=True)
+ except ValueError:
+ pass # Assume it's not a bool value. Use as-is.
+ self.config[key] = [value, '']
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add HilitePostprocessor to Markdown instance. """
hiliter = HiliteTreeprocessor(md)
hiliter.config = self.getConfigs()
- md.treeprocessors.add("hilite", hiliter, "<inline")
+ md.treeprocessors.register(hiliter, 'hilite', 30)
md.registerExtension(self)
-def makeExtension(*args, **kwargs):
- return CodeHiliteExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return CodeHiliteExtension(**kwargs)
diff --git a/markdown/extensions/def_list.py b/markdown/extensions/def_list.py
index 77cca6e..0e8e452 100644
--- a/markdown/extensions/def_list.py
+++ b/markdown/extensions/def_list.py
@@ -4,22 +4,20 @@
Adds parsing of Definition Lists to Python-Markdown.
-See <https://pythonhosted.org/Markdown/extensions/definition_lists.html>
+See <https://Python-Markdown.github.io/extensions/definition_lists>
for documentation.
Original code Copyright 2008 [Waylan Limberg](http://achinghead.com)
All changes Copyright 2008-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
from ..blockprocessors import BlockProcessor, ListIndentProcessor
-from ..util import etree
+import xml.etree.ElementTree as etree
import re
@@ -36,8 +34,8 @@
raw_block = blocks.pop(0)
m = self.RE.search(raw_block)
- terms = [l.strip() for l in
- raw_block[:m.start()].split('\n') if l.strip()]
+ terms = [term.strip() for term in
+ raw_block[:m.start()].split('\n') if term.strip()]
block = raw_block[m.end():]
no_indent = self.NO_INDENT_RE.match(block)
if no_indent:
@@ -45,13 +43,13 @@
else:
d, theRest = self.detab(block)
if d:
- d = '%s\n%s' % (m.group(2), d)
+ d = '{}\n{}'.format(m.group(2), d)
else:
d = m.group(2)
sibling = self.lastChild(parent)
if not terms and sibling is None:
# This is not a definition item. Most likely a paragraph that
- # starts with a colon at the begining of a document or list.
+ # starts with a colon at the beginning of a document or list.
blocks.insert(0, raw_block)
return False
if not terms and sibling.tag == 'p':
@@ -59,7 +57,7 @@
state = 'looselist'
terms = sibling.text.split('\n')
parent.remove(sibling)
- # Aquire new sibling
+ # Acquire new sibling
sibling = self.lastChild(parent)
else:
state = 'list'
@@ -89,11 +87,13 @@
class DefListIndentProcessor(ListIndentProcessor):
""" Process indented children of definition list items. """
- ITEM_TYPES = ['dd']
- LIST_TYPES = ['dl']
+ # Defintion lists need to be aware of all list types
+ ITEM_TYPES = ['dd', 'li']
+ LIST_TYPES = ['dl', 'ol', 'ul']
def create_item(self, parent, block):
- """ Create a new dd and parse the block with it as the parent. """
+ """ Create a new dd or li (depending on parent) and parse the block with it as the parent. """
+
dd = etree.SubElement(parent, 'dd')
self.parser.parseBlocks(dd, [block])
@@ -101,15 +101,11 @@
class DefListExtension(Extension):
""" Add definition lists to Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add an instance of DefListProcessor to BlockParser. """
- md.parser.blockprocessors.add('defindent',
- DefListIndentProcessor(md.parser),
- '>indent')
- md.parser.blockprocessors.add('deflist',
- DefListProcessor(md.parser),
- '>ulist')
+ md.parser.blockprocessors.register(DefListIndentProcessor(md.parser), 'defindent', 85)
+ md.parser.blockprocessors.register(DefListProcessor(md.parser), 'deflist', 25)
-def makeExtension(*args, **kwargs):
- return DefListExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return DefListExtension(**kwargs)
diff --git a/markdown/extensions/extra.py b/markdown/extensions/extra.py
index de5db03..ebd168c 100644
--- a/markdown/extensions/extra.py
+++ b/markdown/extensions/extra.py
@@ -20,113 +20,39 @@
variable defined below, but be aware that such changes may be lost
when you upgrade to any future version of Python-Markdown.
-See <https://pythonhosted.org/Markdown/extensions/extra.html>
+See <https://Python-Markdown.github.io/extensions/extra>
for documentation.
Copyright The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
-from ..blockprocessors import BlockProcessor
-from .. import util
-import re
extensions = [
- 'markdown.extensions.smart_strong',
- 'markdown.extensions.fenced_code',
- 'markdown.extensions.footnotes',
- 'markdown.extensions.attr_list',
- 'markdown.extensions.def_list',
- 'markdown.extensions.tables',
- 'markdown.extensions.abbr'
+ 'fenced_code',
+ 'footnotes',
+ 'attr_list',
+ 'def_list',
+ 'tables',
+ 'abbr',
+ 'md_in_html'
]
class ExtraExtension(Extension):
""" Add various extensions to Markdown class."""
- def __init__(self, *args, **kwargs):
+ def __init__(self, **kwargs):
""" config is a dumb holder which gets passed to actual ext later. """
- self.config = kwargs.pop('configs', {})
- self.config.update(kwargs)
+ self.config = kwargs
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Register extension instances. """
md.registerExtensions(extensions, self.config)
- if not md.safeMode:
- # Turn on processing of markdown text within raw html
- md.preprocessors['html_block'].markdown_in_raw = True
- md.parser.blockprocessors.add('markdown_block',
- MarkdownInHtmlProcessor(md.parser),
- '_begin')
- md.parser.blockprocessors.tag_counter = -1
- md.parser.blockprocessors.contain_span_tags = re.compile(
- r'^(p|h[1-6]|li|dd|dt|td|th|legend|address)$', re.IGNORECASE)
-def makeExtension(*args, **kwargs):
- return ExtraExtension(*args, **kwargs)
-
-
-class MarkdownInHtmlProcessor(BlockProcessor):
- """Process Markdown Inside HTML Blocks."""
- def test(self, parent, block):
- return block == util.TAG_PLACEHOLDER % \
- str(self.parser.blockprocessors.tag_counter + 1)
-
- def _process_nests(self, element, block):
- """Process the element's child elements in self.run."""
- # Build list of indexes of each nest within the parent element.
- nest_index = [] # a list of tuples: (left index, right index)
- i = self.parser.blockprocessors.tag_counter + 1
- while len(self._tag_data) > i and self._tag_data[i]['left_index']:
- left_child_index = self._tag_data[i]['left_index']
- right_child_index = self._tag_data[i]['right_index']
- nest_index.append((left_child_index - 1, right_child_index))
- i += 1
-
- # Create each nest subelement.
- for i, (left_index, right_index) in enumerate(nest_index[:-1]):
- self.run(element, block[left_index:right_index],
- block[right_index:nest_index[i + 1][0]], True)
- self.run(element, block[nest_index[-1][0]:nest_index[-1][1]], # last
- block[nest_index[-1][1]:], True) # nest
-
- def run(self, parent, blocks, tail=None, nest=False):
- self._tag_data = self.parser.markdown.htmlStash.tag_data
-
- self.parser.blockprocessors.tag_counter += 1
- tag = self._tag_data[self.parser.blockprocessors.tag_counter]
-
- # Create Element
- markdown_value = tag['attrs'].pop('markdown')
- element = util.etree.SubElement(parent, tag['tag'], tag['attrs'])
-
- # Slice Off Block
- if nest:
- self.parser.parseBlocks(parent, tail) # Process Tail
- block = blocks[1:]
- else: # includes nests since a third level of nesting isn't supported
- block = blocks[tag['left_index'] + 1: tag['right_index']]
- del blocks[:tag['right_index']]
-
- # Process Text
- if (self.parser.blockprocessors.contain_span_tags.match( # Span Mode
- tag['tag']) and markdown_value != 'block') or \
- markdown_value == 'span':
- element.text = '\n'.join(block)
- else: # Block Mode
- i = self.parser.blockprocessors.tag_counter + 1
- if len(self._tag_data) > i and self._tag_data[i]['left_index']:
- first_subelement_index = self._tag_data[i]['left_index'] - 1
- self.parser.parseBlocks(
- element, block[:first_subelement_index])
- if not nest:
- block = self._process_nests(element, block)
- else:
- self.parser.parseBlocks(element, block)
+def makeExtension(**kwargs): # pragma: no cover
+ return ExtraExtension(**kwargs)
diff --git a/markdown/extensions/fenced_code.py b/markdown/extensions/fenced_code.py
index 4af8891..716b467 100644
--- a/markdown/extensions/fenced_code.py
+++ b/markdown/extensions/fenced_code.py
@@ -4,7 +4,7 @@
This extension adds Fenced Code Blocks to Python-Markdown.
-See <https://pythonhosted.org/Markdown/extensions/fenced_code_blocks.html>
+See <https://Python-Markdown.github.io/extensions/fenced_code_blocks>
for documentation.
Original code Copyright 2007-2008 [Waylan Limberg](http://achinghead.com/).
@@ -12,93 +12,160 @@
All changes Copyright 2008-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
+
+from textwrap import dedent
from . import Extension
from ..preprocessors import Preprocessor
from .codehilite import CodeHilite, CodeHiliteExtension, parse_hl_lines
+from .attr_list import get_attrs, AttrListExtension
+from ..util import parseBoolValue
import re
class FencedCodeExtension(Extension):
+ def __init__(self, **kwargs):
+ self.config = {
+ 'lang_prefix': ['language-', 'Prefix prepended to the language. Default: "language-"']
+ }
+ super().__init__(**kwargs)
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add FencedBlockPreprocessor to the Markdown instance. """
md.registerExtension(self)
- md.preprocessors.add('fenced_code_block',
- FencedBlockPreprocessor(md),
- ">normalize_whitespace")
+ md.preprocessors.register(FencedBlockPreprocessor(md, self.getConfigs()), 'fenced_code_block', 25)
class FencedBlockPreprocessor(Preprocessor):
- FENCED_BLOCK_RE = re.compile(r'''
-(?P<fence>^(?:~{3,}|`{3,}))[ ]* # Opening ``` or ~~~
-(\{?\.?(?P<lang>[a-zA-Z0-9_+-]*))?[ ]* # Optional {, and lang
-# Optional highlight lines, single- or double-quote-delimited
-(hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot))?[ ]*
-}?[ ]*\n # Optional closing }
-(?P<code>.*?)(?<=\n)
-(?P=fence)[ ]*$''', re.MULTILINE | re.DOTALL | re.VERBOSE)
- CODE_WRAP = '<pre><code%s>%s</code></pre>'
- LANG_TAG = ' class="%s"'
+ FENCED_BLOCK_RE = re.compile(
+ dedent(r'''
+ (?P<fence>^(?:~{3,}|`{3,}))[ ]* # opening fence
+ ((\{(?P<attrs>[^\}\n]*)\})?| # (optional {attrs} or
+ (\.?(?P<lang>[\w#.+-]*))?[ ]* # optional (.)lang
+ (hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot))?) # optional hl_lines)
+ [ ]*\n # newline (end of opening fence)
+ (?P<code>.*?)(?<=\n) # the code block
+ (?P=fence)[ ]*$ # closing fence
+ '''),
+ re.MULTILINE | re.DOTALL | re.VERBOSE
+ )
- def __init__(self, md):
- super(FencedBlockPreprocessor, self).__init__(md)
-
- self.checked_for_codehilite = False
+ def __init__(self, md, config):
+ super().__init__(md)
+ self.config = config
+ self.checked_for_deps = False
self.codehilite_conf = {}
+ self.use_attr_list = False
+ # List of options to convert to bool values
+ self.bool_options = [
+ 'linenums',
+ 'guess_lang',
+ 'noclasses',
+ 'use_pygments'
+ ]
def run(self, lines):
""" Match and store Fenced Code Blocks in the HtmlStash. """
- # Check for code hilite extension
- if not self.checked_for_codehilite:
- for ext in self.markdown.registeredExtensions:
+ # Check for dependent extensions
+ if not self.checked_for_deps:
+ for ext in self.md.registeredExtensions:
if isinstance(ext, CodeHiliteExtension):
- self.codehilite_conf = ext.config
- break
+ self.codehilite_conf = ext.getConfigs()
+ if isinstance(ext, AttrListExtension):
+ self.use_attr_list = True
- self.checked_for_codehilite = True
+ self.checked_for_deps = True
text = "\n".join(lines)
while 1:
m = self.FENCED_BLOCK_RE.search(text)
if m:
- lang = ''
- if m.group('lang'):
- lang = self.LANG_TAG % m.group('lang')
+ lang, id, classes, config = None, '', [], {}
+ if m.group('attrs'):
+ id, classes, config = self.handle_attrs(get_attrs(m.group('attrs')))
+ if len(classes):
+ lang = classes.pop(0)
+ else:
+ if m.group('lang'):
+ lang = m.group('lang')
+ if m.group('hl_lines'):
+ # Support hl_lines outside of attrs for backward-compatibility
+ config['hl_lines'] = parse_hl_lines(m.group('hl_lines'))
# If config is not empty, then the codehighlite extension
# is enabled, so we call it to highlight the code
- if self.codehilite_conf:
+ if self.codehilite_conf and self.codehilite_conf['use_pygments'] and config.get('use_pygments', True):
+ local_config = self.codehilite_conf.copy()
+ local_config.update(config)
+ # Combine classes with cssclass. Ensure cssclass is at end
+ # as pygments appends a suffix under certain circumstances.
+ # Ignore ID as Pygments does not offer an option to set it.
+ if classes:
+ local_config['css_class'] = '{} {}'.format(
+ ' '.join(classes),
+ local_config['css_class']
+ )
highliter = CodeHilite(
m.group('code'),
- linenums=self.codehilite_conf['linenums'][0],
- guess_lang=self.codehilite_conf['guess_lang'][0],
- css_class=self.codehilite_conf['css_class'][0],
- style=self.codehilite_conf['pygments_style'][0],
- lang=(m.group('lang') or None),
- noclasses=self.codehilite_conf['noclasses'][0],
- hl_lines=parse_hl_lines(m.group('hl_lines'))
+ lang=lang,
+ style=local_config.pop('pygments_style', 'default'),
+ **local_config
)
code = highliter.hilite()
else:
- code = self.CODE_WRAP % (lang,
- self._escape(m.group('code')))
+ id_attr = lang_attr = class_attr = kv_pairs = ''
+ if lang:
+ lang_attr = ' class="{}{}"'.format(self.config.get('lang_prefix', 'language-'), lang)
+ if classes:
+ class_attr = ' class="{}"'.format(' '.join(classes))
+ if id:
+ id_attr = ' id="{}"'.format(id)
+ if self.use_attr_list and config and not config.get('use_pygments', False):
+ # Only assign key/value pairs to code element if attr_list ext is enabled, key/value pairs
+ # were defined on the code block, and the `use_pygments` key was not set to True. The
+ # `use_pygments` key could be either set to False or not defined. It is omitted from output.
+ kv_pairs = ' ' + ' '.join(
+ '{k}="{v}"'.format(k=k, v=v) for k, v in config.items() if k != 'use_pygments'
+ )
+ code = '<pre{id}{cls}><code{lang}{kv}>{code}</code></pre>'.format(
+ id=id_attr,
+ cls=class_attr,
+ lang=lang_attr,
+ kv=kv_pairs,
+ code=self._escape(m.group('code'))
+ )
- placeholder = self.markdown.htmlStash.store(code, safe=True)
- text = '%s\n%s\n%s' % (text[:m.start()],
- placeholder,
- text[m.end():])
+ placeholder = self.md.htmlStash.store(code)
+ text = '{}\n{}\n{}'.format(text[:m.start()],
+ placeholder,
+ text[m.end():])
else:
break
return text.split("\n")
+ def handle_attrs(self, attrs):
+ """ Return tuple: (id, [list, of, classes], {configs}) """
+ id = ''
+ classes = []
+ configs = {}
+ for k, v in attrs:
+ if k == 'id':
+ id = v
+ elif k == '.':
+ classes.append(v)
+ elif k == 'hl_lines':
+ configs[k] = parse_hl_lines(v)
+ elif k in self.bool_options:
+ configs[k] = parseBoolValue(v, fail_on_errors=False, preserve_none=True)
+ else:
+ configs[k] = v
+ return id, classes, configs
+
def _escape(self, txt):
""" basic html escaping """
txt = txt.replace('&', '&')
@@ -108,5 +175,5 @@
return txt
-def makeExtension(*args, **kwargs):
- return FencedCodeExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return FencedCodeExtension(**kwargs)
diff --git a/markdown/extensions/footnotes.py b/markdown/extensions/footnotes.py
index d8caae2..f6f4c85 100644
--- a/markdown/extensions/footnotes.py
+++ b/markdown/extensions/footnotes.py
@@ -4,36 +4,35 @@
Adds footnote handling to Python-Markdown.
-See <https://pythonhosted.org/Markdown/extensions/footnotes.html>
+See <https://Python-Markdown.github.io/extensions/footnotes>
for documentation.
Copyright The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
-from ..preprocessors import Preprocessor
-from ..inlinepatterns import Pattern
+from ..blockprocessors import BlockProcessor
+from ..inlinepatterns import InlineProcessor
from ..treeprocessors import Treeprocessor
from ..postprocessors import Postprocessor
-from ..util import etree, text_type
-from ..odict import OrderedDict
+from .. import util
+from collections import OrderedDict
import re
+import copy
+import xml.etree.ElementTree as etree
-FN_BACKLINK_TEXT = "zz1337820767766393qq"
-NBSP_PLACEHOLDER = "qq3936677670287331zz"
-DEF_RE = re.compile(r'[ ]{0,3}\[\^([^\]]*)\]:\s*(.*)')
-TABBED_RE = re.compile(r'((\t)|( ))(.*)')
+FN_BACKLINK_TEXT = util.STX + "zz1337820767766393qq" + util.ETX
+NBSP_PLACEHOLDER = util.STX + "qq3936677670287331zz" + util.ETX
+RE_REF_ID = re.compile(r'(fnref)(\d+)')
class FootnoteExtension(Extension):
""" Footnote Extension. """
- def __init__(self, *args, **kwargs):
+ def __init__(self, **kwargs):
""" Setup configs. """
self.config = {
@@ -47,44 +46,77 @@
"BACKLINK_TEXT":
["↩",
"The text string that links from the footnote "
- "to the reader's place."]
+ "to the reader's place."],
+ "BACKLINK_TITLE":
+ ["Jump back to footnote %d in the text",
+ "The text string used for the title HTML attribute "
+ "of the backlink. %d will be replaced by the "
+ "footnote number."],
+ "SEPARATOR":
+ [":",
+ "Footnote separator."]
}
- super(FootnoteExtension, self).__init__(*args, **kwargs)
+ super().__init__(**kwargs)
# In multiple invocations, emit links that don't get tangled.
self.unique_prefix = 0
+ self.found_refs = {}
+ self.used_refs = set()
self.reset()
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add pieces to Markdown. """
md.registerExtension(self)
self.parser = md.parser
self.md = md
- # Insert a preprocessor before ReferencePreprocessor
- md.preprocessors.add(
- "footnote", FootnotePreprocessor(self), "<reference"
- )
+ # Insert a blockprocessor before ReferencePreprocessor
+ md.parser.blockprocessors.register(FootnoteBlockProcessor(self), 'footnote', 17)
+
# Insert an inline pattern before ImageReferencePattern
FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah
- md.inlinePatterns.add(
- "footnote", FootnotePattern(FOOTNOTE_RE, self), "<reference"
- )
+ md.inlinePatterns.register(FootnoteInlineProcessor(FOOTNOTE_RE, self), 'footnote', 175)
# Insert a tree-processor that would actually add the footnote div
# This must be before all other treeprocessors (i.e., inline and
# codehilite) so they can run on the the contents of the div.
- md.treeprocessors.add(
- "footnote", FootnoteTreeprocessor(self), "_begin"
- )
- # Insert a postprocessor after amp_substitute oricessor
- md.postprocessors.add(
- "footnote", FootnotePostprocessor(self), ">amp_substitute"
- )
+ md.treeprocessors.register(FootnoteTreeprocessor(self), 'footnote', 50)
+
+ # Insert a tree-processor that will run after inline is done.
+ # In this tree-processor we want to check our duplicate footnote tracker
+ # And add additional backrefs to the footnote pointing back to the
+ # duplicated references.
+ md.treeprocessors.register(FootnotePostTreeprocessor(self), 'footnote-duplicate', 15)
+
+ # Insert a postprocessor after amp_substitute processor
+ md.postprocessors.register(FootnotePostprocessor(self), 'footnote', 25)
def reset(self):
""" Clear footnotes on reset, and prepare for distinct document. """
self.footnotes = OrderedDict()
self.unique_prefix += 1
+ self.found_refs = {}
+ self.used_refs = set()
+
+ def unique_ref(self, reference, found=False):
+ """ Get a unique reference if there are duplicates. """
+ if not found:
+ return reference
+
+ original_ref = reference
+ while reference in self.used_refs:
+ ref, rest = reference.split(self.get_separator(), 1)
+ m = RE_REF_ID.match(ref)
+ if m:
+ reference = '%s%d%s%s' % (m.group(1), int(m.group(2))+1, self.get_separator(), rest)
+ else:
+ reference = '%s%d%s%s' % (ref, 2, self.get_separator(), rest)
+
+ self.used_refs.add(reference)
+ if original_ref in self.found_refs:
+ self.found_refs[original_ref] += 1
+ else:
+ self.found_refs[original_ref] = 1
+ return reference
def findFootnotesPlaceholder(self, root):
""" Return ElementTree Element that contains Footnote placeholder. """
@@ -96,7 +128,9 @@
if child.tail:
if child.tail.find(self.getConfig("PLACE_MARKER")) > -1:
return child, element, False
- finder(child)
+ child_res = finder(child)
+ if child_res is not None:
+ return child_res
return None
res = finder(root)
@@ -107,24 +141,22 @@
self.footnotes[id] = text
def get_separator(self):
- if self.md.output_format in ['html5', 'xhtml5']:
- return '-'
- return ':'
+ """ Get the footnote separator. """
+ return self.getConfig("SEPARATOR")
def makeFootnoteId(self, id):
""" Return footnote link id. """
if self.getConfig("UNIQUE_IDS"):
return 'fn%s%d-%s' % (self.get_separator(), self.unique_prefix, id)
else:
- return 'fn%s%s' % (self.get_separator(), id)
+ return 'fn{}{}'.format(self.get_separator(), id)
- def makeFootnoteRefId(self, id):
+ def makeFootnoteRefId(self, id, found=False):
""" Return footnote back-link id. """
if self.getConfig("UNIQUE_IDS"):
- return 'fnref%s%d-%s' % (self.get_separator(),
- self.unique_prefix, id)
+ return self.unique_ref('fnref%s%d-%s' % (self.get_separator(), self.unique_prefix, id), found)
else:
- return 'fnref%s%s' % (self.get_separator(), id)
+ return self.unique_ref('fnref{}{}'.format(self.get_separator(), id), found)
def makeFootnotesDiv(self, root):
""" Return div of footnotes as et Element. """
@@ -136,24 +168,28 @@
div.set('class', 'footnote')
etree.SubElement(div, "hr")
ol = etree.SubElement(div, "ol")
+ surrogate_parent = etree.Element("div")
- for id in self.footnotes.keys():
+ for index, id in enumerate(self.footnotes.keys(), start=1):
li = etree.SubElement(ol, "li")
li.set("id", self.makeFootnoteId(id))
- self.parser.parseChunk(li, self.footnotes[id])
+ # Parse footnote with surrogate parent as li cannot be used.
+ # List block handlers have special logic to deal with li.
+ # When we are done parsing, we will copy everything over to li.
+ self.parser.parseChunk(surrogate_parent, self.footnotes[id])
+ for el in list(surrogate_parent):
+ li.append(el)
+ surrogate_parent.remove(el)
backlink = etree.Element("a")
backlink.set("href", "#" + self.makeFootnoteRefId(id))
- if self.md.output_format not in ['html5', 'xhtml5']:
- backlink.set("rev", "footnote") # Invalid in HTML5
backlink.set("class", "footnote-backref")
backlink.set(
"title",
- "Jump back to footnote %d in the text" %
- (self.footnotes.index(id)+1)
+ self.getConfig("BACKLINK_TITLE") % (index)
)
backlink.text = FN_BACKLINK_TEXT
- if li.getchildren():
+ if len(li):
node = li[-1]
if node.tag == "p":
node.text = node.text + NBSP_PLACEHOLDER
@@ -164,119 +200,166 @@
return div
-class FootnotePreprocessor(Preprocessor):
+class FootnoteBlockProcessor(BlockProcessor):
""" Find all footnote references and store for later use. """
+ RE = re.compile(r'^[ ]{0,3}\[\^([^\]]*)\]:[ ]*(.*)$', re.MULTILINE)
+
+ def __init__(self, footnotes):
+ super().__init__(footnotes.parser)
+ self.footnotes = footnotes
+
+ def test(self, parent, block):
+ return True
+
+ def run(self, parent, blocks):
+ """ Find, set, and remove footnote definitions. """
+ block = blocks.pop(0)
+ m = self.RE.search(block)
+ if m:
+ id = m.group(1)
+ fn_blocks = [m.group(2)]
+
+ # Handle rest of block
+ therest = block[m.end():].lstrip('\n')
+ m2 = self.RE.search(therest)
+ if m2:
+ # Another footnote exists in the rest of this block.
+ # Any content before match is continuation of this footnote, which may be lazily indented.
+ before = therest[:m2.start()].rstrip('\n')
+ fn_blocks[0] = '\n'.join([fn_blocks[0], self.detab(before)]).lstrip('\n')
+ # Add back to blocks everything from begining of match forward for next iteration.
+ blocks.insert(0, therest[m2.start():])
+ else:
+ # All remaining lines of block are continuation of this footnote, which may be lazily indented.
+ fn_blocks[0] = '\n'.join([fn_blocks[0], self.detab(therest)]).strip('\n')
+
+ # Check for child elements in remaining blocks.
+ fn_blocks.extend(self.detectTabbed(blocks))
+
+ footnote = "\n\n".join(fn_blocks)
+ self.footnotes.setFootnote(id, footnote.rstrip())
+
+ if block[:m.start()].strip():
+ # Add any content before match back to blocks as separate block
+ blocks.insert(0, block[:m.start()].rstrip('\n'))
+ return True
+ # No match. Restore block.
+ blocks.insert(0, block)
+ return False
+
+ def detectTabbed(self, blocks):
+ """ Find indented text and remove indent before further proccesing.
+
+ Returns: a list of blocks with indentation removed.
+ """
+ fn_blocks = []
+ while blocks:
+ if blocks[0].startswith(' '*4):
+ block = blocks.pop(0)
+ # Check for new footnotes within this block and split at new footnote.
+ m = self.RE.search(block)
+ if m:
+ # Another footnote exists in this block.
+ # Any content before match is continuation of this footnote, which may be lazily indented.
+ before = block[:m.start()].rstrip('\n')
+ fn_blocks.append(self.detab(before))
+ # Add back to blocks everything from begining of match forward for next iteration.
+ blocks.insert(0, block[m.start():])
+ # End of this footnote.
+ break
+ else:
+ # Entire block is part of this footnote.
+ fn_blocks.append(self.detab(block))
+ else:
+ # End of this footnote.
+ break
+ return fn_blocks
+
+ def detab(self, block):
+ """ Remove one level of indent from a block.
+
+ Preserve lazily indented blocks by only removing indent from indented lines.
+ """
+ lines = block.split('\n')
+ for i, line in enumerate(lines):
+ if line.startswith(' '*4):
+ lines[i] = line[4:]
+ return '\n'.join(lines)
+
+
+class FootnoteInlineProcessor(InlineProcessor):
+ """ InlinePattern for footnote markers in a document's body text. """
+
+ def __init__(self, pattern, footnotes):
+ super().__init__(pattern)
+ self.footnotes = footnotes
+
+ def handleMatch(self, m, data):
+ id = m.group(1)
+ if id in self.footnotes.footnotes.keys():
+ sup = etree.Element("sup")
+ a = etree.SubElement(sup, "a")
+ sup.set('id', self.footnotes.makeFootnoteRefId(id, found=True))
+ a.set('href', '#' + self.footnotes.makeFootnoteId(id))
+ a.set('class', 'footnote-ref')
+ a.text = str(list(self.footnotes.footnotes.keys()).index(id) + 1)
+ return sup, m.start(0), m.end(0)
+ else:
+ return None, None, None
+
+
+class FootnotePostTreeprocessor(Treeprocessor):
+ """ Amend footnote div with duplicates. """
+
def __init__(self, footnotes):
self.footnotes = footnotes
- def run(self, lines):
- """
- Loop through lines and find, set, and remove footnote definitions.
-
- Keywords:
-
- * lines: A list of lines of text
-
- Return: A list of lines of text with footnote definitions removed.
-
- """
- newlines = []
- i = 0
- while True:
- m = DEF_RE.match(lines[i])
- if m:
- fn, _i = self.detectTabbed(lines[i+1:])
- fn.insert(0, m.group(2))
- i += _i-1 # skip past footnote
- self.footnotes.setFootnote(m.group(1), "\n".join(fn))
- else:
- newlines.append(lines[i])
- if len(lines) > i+1:
- i += 1
- else:
+ def add_duplicates(self, li, duplicates):
+ """ Adjust current li and add the duplicates: fnref2, fnref3, etc. """
+ for link in li.iter('a'):
+ # Find the link that needs to be duplicated.
+ if link.attrib.get('class', '') == 'footnote-backref':
+ ref, rest = link.attrib['href'].split(self.footnotes.get_separator(), 1)
+ # Duplicate link the number of times we need to
+ # and point the to the appropriate references.
+ links = []
+ for index in range(2, duplicates + 1):
+ sib_link = copy.deepcopy(link)
+ sib_link.attrib['href'] = '%s%d%s%s' % (ref, index, self.footnotes.get_separator(), rest)
+ links.append(sib_link)
+ self.offset += 1
+ # Add all the new duplicate links.
+ el = list(li)[-1]
+ for link in links:
+ el.append(link)
break
- return newlines
- def detectTabbed(self, lines):
- """ Find indented text and remove indent before further proccesing.
+ def get_num_duplicates(self, li):
+ """ Get the number of duplicate refs of the footnote. """
+ fn, rest = li.attrib.get('id', '').split(self.footnotes.get_separator(), 1)
+ link_id = '{}ref{}{}'.format(fn, self.footnotes.get_separator(), rest)
+ return self.footnotes.found_refs.get(link_id, 0)
- Keyword arguments:
+ def handle_duplicates(self, parent):
+ """ Find duplicate footnotes and format and add the duplicates. """
+ for li in list(parent):
+ # Check number of duplicates footnotes and insert
+ # additional links if needed.
+ count = self.get_num_duplicates(li)
+ if count > 1:
+ self.add_duplicates(li, count)
- * lines: an array of strings
-
- Returns: a list of post processed items and the index of last line.
-
- """
- items = []
- blank_line = False # have we encountered a blank line yet?
- i = 0 # to keep track of where we are
-
- def detab(line):
- match = TABBED_RE.match(line)
- if match:
- return match.group(4)
-
- for line in lines:
- if line.strip(): # Non-blank line
- detabbed_line = detab(line)
- if detabbed_line:
- items.append(detabbed_line)
- i += 1
- continue
- elif not blank_line and not DEF_RE.match(line):
- # not tabbed but still part of first par.
- items.append(line)
- i += 1
- continue
- else:
- return items, i+1
-
- else: # Blank line: _maybe_ we are done.
- blank_line = True
- i += 1 # advance
-
- # Find the next non-blank line
- for j in range(i, len(lines)):
- if lines[j].strip():
- next_line = lines[j]
- break
- else:
- break # There is no more text; we are done.
-
- # Check if the next non-blank line is tabbed
- if detab(next_line): # Yes, more work to do.
- items.append("")
- continue
- else:
- break # No, we are done.
- else:
- i += 1
-
- return items, i
-
-
-class FootnotePattern(Pattern):
- """ InlinePattern for footnote markers in a document's body text. """
-
- def __init__(self, pattern, footnotes):
- super(FootnotePattern, self).__init__(pattern)
- self.footnotes = footnotes
-
- def handleMatch(self, m):
- id = m.group(2)
- if id in self.footnotes.footnotes.keys():
- sup = etree.Element("sup")
- a = etree.SubElement(sup, "a")
- sup.set('id', self.footnotes.makeFootnoteRefId(id))
- a.set('href', '#' + self.footnotes.makeFootnoteId(id))
- if self.footnotes.md.output_format not in ['html5', 'xhtml5']:
- a.set('rel', 'footnote') # invalid in HTML5
- a.set('class', 'footnote-ref')
- a.text = text_type(self.footnotes.footnotes.index(id) + 1)
- return sup
- else:
- return None
+ def run(self, root):
+ """ Crawl the footnote div and add missing duplicate footnotes. """
+ self.offset = 0
+ for div in root.iter('div'):
+ if div.attrib.get('class', '') == 'footnote':
+ # Footnotes shoul be under the first orderd list under
+ # the footnote div. So once we find it, quit.
+ for ol in div.iter('ol'):
+ self.handle_duplicates(ol)
+ break
class FootnoteTreeprocessor(Treeprocessor):
@@ -291,7 +374,7 @@
result = self.footnotes.findFootnotesPlaceholder(root)
if result:
child, parent, isText = result
- ind = parent.getchildren().index(child)
+ ind = list(parent).index(child)
if isText:
parent.remove(child)
parent.insert(ind, footnotesDiv)
@@ -314,6 +397,6 @@
return text.replace(NBSP_PLACEHOLDER, " ")
-def makeExtension(*args, **kwargs):
+def makeExtension(**kwargs): # pragma: no cover
""" Return an instance of the FootnoteExtension """
- return FootnoteExtension(*args, **kwargs)
+ return FootnoteExtension(**kwargs)
diff --git a/markdown/extensions/headerid.py b/markdown/extensions/headerid.py
deleted file mode 100644
index 2cb20b9..0000000
--- a/markdown/extensions/headerid.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-HeaderID Extension for Python-Markdown
-======================================
-
-Auto-generate id attributes for HTML headers.
-
-See <https://pythonhosted.org/Markdown/extensions/header_id.html>
-for documentation.
-
-Original code Copyright 2007-2011 [Waylan Limberg](http://achinghead.com/).
-
-All changes Copyright 2011-2014 The Python Markdown Project
-
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
-
-"""
-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-from . import Extension
-from ..treeprocessors import Treeprocessor
-from ..util import parseBoolValue
-from .toc import slugify, unique, stashedHTML2text
-import warnings
-
-
-class HeaderIdTreeprocessor(Treeprocessor):
- """ Assign IDs to headers. """
-
- IDs = set()
-
- def run(self, doc):
- start_level, force_id = self._get_meta()
- slugify = self.config['slugify']
- sep = self.config['separator']
- for elem in doc:
- if elem.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
- if force_id:
- if "id" in elem.attrib:
- id = elem.get('id')
- else:
- id = stashedHTML2text(''.join(elem.itertext()), self.md)
- id = slugify(id, sep)
- elem.set('id', unique(id, self.IDs))
- if start_level:
- level = int(elem.tag[-1]) + start_level
- if level > 6:
- level = 6
- elem.tag = 'h%d' % level
-
- def _get_meta(self):
- """ Return meta data suported by this ext as a tuple """
- level = int(self.config['level']) - 1
- force = parseBoolValue(self.config['forceid'])
- if hasattr(self.md, 'Meta'):
- if 'header_level' in self.md.Meta:
- level = int(self.md.Meta['header_level'][0]) - 1
- if 'header_forceid' in self.md.Meta:
- force = parseBoolValue(self.md.Meta['header_forceid'][0])
- return level, force
-
-
-class HeaderIdExtension(Extension):
- def __init__(self, *args, **kwargs):
- # set defaults
- self.config = {
- 'level': ['1', 'Base level for headers.'],
- 'forceid': ['True', 'Force all headers to have an id.'],
- 'separator': ['-', 'Word separator.'],
- 'slugify': [slugify, 'Callable to generate anchors']
- }
-
- super(HeaderIdExtension, self).__init__(*args, **kwargs)
-
- warnings.warn(
- 'The HeaderId Extension is pending deprecation. Use the TOC Extension instead.',
- PendingDeprecationWarning
- )
-
- def extendMarkdown(self, md, md_globals):
- md.registerExtension(self)
- self.processor = HeaderIdTreeprocessor()
- self.processor.md = md
- self.processor.config = self.getConfigs()
- if 'attr_list' in md.treeprocessors.keys():
- # insert after attr_list treeprocessor
- md.treeprocessors.add('headerid', self.processor, '>attr_list')
- else:
- # insert after 'prettify' treeprocessor.
- md.treeprocessors.add('headerid', self.processor, '>prettify')
-
- def reset(self):
- self.processor.IDs = set()
-
-
-def makeExtension(*args, **kwargs):
- return HeaderIdExtension(*args, **kwargs)
diff --git a/markdown/extensions/legacy_attrs.py b/markdown/extensions/legacy_attrs.py
new file mode 100644
index 0000000..b51d778
--- /dev/null
+++ b/markdown/extensions/legacy_attrs.py
@@ -0,0 +1,67 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
+Legacy Attributes Extension
+===========================
+
+An extension to Python Markdown which implements legacy attributes.
+
+Prior to Python-Markdown version 3.0, the Markdown class had an `enable_attributes`
+keyword which was on by default and provided for attributes to be defined for elements
+using the format `{@key=value}`. This extension is provided as a replacement for
+backward compatability. New documents should be authored using attr_lists. However,
+numerious documents exist which have been using the old attribute format for many
+years. This extension can be used to continue to render those documents correctly.
+"""
+
+import re
+from markdown.treeprocessors import Treeprocessor, isString
+from markdown.extensions import Extension
+
+
+ATTR_RE = re.compile(r'\{@([^\}]*)=([^\}]*)}') # {@id=123}
+
+
+class LegacyAttrs(Treeprocessor):
+ def run(self, doc):
+ """Find and set values of attributes ({@key=value}). """
+ for el in doc.iter():
+ alt = el.get('alt', None)
+ if alt is not None:
+ el.set('alt', self.handleAttributes(el, alt))
+ if el.text and isString(el.text):
+ el.text = self.handleAttributes(el, el.text)
+ if el.tail and isString(el.tail):
+ el.tail = self.handleAttributes(el, el.tail)
+
+ def handleAttributes(self, el, txt):
+ """ Set attributes and return text without definitions. """
+ def attributeCallback(match):
+ el.set(match.group(1), match.group(2).replace('\n', ' '))
+ return ATTR_RE.sub(attributeCallback, txt)
+
+
+class LegacyAttrExtension(Extension):
+ def extendMarkdown(self, md):
+ md.treeprocessors.register(LegacyAttrs(md), 'legacyattrs', 15)
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return LegacyAttrExtension(**kwargs)
diff --git a/markdown/extensions/legacy_em.py b/markdown/extensions/legacy_em.py
new file mode 100644
index 0000000..7fddb77
--- /dev/null
+++ b/markdown/extensions/legacy_em.py
@@ -0,0 +1,49 @@
+'''
+Legacy Em Extension for Python-Markdown
+=======================================
+
+This extention provides legacy behavior for _connected_words_.
+
+Copyright 2015-2018 The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+'''
+
+from . import Extension
+from ..inlinepatterns import UnderscoreProcessor, EmStrongItem, EM_STRONG2_RE, STRONG_EM2_RE
+import re
+
+# _emphasis_
+EMPHASIS_RE = r'(_)([^_]+)\1'
+
+# __strong__
+STRONG_RE = r'(_{2})(.+?)\1'
+
+# __strong_em___
+STRONG_EM_RE = r'(_)\1(?!\1)([^_]+?)\1(?!\1)(.+?)\1{3}'
+
+
+class LegacyUnderscoreProcessor(UnderscoreProcessor):
+ """Emphasis processor for handling strong and em matches inside underscores."""
+
+ PATTERNS = [
+ EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
+ EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
+ EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
+ ]
+
+
+class LegacyEmExtension(Extension):
+ """ Add legacy_em extension to Markdown class."""
+
+ def extendMarkdown(self, md):
+ """ Modify inline patterns. """
+ md.inlinePatterns.register(LegacyUnderscoreProcessor(r'_'), 'em_strong2', 50)
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ """ Return an instance of the LegacyEmExtension """
+ return LegacyEmExtension(**kwargs)
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
new file mode 100644
index 0000000..86cf00d
--- /dev/null
+++ b/markdown/extensions/md_in_html.py
@@ -0,0 +1,363 @@
+"""
+Python-Markdown Markdown in HTML Extension
+===============================
+
+An implementation of [PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/)'s
+parsing of Markdown syntax in raw HTML.
+
+See <https://Python-Markdown.github.io/extensions/raw_html>
+for documentation.
+
+Copyright The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+
+from . import Extension
+from ..blockprocessors import BlockProcessor
+from ..preprocessors import Preprocessor
+from ..postprocessors import RawHtmlPostprocessor
+from .. import util
+from ..htmlparser import HTMLExtractor, blank_line_re
+import xml.etree.ElementTree as etree
+
+
+class HTMLExtractorExtra(HTMLExtractor):
+ """
+ Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown.
+ """
+
+ def __init__(self, md, *args, **kwargs):
+ # All block-level tags.
+ self.block_level_tags = set(md.block_level_elements.copy())
+ # Block-level tags in which the content only gets span level parsing
+ self.span_tags = set(
+ ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
+ )
+ # Block-level tags which never get their content parsed.
+ self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'])
+ # Block-level tags in which the content gets parsed as blocks
+ super().__init__(md, *args, **kwargs)
+
+ self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags)
+ self.span_and_blocks_tags = self.block_tags | self.span_tags
+
+ def reset(self):
+ """Reset this instance. Loses all unprocessed data."""
+ self.mdstack = [] # When markdown=1, stack contains a list of tags
+ self.treebuilder = etree.TreeBuilder()
+ self.mdstate = [] # one of 'block', 'span', 'off', or None
+ super().reset()
+
+ def close(self):
+ """Handle any buffered data."""
+ super().close()
+ # Handle any unclosed tags.
+ if self.mdstack:
+ # Close the outermost parent. handle_endtag will close all unclosed children.
+ self.handle_endtag(self.mdstack[0])
+
+ def get_element(self):
+ """ Return element from treebuilder and reset treebuilder for later use. """
+ element = self.treebuilder.close()
+ self.treebuilder = etree.TreeBuilder()
+ return element
+
+ def get_state(self, tag, attrs):
+ """ Return state from tag and `markdown` attr. One of 'block', 'span', or 'off'. """
+ md_attr = attrs.get('markdown', '0')
+ if md_attr == 'markdown':
+ # `<tag markdown>` is the same as `<tag markdown='1'>`.
+ md_attr = '1'
+ parent_state = self.mdstate[-1] if self.mdstate else None
+ if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'):
+ # Only use the parent state if it is more restrictive than the markdown attribute.
+ md_attr = parent_state
+ if ((md_attr == '1' and tag in self.block_tags) or
+ (md_attr == 'block' and tag in self.span_and_blocks_tags)):
+ return 'block'
+ elif ((md_attr == '1' and tag in self.span_tags) or
+ (md_attr == 'span' and tag in self.span_and_blocks_tags)):
+ return 'span'
+ elif tag in self.block_level_tags:
+ return 'off'
+ else: # pragma: no cover
+ return None
+
+ def handle_starttag(self, tag, attrs):
+ # Handle tags that should always be empty and do not specify a closing tag
+ if tag in self.empty_tags and (self.at_line_start() or self.intail):
+ attrs = {key: value if value is not None else key for key, value in attrs}
+ if "markdown" in attrs:
+ attrs.pop('markdown')
+ element = etree.Element(tag, attrs)
+ data = etree.tostring(element, encoding='unicode', method='html')
+ else:
+ data = self.get_starttag_text()
+ self.handle_empty_tag(data, True)
+ return
+
+ if tag in self.block_level_tags and (self.at_line_start() or self.intail):
+ # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
+ # Convert to `{'checked': 'checked'}`.
+ attrs = {key: value if value is not None else key for key, value in attrs}
+ state = self.get_state(tag, attrs)
+ if self.inraw or (state in [None, 'off'] and not self.mdstack):
+ # fall back to default behavior
+ attrs.pop('markdown', None)
+ super().handle_starttag(tag, attrs)
+ else:
+ if 'p' in self.mdstack and tag in self.block_level_tags:
+ # Close unclosed 'p' tag
+ self.handle_endtag('p')
+ self.mdstate.append(state)
+ self.mdstack.append(tag)
+ attrs['markdown'] = state
+ self.treebuilder.start(tag, attrs)
+ else:
+ # Span level tag
+ if self.inraw:
+ super().handle_starttag(tag, attrs)
+ else:
+ text = self.get_starttag_text()
+ if self.mdstate and self.mdstate[-1] == "off":
+ self.handle_data(self.md.htmlStash.store(text))
+ else:
+ self.handle_data(text)
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ # This is presumably a standalone tag in a code span (see #1036).
+ self.clear_cdata_mode()
+
+ def handle_endtag(self, tag):
+ if tag in self.block_level_tags:
+ if self.inraw:
+ super().handle_endtag(tag)
+ elif tag in self.mdstack:
+ # Close element and any unclosed children
+ while self.mdstack:
+ item = self.mdstack.pop()
+ self.mdstate.pop()
+ self.treebuilder.end(item)
+ if item == tag:
+ break
+ if not self.mdstack:
+ # Last item in stack is closed. Stash it
+ element = self.get_element()
+ # Get last entry to see if it ends in newlines
+ # If it is an element, assume there is no newlines
+ item = self.cleandoc[-1] if self.cleandoc else ''
+ # If we only have one newline before block element, add another
+ if not item.endswith('\n\n') and item.endswith('\n'):
+ self.cleandoc.append('\n')
+ self.cleandoc.append(self.md.htmlStash.store(element))
+ self.cleandoc.append('\n\n')
+ self.state = []
+ # Check if element has a tail
+ if not blank_line_re.match(
+ self.rawdata[self.line_offset + self.offset + len(self.get_endtag_text(tag)):]):
+ # More content exists after endtag.
+ self.intail = True
+ else:
+ # Treat orphan closing tag as a span level tag.
+ text = self.get_endtag_text(tag)
+ if self.mdstate and self.mdstate[-1] == "off":
+ self.handle_data(self.md.htmlStash.store(text))
+ else:
+ self.handle_data(text)
+ else:
+ # Span level tag
+ if self.inraw:
+ super().handle_endtag(tag)
+ else:
+ text = self.get_endtag_text(tag)
+ if self.mdstate and self.mdstate[-1] == "off":
+ self.handle_data(self.md.htmlStash.store(text))
+ else:
+ self.handle_data(text)
+
+ def handle_startendtag(self, tag, attrs):
+ if tag in self.empty_tags:
+ attrs = {key: value if value is not None else key for key, value in attrs}
+ if "markdown" in attrs:
+ attrs.pop('markdown')
+ element = etree.Element(tag, attrs)
+ data = etree.tostring(element, encoding='unicode', method='html')
+ else:
+ data = self.get_starttag_text()
+ else:
+ data = self.get_starttag_text()
+ self.handle_empty_tag(data, is_block=self.md.is_block_level(tag))
+
+ def handle_data(self, data):
+ if self.intail and '\n' in data:
+ self.intail = False
+ if self.inraw or not self.mdstack:
+ super().handle_data(data)
+ else:
+ self.treebuilder.data(data)
+
+ def handle_empty_tag(self, data, is_block):
+ if self.inraw or not self.mdstack:
+ super().handle_empty_tag(data, is_block)
+ else:
+ if self.at_line_start() and is_block:
+ self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n')
+ else:
+ self.handle_data(self.md.htmlStash.store(data))
+
+ def parse_pi(self, i):
+ if self.at_line_start() or self.intail or self.mdstack:
+ # The same override exists in HTMLExtractor without the check
+ # for mdstack. Therefore, use HTMLExtractor's parent instead.
+ return super(HTMLExtractor, self).parse_pi(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<?')
+ return i + 2
+
+ def parse_html_declaration(self, i):
+ if self.at_line_start() or self.intail or self.mdstack:
+ # The same override exists in HTMLExtractor without the check
+ # for mdstack. Therefore, use HTMLExtractor's parent instead.
+ return super(HTMLExtractor, self).parse_html_declaration(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<!')
+ return i + 2
+
+
+class HtmlBlockPreprocessor(Preprocessor):
+ """Remove html blocks from the text and store them for later retrieval."""
+
+ def run(self, lines):
+ source = '\n'.join(lines)
+ parser = HTMLExtractorExtra(self.md)
+ parser.feed(source)
+ parser.close()
+ return ''.join(parser.cleandoc).split('\n')
+
+
+class MarkdownInHtmlProcessor(BlockProcessor):
+ """Process Markdown Inside HTML Blocks which have been stored in the HtmlStash."""
+
+ def test(self, parent, block):
+ # ALways return True. `run` will return `False` it not a valid match.
+ return True
+
+ def parse_element_content(self, element):
+ """
+ Resursively parse the text content of an etree Element as Markdown.
+
+ Any block level elements generated from the Markdown will be inserted as children of the element in place
+ of the text content. All `markdown` attributes are removed. For any elements in which Markdown parsing has
+ been dissabled, the text content of it and its chidlren are wrapped in an `AtomicString`.
+ """
+
+ md_attr = element.attrib.pop('markdown', 'off')
+
+ if md_attr == 'block':
+ # Parse content as block level
+ # The order in which the different parts are parsed (text, children, tails) is important here as the
+ # order of elements needs to be preserved. We can't be inserting items at a later point in the current
+ # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for
+ # example). Therefore, the order of operations is children, tails, text.
+
+ # Recursively parse existing children from raw HTML
+ for child in list(element):
+ self.parse_element_content(child)
+
+ # Parse Markdown text in tail of children. Do this seperate to avoid raw HTML parsing.
+ # Save the position of each item to be inserted later in reverse.
+ tails = []
+ for pos, child in enumerate(element):
+ if child.tail:
+ block = child.tail.rstrip('\n')
+ child.tail = ''
+ # Use a dummy placeholder element.
+ dummy = etree.Element('div')
+ self.parser.parseBlocks(dummy, block.split('\n\n'))
+ children = list(dummy)
+ children.reverse()
+ tails.append((pos + 1, children))
+
+ # Insert the elements created from the tails in reverse.
+ tails.reverse()
+ for pos, tail in tails:
+ for item in tail:
+ element.insert(pos, item)
+
+ # Parse Markdown text content. Do this last to avoid raw HTML parsing.
+ if element.text:
+ block = element.text.rstrip('\n')
+ element.text = ''
+ # Use a dummy placeholder element as the content needs to get inserted before existing children.
+ dummy = etree.Element('div')
+ self.parser.parseBlocks(dummy, block.split('\n\n'))
+ children = list(dummy)
+ children.reverse()
+ for child in children:
+ element.insert(0, child)
+
+ elif md_attr == 'span':
+ # Span level parsing will be handled by inlineprocessors.
+ # Walk children here to remove any `markdown` attributes.
+ for child in list(element):
+ self.parse_element_content(child)
+
+ else:
+ # Disable inline parsing for everything else
+ if element.text is None:
+ element.text = ''
+ element.text = util.AtomicString(element.text)
+ for child in list(element):
+ self.parse_element_content(child)
+ if child.tail:
+ child.tail = util.AtomicString(child.tail)
+
+ def run(self, parent, blocks):
+ m = util.HTML_PLACEHOLDER_RE.match(blocks[0])
+ if m:
+ index = int(m.group(1))
+ element = self.parser.md.htmlStash.rawHtmlBlocks[index]
+ if isinstance(element, etree.Element):
+ # We have a matched element. Process it.
+ blocks.pop(0)
+ self.parse_element_content(element)
+ parent.append(element)
+ # Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
+ self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
+ self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
+ # Comfirm the match to the blockparser.
+ return True
+ # No match found.
+ return False
+
+
+class MarkdownInHTMLPostprocessor(RawHtmlPostprocessor):
+ def stash_to_string(self, text):
+ """ Override default to handle any etree elements still in the stash. """
+ if isinstance(text, etree.Element):
+ return self.md.serializer(text)
+ else:
+ return str(text)
+
+
+class MarkdownInHtmlExtension(Extension):
+ """Add Markdown parsing in HTML to Markdown class."""
+
+ def extendMarkdown(self, md):
+ """ Register extension instances. """
+
+ # Replace raw HTML preprocessor
+ md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
+ # Add blockprocessor which handles the placeholders for etree elements
+ md.parser.blockprocessors.register(
+ MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
+ )
+ # Replace raw HTML postprocessor
+ md.postprocessors.register(MarkdownInHTMLPostprocessor(md), 'raw_html', 30)
+
+
+def makeExtension(**kwargs): # pragma: no cover
+ return MarkdownInHtmlExtension(**kwargs)
diff --git a/markdown/extensions/meta.py b/markdown/extensions/meta.py
index 711235e..10dee11 100644
--- a/markdown/extensions/meta.py
+++ b/markdown/extensions/meta.py
@@ -4,19 +4,17 @@
This extension adds Meta Data handling to markdown.
-See <https://pythonhosted.org/Markdown/extensions/meta_data.html>
+See <https://Python-Markdown.github.io/extensions/meta_data>
for documentation.
Original code Copyright 2007-2008 [Waylan Limberg](http://achinghead.com).
All changes Copyright 2008-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
from ..preprocessors import Preprocessor
import re
@@ -34,11 +32,14 @@
class MetaExtension (Extension):
""" Meta-Data extension for Python-Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add MetaPreprocessor to Markdown instance. """
- md.preprocessors.add("meta",
- MetaPreprocessor(md),
- ">normalize_whitespace")
+ md.registerExtension(self)
+ self.md = md
+ md.preprocessors.register(MetaPreprocessor(md), 'meta', 27)
+
+ def reset(self):
+ self.md.Meta = {}
class MetaPreprocessor(Preprocessor):
@@ -70,9 +71,9 @@
else:
lines.insert(0, line)
break # no meta data - done
- self.markdown.Meta = meta
+ self.md.Meta = meta
return lines
-def makeExtension(*args, **kwargs):
- return MetaExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return MetaExtension(**kwargs)
diff --git a/markdown/extensions/nl2br.py b/markdown/extensions/nl2br.py
index 8acd60c..6c7491b 100644
--- a/markdown/extensions/nl2br.py
+++ b/markdown/extensions/nl2br.py
@@ -5,31 +5,29 @@
A Python-Markdown extension to treat newlines as hard breaks; like
GitHub-flavored Markdown does.
-See <https://pythonhosted.org/Markdown/extensions/nl2br.html>
+See <https://Python-Markdown.github.io/extensions/nl2br>
for documentation.
-Oringinal code Copyright 2011 [Brian Neal](http://deathofagremmie.com/)
+Oringinal code Copyright 2011 [Brian Neal](https://deathofagremmie.com/)
All changes Copyright 2011-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
-from ..inlinepatterns import SubstituteTagPattern
+from ..inlinepatterns import SubstituteTagInlineProcessor
BR_RE = r'\n'
class Nl2BrExtension(Extension):
- def extendMarkdown(self, md, md_globals):
- br_tag = SubstituteTagPattern(BR_RE, 'br')
- md.inlinePatterns.add('nl', br_tag, '_end')
+ def extendMarkdown(self, md):
+ br_tag = SubstituteTagInlineProcessor(BR_RE, 'br')
+ md.inlinePatterns.register(br_tag, 'nl', 5)
-def makeExtension(*args, **kwargs):
- return Nl2BrExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return Nl2BrExtension(**kwargs)
diff --git a/markdown/extensions/sane_lists.py b/markdown/extensions/sane_lists.py
index 213c8a6..e27eb18 100644
--- a/markdown/extensions/sane_lists.py
+++ b/markdown/extensions/sane_lists.py
@@ -4,19 +4,17 @@
Modify the behavior of Lists in Python-Markdown to act in a sane manor.
-See <https://pythonhosted.org/Markdown/extensions/sane_lists.html>
+See <https://Python-Markdown.github.io/extensions/sane_lists>
for documentation.
Original code Copyright 2011 [Waylan Limberg](http://achinghead.com)
All changes Copyright 2011-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
from ..blockprocessors import OListProcessor, UListProcessor
import re
@@ -24,24 +22,33 @@
class SaneOListProcessor(OListProcessor):
- CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.))[ ]+(.*)')
SIBLING_TAGS = ['ol']
+ LAZY_OL = False
+
+ def __init__(self, parser):
+ super().__init__(parser)
+ self.CHILD_RE = re.compile(r'^[ ]{0,%d}((\d+\.))[ ]+(.*)' %
+ (self.tab_length - 1))
class SaneUListProcessor(UListProcessor):
- CHILD_RE = re.compile(r'^[ ]{0,3}(([*+-]))[ ]+(.*)')
SIBLING_TAGS = ['ul']
+ def __init__(self, parser):
+ super().__init__(parser)
+ self.CHILD_RE = re.compile(r'^[ ]{0,%d}(([*+-]))[ ]+(.*)' %
+ (self.tab_length - 1))
+
class SaneListExtension(Extension):
""" Add sane lists to Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Override existing Processors. """
- md.parser.blockprocessors['olist'] = SaneOListProcessor(md.parser)
- md.parser.blockprocessors['ulist'] = SaneUListProcessor(md.parser)
+ md.parser.blockprocessors.register(SaneOListProcessor(md.parser), 'olist', 40)
+ md.parser.blockprocessors.register(SaneUListProcessor(md.parser), 'ulist', 30)
-def makeExtension(*args, **kwargs):
- return SaneListExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return SaneListExtension(**kwargs)
diff --git a/markdown/extensions/smart_strong.py b/markdown/extensions/smart_strong.py
deleted file mode 100644
index 58570bb..0000000
--- a/markdown/extensions/smart_strong.py
+++ /dev/null
@@ -1,41 +0,0 @@
-'''
-Smart_Strong Extension for Python-Markdown
-==========================================
-
-This extention adds smarter handling of double underscores within words.
-
-See <https://pythonhosted.org/Markdown/extensions/smart_strong.html>
-for documentation.
-
-Original code Copyright 2011 [Waylan Limberg](http://achinghead.com)
-
-All changes Copyright 2011-2014 The Python Markdown Project
-
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
-
-'''
-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-from . import Extension
-from ..inlinepatterns import SimpleTagPattern
-
-SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\2(?!\w)'
-STRONG_RE = r'(\*{2})(.+?)\2'
-
-
-class SmartEmphasisExtension(Extension):
- """ Add smart_emphasis extension to Markdown class."""
-
- def extendMarkdown(self, md, md_globals):
- """ Modify inline patterns. """
- md.inlinePatterns['strong'] = SimpleTagPattern(STRONG_RE, 'strong')
- md.inlinePatterns.add(
- 'strong2',
- SimpleTagPattern(SMART_STRONG_RE, 'strong'),
- '>emphasis2'
- )
-
-
-def makeExtension(*args, **kwargs):
- return SmartEmphasisExtension(*args, **kwargs)
diff --git a/markdown/extensions/smarty.py b/markdown/extensions/smarty.py
index 46e54c1..894805f 100644
--- a/markdown/extensions/smarty.py
+++ b/markdown/extensions/smarty.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
'''
Smarty extension for Python-Markdown
====================================
@@ -6,18 +5,18 @@
Adds conversion of ASCII dashes, quotes and ellipses to their HTML
entity equivalents.
-See <https://pythonhosted.org/Markdown/extensions/smarty.html>
+See <https://Python-Markdown.github.io/extensions/smarty>
for documentation.
Author: 2013, Dmitry Shachnev <mitya57@gmail.com>
All changes Copyright 2013-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
SmartyPants license:
- Copyright (c) 2003 John Gruber <http://daringfireball.net/>
+ Copyright (c) 2003 John Gruber <https://daringfireball.net/>
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -81,26 +80,25 @@
'''
-from __future__ import unicode_literals
from . import Extension
-from ..inlinepatterns import HtmlPattern
-from ..odict import OrderedDict
+from ..inlinepatterns import HtmlInlineProcessor, HTML_RE
from ..treeprocessors import InlineProcessor
+from ..util import Registry, deprecated
# Constants for quote education.
punctClass = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
endOfWordClass = r"[\s.,;:!?)]"
-closeClass = "[^\ \t\r\n\[\{\(\-\u0002\u0003]"
+closeClass = r"[^\ \t\r\n\[\{\(\-\u0002\u0003]"
openingQuotesBase = (
- '(\s' # a whitespace char
- '| ' # or a non-breaking space entity
- '|--' # or dashes
- '|–|—' # or unicode
- '|&[mn]dash;' # or named dash entities
- '|–|—' # or decimal entities
- ')'
+ r'(\s' # a whitespace char
+ r'| ' # or a non-breaking space entity
+ r'|--' # or dashes
+ r'|–|—' # or unicode
+ r'|&[mn]dash;' # or named dash entities
+ r'|–|—' # or decimal entities
+ r')'
)
substitutions = {
@@ -144,29 +142,37 @@
closingSingleQuotesRegex2 = r"(?<=%s)'(\s|s\b)" % closeClass
# All remaining quotes should be opening ones
-remainingSingleQuotesRegex = "'"
-remainingDoubleQuotesRegex = '"'
+remainingSingleQuotesRegex = r"'"
+remainingDoubleQuotesRegex = r'"'
+
+HTML_STRICT_RE = HTML_RE + r'(?!\>)'
-class SubstituteTextPattern(HtmlPattern):
- def __init__(self, pattern, replace, markdown_instance):
+class SubstituteTextPattern(HtmlInlineProcessor):
+ def __init__(self, pattern, replace, md):
""" Replaces matches with some text. """
- HtmlPattern.__init__(self, pattern)
+ HtmlInlineProcessor.__init__(self, pattern)
self.replace = replace
- self.markdown = markdown_instance
+ self.md = md
- def handleMatch(self, m):
+ @property
+ @deprecated("Use 'md' instead.")
+ def markdown(self):
+ # TODO: remove this later
+ return self.md
+
+ def handleMatch(self, m, data):
result = ''
for part in self.replace:
if isinstance(part, int):
result += m.group(part)
else:
- result += self.markdown.htmlStash.store(part, safe=True)
- return result
+ result += self.md.htmlStash.store(part)
+ return result, m.start(0), m.end(0)
class SmartyExtension(Extension):
- def __init__(self, *args, **kwargs):
+ def __init__(self, **kwargs):
self.config = {
'smart_quotes': [True, 'Educate quotes'],
'smart_angled_quotes': [False, 'Educate angled quotes'],
@@ -174,17 +180,16 @@
'smart_ellipses': [True, 'Educate ellipses'],
'substitutions': [{}, 'Overwrite default substitutions'],
}
- super(SmartyExtension, self).__init__(*args, **kwargs)
+ super().__init__(**kwargs)
self.substitutions = dict(substitutions)
self.substitutions.update(self.getConfig('substitutions', default={}))
- def _addPatterns(self, md, patterns, serie):
+ def _addPatterns(self, md, patterns, serie, priority):
for ind, pattern in enumerate(patterns):
pattern += (md,)
pattern = SubstituteTextPattern(*pattern)
- after = ('>smarty-%s-%d' % (serie, ind - 1) if ind else '_begin')
name = 'smarty-%s-%d' % (serie, ind)
- self.inlinePatterns.add(name, pattern, after)
+ self.inlinePatterns.register(pattern, name, priority-ind)
def educateDashes(self, md):
emDashesPattern = SubstituteTextPattern(
@@ -193,16 +198,14 @@
enDashesPattern = SubstituteTextPattern(
r'(?<!-)--(?!-)', (self.substitutions['ndash'],), md
)
- self.inlinePatterns.add('smarty-em-dashes', emDashesPattern, '_begin')
- self.inlinePatterns.add(
- 'smarty-en-dashes', enDashesPattern, '>smarty-em-dashes'
- )
+ self.inlinePatterns.register(emDashesPattern, 'smarty-em-dashes', 50)
+ self.inlinePatterns.register(enDashesPattern, 'smarty-en-dashes', 45)
def educateEllipses(self, md):
ellipsesPattern = SubstituteTextPattern(
r'(?<!\.)\.{3}(?!\.)', (self.substitutions['ellipsis'],), md
)
- self.inlinePatterns.add('smarty-ellipses', ellipsesPattern, '_begin')
+ self.inlinePatterns.register(ellipsesPattern, 'smarty-ellipses', 10)
def educateAngledQuotes(self, md):
leftAngledQuotePattern = SubstituteTextPattern(
@@ -211,14 +214,8 @@
rightAngledQuotePattern = SubstituteTextPattern(
r'\>\>', (self.substitutions['right-angle-quote'],), md
)
- self.inlinePatterns.add(
- 'smarty-left-angle-quotes', leftAngledQuotePattern, '_begin'
- )
- self.inlinePatterns.add(
- 'smarty-right-angle-quotes',
- rightAngledQuotePattern,
- '>smarty-left-angle-quotes'
- )
+ self.inlinePatterns.register(leftAngledQuotePattern, 'smarty-left-angle-quotes', 40)
+ self.inlinePatterns.register(rightAngledQuotePattern, 'smarty-right-angle-quotes', 35)
def educateQuotes(self, md):
lsquo = self.substitutions['left-single-quote']
@@ -231,33 +228,36 @@
(doubleQuoteSetsRe, (ldquo + lsquo,)),
(singleQuoteSetsRe, (lsquo + ldquo,)),
(decadeAbbrRe, (rsquo,)),
- (openingSingleQuotesRegex, (2, lsquo)),
+ (openingSingleQuotesRegex, (1, lsquo)),
(closingSingleQuotesRegex, (rsquo,)),
- (closingSingleQuotesRegex2, (rsquo, 2)),
+ (closingSingleQuotesRegex2, (rsquo, 1)),
(remainingSingleQuotesRegex, (lsquo,)),
- (openingDoubleQuotesRegex, (2, ldquo)),
+ (openingDoubleQuotesRegex, (1, ldquo)),
(closingDoubleQuotesRegex, (rdquo,)),
(closingDoubleQuotesRegex2, (rdquo,)),
(remainingDoubleQuotesRegex, (ldquo,))
)
- self._addPatterns(md, patterns, 'quotes')
+ self._addPatterns(md, patterns, 'quotes', 30)
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
configs = self.getConfigs()
- self.inlinePatterns = OrderedDict()
+ self.inlinePatterns = Registry()
if configs['smart_ellipses']:
self.educateEllipses(md)
if configs['smart_quotes']:
self.educateQuotes(md)
if configs['smart_angled_quotes']:
self.educateAngledQuotes(md)
+ # Override HTML_RE from inlinepatterns.py so that it does not
+ # process tags with duplicate closing quotes.
+ md.inlinePatterns.register(HtmlInlineProcessor(HTML_STRICT_RE, md), 'html', 90)
if configs['smart_dashes']:
self.educateDashes(md)
inlineProcessor = InlineProcessor(md)
inlineProcessor.inlinePatterns = self.inlinePatterns
- md.treeprocessors.add('smarty', inlineProcessor, '_end')
+ md.treeprocessors.register(inlineProcessor, 'smarty', 2)
md.ESCAPED_CHARS.extend(['"', "'"])
-def makeExtension(*args, **kwargs):
- return SmartyExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return SmartyExtension(**kwargs)
diff --git a/markdown/extensions/tables.py b/markdown/extensions/tables.py
index 368321d..4b027bb 100644
--- a/markdown/extensions/tables.py
+++ b/markdown/extensions/tables.py
@@ -4,46 +4,83 @@
Added parsing of tables to Python-Markdown.
-See <https://pythonhosted.org/Markdown/extensions/tables.html>
+See <https://Python-Markdown.github.io/extensions/tables>
for documentation.
Original code Copyright 2009 [Waylan Limberg](http://achinghead.com)
All changes Copyright 2008-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
from ..blockprocessors import BlockProcessor
-from ..util import etree
+import xml.etree.ElementTree as etree
+import re
+PIPE_NONE = 0
+PIPE_LEFT = 1
+PIPE_RIGHT = 2
class TableProcessor(BlockProcessor):
""" Process Tables. """
+ RE_CODE_PIPES = re.compile(r'(?:(\\\\)|(\\`+)|(`+)|(\\\|)|(\|))')
+ RE_END_BORDER = re.compile(r'(?<!\\)(?:\\\\)*\|$')
+
+ def __init__(self, parser):
+ self.border = False
+ self.separator = ''
+ super().__init__(parser)
+
def test(self, parent, block):
- rows = block.split('\n')
- return (len(rows) > 1 and '|' in rows[0] and
- '|' in rows[1] and '-' in rows[1] and
- rows[1].strip()[0] in ['|', ':', '-'])
+ """
+ Ensure first two rows (column header and separator row) are valid table rows.
+
+ Keep border check and separator row do avoid repeating the work.
+ """
+ is_table = False
+ rows = [row.strip(' ') for row in block.split('\n')]
+ if len(rows) > 1:
+ header0 = rows[0]
+ self.border = PIPE_NONE
+ if header0.startswith('|'):
+ self.border |= PIPE_LEFT
+ if self.RE_END_BORDER.search(header0) is not None:
+ self.border |= PIPE_RIGHT
+ row = self._split_row(header0)
+ row0_len = len(row)
+ is_table = row0_len > 1
+
+ # Each row in a single column table needs at least one pipe.
+ if not is_table and row0_len == 1 and self.border:
+ for index in range(1, len(rows)):
+ is_table = rows[index].startswith('|')
+ if not is_table:
+ is_table = self.RE_END_BORDER.search(rows[index]) is not None
+ if not is_table:
+ break
+
+ if is_table:
+ row = self._split_row(rows[1])
+ is_table = (len(row) == row0_len) and set(''.join(row)) <= set('|:- ')
+ if is_table:
+ self.separator = row
+
+ return is_table
def run(self, parent, blocks):
""" Parse a table block and build table. """
block = blocks.pop(0).split('\n')
- header = block[0].strip()
- seperator = block[1].strip()
+ header = block[0].strip(' ')
rows = [] if len(block) < 3 else block[2:]
- # Get format type (bordered by pipes or not)
- border = False
- if header.startswith('|'):
- border = True
+
# Get alignment of columns
align = []
- for c in self._split_row(seperator, border):
+ for c in self.separator:
+ c = c.strip(' ')
if c.startswith(':') and c.endswith(':'):
align.append('center')
elif c.startswith(':'):
@@ -52,51 +89,135 @@
align.append('right')
else:
align.append(None)
+
# Build table
table = etree.SubElement(parent, 'table')
thead = etree.SubElement(table, 'thead')
- self._build_row(header, thead, align, border)
+ self._build_row(header, thead, align)
tbody = etree.SubElement(table, 'tbody')
- for row in rows:
- self._build_row(row.strip(), tbody, align, border)
+ if len(rows) == 0:
+ # Handle empty table
+ self._build_empty_row(tbody, align)
+ else:
+ for row in rows:
+ self._build_row(row.strip(' '), tbody, align)
- def _build_row(self, row, parent, align, border):
+ def _build_empty_row(self, parent, align):
+ """Build an empty row."""
+ tr = etree.SubElement(parent, 'tr')
+ count = len(align)
+ while count:
+ etree.SubElement(tr, 'td')
+ count -= 1
+
+ def _build_row(self, row, parent, align):
""" Given a row of text, build table cells. """
tr = etree.SubElement(parent, 'tr')
tag = 'td'
if parent.tag == 'thead':
tag = 'th'
- cells = self._split_row(row, border)
+ cells = self._split_row(row)
# We use align here rather than cells to ensure every row
# contains the same number of columns.
for i, a in enumerate(align):
c = etree.SubElement(tr, tag)
try:
- c.text = cells[i].strip()
+ c.text = cells[i].strip(' ')
except IndexError: # pragma: no cover
c.text = ""
if a:
c.set('align', a)
- def _split_row(self, row, border):
+ def _split_row(self, row):
""" split a row of text into list of cells. """
- if border:
+ if self.border:
if row.startswith('|'):
row = row[1:]
- if row.endswith('|'):
- row = row[:-1]
- return row.split('|')
+ row = self.RE_END_BORDER.sub('', row)
+ return self._split(row)
+
+ def _split(self, row):
+ """ split a row of text with some code into a list of cells. """
+ elements = []
+ pipes = []
+ tics = []
+ tic_points = []
+ tic_region = []
+ good_pipes = []
+
+ # Parse row
+ # Throw out \\, and \|
+ for m in self.RE_CODE_PIPES.finditer(row):
+ # Store ` data (len, start_pos, end_pos)
+ if m.group(2):
+ # \`+
+ # Store length of each tic group: subtract \
+ tics.append(len(m.group(2)) - 1)
+ # Store start of group, end of group, and escape length
+ tic_points.append((m.start(2), m.end(2) - 1, 1))
+ elif m.group(3):
+ # `+
+ # Store length of each tic group
+ tics.append(len(m.group(3)))
+ # Store start of group, end of group, and escape length
+ tic_points.append((m.start(3), m.end(3) - 1, 0))
+ # Store pipe location
+ elif m.group(5):
+ pipes.append(m.start(5))
+
+ # Pair up tics according to size if possible
+ # Subtract the escape length *only* from the opening.
+ # Walk through tic list and see if tic has a close.
+ # Store the tic region (start of region, end of region).
+ pos = 0
+ tic_len = len(tics)
+ while pos < tic_len:
+ try:
+ tic_size = tics[pos] - tic_points[pos][2]
+ if tic_size == 0:
+ raise ValueError
+ index = tics[pos + 1:].index(tic_size) + 1
+ tic_region.append((tic_points[pos][0], tic_points[pos + index][1]))
+ pos += index + 1
+ except ValueError:
+ pos += 1
+
+ # Resolve pipes. Check if they are within a tic pair region.
+ # Walk through pipes comparing them to each region.
+ # - If pipe position is less that a region, it isn't in a region
+ # - If it is within a region, we don't want it, so throw it out
+ # - If we didn't throw it out, it must be a table pipe
+ for pipe in pipes:
+ throw_out = False
+ for region in tic_region:
+ if pipe < region[0]:
+ # Pipe is not in a region
+ break
+ elif region[0] <= pipe <= region[1]:
+ # Pipe is within a code region. Throw it out.
+ throw_out = True
+ break
+ if not throw_out:
+ good_pipes.append(pipe)
+
+ # Split row according to table delimeters.
+ pos = 0
+ for pipe in good_pipes:
+ elements.append(row[pos:pipe])
+ pos = pipe + 1
+ elements.append(row[pos:])
+ return elements
class TableExtension(Extension):
""" Add tables to Markdown. """
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
""" Add an instance of TableProcessor to BlockParser. """
- md.parser.blockprocessors.add('table',
- TableProcessor(md.parser),
- '<hashheader')
+ if '|' not in md.ESCAPED_CHARS:
+ md.ESCAPED_CHARS.append('|')
+ md.parser.blockprocessors.register(TableProcessor(md.parser), 'table', 75)
-def makeExtension(*args, **kwargs):
- return TableExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return TableExtension(**kwargs)
diff --git a/markdown/extensions/toc.py b/markdown/extensions/toc.py
index b3cf898..d64ec16 100644
--- a/markdown/extensions/toc.py
+++ b/markdown/extensions/toc.py
@@ -2,31 +2,37 @@
Table of Contents Extension for Python-Markdown
===============================================
-See <https://pythonhosted.org/Markdown/extensions/toc.html>
+See <https://Python-Markdown.github.io/extensions/toc>
for documentation.
-Oringinal code Copyright 2008 [Jack Miller](http://codezen.org)
+Oringinal code Copyright 2008 [Jack Miller](https://codezen.org/)
All changes Copyright 2008-2014 The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
from ..treeprocessors import Treeprocessor
-from ..util import etree, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, string_type
+from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString
+from ..postprocessors import UnescapePostprocessor
import re
+import html
import unicodedata
+import xml.etree.ElementTree as etree
-def slugify(value, separator):
+def slugify(value, separator, encoding='ascii'):
""" Slugify a string, to make it URL friendly. """
- value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
- value = re.sub('[^\w\s-]', '', value.decode('ascii')).strip().lower()
- return re.sub('[%s\s]+' % separator, separator, value)
+ value = unicodedata.normalize('NFKD', value).encode(encoding, 'ignore')
+ value = re.sub(r'[^\w\s-]', '', value.decode(encoding)).strip().lower()
+ return re.sub(r'[{}\s]+'.format(separator), separator, value)
+
+
+def slugify_unicode(value, separator):
+ """ Slugify a string, to make it URL friendly while preserving Unicode characters. """
+ return slugify(value, separator, 'utf-8')
IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$')
@@ -44,22 +50,41 @@
return id
-def stashedHTML2text(text, md):
+def get_name(el):
+ """Get title name."""
+
+ text = []
+ for c in el.itertext():
+ if isinstance(c, AtomicString):
+ text.append(html.unescape(c))
+ else:
+ text.append(c)
+ return ''.join(text).strip()
+
+
+def stashedHTML2text(text, md, strip_entities=True):
""" Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
def _html_sub(m):
""" Substitute raw html with plain text. """
try:
- raw, safe = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
+ raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
except (IndexError, TypeError): # pragma: no cover
return m.group(0)
- if md.safeMode and not safe: # pragma: no cover
- return ''
- # Strip out tags and entities - leaveing text
- return re.sub(r'(<[^>]+>)|(&[\#a-zA-Z0-9]+;)', '', raw)
+ # Strip out tags and/or entities - leaving text
+ res = re.sub(r'(<[^>]+>)', '', raw)
+ if strip_entities:
+ res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res)
+ return res
return HTML_PLACEHOLDER_RE.sub(_html_sub, text)
+def unescape(text):
+ """ Unescape escaped text. """
+ c = UnescapePostprocessor()
+ return c.run(text)
+
+
def nest_toc_tokens(toc_list):
"""Given an unsorted list with errors and skips, return a nested one.
[{'level': 1}, {'level': 2}]
@@ -125,7 +150,7 @@
class TocTreeprocessor(Treeprocessor):
def __init__(self, md, config):
- super(TocTreeprocessor, self).__init__(md)
+ super().__init__(md)
self.marker = config["marker"]
self.title = config["title"]
@@ -133,17 +158,29 @@
self.slugify = config["slugify"]
self.sep = config["separator"]
self.use_anchors = parseBoolValue(config["anchorlink"])
+ self.anchorlink_class = config["anchorlink_class"]
self.use_permalinks = parseBoolValue(config["permalink"], False)
if self.use_permalinks is None:
self.use_permalinks = config["permalink"]
-
+ self.permalink_class = config["permalink_class"]
+ self.permalink_title = config["permalink_title"]
self.header_rgx = re.compile("[Hh][123456]")
+ if isinstance(config["toc_depth"], str) and '-' in config["toc_depth"]:
+ self.toc_top, self.toc_bottom = [int(x) for x in config["toc_depth"].split('-')]
+ else:
+ self.toc_top = 1
+ self.toc_bottom = int(config["toc_depth"])
- def iterparent(self, root):
- ''' Iterator wrapper to get parent and child all at once. '''
- for parent in root.iter():
- for child in parent:
- yield parent, child
+ def iterparent(self, node):
+ ''' Iterator wrapper to get allowed parent and child all at once. '''
+
+ # We do not allow the marker inside a header as that
+ # would causes an enless loop of placing a new TOC
+ # inside previously generated TOC.
+ for child in node:
+ if not self.header_rgx.match(child.tag) and child.tag not in ['pre', 'code']:
+ yield node, child
+ yield from self.iterparent(child)
def replace_marker(self, root, elem):
''' Replace marker with elem. '''
@@ -155,11 +192,7 @@
# To keep the output from screwing up the
# validation by putting a <div> inside of a <p>
# we actually replace the <p> in its entirety.
- # We do not allow the marker inside a header as that
- # would causes an enless loop of placing a new TOC
- # inside previously generated TOC.
- if c.text and c.text.strip() == self.marker and \
- not self.header_rgx.match(c.tag) and c.tag not in ['pre', 'code']:
+ if c.text and c.text.strip() == self.marker:
for i in range(len(p)):
if p[i] == c:
p[i] = elem
@@ -176,11 +209,12 @@
anchor = etree.Element("a")
anchor.text = c.text
anchor.attrib["href"] = "#" + elem_id
- anchor.attrib["class"] = "toclink"
+ anchor.attrib["class"] = self.anchorlink_class
c.text = ""
for elem in c:
anchor.append(elem)
- c.remove(elem)
+ while len(c):
+ c.remove(c[0])
c.append(anchor)
def add_permalink(self, c, elem_id):
@@ -189,8 +223,9 @@
if self.use_permalinks is True
else self.use_permalinks)
permalink.attrib["href"] = "#" + elem_id
- permalink.attrib["class"] = "headerlink"
- permalink.attrib["title"] = "Permanent link"
+ permalink.attrib["class"] = self.permalink_class
+ if self.permalink_title:
+ permalink.attrib["title"] = self.permalink_title
c.append(permalink)
def build_toc_div(self, toc_list):
@@ -217,9 +252,10 @@
return ul
build_etree_ul(toc_list, div)
- prettify = self.markdown.treeprocessors.get('prettify')
- if prettify:
- prettify.run(div)
+
+ if 'prettify' in self.md.treeprocessors:
+ self.md.treeprocessors['prettify'].run(div)
+
return div
def run(self, doc):
@@ -231,42 +267,52 @@
toc_tokens = []
for el in doc.iter():
- if isinstance(el.tag, string_type) and self.header_rgx.match(el.tag):
+ if isinstance(el.tag, str) and self.header_rgx.match(el.tag):
self.set_level(el)
- text = ''.join(el.itertext()).strip()
+ text = get_name(el)
# Do not override pre-existing ids
if "id" not in el.attrib:
- innertext = stashedHTML2text(text, self.markdown)
+ innertext = unescape(stashedHTML2text(text, self.md))
el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids)
- toc_tokens.append({
- 'level': int(el.tag[-1]),
- 'id': el.attrib["id"],
- 'name': text
- })
+ if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:
+ toc_tokens.append({
+ 'level': int(el.tag[-1]),
+ 'id': el.attrib["id"],
+ 'name': unescape(stashedHTML2text(
+ code_escape(el.attrib.get('data-toc-label', text)),
+ self.md, strip_entities=False
+ ))
+ })
+
+ # Remove the data-toc-label attribute as it is no longer needed
+ if 'data-toc-label' in el.attrib:
+ del el.attrib['data-toc-label']
if self.use_anchors:
self.add_anchor(el, el.attrib["id"])
- if self.use_permalinks:
+ if self.use_permalinks not in [False, None]:
self.add_permalink(el, el.attrib["id"])
- div = self.build_toc_div(nest_toc_tokens(toc_tokens))
+ toc_tokens = nest_toc_tokens(toc_tokens)
+ div = self.build_toc_div(toc_tokens)
if self.marker:
self.replace_marker(doc, div)
# serialize and attach to markdown instance.
- toc = self.markdown.serializer(div)
- for pp in self.markdown.postprocessors.values():
+ toc = self.md.serializer(div)
+ for pp in self.md.postprocessors:
toc = pp.run(toc)
- self.markdown.toc = toc
+ self.md.toc_tokens = toc_tokens
+ self.md.toc = toc
class TocExtension(Extension):
TreeProcessorClass = TocTreeprocessor
- def __init__(self, *args, **kwargs):
+ def __init__(self, **kwargs):
self.config = {
"marker": ['[TOC]',
'Text to find and replace with Table of Contents - '
@@ -277,19 +323,35 @@
"anchorlink": [False,
"True if header should be a self link - "
"Defaults to False"],
+ "anchorlink_class": ['toclink',
+ 'CSS class(es) used for the link. '
+ 'Defaults to "toclink"'],
"permalink": [0,
"True or link text if a Sphinx-style permalink should "
"be added - Defaults to False"],
+ "permalink_class": ['headerlink',
+ 'CSS class(es) used for the link. '
+ 'Defaults to "headerlink"'],
+ "permalink_title": ["Permanent link",
+ "Title attribute of the permalink - "
+ "Defaults to 'Permanent link'"],
"baselevel": ['1', 'Base level for headers.'],
"slugify": [slugify,
"Function to generate anchors based on header text - "
"Defaults to the headerid ext's slugify function."],
- 'separator': ['-', 'Word separator. Defaults to "-".']
+ 'separator': ['-', 'Word separator. Defaults to "-".'],
+ "toc_depth": [6,
+ 'Define the range of section levels to include in'
+ 'the Table of Contents. A single integer (b) defines'
+ 'the bottom section level (<h1>..<hb>) only.'
+ 'A string consisting of two digits separated by a hyphen'
+ 'in between ("2-5"), define the top (t) and the'
+ 'bottom (b) (<ht>..<hb>). Defaults to `6` (bottom).'],
}
- super(TocExtension, self).__init__(*args, **kwargs)
+ super().__init__(**kwargs)
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
md.registerExtension(self)
self.md = md
self.reset()
@@ -299,11 +361,12 @@
# by the header id extension) if both are used. Same goes for
# attr_list extension. This must come last because we don't want
# to redefine ids after toc is created. But we do want toc prettified.
- md.treeprocessors.add("toc", tocext, "_end")
+ md.treeprocessors.register(tocext, 'toc', 5)
def reset(self):
self.md.toc = ''
+ self.md.toc_tokens = []
-def makeExtension(*args, **kwargs):
- return TocExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return TocExtension(**kwargs)
diff --git a/markdown/extensions/wikilinks.py b/markdown/extensions/wikilinks.py
index 94e1b67..cddee7a 100644
--- a/markdown/extensions/wikilinks.py
+++ b/markdown/extensions/wikilinks.py
@@ -4,34 +4,32 @@
Converts [[WikiLinks]] to relative links.
-See <https://pythonhosted.org/Markdown/extensions/wikilinks.html>
+See <https://Python-Markdown.github.io/extensions/wikilinks>
for documentation.
Original code Copyright [Waylan Limberg](http://achinghead.com/).
All changes Copyright The Python Markdown Project
-License: [BSD](http://www.opensource.org/licenses/bsd-license.php)
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
'''
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import Extension
-from ..inlinepatterns import Pattern
-from ..util import etree
+from ..inlinepatterns import InlineProcessor
+import xml.etree.ElementTree as etree
import re
def build_url(label, base, end):
""" Build a url from the label, a base, and an end. """
clean_label = re.sub(r'([ ]+_)|(_[ ]+)|([ ]+)', '_', label)
- return '%s%s%s' % (base, clean_label, end)
+ return '{}{}{}'.format(base, clean_label, end)
class WikiLinkExtension(Extension):
- def __init__(self, *args, **kwargs):
+ def __init__(self, **kwargs):
self.config = {
'base_url': ['/', 'String to append to beginning or URL.'],
'end_url': ['/', 'String to append to end of URL.'],
@@ -39,27 +37,27 @@
'build_url': [build_url, 'Callable formats URL from label.'],
}
- super(WikiLinkExtension, self).__init__(*args, **kwargs)
+ super().__init__(**kwargs)
- def extendMarkdown(self, md, md_globals):
+ def extendMarkdown(self, md):
self.md = md
# append to end of inline patterns
WIKILINK_RE = r'\[\[([\w0-9_ -]+)\]\]'
- wikilinkPattern = WikiLinks(WIKILINK_RE, self.getConfigs())
+ wikilinkPattern = WikiLinksInlineProcessor(WIKILINK_RE, self.getConfigs())
wikilinkPattern.md = md
- md.inlinePatterns.add('wikilink', wikilinkPattern, "<not_strong")
+ md.inlinePatterns.register(wikilinkPattern, 'wikilink', 75)
-class WikiLinks(Pattern):
+class WikiLinksInlineProcessor(InlineProcessor):
def __init__(self, pattern, config):
- super(WikiLinks, self).__init__(pattern)
+ super().__init__(pattern)
self.config = config
- def handleMatch(self, m):
- if m.group(2).strip():
+ def handleMatch(self, m, data):
+ if m.group(1).strip():
base_url, end_url, html_class = self._getMeta()
- label = m.group(2).strip()
+ label = m.group(1).strip()
url = self.config['build_url'](label, base_url, end_url)
a = etree.Element('a')
a.text = label
@@ -68,7 +66,7 @@
a.set('class', html_class)
else:
a = ''
- return a
+ return a, m.start(0), m.end(0)
def _getMeta(self):
""" Return meta data or config data. """
@@ -85,5 +83,5 @@
return base_url, end_url, html_class
-def makeExtension(*args, **kwargs):
- return WikiLinkExtension(*args, **kwargs)
+def makeExtension(**kwargs): # pragma: no cover
+ return WikiLinkExtension(**kwargs)
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
new file mode 100644
index 0000000..c08856a
--- /dev/null
+++ b/markdown/htmlparser.py
@@ -0,0 +1,323 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2020 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import re
+import importlib
+import sys
+
+
+# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
+# Users can still do `from html import parser` and get the default behavior.
+spec = importlib.util.find_spec('html.parser')
+htmlparser = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(htmlparser)
+sys.modules['htmlparser'] = htmlparser
+
+# Monkeypatch HTMLParser to only accept `?>` to close Processing Instructions.
+htmlparser.piclose = re.compile(r'\?>')
+# Monkeypatch HTMLParser to only recognize entity references with a closing semicolon.
+htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
+# Monkeypatch HTMLParser to no longer support partial entities. We are always feeding a complete block,
+# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
+# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
+htmlparser.incomplete = htmlparser.entityref
+# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
+htmlparser.locatestarttagend_tolerant = re.compile(r"""
+ <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
+ (?:[\s/]* # optional whitespace before attribute name
+ (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
+ (?:\s*=+\s* # value indicator
+ (?:'[^']*' # LITA-enclosed value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^`>\s]* # bare value <= added backtick here
+ )
+ (?:\s*,)* # possibly followed by a comma
+ )?(?:\s|/(?!>))*
+ )*
+ )?
+ \s* # trailing whitespace
+""", re.VERBOSE)
+
+# Match a blank line at the start of a block of text (two newlines).
+# The newlines may be preceded by additional whitespace.
+blank_line_re = re.compile(r'^([ ]*\n){2}')
+
+
+class HTMLExtractor(htmlparser.HTMLParser):
+ """
+ Extract raw HTML from text.
+
+ The raw HTML is stored in the `htmlStash` of the Markdown instance passed
+ to `md` and the remaining text is stored in `cleandoc` as a list of strings.
+ """
+
+ def __init__(self, md, *args, **kwargs):
+ if 'convert_charrefs' not in kwargs:
+ kwargs['convert_charrefs'] = False
+
+ # Block tags that should contain no content (self closing)
+ self.empty_tags = set(['hr'])
+
+ # This calls self.reset
+ super().__init__(*args, **kwargs)
+ self.md = md
+
+ def reset(self):
+ """Reset this instance. Loses all unprocessed data."""
+ self.inraw = False
+ self.intail = False
+ self.stack = [] # When inraw==True, stack contains a list of tags
+ self._cache = []
+ self.cleandoc = []
+ super().reset()
+
+ def close(self):
+ """Handle any buffered data."""
+ super().close()
+ if len(self.rawdata):
+ # Temp fix for https://bugs.python.org/issue41989
+ # TODO: remove this when the bug is fixed in all supported Python versions.
+ if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
+ self.handle_data(htmlparser.unescape(self.rawdata))
+ else:
+ self.handle_data(self.rawdata)
+ # Handle any unclosed tags.
+ if len(self._cache):
+ self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
+ self._cache = []
+
+ @property
+ def line_offset(self):
+ """Returns char index in self.rawdata for the start of the current line. """
+ if self.lineno > 1 and '\n' in self.rawdata:
+ m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
+ if m:
+ return m.end()
+ else: # pragma: no cover
+ # Value of self.lineno must exceed total number of lines.
+ # Find index of begining of last line.
+ return self.rawdata.rfind('\n')
+ return 0
+
+ def at_line_start(self):
+ """
+ Returns True if current position is at start of line.
+
+ Allows for up to three blank spaces at start of line.
+ """
+ if self.offset == 0:
+ return True
+ if self.offset > 3:
+ return False
+ # Confirm up to first 3 chars are whitespace
+ return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''
+
+ def get_endtag_text(self, tag):
+ """
+ Returns the text of the end tag.
+
+ If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
+ """
+ # Attempt to extract actual tag from raw source text
+ start = self.line_offset + self.offset
+ m = htmlparser.endendtag.search(self.rawdata, start)
+ if m:
+ return self.rawdata[start:m.end()]
+ else: # pragma: no cover
+ # Failed to extract from raw data. Assume well formed and lowercase.
+ return '</{}>'.format(tag)
+
+ def handle_starttag(self, tag, attrs):
+ # Handle tags that should always be empty and do not specify a closing tag
+ if tag in self.empty_tags:
+ self.handle_startendtag(tag, attrs)
+ return
+
+ if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
+ # Started a new raw block. Prepare stack.
+ self.inraw = True
+ self.cleandoc.append('\n')
+
+ text = self.get_starttag_text()
+ if self.inraw:
+ self.stack.append(tag)
+ self._cache.append(text)
+ else:
+ self.cleandoc.append(text)
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ # This is presumably a standalone tag in a code span (see #1036).
+ self.clear_cdata_mode()
+
+ def handle_endtag(self, tag):
+ text = self.get_endtag_text(tag)
+
+ if self.inraw:
+ self._cache.append(text)
+ if tag in self.stack:
+ # Remove tag from stack
+ while self.stack:
+ if self.stack.pop() == tag:
+ break
+ if len(self.stack) == 0:
+ # End of raw block.
+ if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
+ # Preserve blank line and end of raw block.
+ self._cache.append('\n')
+ else:
+ # More content exists after endtag.
+ self.intail = True
+ # Reset stack.
+ self.inraw = False
+ self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
+ # Insert blank line between this and next line.
+ self.cleandoc.append('\n\n')
+ self._cache = []
+ else:
+ self.cleandoc.append(text)
+
+ def handle_data(self, data):
+ if self.intail and '\n' in data:
+ self.intail = False
+ if self.inraw:
+ self._cache.append(data)
+ else:
+ self.cleandoc.append(data)
+
+ def handle_empty_tag(self, data, is_block):
+ """ Handle empty tags (`<data>`). """
+ if self.inraw or self.intail:
+ # Append this to the existing raw block
+ self._cache.append(data)
+ elif self.at_line_start() and is_block:
+ # Handle this as a standalone raw block
+ if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
+ # Preserve blank line after tag in raw block.
+ data += '\n'
+ else:
+ # More content exists after tag.
+ self.intail = True
+ item = self.cleandoc[-1] if self.cleandoc else ''
+ # If we only have one newline before block element, add another
+ if not item.endswith('\n\n') and item.endswith('\n'):
+ self.cleandoc.append('\n')
+ self.cleandoc.append(self.md.htmlStash.store(data))
+ # Insert blank line between this and next line.
+ self.cleandoc.append('\n\n')
+ else:
+ self.cleandoc.append(data)
+
+ def handle_startendtag(self, tag, attrs):
+ self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))
+
+ def handle_charref(self, name):
+ self.handle_empty_tag('&#{};'.format(name), is_block=False)
+
+ def handle_entityref(self, name):
+ self.handle_empty_tag('&{};'.format(name), is_block=False)
+
+ def handle_comment(self, data):
+ self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)
+
+ def handle_decl(self, data):
+ self.handle_empty_tag('<!{}>'.format(data), is_block=True)
+
+ def handle_pi(self, data):
+ self.handle_empty_tag('<?{}?>'.format(data), is_block=True)
+
+ def unknown_decl(self, data):
+ end = ']]>' if data.startswith('CDATA[') else ']>'
+ self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
+
+ def parse_pi(self, i):
+ if self.at_line_start() or self.intail:
+ return super().parse_pi(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<?')
+ return i + 2
+
+ def parse_html_declaration(self, i):
+ if self.at_line_start() or self.intail:
+ return super().parse_html_declaration(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<!')
+ return i + 2
+
+ # The rest has been copied from base class in standard lib to address #1036.
+ # As __startag_text is private, all references to it must be in this subclass.
+ # The last few lines of parse_starttag are reversed so that handle_starttag
+ # can override cdata_mode in certain situations (in a code span).
+ __starttag_text = None
+
+ def get_starttag_text(self):
+ """Return full source of start tag: '<...>'."""
+ return self.__starttag_text
+
+ def parse_starttag(self, i): # pragma: no cover
+ self.__starttag_text = None
+ endpos = self.check_for_whole_start_tag(i)
+ if endpos < 0:
+ return endpos
+ rawdata = self.rawdata
+ self.__starttag_text = rawdata[i:endpos]
+
+ # Now parse the data between i+1 and j into a tag and attrs
+ attrs = []
+ match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
+ assert match, 'unexpected call to parse_starttag()'
+ k = match.end()
+ self.lasttag = tag = match.group(1).lower()
+ while k < endpos:
+ m = htmlparser.attrfind_tolerant.match(rawdata, k)
+ if not m:
+ break
+ attrname, rest, attrvalue = m.group(1, 2, 3)
+ if not rest:
+ attrvalue = None
+ elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
+ attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
+ attrvalue = attrvalue[1:-1]
+ if attrvalue:
+ attrvalue = htmlparser.unescape(attrvalue)
+ attrs.append((attrname.lower(), attrvalue))
+ k = m.end()
+
+ end = rawdata[k:endpos].strip()
+ if end not in (">", "/>"):
+ lineno, offset = self.getpos()
+ if "\n" in self.__starttag_text:
+ lineno = lineno + self.__starttag_text.count("\n")
+ offset = len(self.__starttag_text) \
+ - self.__starttag_text.rfind("\n") # noqa: E127
+ else:
+ offset = offset + len(self.__starttag_text)
+ self.handle_data(rawdata[i:endpos])
+ return endpos
+ if end.endswith('/>'):
+ # XHTML-style empty tag: <span attr="value" />
+ self.handle_startendtag(tag, attrs)
+ else:
+ # *** set cdata_mode first so we can override it in handle_starttag (see #1036) ***
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ self.set_cdata_mode(tag)
+ self.handle_starttag(tag, attrs)
+ return endpos
diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py
index 95d358d..b0621a8 100644
--- a/markdown/inlinepatterns.py
+++ b/markdown/inlinepatterns.py
@@ -1,4 +1,23 @@
"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
INLINE PATTERNS
=============================================================================
@@ -41,120 +60,113 @@
* finally we apply strong and emphasis
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import util
-from . import odict
+from collections import namedtuple
import re
-try: # pragma: no cover
- from urllib.parse import urlparse, urlunparse
-except ImportError: # pragma: no cover
- from urlparse import urlparse, urlunparse
+import xml.etree.ElementTree as etree
try: # pragma: no cover
from html import entities
except ImportError: # pragma: no cover
import htmlentitydefs as entities
-def build_inlinepatterns(md_instance, **kwargs):
+def build_inlinepatterns(md, **kwargs):
""" Build the default set of inline patterns for Markdown. """
- inlinePatterns = odict.OrderedDict()
- inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
- inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
- inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
- inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
- inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
- inlinePatterns["image_reference"] = ImageReferencePattern(
- IMAGE_REFERENCE_RE, md_instance
+ inlinePatterns = util.Registry()
+ inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)
+ inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)
+ inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)
+ inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)
+ inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)
+ inlinePatterns.register(
+ ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140
)
- inlinePatterns["short_reference"] = ReferencePattern(
- SHORT_REF_RE, md_instance
+ inlinePatterns.register(
+ ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130
)
- inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
- inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
- inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
- if md_instance.safeMode != 'escape':
- inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
- inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
- inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
- inlinePatterns["em_strong"] = DoubleTagPattern(EM_STRONG_RE, 'strong,em')
- inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'em,strong')
- inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
- inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
- if md_instance.smart_emphasis:
- inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
- else:
- inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
+ inlinePatterns.register(
+ ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125
+ )
+ inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)
+ inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)
+ inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)
+ inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)
+ inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)
+ inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)
+ inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)
+ inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)
return inlinePatterns
+
"""
The actual regular expressions for patterns
-----------------------------------------------------------------------------
"""
-NOBRACKET = r'[^\]\[]*'
-BRK = (
- r'\[(' +
- (NOBRACKET + r'(\[')*6 +
- (NOBRACKET + r'\])*')*6 +
- NOBRACKET + r')\]'
-)
NOIMG = r'(?<!\!)'
# `e=f()` or ``e=f("`")``
-BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)'
+BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'
# \<
ESCAPE_RE = r'\\(.)'
# *emphasis*
-EMPHASIS_RE = r'(\*)([^\*]+)\2'
+EMPHASIS_RE = r'(\*)([^\*]+)\1'
# **strong**
-STRONG_RE = r'(\*{2}|_{2})(.+?)\2'
+STRONG_RE = r'(\*{2})(.+?)\1'
-# ***strongem*** or ***em*strong**
-EM_STRONG_RE = r'(\*|_)\2{2}(.+?)\2(.*?)\2{2}'
-
-# ***strong**em*
-STRONG_EM_RE = r'(\*|_)\2{2}(.+?)\2{2}(.*?)\2'
+# __smart__strong__
+SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'
# _smart_emphasis_
-SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'
+SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'
-# _emphasis_
-EMPHASIS_2_RE = r'(_)(.+?)\2'
+# __strong _em__
+SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'
+
+# ***strongem*** or ***em*strong**
+EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}'
+
+# ___strongem___ or ___em_strong__
+EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'
+
+# ***strong**em*
+STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1'
+
+# ___strong__em_
+STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'
+
+# **strong*em***
+STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}'
# [text](url) or [text](<url>) or [text](url "title")
-LINK_RE = NOIMG + BRK + \
- r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
+LINK_RE = NOIMG + r'\['
#  or 
-IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^")]+"[^"]*"|[^\)]*))\)'
+IMAGE_LINK_RE = r'\!\['
# [Google][3]
-REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]'
-
-# [Google]
-SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'
+REFERENCE_RE = LINK_RE
# ![alt text][2]
-IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]'
+IMAGE_REFERENCE_RE = IMAGE_LINK_RE
# stand-alone * or _
-NOT_STRONG_RE = r'((^| )(\*|_)( |$))'
+NOT_STRONG_RE = r'((^|\s)(\*|_)(\s|$))'
# <http://www.123.com>
-AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>'
+AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'
# <me@example.com>
-AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'
+AUTOMAIL_RE = r'<([^<> !]*@[^@<> ]*)>'
# <...>
-HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'
+HTML_RE = r'(<([a-zA-Z/][^<>]*|!--(?:(?!<!--|-->).)*--)>)'
-# &
-ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'
+# "&" (decimal) or "&" (hex) or "&" (named)
+ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
# two spaces at end of line
LINE_BREAK_RE = r' \n'
@@ -169,14 +181,8 @@
return string
-ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
-
-
-def handleAttributes(text, parent):
- """Set values of an element based on attribute definitions ({@id=123})."""
- def attributeCallback(match):
- parent.set(match.group(1), match.group(2).replace('\n', ' '))
- return ATTR_RE.sub(attributeCallback, text)
+class EmStrongItem(namedtuple('EmStrongItem', ['pattern', 'builder', 'tags'])):
+ """Emphasis/strong pattern item."""
"""
@@ -185,10 +191,12 @@
"""
-class Pattern(object):
+class Pattern: # pragma: no cover
"""Base class that inline patterns subclass. """
- def __init__(self, pattern, markdown_instance=None):
+ ANCESTOR_EXCLUDES = tuple()
+
+ def __init__(self, pattern, md=None):
"""
Create an instant of an inline pattern.
@@ -198,13 +206,16 @@
"""
self.pattern = pattern
- self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
+ self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern,
re.DOTALL | re.UNICODE)
- # Api for Markdown to pass safe_mode into instance
- self.safe_mode = False
- if markdown_instance:
- self.markdown = markdown_instance
+ self.md = md
+
+ @property
+ @util.deprecated("Use 'md' instead.")
+ def markdown(self):
+ # TODO: remove this later
+ return self.md
def getCompiledRegExp(self):
""" Return a compiled regular expression. """
@@ -229,53 +240,94 @@
def unescape(self, text):
""" Return unescaped text given text with an inline placeholder. """
try:
- stash = self.markdown.treeprocessors['inline'].stashed_nodes
+ stash = self.md.treeprocessors['inline'].stashed_nodes
except KeyError: # pragma: no cover
return text
- def itertext(el): # pragma: no cover
- ' Reimplement Element.itertext for older python versions '
- tag = el.tag
- if not isinstance(tag, util.string_type) and tag is not None:
- return
- if el.text:
- yield el.text
- for e in el:
- for s in itertext(e):
- yield s
- if e.tail:
- yield e.tail
-
def get_stash(m):
id = m.group(1)
if id in stash:
value = stash.get(id)
- if isinstance(value, util.string_type):
+ if isinstance(value, str):
return value
else:
# An etree Element - return text content only
- return ''.join(itertext(value))
+ return ''.join(value.itertext())
return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
-class SimpleTextPattern(Pattern):
+class InlineProcessor(Pattern):
+ """
+ Base class that inline patterns subclass.
+
+ This is the newer style inline processor that uses a more
+ efficient and flexible search approach.
+ """
+
+ def __init__(self, pattern, md=None):
+ """
+ Create an instant of an inline pattern.
+
+ Keyword arguments:
+
+ * pattern: A regular expression that matches a pattern
+
+ """
+ self.pattern = pattern
+ self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
+
+ # Api for Markdown to pass safe_mode into instance
+ self.safe_mode = False
+ self.md = md
+
+ def handleMatch(self, m, data):
+ """Return a ElementTree element from the given match and the
+ start and end index of the matched text.
+
+ If `start` and/or `end` are returned as `None`, it will be
+ assumed that the processor did not find a valid region of text.
+
+ Subclasses should override this method.
+
+ Keyword arguments:
+
+ * m: A re match object containing a match of the pattern.
+ * data: The buffer current under analysis
+
+ Returns:
+
+ * el: The ElementTree element, text or None.
+ * start: The start of the region that has been matched or None.
+ * end: The end of the region that has been matched or None.
+
+ """
+ pass # pragma: no cover
+
+
+class SimpleTextPattern(Pattern): # pragma: no cover
""" Return a simple text of group(2) of a Pattern. """
def handleMatch(self, m):
return m.group(2)
-class EscapePattern(Pattern):
+class SimpleTextInlineProcessor(InlineProcessor):
+ """ Return a simple text of group(1) of a Pattern. """
+ def handleMatch(self, m, data):
+ return m.group(1), m.start(0), m.end(0)
+
+
+class EscapeInlineProcessor(InlineProcessor):
""" Return an escaped character. """
- def handleMatch(self, m):
- char = m.group(2)
- if char in self.markdown.ESCAPED_CHARS:
- return '%s%s%s' % (util.STX, ord(char), util.ETX)
+ def handleMatch(self, m, data):
+ char = m.group(1)
+ if char in self.md.ESCAPED_CHARS:
+ return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)
else:
- return None
+ return None, m.start(0), m.end(0)
-class SimpleTagPattern(Pattern):
+class SimpleTagPattern(Pattern): # pragma: no cover
"""
Return element of type `tag` with a text attribute of group(3)
of a Pattern.
@@ -286,30 +338,56 @@
self.tag = tag
def handleMatch(self, m):
- el = util.etree.Element(self.tag)
+ el = etree.Element(self.tag)
el.text = m.group(3)
return el
-class SubstituteTagPattern(SimpleTagPattern):
+class SimpleTagInlineProcessor(InlineProcessor):
+ """
+ Return element of type `tag` with a text attribute of group(2)
+ of a Pattern.
+
+ """
+ def __init__(self, pattern, tag):
+ InlineProcessor.__init__(self, pattern)
+ self.tag = tag
+
+ def handleMatch(self, m, data): # pragma: no cover
+ el = etree.Element(self.tag)
+ el.text = m.group(2)
+ return el, m.start(0), m.end(0)
+
+
+class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover
""" Return an element of type `tag` with no children. """
def handleMatch(self, m):
- return util.etree.Element(self.tag)
+ return etree.Element(self.tag)
-class BacktickPattern(Pattern):
+class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):
+ """ Return an element of type `tag` with no children. """
+ def handleMatch(self, m, data):
+ return etree.Element(self.tag), m.start(0), m.end(0)
+
+
+class BacktickInlineProcessor(InlineProcessor):
""" Return a `<code>` element containing the matching text. """
def __init__(self, pattern):
- Pattern.__init__(self, pattern)
- self.tag = "code"
+ InlineProcessor.__init__(self, pattern)
+ self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)
+ self.tag = 'code'
- def handleMatch(self, m):
- el = util.etree.Element(self.tag)
- el.text = util.AtomicString(m.group(3).strip())
- return el
+ def handleMatch(self, m, data):
+ if m.group(3):
+ el = etree.Element(self.tag)
+ el.text = util.AtomicString(util.code_escape(m.group(3).strip()))
+ return el, m.start(0), m.end(0)
+ else:
+ return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)
-class DoubleTagPattern(SimpleTagPattern):
+class DoubleTagPattern(SimpleTagPattern): # pragma: no cover
"""Return a ElementTree element nested in tag2 nested in tag1.
Useful for strong emphasis etc.
@@ -317,25 +395,41 @@
"""
def handleMatch(self, m):
tag1, tag2 = self.tag.split(",")
- el1 = util.etree.Element(tag1)
- el2 = util.etree.SubElement(el1, tag2)
+ el1 = etree.Element(tag1)
+ el2 = etree.SubElement(el1, tag2)
el2.text = m.group(3)
if len(m.groups()) == 5:
el2.tail = m.group(4)
return el1
-class HtmlPattern(Pattern):
+class DoubleTagInlineProcessor(SimpleTagInlineProcessor):
+ """Return a ElementTree element nested in tag2 nested in tag1.
+
+ Useful for strong emphasis etc.
+
+ """
+ def handleMatch(self, m, data): # pragma: no cover
+ tag1, tag2 = self.tag.split(",")
+ el1 = etree.Element(tag1)
+ el2 = etree.SubElement(el1, tag2)
+ el2.text = m.group(2)
+ if len(m.groups()) == 3:
+ el2.tail = m.group(3)
+ return el1, m.start(0), m.end(0)
+
+
+class HtmlInlineProcessor(InlineProcessor):
""" Store raw inline html and return a placeholder. """
- def handleMatch(self, m):
- rawhtml = self.unescape(m.group(2))
- place_holder = self.markdown.htmlStash.store(rawhtml)
- return place_holder
+ def handleMatch(self, m, data):
+ rawhtml = self.unescape(m.group(1))
+ place_holder = self.md.htmlStash.store(rawhtml)
+ return place_holder, m.start(0), m.end(0)
def unescape(self, text):
""" Return unescaped text given text with an inline placeholder. """
try:
- stash = self.markdown.treeprocessors['inline'].stashed_nodes
+ stash = self.md.treeprocessors['inline'].stashed_nodes
except KeyError: # pragma: no cover
return text
@@ -344,132 +438,389 @@
value = stash.get(id)
if value is not None:
try:
- return self.markdown.serializer(value)
- except:
- return '\%s' % value
+ return self.md.serializer(value)
+ except Exception:
+ return r'\%s' % value
return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
-class LinkPattern(Pattern):
+class AsteriskProcessor(InlineProcessor):
+ """Emphasis processor for handling strong and em matches inside asterisks."""
+
+ PATTERNS = [
+ EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
+ EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
+ EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
+ ]
+
+ def build_single(self, m, tag, idx):
+ """Return single tag."""
+ el1 = etree.Element(tag)
+ text = m.group(2)
+ self.parse_sub_patterns(text, el1, None, idx)
+ return el1
+
+ def build_double(self, m, tags, idx):
+ """Return double tag."""
+
+ tag1, tag2 = tags.split(",")
+ el1 = etree.Element(tag1)
+ el2 = etree.Element(tag2)
+ text = m.group(2)
+ self.parse_sub_patterns(text, el2, None, idx)
+ el1.append(el2)
+ if len(m.groups()) == 3:
+ text = m.group(3)
+ self.parse_sub_patterns(text, el1, el2, idx)
+ return el1
+
+ def build_double2(self, m, tags, idx):
+ """Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""
+
+ tag1, tag2 = tags.split(",")
+ el1 = etree.Element(tag1)
+ el2 = etree.Element(tag2)
+ text = m.group(2)
+ self.parse_sub_patterns(text, el1, None, idx)
+ text = m.group(3)
+ el1.append(el2)
+ self.parse_sub_patterns(text, el2, None, idx)
+ return el1
+
+ def parse_sub_patterns(self, data, parent, last, idx):
+ """
+ Parses sub patterns.
+
+ `data` (`str`):
+ text to evaluate.
+
+ `parent` (`etree.Element`):
+ Parent to attach text and sub elements to.
+
+ `last` (`etree.Element`):
+ Last appended child to parent. Can also be None if parent has no children.
+
+ `idx` (`int`):
+ Current pattern index that was used to evaluate the parent.
+
+ """
+
+ offset = 0
+ pos = 0
+
+ length = len(data)
+ while pos < length:
+ # Find the start of potential emphasis or strong tokens
+ if self.compiled_re.match(data, pos):
+ matched = False
+ # See if the we can match an emphasis/strong pattern
+ for index, item in enumerate(self.PATTERNS):
+ # Only evaluate patterns that are after what was used on the parent
+ if index <= idx:
+ continue
+ m = item.pattern.match(data, pos)
+ if m:
+ # Append child nodes to parent
+ # Text nodes should be appended to the last
+ # child if present, and if not, it should
+ # be added as the parent's text node.
+ text = data[offset:m.start(0)]
+ if text:
+ if last is not None:
+ last.tail = text
+ else:
+ parent.text = text
+ el = self.build_element(m, item.builder, item.tags, index)
+ parent.append(el)
+ last = el
+ # Move our position past the matched hunk
+ offset = pos = m.end(0)
+ matched = True
+ if not matched:
+ # We matched nothing, move on to the next character
+ pos += 1
+ else:
+ # Increment position as no potential emphasis start was found.
+ pos += 1
+
+ # Append any leftover text as a text node.
+ text = data[offset:]
+ if text:
+ if last is not None:
+ last.tail = text
+ else:
+ parent.text = text
+
+ def build_element(self, m, builder, tags, index):
+ """Element builder."""
+
+ if builder == 'double2':
+ return self.build_double2(m, tags, index)
+ elif builder == 'double':
+ return self.build_double(m, tags, index)
+ else:
+ return self.build_single(m, tags, index)
+
+ def handleMatch(self, m, data):
+ """Parse patterns."""
+
+ el = None
+ start = None
+ end = None
+
+ for index, item in enumerate(self.PATTERNS):
+ m1 = item.pattern.match(data, m.start(0))
+ if m1:
+ start = m1.start(0)
+ end = m1.end(0)
+ el = self.build_element(m1, item.builder, item.tags, index)
+ break
+ return el, start, end
+
+
+class UnderscoreProcessor(AsteriskProcessor):
+ """Emphasis processor for handling strong and em matches inside underscores."""
+
+ PATTERNS = [
+ EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
+ EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
+ EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
+ EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
+ EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
+ ]
+
+
+class LinkInlineProcessor(InlineProcessor):
""" Return a link element from the given match. """
- def handleMatch(self, m):
- el = util.etree.Element("a")
- el.text = m.group(2)
- title = m.group(13)
- href = m.group(9)
+ RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE)
+ RE_TITLE_CLEAN = re.compile(r'\s')
- if href:
- if href[0] == "<":
- href = href[1:-1]
- el.set("href", self.sanitize_url(self.unescape(href.strip())))
- else:
- el.set("href", "")
+ def handleMatch(self, m, data):
+ text, index, handled = self.getText(data, m.end(0))
- if title:
- title = dequote(self.unescape(title))
+ if not handled:
+ return None, None, None
+
+ href, title, index, handled = self.getLink(data, index)
+ if not handled:
+ return None, None, None
+
+ el = etree.Element("a")
+ el.text = text
+
+ el.set("href", href)
+
+ if title is not None:
el.set("title", title)
- return el
- def sanitize_url(self, url):
- """
- Sanitize a url against xss attacks in "safe_mode".
+ return el, m.start(0), index
- Rather than specifically blacklisting `javascript:alert("XSS")` and all
- its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
- safe url formats. Most urls contain a network location, however some
- are known not to (i.e.: mailto links). Script urls do not contain a
- location. Additionally, for `javascript:...`, the scheme would be
- "javascript" but some aliases will appear to `urlparse()` to have no
- scheme. On top of that relative links (i.e.: "foo/bar.html") have no
- scheme. Therefore we must check "path", "parameters", "query" and
- "fragment" for any literal colons. We don't check "scheme" for colons
- because it *should* never have any and "netloc" must allow the form:
- `username:password@host:port`.
+ def getLink(self, data, index):
+ """Parse data between `()` of `[Text]()` allowing recursive `()`. """
+
+ href = ''
+ title = None
+ handled = False
+
+ m = self.RE_LINK.match(data, pos=index)
+ if m and m.group(1):
+ # Matches [Text](<link> "title")
+ href = m.group(1)[1:-1].strip()
+ if m.group(2):
+ title = m.group(2)[1:-1]
+ index = m.end(0)
+ handled = True
+ elif m:
+ # Track bracket nesting and index in string
+ bracket_count = 1
+ backtrack_count = 1
+ start_index = m.end()
+ index = start_index
+ last_bracket = -1
+
+ # Primary (first found) quote tracking.
+ quote = None
+ start_quote = -1
+ exit_quote = -1
+ ignore_matches = False
+
+ # Secondary (second found) quote tracking.
+ alt_quote = None
+ start_alt_quote = -1
+ exit_alt_quote = -1
+
+ # Track last character
+ last = ''
+
+ for pos in range(index, len(data)):
+ c = data[pos]
+ if c == '(':
+ # Count nested (
+ # Don't increment the bracket count if we are sure we're in a title.
+ if not ignore_matches:
+ bracket_count += 1
+ elif backtrack_count > 0:
+ backtrack_count -= 1
+ elif c == ')':
+ # Match nested ) to (
+ # Don't decrement if we are sure we are in a title that is unclosed.
+ if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):
+ bracket_count = 0
+ elif not ignore_matches:
+ bracket_count -= 1
+ elif backtrack_count > 0:
+ backtrack_count -= 1
+ # We've found our backup end location if the title doesn't reslove.
+ if backtrack_count == 0:
+ last_bracket = index + 1
+
+ elif c in ("'", '"'):
+ # Quote has started
+ if not quote:
+ # We'll assume we are now in a title.
+ # Brackets are quoted, so no need to match them (except for the final one).
+ ignore_matches = True
+ backtrack_count = bracket_count
+ bracket_count = 1
+ start_quote = index + 1
+ quote = c
+ # Secondary quote (in case the first doesn't resolve): [text](link'"title")
+ elif c != quote and not alt_quote:
+ start_alt_quote = index + 1
+ alt_quote = c
+ # Update primary quote match
+ elif c == quote:
+ exit_quote = index + 1
+ # Update secondary quote match
+ elif alt_quote and c == alt_quote:
+ exit_alt_quote = index + 1
+
+ index += 1
+
+ # Link is closed, so let's break out of the loop
+ if bracket_count == 0:
+ # Get the title if we closed a title string right before link closed
+ if exit_quote >= 0 and quote == last:
+ href = data[start_index:start_quote - 1]
+ title = ''.join(data[start_quote:exit_quote - 1])
+ elif exit_alt_quote >= 0 and alt_quote == last:
+ href = data[start_index:start_alt_quote - 1]
+ title = ''.join(data[start_alt_quote:exit_alt_quote - 1])
+ else:
+ href = data[start_index:index - 1]
+ break
+
+ if c != ' ':
+ last = c
+
+ # We have a scenario: [test](link"notitle)
+ # When we enter a string, we stop tracking bracket resolution in the main counter,
+ # but we do keep a backup counter up until we discover where we might resolve all brackets
+ # if the title string fails to resolve.
+ if bracket_count != 0 and backtrack_count == 0:
+ href = data[start_index:last_bracket - 1]
+ index = last_bracket
+ bracket_count = 0
+
+ handled = bracket_count == 0
+
+ if title is not None:
+ title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))
+
+ href = self.unescape(href).strip()
+
+ return href, title, index, handled
+
+ def getText(self, data, index):
+ """Parse the content between `[]` of the start of an image or link
+ resolving nested square brackets.
"""
- if not self.markdown.safeMode:
- # Return immediately bipassing parsing.
- return url
-
- try:
- scheme, netloc, path, params, query, fragment = url = urlparse(url)
- except ValueError: # pragma: no cover
- # Bad url - so bad it couldn't be parsed.
- return ''
-
- locless_schemes = ['', 'mailto', 'news']
- allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
- if scheme not in allowed_schemes:
- # Not a known (allowed) scheme. Not safe.
- return ''
-
- if netloc == '' and scheme not in locless_schemes: # pragma: no cover
- # This should not happen. Treat as suspect.
- return ''
-
- for part in url[2:]:
- if ":" in part:
- # A colon in "path", "parameters", "query"
- # or "fragment" is suspect.
- return ''
-
- # Url passes all tests. Return url as-is.
- return urlunparse(url)
+ bracket_count = 1
+ text = []
+ for pos in range(index, len(data)):
+ c = data[pos]
+ if c == ']':
+ bracket_count -= 1
+ elif c == '[':
+ bracket_count += 1
+ index += 1
+ if bracket_count == 0:
+ break
+ text.append(c)
+ return ''.join(text), index, bracket_count == 0
-class ImagePattern(LinkPattern):
+class ImageInlineProcessor(LinkInlineProcessor):
""" Return a img element from the given match. """
- def handleMatch(self, m):
- el = util.etree.Element("img")
- src_parts = m.group(9).split()
- if src_parts:
- src = src_parts[0]
- if src[0] == "<" and src[-1] == ">":
- src = src[1:-1]
- el.set('src', self.sanitize_url(self.unescape(src)))
- else:
- el.set('src', "")
- if len(src_parts) > 1:
- el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
- if self.markdown.enable_attributes:
- truealt = handleAttributes(m.group(2), el)
- else:
- truealt = m.group(2)
+ def handleMatch(self, m, data):
+ text, index, handled = self.getText(data, m.end(0))
+ if not handled:
+ return None, None, None
- el.set('alt', self.unescape(truealt))
- return el
+ src, title, index, handled = self.getLink(data, index)
+ if not handled:
+ return None, None, None
+
+ el = etree.Element("img")
+
+ el.set("src", src)
+
+ if title is not None:
+ el.set("title", title)
+
+ el.set('alt', self.unescape(text))
+ return el, m.start(0), index
-class ReferencePattern(LinkPattern):
+class ReferenceInlineProcessor(LinkInlineProcessor):
""" Match to a stored reference and return link element. """
+ NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)
- NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
+ RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE)
- def handleMatch(self, m):
- try:
- id = m.group(9).lower()
- except IndexError:
- id = None
- if not id:
- # if we got something like "[Google][]" or "[Goggle]"
- # we'll use "google" as the id
- id = m.group(2).lower()
+ def handleMatch(self, m, data):
+ text, index, handled = self.getText(data, m.end(0))
+ if not handled:
+ return None, None, None
+
+ id, end, handled = self.evalId(data, index, text)
+ if not handled:
+ return None, None, None
# Clean up linebreaks in id
id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
- if id not in self.markdown.references: # ignore undefined refs
- return None
- href, title = self.markdown.references[id]
+ if id not in self.md.references: # ignore undefined refs
+ return None, m.start(0), end
- text = m.group(2)
- return self.makeTag(href, title, text)
+ href, title = self.md.references[id]
+
+ return self.makeTag(href, title, text), m.start(0), end
+
+ def evalId(self, data, index, text):
+ """
+ Evaluate the id portion of [ref][id].
+
+ If [ref][] use [ref].
+ """
+ m = self.RE_LINK.match(data, pos=index)
+ if not m:
+ return None, index, False
+ else:
+ id = m.group(1).lower()
+ end = m.end(0)
+ if not id:
+ id = text.lower()
+ return id, end, True
def makeTag(self, href, title, text):
- el = util.etree.Element('a')
+ el = etree.Element('a')
- el.set('href', self.sanitize_url(href))
+ el.set('href', href)
if title:
el.set('title', title)
@@ -477,37 +828,49 @@
return el
-class ImageReferencePattern(ReferencePattern):
+class ShortReferenceInlineProcessor(ReferenceInlineProcessor):
+ """Short form of reference: [google]. """
+ def evalId(self, data, index, text):
+ """Evaluate the id from of [ref] """
+
+ return text.lower(), index, True
+
+
+class ImageReferenceInlineProcessor(ReferenceInlineProcessor):
""" Match to a stored reference and return img element. """
def makeTag(self, href, title, text):
- el = util.etree.Element("img")
- el.set("src", self.sanitize_url(href))
+ el = etree.Element("img")
+ el.set("src", href)
if title:
el.set("title", title)
-
- if self.markdown.enable_attributes:
- text = handleAttributes(text, el)
-
el.set("alt", self.unescape(text))
return el
-class AutolinkPattern(Pattern):
+class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):
+ """ Short form of inage reference: ![ref]. """
+ def evalId(self, data, index, text):
+ """Evaluate the id from of [ref] """
+
+ return text.lower(), index, True
+
+
+class AutolinkInlineProcessor(InlineProcessor):
""" Return a link Element given an autolink (`<http://example/com>`). """
- def handleMatch(self, m):
- el = util.etree.Element("a")
- el.set('href', self.unescape(m.group(2)))
- el.text = util.AtomicString(m.group(2))
- return el
+ def handleMatch(self, m, data):
+ el = etree.Element("a")
+ el.set('href', self.unescape(m.group(1)))
+ el.text = util.AtomicString(m.group(1))
+ return el, m.start(0), m.end(0)
-class AutomailPattern(Pattern):
+class AutomailInlineProcessor(InlineProcessor):
"""
Return a mailto link Element given an automail link (`<foo@example.com>`).
"""
- def handleMatch(self, m):
- el = util.etree.Element('a')
- email = self.unescape(m.group(2))
+ def handleMatch(self, m, data):
+ el = etree.Element('a')
+ email = self.unescape(m.group(1))
if email.startswith("mailto:"):
email = email[len("mailto:"):]
@@ -515,7 +878,7 @@
"""Return entity definition by code, or the code if not defined."""
entity = entities.codepoint2name.get(code)
if entity:
- return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
+ return "{}{};".format(util.AMP_SUBSTITUTE, entity)
else:
return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
@@ -526,4 +889,4 @@
mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
ord(letter) for letter in mailto])
el.set('href', mailto)
- return el
+ return el, m.start(0), m.end(0)
diff --git a/markdown/odict.py b/markdown/odict.py
deleted file mode 100644
index 584ad7c..0000000
--- a/markdown/odict.py
+++ /dev/null
@@ -1,191 +0,0 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from . import util
-from copy import deepcopy
-
-
-class OrderedDict(dict):
- """
- A dictionary that keeps its keys in the order in which they're inserted.
-
- Copied from Django's SortedDict with some modifications.
-
- """
- def __new__(cls, *args, **kwargs):
- instance = super(OrderedDict, cls).__new__(cls, *args, **kwargs)
- instance.keyOrder = []
- return instance
-
- def __init__(self, data=None):
- if data is None or isinstance(data, dict):
- data = data or []
- super(OrderedDict, self).__init__(data)
- self.keyOrder = list(data) if data else []
- else:
- super(OrderedDict, self).__init__()
- super_set = super(OrderedDict, self).__setitem__
- for key, value in data:
- # Take the ordering from first key
- if key not in self:
- self.keyOrder.append(key)
- # But override with last value in data (dict() does this)
- super_set(key, value)
-
- def __deepcopy__(self, memo):
- return self.__class__([(key, deepcopy(value, memo))
- for key, value in self.items()])
-
- def __copy__(self):
- # The Python's default copy implementation will alter the state
- # of self. The reason for this seems complex but is likely related to
- # subclassing dict.
- return self.copy()
-
- def __setitem__(self, key, value):
- if key not in self:
- self.keyOrder.append(key)
- super(OrderedDict, self).__setitem__(key, value)
-
- def __delitem__(self, key):
- super(OrderedDict, self).__delitem__(key)
- self.keyOrder.remove(key)
-
- def __iter__(self):
- return iter(self.keyOrder)
-
- def __reversed__(self):
- return reversed(self.keyOrder)
-
- def pop(self, k, *args):
- result = super(OrderedDict, self).pop(k, *args)
- try:
- self.keyOrder.remove(k)
- except ValueError:
- # Key wasn't in the dictionary in the first place. No problem.
- pass
- return result
-
- def popitem(self):
- result = super(OrderedDict, self).popitem()
- self.keyOrder.remove(result[0])
- return result
-
- def _iteritems(self):
- for key in self.keyOrder:
- yield key, self[key]
-
- def _iterkeys(self):
- for key in self.keyOrder:
- yield key
-
- def _itervalues(self):
- for key in self.keyOrder:
- yield self[key]
-
- if util.PY3: # pragma: no cover
- items = _iteritems
- keys = _iterkeys
- values = _itervalues
- else: # pragma: no cover
- iteritems = _iteritems
- iterkeys = _iterkeys
- itervalues = _itervalues
-
- def items(self):
- return [(k, self[k]) for k in self.keyOrder]
-
- def keys(self):
- return self.keyOrder[:]
-
- def values(self):
- return [self[k] for k in self.keyOrder]
-
- def update(self, dict_):
- for k in dict_:
- self[k] = dict_[k]
-
- def setdefault(self, key, default):
- if key not in self:
- self.keyOrder.append(key)
- return super(OrderedDict, self).setdefault(key, default)
-
- def value_for_index(self, index):
- """Returns the value of the item at the given zero-based index."""
- return self[self.keyOrder[index]]
-
- def insert(self, index, key, value):
- """Inserts the key, value pair before the item with the given index."""
- if key in self.keyOrder:
- n = self.keyOrder.index(key)
- del self.keyOrder[n]
- if n < index:
- index -= 1
- self.keyOrder.insert(index, key)
- super(OrderedDict, self).__setitem__(key, value)
-
- def copy(self):
- """Returns a copy of this object."""
- # This way of initializing the copy means it works for subclasses, too.
- return self.__class__(self)
-
- def __repr__(self):
- """
- Replaces the normal dict.__repr__ with a version that returns the keys
- in their Ordered order.
- """
- return '{%s}' % ', '.join(
- ['%r: %r' % (k, v) for k, v in self._iteritems()]
- )
-
- def clear(self):
- super(OrderedDict, self).clear()
- self.keyOrder = []
-
- def index(self, key):
- """ Return the index of a given key. """
- try:
- return self.keyOrder.index(key)
- except ValueError:
- raise ValueError("Element '%s' was not found in OrderedDict" % key)
-
- def index_for_location(self, location):
- """ Return index or None for a given location. """
- if location == '_begin':
- i = 0
- elif location == '_end':
- i = None
- elif location.startswith('<') or location.startswith('>'):
- i = self.index(location[1:])
- if location.startswith('>'):
- if i >= len(self):
- # last item
- i = None
- else:
- i += 1
- else:
- raise ValueError('Not a valid location: "%s". Location key '
- 'must start with a ">" or "<".' % location)
- return i
-
- def add(self, key, value, location):
- """ Insert by key location. """
- i = self.index_for_location(location)
- if i is not None:
- self.insert(i, key, value)
- else:
- self.__setitem__(key, value)
-
- def link(self, key, location):
- """ Change location of an existing item. """
- n = self.keyOrder.index(key)
- del self.keyOrder[n]
- try:
- i = self.index_for_location(location)
- if i is not None:
- self.keyOrder.insert(i, key)
- else:
- self.keyOrder.append(key)
- except Exception as e:
- # restore to prevent data loss and reraise
- self.keyOrder.insert(n, key)
- raise e
diff --git a/markdown/pep562.py b/markdown/pep562.py
new file mode 100644
index 0000000..b130d3b
--- /dev/null
+++ b/markdown/pep562.py
@@ -0,0 +1,245 @@
+"""
+Backport of PEP 562.
+
+https://pypi.org/search/?q=pep562
+
+Licensed under MIT
+Copyright (c) 2018 Isaac Muse <isaacmuse@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+"""
+import sys
+from collections import namedtuple
+import re
+
+__all__ = ('Pep562',)
+
+RE_VER = re.compile(
+ r'''(?x)
+ (?P<major>\d+)(?:\.(?P<minor>\d+))?(?:\.(?P<micro>\d+))?
+ (?:(?P<type>a|b|rc)(?P<pre>\d+))?
+ (?:\.post(?P<post>\d+))?
+ (?:\.dev(?P<dev>\d+))?
+ '''
+)
+
+REL_MAP = {
+ ".dev": "",
+ ".dev-alpha": "a",
+ ".dev-beta": "b",
+ ".dev-candidate": "rc",
+ "alpha": "a",
+ "beta": "b",
+ "candidate": "rc",
+ "final": ""
+}
+
+DEV_STATUS = {
+ ".dev": "2 - Pre-Alpha",
+ ".dev-alpha": "2 - Pre-Alpha",
+ ".dev-beta": "2 - Pre-Alpha",
+ ".dev-candidate": "2 - Pre-Alpha",
+ "alpha": "3 - Alpha",
+ "beta": "4 - Beta",
+ "candidate": "4 - Beta",
+ "final": "5 - Production/Stable"
+}
+
+PRE_REL_MAP = {"a": 'alpha', "b": 'beta', "rc": 'candidate'}
+
+
+class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre", "post", "dev"])):
+ """
+ Get the version (PEP 440).
+
+ A biased approach to the PEP 440 semantic version.
+
+ Provides a tuple structure which is sorted for comparisons `v1 > v2` etc.
+ (major, minor, micro, release type, pre-release build, post-release build, development release build)
+ Release types are named in is such a way they are comparable with ease.
+ Accessors to check if a development, pre-release, or post-release build. Also provides accessor to get
+ development status for setup files.
+
+ How it works (currently):
+
+ - You must specify a release type as either `final`, `alpha`, `beta`, or `candidate`.
+ - To define a development release, you can use either `.dev`, `.dev-alpha`, `.dev-beta`, or `.dev-candidate`.
+ The dot is used to ensure all development specifiers are sorted before `alpha`.
+ You can specify a `dev` number for development builds, but do not have to as implicit development releases
+ are allowed.
+ - You must specify a `pre` value greater than zero if using a prerelease as this project (not PEP 440) does not
+ allow implicit prereleases.
+ - You can optionally set `post` to a value greater than zero to make the build a post release. While post releases
+ are technically allowed in prereleases, it is strongly discouraged, so we are rejecting them. It should be
+ noted that we do not allow `post0` even though PEP 440 does not restrict this. This project specifically
+ does not allow implicit post releases.
+ - It should be noted that we do not support epochs `1!` or local versions `+some-custom.version-1`.
+
+ Acceptable version releases:
+
+ ```
+ Version(1, 0, 0, "final") 1.0
+ Version(1, 2, 0, "final") 1.2
+ Version(1, 2, 3, "final") 1.2.3
+ Version(1, 2, 0, ".dev-alpha", pre=4) 1.2a4
+ Version(1, 2, 0, ".dev-beta", pre=4) 1.2b4
+ Version(1, 2, 0, ".dev-candidate", pre=4) 1.2rc4
+ Version(1, 2, 0, "final", post=1) 1.2.post1
+ Version(1, 2, 3, ".dev") 1.2.3.dev0
+ Version(1, 2, 3, ".dev", dev=1) 1.2.3.dev1
+ ```
+
+ """
+
+ def __new__(cls, major, minor, micro, release="final", pre=0, post=0, dev=0):
+ """Validate version info."""
+
+ # Ensure all parts are positive integers.
+ for value in (major, minor, micro, pre, post):
+ if not (isinstance(value, int) and value >= 0):
+ raise ValueError("All version parts except 'release' should be integers.")
+
+ if release not in REL_MAP:
+ raise ValueError("'{}' is not a valid release type.".format(release))
+
+ # Ensure valid pre-release (we do not allow implicit pre-releases).
+ if ".dev-candidate" < release < "final":
+ if pre == 0:
+ raise ValueError("Implicit pre-releases not allowed.")
+ elif dev:
+ raise ValueError("Version is not a development release.")
+ elif post:
+ raise ValueError("Post-releases are not allowed with pre-releases.")
+
+ # Ensure valid development or development/pre release
+ elif release < "alpha":
+ if release > ".dev" and pre == 0:
+ raise ValueError("Implicit pre-release not allowed.")
+ elif post:
+ raise ValueError("Post-releases are not allowed with pre-releases.")
+
+ # Ensure a valid normal release
+ else:
+ if pre:
+ raise ValueError("Version is not a pre-release.")
+ elif dev:
+ raise ValueError("Version is not a development release.")
+
+ return super().__new__(cls, major, minor, micro, release, pre, post, dev)
+
+ def _is_pre(self):
+ """Is prerelease."""
+
+ return self.pre > 0
+
+ def _is_dev(self):
+ """Is development."""
+
+ return bool(self.release < "alpha")
+
+ def _is_post(self):
+ """Is post."""
+
+ return self.post > 0
+
+ def _get_dev_status(self): # pragma: no cover
+ """Get development status string."""
+
+ return DEV_STATUS[self.release]
+
+ def _get_canonical(self):
+ """Get the canonical output string."""
+
+ # Assemble major, minor, micro version and append `pre`, `post`, or `dev` if needed..
+ if self.micro == 0:
+ ver = "{}.{}".format(self.major, self.minor)
+ else:
+ ver = "{}.{}.{}".format(self.major, self.minor, self.micro)
+ if self._is_pre():
+ ver += '{}{}'.format(REL_MAP[self.release], self.pre)
+ if self._is_post():
+ ver += ".post{}".format(self.post)
+ if self._is_dev():
+ ver += ".dev{}".format(self.dev)
+
+ return ver
+
+
+def parse_version(ver, pre=False):
+ """Parse version into a comparable Version tuple."""
+
+ m = RE_VER.match(ver)
+
+ # Handle major, minor, micro
+ major = int(m.group('major'))
+ minor = int(m.group('minor')) if m.group('minor') else 0
+ micro = int(m.group('micro')) if m.group('micro') else 0
+
+ # Handle pre releases
+ if m.group('type'):
+ release = PRE_REL_MAP[m.group('type')]
+ pre = int(m.group('pre'))
+ else:
+ release = "final"
+ pre = 0
+
+ # Handle development releases
+ dev = m.group('dev') if m.group('dev') else 0
+ if m.group('dev'):
+ dev = int(m.group('dev'))
+ release = '.dev-' + release if pre else '.dev'
+ else:
+ dev = 0
+
+ # Handle post
+ post = int(m.group('post')) if m.group('post') else 0
+
+ return Version(major, minor, micro, release, pre, post, dev)
+
+
+class Pep562:
+ """
+ Backport of PEP 562 <https://pypi.org/search/?q=pep562>.
+
+ Wraps the module in a class that exposes the mechanics to override `__dir__` and `__getattr__`.
+ The given module will be searched for overrides of `__dir__` and `__getattr__` and use them when needed.
+ """
+
+ def __init__(self, name):
+ """Acquire `__getattr__` and `__dir__`, but only replace module for versions less than Python 3.7."""
+
+ self._module = sys.modules[name]
+ self._get_attr = getattr(self._module, '__getattr__', None)
+ self._get_dir = getattr(self._module, '__dir__', None)
+ sys.modules[name] = self
+
+ def __dir__(self):
+ """Return the overridden `dir` if one was provided, else apply `dir` to the module."""
+
+ return self._get_dir() if self._get_dir else dir(self._module)
+
+ def __getattr__(self, name):
+ """Attempt to retrieve the attribute from the module, and if missing, use the overridden function if present."""
+
+ try:
+ return getattr(self._module, name)
+ except AttributeError:
+ if self._get_attr:
+ return self._get_attr(name)
+ raise
+
+
+__version_info__ = Version(1, 0, 0, "final")
+__version__ = __version_info__._get_canonical()
diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py
index 2d4dcb5..2e572f6 100644
--- a/markdown/postprocessors.py
+++ b/markdown/postprocessors.py
@@ -1,4 +1,23 @@
"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
POST-PROCESSORS
=============================================================================
@@ -8,19 +27,17 @@
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
+from collections import OrderedDict
from . import util
-from . import odict
import re
-def build_postprocessors(md_instance, **kwargs):
+def build_postprocessors(md, **kwargs):
""" Build the default postprocessors for Markdown. """
- postprocessors = odict.OrderedDict()
- postprocessors["raw_html"] = RawHtmlPostprocessor(md_instance)
- postprocessors["amp_substitute"] = AndSubstitutePostprocessor()
- postprocessors["unescape"] = UnescapePostprocessor()
+ postprocessors = util.Registry()
+ postprocessors.register(RawHtmlPostprocessor(md), 'raw_html', 30)
+ postprocessors.register(AndSubstitutePostprocessor(), 'amp_substitute', 20)
+ postprocessors.register(UnescapePostprocessor(), 'unescape', 10)
return postprocessors
@@ -49,34 +66,37 @@
""" Restore raw html to the document. """
def run(self, text):
- """ Iterate over html stash and restore "safe" html. """
- for i in range(self.markdown.htmlStash.html_counter):
- html, safe = self.markdown.htmlStash.rawHtmlBlocks[i]
- if self.markdown.safeMode and not safe:
- if str(self.markdown.safeMode).lower() == 'escape':
- html = self.escape(html)
- elif str(self.markdown.safeMode).lower() == 'remove':
- html = ''
- else:
- html = self.markdown.html_replacement_text
- if (self.isblocklevel(html) and
- (safe or not self.markdown.safeMode)):
- text = text.replace(
- "<p>%s</p>" %
- (self.markdown.htmlStash.get_placeholder(i)),
- html + "\n"
- )
- text = text.replace(
- self.markdown.htmlStash.get_placeholder(i), html
- )
- return text
+ """ Iterate over html stash and restore html. """
+ replacements = OrderedDict()
+ for i in range(self.md.htmlStash.html_counter):
+ html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[i])
+ if self.isblocklevel(html):
+ replacements["<p>{}</p>".format(
+ self.md.htmlStash.get_placeholder(i))] = html
+ replacements[self.md.htmlStash.get_placeholder(i)] = html
- def escape(self, html):
- """ Basic html escaping """
- html = html.replace('&', '&')
- html = html.replace('<', '<')
- html = html.replace('>', '>')
- return html.replace('"', '"')
+ def substitute_match(m):
+ key = m.group(0)
+
+ if key not in replacements:
+ if key[3:-4] in replacements:
+ return f'<p>{ replacements[key[3:-4]] }</p>'
+ else:
+ return key
+
+ return replacements[key]
+
+ if replacements:
+ base_placeholder = util.HTML_PLACEHOLDER % r'([0-9]+)'
+ pattern = re.compile(f'<p>{ base_placeholder }</p>|{ base_placeholder }')
+ processed_text = pattern.sub(substitute_match, text)
+ else:
+ return text
+
+ if processed_text == text:
+ return processed_text
+ else:
+ return self.run(processed_text)
def isblocklevel(self, html):
m = re.match(r'^\<\/?([^ >]+)', html)
@@ -84,9 +104,13 @@
if m.group(1)[0] in ('!', '?', '@', '%'):
# Comment, php etc...
return True
- return util.isBlockLevel(m.group(1))
+ return self.md.is_block_level(m.group(1))
return False
+ def stash_to_string(self, text):
+ """ Convert a stashed object to a string. """
+ return str(text)
+
class AndSubstitutePostprocessor(Postprocessor):
""" Restore valid entities """
@@ -99,10 +123,10 @@
class UnescapePostprocessor(Postprocessor):
""" Restore escaped chars """
- RE = re.compile('%s(\d+)%s' % (util.STX, util.ETX))
+ RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))
def unescape(self, m):
- return util.int2str(int(m.group(1)))
+ return chr(int(m.group(1)))
def run(self, text):
return self.RE.sub(self.unescape, text)
diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py
index 7fd38d3..e1023c5 100644
--- a/markdown/preprocessors.py
+++ b/markdown/preprocessors.py
@@ -1,4 +1,23 @@
"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
PRE-PROCESSORS
=============================================================================
@@ -6,20 +25,16 @@
complicated.
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
from . import util
-from . import odict
+from .htmlparser import HTMLExtractor
import re
-def build_preprocessors(md_instance, **kwargs):
+def build_preprocessors(md, **kwargs):
""" Build the default set of preprocessors used by Markdown. """
- preprocessors = odict.OrderedDict()
- preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance)
- if md_instance.safeMode != 'escape':
- preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
- preprocessors["reference"] = ReferencePreprocessor(md_instance)
+ preprocessors = util.Registry()
+ preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30)
+ preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
return preprocessors
@@ -45,13 +60,13 @@
class NormalizeWhitespace(Preprocessor):
- """ Normalize whitespace for consistant parsing. """
+ """ Normalize whitespace for consistent parsing. """
def run(self, lines):
source = '\n'.join(lines)
source = source.replace(util.STX, "").replace(util.ETX, "")
source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
- source = source.expandtabs(self.markdown.tab_length)
+ source = source.expandtabs(self.md.tab_length)
source = re.sub(r'(?<=\n) +\n', '\n', source)
return source.split('\n')
@@ -59,287 +74,9 @@
class HtmlBlockPreprocessor(Preprocessor):
"""Remove html blocks from the text and store them for later retrieval."""
- right_tag_patterns = ["</%s>", "%s>"]
- attrs_pattern = r"""
- \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
- | # OR
- \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
- | # OR
- \s+(?P<attr2>[^>"'/= ]+) # attr
- """
- left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \
- attrs_pattern
- attrs_re = re.compile(attrs_pattern, re.VERBOSE)
- left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
- markdown_in_raw = False
-
- def _get_left_tag(self, block):
- m = self.left_tag_re.match(block)
- if m:
- tag = m.group('tag')
- raw_attrs = m.group('attrs')
- attrs = {}
- if raw_attrs:
- for ma in self.attrs_re.finditer(raw_attrs):
- if ma.group('attr'):
- if ma.group('value'):
- attrs[ma.group('attr').strip()] = ma.group('value')
- else:
- attrs[ma.group('attr').strip()] = ""
- elif ma.group('attr1'):
- if ma.group('value1'):
- attrs[ma.group('attr1').strip()] = ma.group(
- 'value1'
- )
- else:
- attrs[ma.group('attr1').strip()] = ""
- elif ma.group('attr2'):
- attrs[ma.group('attr2').strip()] = ""
- return tag, len(m.group(0)), attrs
- else:
- tag = block[1:].split(">", 1)[0].lower()
- return tag, len(tag)+2, {}
-
- def _recursive_tagfind(self, ltag, rtag, start_index, block):
- while 1:
- i = block.find(rtag, start_index)
- if i == -1:
- return -1
- j = block.find(ltag, start_index)
- # if no ltag, or rtag found before another ltag, return index
- if (j > i or j == -1):
- return i + len(rtag)
- # another ltag found before rtag, use end of ltag as starting
- # point and search again
- j = block.find('>', j)
- start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
- if start_index == -1:
- # HTML potentially malformed- ltag has no corresponding
- # rtag
- return -1
-
- def _get_right_tag(self, left_tag, left_index, block):
- for p in self.right_tag_patterns:
- tag = p % left_tag
- i = self._recursive_tagfind(
- "<%s" % left_tag, tag, left_index, block
- )
- if i > 2:
- return tag.lstrip("<").rstrip(">"), i
- return block.rstrip()[-left_index:-1].lower(), len(block)
-
- def _equal_tags(self, left_tag, right_tag):
- if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
- return True
- if ("/" + left_tag) == right_tag:
- return True
- if (right_tag == "--" and left_tag == "--"):
- return True
- elif left_tag == right_tag[1:] and right_tag[0] == "/":
- return True
- else:
- return False
-
- def _is_oneliner(self, tag):
- return (tag in ['hr', 'hr/'])
-
- def _stringindex_to_listindex(self, stringindex, items):
- """
- Same effect as concatenating the strings in items,
- finding the character to which stringindex refers in that string,
- and returning the index of the item in which that character resides.
- """
- items.append('dummy')
- i, count = 0, 0
- while count <= stringindex:
- count += len(items[i])
- i += 1
- return i - 1
-
- def _nested_markdown_in_html(self, items):
- """Find and process html child elements of the given element block."""
- for i, item in enumerate(items):
- if self.left_tag_re.match(item):
- left_tag, left_index, attrs = \
- self._get_left_tag(''.join(items[i:]))
- right_tag, data_index = self._get_right_tag(
- left_tag, left_index, ''.join(items[i:]))
- right_listindex = \
- self._stringindex_to_listindex(data_index, items[i:]) + i
- if 'markdown' in attrs.keys():
- items[i] = items[i][left_index:] # remove opening tag
- placeholder = self.markdown.htmlStash.store_tag(
- left_tag, attrs, i + 1, right_listindex + 1)
- items.insert(i, placeholder)
- if len(items) - right_listindex <= 1: # last nest, no tail
- right_listindex -= 1
- items[right_listindex] = items[right_listindex][
- :-len(right_tag) - 2] # remove closing tag
- else: # raw html
- if len(items) - right_listindex <= 1: # last element
- right_listindex -= 1
- offset = 1 if i == right_listindex else 0
- placeholder = self.markdown.htmlStash.store('\n\n'.join(
- items[i:right_listindex + offset]))
- del items[i:right_listindex + offset]
- items.insert(i, placeholder)
- return items
-
def run(self, lines):
- text = "\n".join(lines)
- new_blocks = []
- text = text.rsplit("\n\n")
- items = []
- left_tag = ''
- right_tag = ''
- in_tag = False # flag
-
- while text:
- block = text[0]
- if block.startswith("\n"):
- block = block[1:]
- text = text[1:]
-
- if block.startswith("\n"):
- block = block[1:]
-
- if not in_tag:
- if block.startswith("<") and len(block.strip()) > 1:
-
- if block[1:4] == "!--":
- # is a comment block
- left_tag, left_index, attrs = "--", 2, {}
- else:
- left_tag, left_index, attrs = self._get_left_tag(block)
- right_tag, data_index = self._get_right_tag(left_tag,
- left_index,
- block)
- # keep checking conditions below and maybe just append
-
- if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'):
- text.insert(0, block[data_index:])
- block = block[:data_index]
-
- if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]):
- new_blocks.append(block)
- continue
-
- if self._is_oneliner(left_tag):
- new_blocks.append(block.strip())
- continue
-
- if block.rstrip().endswith(">") \
- and self._equal_tags(left_tag, right_tag):
- if self.markdown_in_raw and 'markdown' in attrs.keys():
- block = block[left_index:-len(right_tag) - 2]
- new_blocks.append(self.markdown.htmlStash.
- store_tag(left_tag, attrs, 0, 2))
- new_blocks.extend([block])
- else:
- new_blocks.append(
- self.markdown.htmlStash.store(block.strip()))
- continue
- else:
- # if is block level tag and is not complete
- if (not self._equal_tags(left_tag, right_tag)) and \
- (util.isBlockLevel(left_tag) or left_tag == "--"):
- items.append(block.strip())
- in_tag = True
- else:
- new_blocks.append(
- self.markdown.htmlStash.store(block.strip())
- )
- continue
-
- else:
- new_blocks.append(block)
-
- else:
- items.append(block)
-
- right_tag, data_index = self._get_right_tag(left_tag, 0, block)
-
- if self._equal_tags(left_tag, right_tag):
- # if find closing tag
-
- if data_index < len(block):
- # we have more text after right_tag
- items[-1] = block[:data_index]
- text.insert(0, block[data_index:])
-
- in_tag = False
- if self.markdown_in_raw and 'markdown' in attrs.keys():
- items[0] = items[0][left_index:]
- items[-1] = items[-1][:-len(right_tag) - 2]
- if items[len(items) - 1]: # not a newline/empty string
- right_index = len(items) + 3
- else:
- right_index = len(items) + 2
- new_blocks.append(self.markdown.htmlStash.store_tag(
- left_tag, attrs, 0, right_index))
- placeholderslen = len(self.markdown.htmlStash.tag_data)
- new_blocks.extend(
- self._nested_markdown_in_html(items))
- nests = len(self.markdown.htmlStash.tag_data) - \
- placeholderslen
- self.markdown.htmlStash.tag_data[-1 - nests][
- 'right_index'] += nests - 2
- else:
- new_blocks.append(
- self.markdown.htmlStash.store('\n\n'.join(items)))
- items = []
-
- if items:
- if self.markdown_in_raw and 'markdown' in attrs.keys():
- items[0] = items[0][left_index:]
- items[-1] = items[-1][:-len(right_tag) - 2]
- if items[len(items) - 1]: # not a newline/empty string
- right_index = len(items) + 3
- else:
- right_index = len(items) + 2
- new_blocks.append(
- self.markdown.htmlStash.store_tag(
- left_tag, attrs, 0, right_index))
- placeholderslen = len(self.markdown.htmlStash.tag_data)
- new_blocks.extend(self._nested_markdown_in_html(items))
- nests = len(self.markdown.htmlStash.tag_data) - placeholderslen
- self.markdown.htmlStash.tag_data[-1 - nests][
- 'right_index'] += nests - 2
- else:
- new_blocks.append(
- self.markdown.htmlStash.store('\n\n'.join(items)))
- new_blocks.append('\n')
-
- new_text = "\n\n".join(new_blocks)
- return new_text.split("\n")
-
-
-class ReferencePreprocessor(Preprocessor):
- """ Remove reference definitions from text and store for later use. """
-
- TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
- RE = re.compile(
- r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL
- )
- TITLE_RE = re.compile(r'^%s$' % TITLE)
-
- def run(self, lines):
- new_text = []
- while lines:
- line = lines.pop(0)
- m = self.RE.match(line)
- if m:
- id = m.group(1).strip().lower()
- link = m.group(2).lstrip('<').rstrip('>')
- t = m.group(5) or m.group(6) or m.group(7)
- if not t:
- # Check next line for title
- tm = self.TITLE_RE.match(lines[0])
- if tm:
- lines.pop(0)
- t = tm.group(2) or tm.group(3) or tm.group(4)
- self.markdown.references[id] = (link, t)
- else:
- new_text.append(line)
-
- return new_text # + "\n"
+ source = '\n'.join(lines)
+ parser = HTMLExtractor(self.md)
+ parser.feed(source)
+ parser.close()
+ return ''.join(parser.cleandoc).split('\n')
diff --git a/markdown/serializers.py b/markdown/serializers.py
index 1e8d9dd..59bab18 100644
--- a/markdown/serializers.py
+++ b/markdown/serializers.py
@@ -6,7 +6,7 @@
# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
#
# fredrik@pythonware.com
-# http://www.pythonware.com
+# https://www.pythonware.com/
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
@@ -37,55 +37,28 @@
# --------------------------------------------------------------------
-from __future__ import absolute_import
-from __future__ import unicode_literals
-from . import util
-ElementTree = util.etree.ElementTree
-QName = util.etree.QName
-if hasattr(util.etree, 'test_comment'): # pragma: no cover
- Comment = util.etree.test_comment
-else: # pragma: no cover
- Comment = util.etree.Comment
-PI = util.etree.PI
-ProcessingInstruction = util.etree.ProcessingInstruction
+from xml.etree.ElementTree import ProcessingInstruction
+from xml.etree.ElementTree import Comment, ElementTree, QName
+import re
__all__ = ['to_html_string', 'to_xhtml_string']
HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
- "img", "input", "isindex", "link", "meta" "param")
+ "img", "input", "isindex", "link", "meta", "param")
+RE_AMP = re.compile(r'&(?!(?:\#[0-9]+|\#x[0-9a-f]+|[0-9a-z]+);)', re.I)
try:
HTML_EMPTY = set(HTML_EMPTY)
except NameError: # pragma: no cover
pass
-_namespace_map = {
- # "well-known" namespace prefixes
- "http://www.w3.org/XML/1998/namespace": "xml",
- "http://www.w3.org/1999/xhtml": "html",
- "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
- "http://schemas.xmlsoap.org/wsdl/": "wsdl",
- # xml schema
- "http://www.w3.org/2001/XMLSchema": "xs",
- "http://www.w3.org/2001/XMLSchema-instance": "xsi",
- # dublic core
- "http://purl.org/dc/elements/1.1/": "dc",
-}
-
def _raise_serialization_error(text): # pragma: no cover
raise TypeError(
- "cannot serialize %r (type %s)" % (text, type(text).__name__)
+ "cannot serialize {!r} (type {})".format(text, type(text).__name__)
)
-def _encode(text, encoding):
- try:
- return text.encode(encoding, "xmlcharrefreplace")
- except (TypeError, AttributeError): # pragma: no cover
- _raise_serialization_error(text)
-
-
def _escape_cdata(text):
# escape character data
try:
@@ -93,7 +66,8 @@
# shorter than 500 character, or so. assume that's, by far,
# the most common case in most applications.
if "&" in text:
- text = text.replace("&", "&")
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&', text)
if "<" in text:
text = text.replace("<", "<")
if ">" in text:
@@ -107,7 +81,8 @@
# escape attribute value
try:
if "&" in text:
- text = text.replace("&", "&")
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&', text)
if "<" in text:
text = text.replace("<", "<")
if ">" in text:
@@ -125,7 +100,8 @@
# escape attribute value
try:
if "&" in text:
- text = text.replace("&", "&")
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&', text)
if "<" in text:
text = text.replace("<", "<")
if ">" in text:
@@ -137,142 +113,73 @@
_raise_serialization_error(text)
-def _serialize_html(write, elem, qnames, namespaces, format):
+def _serialize_html(write, elem, format):
tag = elem.tag
text = elem.text
if tag is Comment:
write("<!--%s-->" % _escape_cdata(text))
elif tag is ProcessingInstruction:
write("<?%s?>" % _escape_cdata(text))
+ elif tag is None:
+ if text:
+ write(_escape_cdata(text))
+ for e in elem:
+ _serialize_html(write, e, format)
else:
- tag = qnames[tag]
- if tag is None:
- if text:
- write(_escape_cdata(text))
- for e in elem:
- _serialize_html(write, e, qnames, None, format)
- else:
- write("<" + tag)
- items = elem.items()
- if items or namespaces:
- items = sorted(items) # lexical order
- for k, v in items:
- if isinstance(k, QName):
- k = k.text
- if isinstance(v, QName):
- v = qnames[v.text]
- else:
- v = _escape_attrib_html(v)
- if qnames[k] == v and format == 'html':
- # handle boolean attributes
- write(" %s" % v)
- else:
- write(" %s=\"%s\"" % (qnames[k], v))
- if namespaces:
- items = namespaces.items()
- items.sort(key=lambda x: x[1]) # sort on prefix
- for v, k in items:
- if k:
- k = ":" + k
- write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
- if format == "xhtml" and tag.lower() in HTML_EMPTY:
- write(" />")
+ namespace_uri = None
+ if isinstance(tag, QName):
+ # QNAME objects store their data as a string: `{uri}tag`
+ if tag.text[:1] == "{":
+ namespace_uri, tag = tag.text[1:].split("}", 1)
else:
- write(">")
- if text:
- if tag.lower() in ["script", "style"]:
- write(text)
- else:
- write(_escape_cdata(text))
- for e in elem:
- _serialize_html(write, e, qnames, None, format)
- if tag.lower() not in HTML_EMPTY:
- write("</" + tag + ">")
+ raise ValueError('QName objects must define a tag.')
+ write("<" + tag)
+ items = elem.items()
+ if items:
+ items = sorted(items) # lexical order
+ for k, v in items:
+ if isinstance(k, QName):
+ # Assume a text only QName
+ k = k.text
+ if isinstance(v, QName):
+ # Assume a text only QName
+ v = v.text
+ else:
+ v = _escape_attrib_html(v)
+ if k == v and format == 'html':
+ # handle boolean attributes
+ write(" %s" % v)
+ else:
+ write(' {}="{}"'.format(k, v))
+ if namespace_uri:
+ write(' xmlns="%s"' % (_escape_attrib(namespace_uri)))
+ if format == "xhtml" and tag.lower() in HTML_EMPTY:
+ write(" />")
+ else:
+ write(">")
+ if text:
+ if tag.lower() in ["script", "style"]:
+ write(text)
+ else:
+ write(_escape_cdata(text))
+ for e in elem:
+ _serialize_html(write, e, format)
+ if tag.lower() not in HTML_EMPTY:
+ write("</" + tag + ">")
if elem.tail:
write(_escape_cdata(elem.tail))
-def _write_html(root,
- encoding=None,
- default_namespace=None,
- format="html"):
+def _write_html(root, format="html"):
assert root is not None
data = []
write = data.append
- qnames, namespaces = _namespaces(root, default_namespace)
- _serialize_html(write, root, qnames, namespaces, format)
- if encoding is None:
- return "".join(data)
- else:
- return _encode("".join(data))
+ _serialize_html(write, root, format)
+ return "".join(data)
# --------------------------------------------------------------------
-# serialization support
-
-def _namespaces(elem, default_namespace=None):
- # identify namespaces used in this tree
-
- # maps qnames to *encoded* prefix:local names
- qnames = {None: None}
-
- # maps uri:s to prefixes
- namespaces = {}
- if default_namespace:
- namespaces[default_namespace] = ""
-
- def add_qname(qname):
- # calculate serialized qname representation
- try:
- if qname[:1] == "{":
- uri, tag = qname[1:].split("}", 1)
- prefix = namespaces.get(uri)
- if prefix is None:
- prefix = _namespace_map.get(uri)
- if prefix is None:
- prefix = "ns%d" % len(namespaces)
- if prefix != "xml":
- namespaces[uri] = prefix
- if prefix:
- qnames[qname] = "%s:%s" % (prefix, tag)
- else:
- qnames[qname] = tag # default element
- else:
- if default_namespace:
- raise ValueError(
- "cannot use non-qualified names with "
- "default_namespace option"
- )
- qnames[qname] = qname
- except TypeError: # pragma: no cover
- _raise_serialization_error(qname)
-
- # populate qname and namespaces table
- try:
- iterate = elem.iter
- except AttributeError:
- iterate = elem.getiterator # cET compatibility
- for elem in iterate():
- tag = elem.tag
- if isinstance(tag, QName) and tag.text not in qnames:
- add_qname(tag.text)
- elif isinstance(tag, util.string_type):
- if tag not in qnames:
- add_qname(tag)
- elif tag is not None and tag is not Comment and tag is not PI:
- _raise_serialization_error(tag)
- for key, value in elem.items():
- if isinstance(key, QName):
- key = key.text
- if key not in qnames:
- add_qname(key)
- if isinstance(value, QName) and value.text not in qnames:
- add_qname(value.text)
- text = elem.text
- if isinstance(text, QName) and text.text not in qnames:
- add_qname(text.text)
- return qnames, namespaces
-
+# public functions
def to_html_string(element):
return _write_html(ElementTree(element).getroot(), format="html")
diff --git a/markdown/test_tools.py b/markdown/test_tools.py
new file mode 100644
index 0000000..21ae1a7
--- /dev/null
+++ b/markdown/test_tools.py
@@ -0,0 +1,220 @@
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import os
+import sys
+import unittest
+import textwrap
+from . import markdown, Markdown, util
+
+try:
+ import tidylib
+except ImportError:
+ tidylib = None
+
+__all__ = ['TestCase', 'LegacyTestCase', 'Kwargs']
+
+
+class TestCase(unittest.TestCase):
+ """
+ A unittest.TestCase subclass with helpers for testing Markdown output.
+
+ Define `default_kwargs` as a dict of keywords to pass to Markdown for each
+ test. The defaults can be overridden on individual tests.
+
+ The `assertMarkdownRenders` method accepts the source text, the expected
+ output, and any keywords to pass to Markdown. The `default_kwargs` are used
+ except where overridden by `kwargs`. The ouput and expected ouput are passed
+ to `TestCase.assertMultiLineEqual`. An AssertionError is raised with a diff
+ if the actual output does not equal the expected output.
+
+ The `dedent` method is available to dedent triple-quoted strings if
+ necessary.
+
+ In all other respects, behaves as unittest.TestCase.
+ """
+
+ default_kwargs = {}
+
+ def assertMarkdownRenders(self, source, expected, expected_attrs=None, **kwargs):
+ """
+ Test that source Markdown text renders to expected output with given keywords.
+
+ `expected_attrs` accepts a dict. Each key should be the name of an attribute
+ on the `Markdown` instance and the value should be the expected value after
+ the source text is parsed by Markdown. After the expected output is tested,
+ the expected value for each attribute is compared against the actual
+ attribute of the `Markdown` instance using `TestCase.assertEqual`.
+ """
+
+ expected_attrs = expected_attrs or {}
+ kws = self.default_kwargs.copy()
+ kws.update(kwargs)
+ md = Markdown(**kws)
+ output = md.convert(source)
+ self.assertMultiLineEqual(output, expected)
+ for key, value in expected_attrs.items():
+ self.assertEqual(getattr(md, key), value)
+
+ def dedent(self, text):
+ """
+ Dedent text.
+ """
+
+ # TODO: If/when actual output ends with a newline, then use:
+ # return textwrap.dedent(text.strip('/n'))
+ return textwrap.dedent(text).strip()
+
+
+class recursionlimit:
+ """
+ A context manager which temporarily modifies the Python recursion limit.
+
+ The testing framework, coverage, etc. may add an arbitrary number of levels to the depth. To maintain consistency
+ in the tests, the current stack depth is determined when called, then added to the provided limit.
+
+ Example usage:
+
+ with recursionlimit(20):
+ # test code here
+
+ See https://stackoverflow.com/a/50120316/866026
+ """
+
+ def __init__(self, limit):
+ self.limit = util._get_stack_depth() + limit
+ self.old_limit = sys.getrecursionlimit()
+
+ def __enter__(self):
+ sys.setrecursionlimit(self.limit)
+
+ def __exit__(self, type, value, tb):
+ sys.setrecursionlimit(self.old_limit)
+
+
+#########################
+# Legacy Test Framework #
+#########################
+
+
+class Kwargs(dict):
+ """ A dict like class for holding keyword arguments. """
+ pass
+
+
+def _normalize_whitespace(text):
+ """ Normalize whitespace for a string of html using tidylib. """
+ output, errors = tidylib.tidy_fragment(text, options={
+ 'drop_empty_paras': 0,
+ 'fix_backslash': 0,
+ 'fix_bad_comments': 0,
+ 'fix_uri': 0,
+ 'join_styles': 0,
+ 'lower_literals': 0,
+ 'merge_divs': 0,
+ 'output_xhtml': 1,
+ 'quote_ampersand': 0,
+ 'newline': 'LF'
+ })
+ return output
+
+
+class LegacyTestMeta(type):
+ def __new__(cls, name, bases, dct):
+
+ def generate_test(infile, outfile, normalize, kwargs):
+ def test(self):
+ with open(infile, encoding="utf-8") as f:
+ input = f.read()
+ with open(outfile, encoding="utf-8") as f:
+ # Normalize line endings
+ # (on Windows, git may have altered line endings).
+ expected = f.read().replace("\r\n", "\n")
+ output = markdown(input, **kwargs)
+ if tidylib and normalize:
+ try:
+ expected = _normalize_whitespace(expected)
+ output = _normalize_whitespace(output)
+ except OSError:
+ self.skipTest("Tidylib's c library not available.")
+ elif normalize:
+ self.skipTest('Tidylib not available.')
+ self.assertMultiLineEqual(output, expected)
+ return test
+
+ location = dct.get('location', '')
+ exclude = dct.get('exclude', [])
+ normalize = dct.get('normalize', False)
+ input_ext = dct.get('input_ext', '.txt')
+ output_ext = dct.get('output_ext', '.html')
+ kwargs = dct.get('default_kwargs', Kwargs())
+
+ if os.path.isdir(location):
+ for file in os.listdir(location):
+ infile = os.path.join(location, file)
+ if os.path.isfile(infile):
+ tname, ext = os.path.splitext(file)
+ if ext == input_ext:
+ outfile = os.path.join(location, tname + output_ext)
+ tname = tname.replace(' ', '_').replace('-', '_')
+ kws = kwargs.copy()
+ if tname in dct:
+ kws.update(dct[tname])
+ test_name = 'test_%s' % tname
+ if tname not in exclude:
+ dct[test_name] = generate_test(infile, outfile, normalize, kws)
+ else:
+ dct[test_name] = unittest.skip('Excluded')(lambda: None)
+
+ return type.__new__(cls, name, bases, dct)
+
+
+class LegacyTestCase(unittest.TestCase, metaclass=LegacyTestMeta):
+ """
+ A `unittest.TestCase` subclass for running Markdown's legacy file-based tests.
+
+ A subclass should define various properties which point to a directory of
+ text-based test files and define various behaviors/defaults for those tests.
+ The following properties are supported:
+
+ location: A path to the directory fo test files. An absolute path is preferred.
+ exclude: A list of tests to exclude. Each test name should comprise the filename
+ without an extension.
+ normalize: A boolean value indicating if the HTML should be normalized.
+ Default: `False`.
+ input_ext: A string containing the file extension of input files. Default: `.txt`.
+ ouput_ext: A string containing the file extension of expected output files.
+ Default: `html`.
+ default_kwargs: A `Kwargs` instance which stores the default set of keyword
+ arguments for all test files in the directory.
+
+ In addition, properties can be defined for each individual set of test files within
+ the directory. The property should be given the name of the file without the file
+ extension. Any spaces and dashes in the filename should be replaced with
+ underscores. The value of the property should be a `Kwargs` instance which
+ contains the keyword arguments that should be passed to `Markdown` for that
+ test file. The keyword arguments will "update" the `default_kwargs`.
+
+ When the class instance is created, it will walk the given directory and create
+ a separate unitttest for each set of test files using the naming scheme:
+ `test_filename`. One unittest will be run for each set of input and output files.
+ """
+ pass
diff --git a/markdown/treeprocessors.py b/markdown/treeprocessors.py
index d06f192..eb6bf41 100644
--- a/markdown/treeprocessors.py
+++ b/markdown/treeprocessors.py
@@ -1,22 +1,41 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
+import xml.etree.ElementTree as etree
from . import util
-from . import odict
from . import inlinepatterns
-def build_treeprocessors(md_instance, **kwargs):
+def build_treeprocessors(md, **kwargs):
""" Build the default treeprocessors for Markdown. """
- treeprocessors = odict.OrderedDict()
- treeprocessors["inline"] = InlineProcessor(md_instance)
- treeprocessors["prettify"] = PrettifyTreeprocessor(md_instance)
+ treeprocessors = util.Registry()
+ treeprocessors.register(InlineProcessor(md), 'inline', 20)
+ treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10)
return treeprocessors
def isString(s):
""" Check if it's string """
if not isinstance(s, util.AtomicString):
- return isinstance(s, util.string_type)
+ return isinstance(s, str)
return False
@@ -52,8 +71,15 @@
self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
+ len(self.__placeholder_suffix)
self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
- self.markdown = md
+ self.md = md
self.inlinePatterns = md.inlinePatterns
+ self.ancestors = []
+
+ @property
+ @util.deprecated("Use 'md' instead.")
+ def markdown(self):
+ # TODO: remove this later
+ return self.md
def __makePlaceholder(self, type):
""" Generate a placeholder """
@@ -100,10 +126,11 @@
"""
if not isinstance(data, util.AtomicString):
startIndex = 0
- while patternIndex < len(self.inlinePatterns):
+ count = len(self.inlinePatterns)
+ while patternIndex < count:
data, matched, startIndex = self.__applyPattern(
- self.inlinePatterns.value_for_index(patternIndex),
- data, patternIndex, startIndex)
+ self.inlinePatterns[patternIndex], data, patternIndex, startIndex
+ )
if not matched:
patternIndex += 1
return data
@@ -138,7 +165,7 @@
childResult.reverse()
for newChild in childResult:
- node.insert(pos, newChild)
+ node.insert(pos, newChild[0])
def __processPlaceholders(self, data, parent, isText=True):
"""
@@ -155,10 +182,10 @@
def linkText(text):
if text:
if result:
- if result[-1].tail:
- result[-1].tail += text
+ if result[-1][0].tail:
+ result[-1][0].tail += text
else:
- result[-1].tail = text
+ result[-1][0].tail = text
elif not isText:
if parent.tail:
parent.tail += text
@@ -199,7 +226,7 @@
continue
strartIndex = phEndIndex
- result.append(node)
+ result.append((node, self.ancestors[:]))
else: # wrong placeholder
end = index + len(self.__placeholder_prefix)
@@ -230,16 +257,38 @@
Returns: String with placeholders instead of ElementTree elements.
"""
- match = pattern.getCompiledRegExp().match(data[startIndex:])
- leftData = data[:startIndex]
+ new_style = isinstance(pattern, inlinepatterns.InlineProcessor)
+
+ for exclude in pattern.ANCESTOR_EXCLUDES:
+ if exclude.lower() in self.ancestors:
+ return data, False, 0
+
+ if new_style:
+ match = None
+ # Since handleMatch may reject our first match,
+ # we iterate over the buffer looking for matches
+ # until we can't find any more.
+ for match in pattern.getCompiledRegExp().finditer(data, startIndex):
+ node, start, end = pattern.handleMatch(match, data)
+ if start is None or end is None:
+ startIndex += match.end(0)
+ match = None
+ continue
+ break
+ else: # pragma: no cover
+ match = pattern.getCompiledRegExp().match(data[startIndex:])
+ leftData = data[:startIndex]
if not match:
return data, False, 0
- node = pattern.handleMatch(match)
+ if not new_style: # pragma: no cover
+ node = pattern.handleMatch(match)
+ start = match.start(0)
+ end = match.end(0)
if node is None:
- return data, True, len(leftData)+match.span(len(match.groups()))[0]
+ return data, True, end
if not isString(node):
if not isinstance(node.text, util.AtomicString):
@@ -247,9 +296,11 @@
for child in [node] + list(node):
if not isString(node):
if child.text:
+ self.ancestors.append(child.tag.lower())
child.text = self.__handleInline(
child.text, patternIndex + 1
)
+ self.ancestors.pop()
if child.tail:
child.tail = self.__handleInline(
child.tail, patternIndex
@@ -257,11 +308,25 @@
placeholder = self.__stashNode(node, pattern.type())
- return "%s%s%s%s" % (leftData,
- match.group(1),
- placeholder, match.groups()[-1]), True, 0
+ if new_style:
+ return "{}{}{}".format(data[:start],
+ placeholder, data[end:]), True, 0
+ else: # pragma: no cover
+ return "{}{}{}{}".format(leftData,
+ match.group(1),
+ placeholder, match.groups()[-1]), True, 0
- def run(self, tree):
+ def __build_ancestors(self, parent, parents):
+ """Build the ancestor list."""
+ ancestors = []
+ while parent is not None:
+ if parent is not None:
+ ancestors.append(parent.tag.lower())
+ parent = self.parent_map.get(parent)
+ ancestors.reverse()
+ parents.extend(ancestors)
+
+ def run(self, tree, ancestors=None):
"""Apply inline patterns to a parsed Markdown tree.
Iterate over ElementTree, find elements with inline tag, apply inline
@@ -274,31 +339,45 @@
Arguments:
* tree: ElementTree object, representing Markdown tree.
+ * ancestors: List of parent tag names that precede the tree node (if needed).
Returns: ElementTree object with applied inline patterns.
"""
self.stashed_nodes = {}
- stack = [tree]
+ # Ensure a valid parent list, but copy passed in lists
+ # to ensure we don't have the user accidentally change it on us.
+ tree_parents = [] if ancestors is None else ancestors[:]
+
+ self.parent_map = {c: p for p in tree.iter() for c in p}
+ stack = [(tree, tree_parents)]
while stack:
- currElement = stack.pop()
+ currElement, parents = stack.pop()
+
+ self.ancestors = parents
+ self.__build_ancestors(currElement, self.ancestors)
+
insertQueue = []
for child in currElement:
if child.text and not isinstance(
child.text, util.AtomicString
):
+ self.ancestors.append(child.tag.lower())
text = child.text
child.text = None
lst = self.__processPlaceholders(
self.__handleInline(text), child
)
+ for item in lst:
+ self.parent_map[item[0]] = child
stack += lst
insertQueue.append((child, lst))
+ self.ancestors.pop()
if child.tail:
tail = self.__handleInline(child.tail)
- dumby = util.etree.Element('d')
+ dumby = etree.Element('d')
child.tail = None
tailResult = self.__processPlaceholders(tail, dumby, False)
if dumby.tail:
@@ -306,30 +385,16 @@
pos = list(currElement).index(child) + 1
tailResult.reverse()
for newChild in tailResult:
- currElement.insert(pos, newChild)
+ self.parent_map[newChild[0]] = currElement
+ currElement.insert(pos, newChild[0])
if len(child):
- stack.append(child)
+ self.parent_map[child] = currElement
+ stack.append((child, self.ancestors[:]))
for element, lst in insertQueue:
- if self.markdown.enable_attributes:
- if element.text and isString(element.text):
- element.text = inlinepatterns.handleAttributes(
- element.text, element
- )
- i = 0
- for newChild in lst:
- if self.markdown.enable_attributes:
- # Processing attributes
- if newChild.tail and isString(newChild.tail):
- newChild.tail = inlinepatterns.handleAttributes(
- newChild.tail, element
- )
- if newChild.text and isString(newChild.text):
- newChild.text = inlinepatterns.handleAttributes(
- newChild.text, newChild
- )
+ for i, obj in enumerate(lst):
+ newChild = obj[0]
element.insert(i, newChild)
- i += 1
return tree
@@ -340,12 +405,12 @@
""" Recursively add linebreaks to ElementTree children. """
i = "\n"
- if util.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']:
+ if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']:
if (not elem.text or not elem.text.strip()) \
- and len(elem) and util.isBlockLevel(elem[0].tag):
+ and len(elem) and self.md.is_block_level(elem[0].tag):
elem.text = i
for e in elem:
- if util.isBlockLevel(e.tag):
+ if self.md.is_block_level(e.tag):
self._prettifyETree(e)
if not elem.tail or not elem.tail.strip():
elem.tail = i
@@ -356,16 +421,16 @@
""" Add linebreaks to ElementTree root object. """
self._prettifyETree(root)
- # Do <br />'s seperately as they are often in the middle of
+ # Do <br />'s separately as they are often in the middle of
# inline content and missed by _prettifyETree.
- brs = root.getiterator('br')
+ brs = root.iter('br')
for br in brs:
if not br.tail or not br.tail.strip():
br.tail = '\n'
else:
br.tail = '\n%s' % br.tail
# Clean up extra empty lines at end of code blocks.
- pres = root.getiterator('pre')
+ pres = root.iter('pre')
for pre in pres:
if len(pre) and pre[0].tag == 'code':
pre[0].text = util.AtomicString(pre[0].text.rstrip() + '\n')
diff --git a/markdown/util.py b/markdown/util.py
index d3d48f0..2cb2317 100644
--- a/markdown/util.py
+++ b/markdown/util.py
@@ -1,23 +1,50 @@
-# -*- coding: utf-8 -*-
-from __future__ import unicode_literals
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
import re
import sys
+from collections import namedtuple
+from functools import wraps
+import warnings
+import xml.etree.ElementTree
+from .pep562 import Pep562
+from itertools import count
+
+try:
+ from importlib import metadata
+except ImportError:
+ # <PY38 use backport
+ import importlib_metadata as metadata
+
+PY37 = (3, 7) <= sys.version_info
-"""
-Python 3 Stuff
-=============================================================================
-"""
-PY3 = sys.version_info[0] == 3
-
-if PY3: # pragma: no cover
- string_type = str
- text_type = str
- int2str = chr
-else: # pragma: no cover
- string_type = basestring # noqa
- text_type = unicode # noqa
- int2str = unichr # noqa
+# TODO: Remove deprecated variables in a future release.
+__deprecated__ = {
+ 'etree': ('xml.etree.ElementTree', xml.etree.ElementTree),
+ 'string_type': ('str', str),
+ 'text_type': ('str', str),
+ 'int2str': ('chr', chr),
+ 'iterrange': ('range', range)
+}
"""
@@ -26,15 +53,19 @@
"""
-BLOCK_LEVEL_ELEMENTS = re.compile(
- "^(p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
- "|script|noscript|form|fieldset|iframe|math"
- "|hr|hr/|style|li|dt|dd|thead|tbody"
- "|tr|th|td|section|footer|header|group|figure"
- "|figcaption|aside|article|canvas|output"
- "|progress|video|nav)$",
- re.IGNORECASE
-)
+BLOCK_LEVEL_ELEMENTS = [
+ # Elements which are invalid to wrap in a `<p>` tag.
+ # See https://w3c.github.io/html/grouping-content.html#the-p-element
+ 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
+ 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
+ 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul',
+ # Other elements which Markdown should not be mucking up the contents of.
+ 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+ 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+ 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
+]
+
# Placeholders
STX = '\u0002' # Use STX ("Start of text") for start-of-placeholder
ETX = '\u0003' # Use ETX ("End of text") for end-of-placeholder
@@ -52,6 +83,8 @@
-----------------------------------------------------------------------------
"""
+# Only load extension entry_points once.
+INSTALLED_EXTENSIONS = metadata.entry_points().get('markdown.extensions', ())
RTL_BIDI_RANGES = (
('\u0590', '\u07FF'),
# Hebrew (0590-05FF), Arabic (0600-06FF),
@@ -60,23 +93,6 @@
('\u2D30', '\u2D7F') # Tifinagh
)
-# Extensions should use "markdown.util.etree" instead of "etree" (or do `from
-# markdown.util import etree`). Do not import it by yourself.
-
-try: # pragma: no cover
- # Is the C implementation of ElementTree available?
- import xml.etree.cElementTree as etree
- from xml.etree.ElementTree import Comment
- # Serializers (including ours) test with non-c Comment
- etree.test_comment = Comment
- if etree.VERSION < "1.0.5":
- raise RuntimeError("cElementTree version 1.0.5 or higher is required.")
-except (ImportError, RuntimeError): # pragma: no cover
- # Use the Python implementation of ElementTree?
- import xml.etree.ElementTree as etree
- if etree.VERSION < "1.1":
- raise RuntimeError("ElementTree version 1.1 or higher is required")
-
"""
AUXILIARY GLOBAL FUNCTIONS
@@ -84,10 +100,30 @@
"""
+def deprecated(message, stacklevel=2):
+ """
+ Raise a DeprecationWarning when wrapped function/method is called.
+
+ Borrowed from https://stackoverflow.com/a/48632082/866026
+ """
+ def deprecated_decorator(func):
+ @wraps(func)
+ def deprecated_func(*args, **kwargs):
+ warnings.warn(
+ "'{}' is deprecated. {}".format(func.__name__, message),
+ category=DeprecationWarning,
+ stacklevel=stacklevel
+ )
+ return func(*args, **kwargs)
+ return deprecated_func
+ return deprecated_decorator
+
+
+@deprecated("Use 'Markdown.is_block_level' instead.")
def isBlockLevel(tag):
"""Check if the tag is a block level HTML tag."""
- if isinstance(tag, string_type):
- return BLOCK_LEVEL_ELEMENTS.match(tag)
+ if isinstance(tag, str):
+ return tag.lower().rstrip('/') in BLOCK_LEVEL_ELEMENTS
# Some ElementTree tags are not strings, so return False.
return False
@@ -97,7 +133,7 @@
returns True or False. If preserve_none=True, returns True, False,
or None. If parsing was not successful, raises ValueError, or, if
fail_on_errors=False, returns None."""
- if not isinstance(value, string_type):
+ if not isinstance(value, str):
if preserve_none and value is None:
return value
return bool(value)
@@ -111,24 +147,57 @@
raise ValueError('Cannot parse bool value: %r' % value)
+def code_escape(text):
+ """Escape code."""
+ if "&" in text:
+ text = text.replace("&", "&")
+ if "<" in text:
+ text = text.replace("<", "<")
+ if ">" in text:
+ text = text.replace(">", ">")
+ return text
+
+
+def _get_stack_depth(size=2):
+ """Get stack size for caller's frame.
+ See https://stackoverflow.com/a/47956089/866026
+ """
+ frame = sys._getframe(size)
+
+ for size in count(size):
+ frame = frame.f_back
+ if not frame:
+ return size
+
+
+def nearing_recursion_limit():
+ """Return true if current stack depth is withing 100 of maximum limit."""
+ return sys.getrecursionlimit() - _get_stack_depth() < 100
+
+
"""
MISC AUXILIARY CLASSES
=============================================================================
"""
-class AtomicString(text_type):
+class AtomicString(str):
"""A string which should not be further processed."""
pass
-class Processor(object):
- def __init__(self, markdown_instance=None):
- if markdown_instance:
- self.markdown = markdown_instance
+class Processor:
+ def __init__(self, md=None):
+ self.md = md
+
+ @property
+ @deprecated("Use 'md' instead.")
+ def markdown(self):
+ # TODO: remove this later
+ return self.md
-class HtmlStash(object):
+class HtmlStash:
"""
This class is used for stashing HTML objects that we extract
in the beginning and replace with place-holders.
@@ -141,7 +210,7 @@
self.tag_counter = 0
self.tag_data = [] # list of dictionaries in the order tags appear
- def store(self, html, safe=False):
+ def store(self, html):
"""
Saves an HTML segment for later reinsertion. Returns a
placeholder string that needs to be inserted into the
@@ -150,12 +219,11 @@
Keyword arguments:
* html: an html segment
- * safe: label an html segment as safe for safemode
Returns : a placeholder string
"""
- self.rawHtmlBlocks.append((html, safe))
+ self.rawHtmlBlocks.append(html)
placeholder = self.get_placeholder(self.html_counter)
self.html_counter += 1
return placeholder
@@ -175,3 +243,240 @@
placeholder = TAG_PLACEHOLDER % str(self.tag_counter)
self.tag_counter += 1 # equal to the tag's index in self.tag_data
return placeholder
+
+
+# Used internally by `Registry` for each item in its sorted list.
+# Provides an easier to read API when editing the code later.
+# For example, `item.name` is more clear than `item[0]`.
+_PriorityItem = namedtuple('PriorityItem', ['name', 'priority'])
+
+
+class Registry:
+ """
+ A priority sorted registry.
+
+ A `Registry` instance provides two public methods to alter the data of the
+ registry: `register` and `deregister`. Use `register` to add items and
+ `deregister` to remove items. See each method for specifics.
+
+ When registering an item, a "name" and a "priority" must be provided. All
+ items are automatically sorted by "priority" from highest to lowest. The
+ "name" is used to remove ("deregister") and get items.
+
+ A `Registry` instance it like a list (which maintains order) when reading
+ data. You may iterate over the items, get an item and get a count (length)
+ of all items. You may also check that the registry contains an item.
+
+ When getting an item you may use either the index of the item or the
+ string-based "name". For example:
+
+ registry = Registry()
+ registry.register(SomeItem(), 'itemname', 20)
+ # Get the item by index
+ item = registry[0]
+ # Get the item by name
+ item = registry['itemname']
+
+ When checking that the registry contains an item, you may use either the
+ string-based "name", or a reference to the actual item. For example:
+
+ someitem = SomeItem()
+ registry.register(someitem, 'itemname', 20)
+ # Contains the name
+ assert 'itemname' in registry
+ # Contains the item instance
+ assert someitem in registry
+
+ The method `get_index_for_name` is also available to obtain the index of
+ an item using that item's assigned "name".
+ """
+
+ def __init__(self):
+ self._data = {}
+ self._priority = []
+ self._is_sorted = False
+
+ def __contains__(self, item):
+ if isinstance(item, str):
+ # Check if an item exists by this name.
+ return item in self._data.keys()
+ # Check if this instance exists.
+ return item in self._data.values()
+
+ def __iter__(self):
+ self._sort()
+ return iter([self._data[k] for k, p in self._priority])
+
+ def __getitem__(self, key):
+ self._sort()
+ if isinstance(key, slice):
+ data = Registry()
+ for k, p in self._priority[key]:
+ data.register(self._data[k], k, p)
+ return data
+ if isinstance(key, int):
+ return self._data[self._priority[key].name]
+ return self._data[key]
+
+ def __len__(self):
+ return len(self._priority)
+
+ def __repr__(self):
+ return '<{}({})>'.format(self.__class__.__name__, list(self))
+
+ def get_index_for_name(self, name):
+ """
+ Return the index of the given name.
+ """
+ if name in self:
+ self._sort()
+ return self._priority.index(
+ [x for x in self._priority if x.name == name][0]
+ )
+ raise ValueError('No item named "{}" exists.'.format(name))
+
+ def register(self, item, name, priority):
+ """
+ Add an item to the registry with the given name and priority.
+
+ Parameters:
+
+ * `item`: The item being registered.
+ * `name`: A string used to reference the item.
+ * `priority`: An integer or float used to sort against all items.
+
+ If an item is registered with a "name" which already exists, the
+ existing item is replaced with the new item. Tread carefully as the
+ old item is lost with no way to recover it. The new item will be
+ sorted according to its priority and will **not** retain the position
+ of the old item.
+ """
+ if name in self:
+ # Remove existing item of same name first
+ self.deregister(name)
+ self._is_sorted = False
+ self._data[name] = item
+ self._priority.append(_PriorityItem(name, priority))
+
+ def deregister(self, name, strict=True):
+ """
+ Remove an item from the registry.
+
+ Set `strict=False` to fail silently.
+ """
+ try:
+ index = self.get_index_for_name(name)
+ del self._priority[index]
+ del self._data[name]
+ except ValueError:
+ if strict:
+ raise
+
+ def _sort(self):
+ """
+ Sort the registry by priority from highest to lowest.
+
+ This method is called internally and should never be explicitly called.
+ """
+ if not self._is_sorted:
+ self._priority.sort(key=lambda item: item.priority, reverse=True)
+ self._is_sorted = True
+
+ # Deprecated Methods which provide a smooth transition from OrderedDict
+
+ def __setitem__(self, key, value):
+ """ Register item with priorty 5 less than lowest existing priority. """
+ if isinstance(key, str):
+ warnings.warn(
+ 'Using setitem to register a processor or pattern is deprecated. '
+ 'Use the `register` method instead.',
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ if key in self:
+ # Key already exists, replace without altering priority
+ self._data[key] = value
+ return
+ if len(self) == 0:
+ # This is the first item. Set priority to 50.
+ priority = 50
+ else:
+ self._sort()
+ priority = self._priority[-1].priority - 5
+ self.register(value, key, priority)
+ else:
+ raise TypeError
+
+ def __delitem__(self, key):
+ """ Deregister an item by name. """
+ if key in self:
+ self.deregister(key)
+ warnings.warn(
+ 'Using del to remove a processor or pattern is deprecated. '
+ 'Use the `deregister` method instead.',
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ else:
+ raise KeyError('Cannot delete key {}, not registered.'.format(key))
+
+ def add(self, key, value, location):
+ """ Register a key by location. """
+ if len(self) == 0:
+ # This is the first item. Set priority to 50.
+ priority = 50
+ elif location == '_begin':
+ self._sort()
+ # Set priority 5 greater than highest existing priority
+ priority = self._priority[0].priority + 5
+ elif location == '_end':
+ self._sort()
+ # Set priority 5 less than lowest existing priority
+ priority = self._priority[-1].priority - 5
+ elif location.startswith('<') or location.startswith('>'):
+ # Set priority halfway between existing priorities.
+ i = self.get_index_for_name(location[1:])
+ if location.startswith('<'):
+ after = self._priority[i].priority
+ if i > 0:
+ before = self._priority[i-1].priority
+ else:
+ # Location is first item`
+ before = after + 10
+ else:
+ # location.startswith('>')
+ before = self._priority[i].priority
+ if i < len(self) - 1:
+ after = self._priority[i+1].priority
+ else:
+ # location is last item
+ after = before - 10
+ priority = before - ((before - after) / 2)
+ else:
+ raise ValueError('Not a valid location: "%s". Location key '
+ 'must start with a ">" or "<".' % location)
+ self.register(value, key, priority)
+ warnings.warn(
+ 'Using the add method to register a processor or pattern is deprecated. '
+ 'Use the `register` method instead.',
+ DeprecationWarning,
+ stacklevel=2,
+ )
+
+
+def __getattr__(name):
+ """Get attribute."""
+
+ deprecated = __deprecated__.get(name)
+ if deprecated:
+ warnings.warn(
+ "'{}' is deprecated. Use '{}' instead.".format(name, deprecated[0]),
+ category=DeprecationWarning,
+ stacklevel=(3 if PY37 else 4)
+ )
+ return deprecated[1]
+ raise AttributeError("module '{}' has no attribute '{}'".format(__name__, name))
+
+
+if not PY37:
+ Pep562(__name__)