dpranke | b08af21 | 2015-10-06 17:44:36 -0700 | [diff] [blame^] | 1 | """ |
| 2 | PRE-PROCESSORS |
| 3 | ============================================================================= |
| 4 | |
| 5 | Preprocessors work on source text before we start doing anything too |
| 6 | complicated. |
| 7 | """ |
| 8 | |
| 9 | from __future__ import absolute_import |
| 10 | from __future__ import unicode_literals |
| 11 | from . import util |
| 12 | from . import odict |
| 13 | import re |
| 14 | |
| 15 | |
| 16 | def build_preprocessors(md_instance, **kwargs): |
| 17 | """ Build the default set of preprocessors used by Markdown. """ |
| 18 | preprocessors = odict.OrderedDict() |
| 19 | preprocessors['normalize_whitespace'] = NormalizeWhitespace(md_instance) |
| 20 | if md_instance.safeMode != 'escape': |
| 21 | preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) |
| 22 | preprocessors["reference"] = ReferencePreprocessor(md_instance) |
| 23 | return preprocessors |
| 24 | |
| 25 | |
| 26 | class Preprocessor(util.Processor): |
| 27 | """ |
| 28 | Preprocessors are run after the text is broken into lines. |
| 29 | |
| 30 | Each preprocessor implements a "run" method that takes a pointer to a |
| 31 | list of lines of the document, modifies it as necessary and returns |
| 32 | either the same pointer or a pointer to a new list. |
| 33 | |
| 34 | Preprocessors must extend markdown.Preprocessor. |
| 35 | |
| 36 | """ |
| 37 | def run(self, lines): |
| 38 | """ |
| 39 | Each subclass of Preprocessor should override the `run` method, which |
| 40 | takes the document as a list of strings split by newlines and returns |
| 41 | the (possibly modified) list of lines. |
| 42 | |
| 43 | """ |
| 44 | pass # pragma: no cover |
| 45 | |
| 46 | |
| 47 | class NormalizeWhitespace(Preprocessor): |
| 48 | """ Normalize whitespace for consistant parsing. """ |
| 49 | |
| 50 | def run(self, lines): |
| 51 | source = '\n'.join(lines) |
| 52 | source = source.replace(util.STX, "").replace(util.ETX, "") |
| 53 | source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" |
| 54 | source = source.expandtabs(self.markdown.tab_length) |
| 55 | source = re.sub(r'(?<=\n) +\n', '\n', source) |
| 56 | return source.split('\n') |
| 57 | |
| 58 | |
| 59 | class HtmlBlockPreprocessor(Preprocessor): |
| 60 | """Remove html blocks from the text and store them for later retrieval.""" |
| 61 | |
| 62 | right_tag_patterns = ["</%s>", "%s>"] |
| 63 | attrs_pattern = r""" |
| 64 | \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
| 65 | | # OR |
| 66 | \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
| 67 | | # OR |
| 68 | \s+(?P<attr2>[^>"'/= ]+) # attr |
| 69 | """ |
| 70 | left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \ |
| 71 | attrs_pattern |
| 72 | attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
| 73 | left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
| 74 | markdown_in_raw = False |
| 75 | |
| 76 | def _get_left_tag(self, block): |
| 77 | m = self.left_tag_re.match(block) |
| 78 | if m: |
| 79 | tag = m.group('tag') |
| 80 | raw_attrs = m.group('attrs') |
| 81 | attrs = {} |
| 82 | if raw_attrs: |
| 83 | for ma in self.attrs_re.finditer(raw_attrs): |
| 84 | if ma.group('attr'): |
| 85 | if ma.group('value'): |
| 86 | attrs[ma.group('attr').strip()] = ma.group('value') |
| 87 | else: |
| 88 | attrs[ma.group('attr').strip()] = "" |
| 89 | elif ma.group('attr1'): |
| 90 | if ma.group('value1'): |
| 91 | attrs[ma.group('attr1').strip()] = ma.group( |
| 92 | 'value1' |
| 93 | ) |
| 94 | else: |
| 95 | attrs[ma.group('attr1').strip()] = "" |
| 96 | elif ma.group('attr2'): |
| 97 | attrs[ma.group('attr2').strip()] = "" |
| 98 | return tag, len(m.group(0)), attrs |
| 99 | else: |
| 100 | tag = block[1:].split(">", 1)[0].lower() |
| 101 | return tag, len(tag)+2, {} |
| 102 | |
| 103 | def _recursive_tagfind(self, ltag, rtag, start_index, block): |
| 104 | while 1: |
| 105 | i = block.find(rtag, start_index) |
| 106 | if i == -1: |
| 107 | return -1 |
| 108 | j = block.find(ltag, start_index) |
| 109 | # if no ltag, or rtag found before another ltag, return index |
| 110 | if (j > i or j == -1): |
| 111 | return i + len(rtag) |
| 112 | # another ltag found before rtag, use end of ltag as starting |
| 113 | # point and search again |
| 114 | j = block.find('>', j) |
| 115 | start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
| 116 | if start_index == -1: |
| 117 | # HTML potentially malformed- ltag has no corresponding |
| 118 | # rtag |
| 119 | return -1 |
| 120 | |
| 121 | def _get_right_tag(self, left_tag, left_index, block): |
| 122 | for p in self.right_tag_patterns: |
| 123 | tag = p % left_tag |
| 124 | i = self._recursive_tagfind( |
| 125 | "<%s" % left_tag, tag, left_index, block |
| 126 | ) |
| 127 | if i > 2: |
| 128 | return tag.lstrip("<").rstrip(">"), i |
| 129 | return block.rstrip()[-left_index:-1].lower(), len(block) |
| 130 | |
| 131 | def _equal_tags(self, left_tag, right_tag): |
| 132 | if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
| 133 | return True |
| 134 | if ("/" + left_tag) == right_tag: |
| 135 | return True |
| 136 | if (right_tag == "--" and left_tag == "--"): |
| 137 | return True |
| 138 | elif left_tag == right_tag[1:] and right_tag[0] == "/": |
| 139 | return True |
| 140 | else: |
| 141 | return False |
| 142 | |
| 143 | def _is_oneliner(self, tag): |
| 144 | return (tag in ['hr', 'hr/']) |
| 145 | |
| 146 | def _stringindex_to_listindex(self, stringindex, items): |
| 147 | """ |
| 148 | Same effect as concatenating the strings in items, |
| 149 | finding the character to which stringindex refers in that string, |
| 150 | and returning the index of the item in which that character resides. |
| 151 | """ |
| 152 | items.append('dummy') |
| 153 | i, count = 0, 0 |
| 154 | while count <= stringindex: |
| 155 | count += len(items[i]) |
| 156 | i += 1 |
| 157 | return i - 1 |
| 158 | |
| 159 | def _nested_markdown_in_html(self, items): |
| 160 | """Find and process html child elements of the given element block.""" |
| 161 | for i, item in enumerate(items): |
| 162 | if self.left_tag_re.match(item): |
| 163 | left_tag, left_index, attrs = \ |
| 164 | self._get_left_tag(''.join(items[i:])) |
| 165 | right_tag, data_index = self._get_right_tag( |
| 166 | left_tag, left_index, ''.join(items[i:])) |
| 167 | right_listindex = \ |
| 168 | self._stringindex_to_listindex(data_index, items[i:]) + i |
| 169 | if 'markdown' in attrs.keys(): |
| 170 | items[i] = items[i][left_index:] # remove opening tag |
| 171 | placeholder = self.markdown.htmlStash.store_tag( |
| 172 | left_tag, attrs, i + 1, right_listindex + 1) |
| 173 | items.insert(i, placeholder) |
| 174 | if len(items) - right_listindex <= 1: # last nest, no tail |
| 175 | right_listindex -= 1 |
| 176 | items[right_listindex] = items[right_listindex][ |
| 177 | :-len(right_tag) - 2] # remove closing tag |
| 178 | else: # raw html |
| 179 | if len(items) - right_listindex <= 1: # last element |
| 180 | right_listindex -= 1 |
| 181 | offset = 1 if i == right_listindex else 0 |
| 182 | placeholder = self.markdown.htmlStash.store('\n\n'.join( |
| 183 | items[i:right_listindex + offset])) |
| 184 | del items[i:right_listindex + offset] |
| 185 | items.insert(i, placeholder) |
| 186 | return items |
| 187 | |
| 188 | def run(self, lines): |
| 189 | text = "\n".join(lines) |
| 190 | new_blocks = [] |
| 191 | text = text.rsplit("\n\n") |
| 192 | items = [] |
| 193 | left_tag = '' |
| 194 | right_tag = '' |
| 195 | in_tag = False # flag |
| 196 | |
| 197 | while text: |
| 198 | block = text[0] |
| 199 | if block.startswith("\n"): |
| 200 | block = block[1:] |
| 201 | text = text[1:] |
| 202 | |
| 203 | if block.startswith("\n"): |
| 204 | block = block[1:] |
| 205 | |
| 206 | if not in_tag: |
| 207 | if block.startswith("<") and len(block.strip()) > 1: |
| 208 | |
| 209 | if block[1:4] == "!--": |
| 210 | # is a comment block |
| 211 | left_tag, left_index, attrs = "--", 2, {} |
| 212 | else: |
| 213 | left_tag, left_index, attrs = self._get_left_tag(block) |
| 214 | right_tag, data_index = self._get_right_tag(left_tag, |
| 215 | left_index, |
| 216 | block) |
| 217 | # keep checking conditions below and maybe just append |
| 218 | |
| 219 | if data_index < len(block) and (util.isBlockLevel(left_tag) or left_tag == '--'): |
| 220 | text.insert(0, block[data_index:]) |
| 221 | block = block[:data_index] |
| 222 | |
| 223 | if not (util.isBlockLevel(left_tag) or block[1] in ["!", "?", "@", "%"]): |
| 224 | new_blocks.append(block) |
| 225 | continue |
| 226 | |
| 227 | if self._is_oneliner(left_tag): |
| 228 | new_blocks.append(block.strip()) |
| 229 | continue |
| 230 | |
| 231 | if block.rstrip().endswith(">") \ |
| 232 | and self._equal_tags(left_tag, right_tag): |
| 233 | if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| 234 | block = block[left_index:-len(right_tag) - 2] |
| 235 | new_blocks.append(self.markdown.htmlStash. |
| 236 | store_tag(left_tag, attrs, 0, 2)) |
| 237 | new_blocks.extend([block]) |
| 238 | else: |
| 239 | new_blocks.append( |
| 240 | self.markdown.htmlStash.store(block.strip())) |
| 241 | continue |
| 242 | else: |
| 243 | # if is block level tag and is not complete |
| 244 | if (not self._equal_tags(left_tag, right_tag)) and \ |
| 245 | (util.isBlockLevel(left_tag) or left_tag == "--"): |
| 246 | items.append(block.strip()) |
| 247 | in_tag = True |
| 248 | else: |
| 249 | new_blocks.append( |
| 250 | self.markdown.htmlStash.store(block.strip()) |
| 251 | ) |
| 252 | continue |
| 253 | |
| 254 | else: |
| 255 | new_blocks.append(block) |
| 256 | |
| 257 | else: |
| 258 | items.append(block) |
| 259 | |
| 260 | right_tag, data_index = self._get_right_tag(left_tag, 0, block) |
| 261 | |
| 262 | if self._equal_tags(left_tag, right_tag): |
| 263 | # if find closing tag |
| 264 | |
| 265 | if data_index < len(block): |
| 266 | # we have more text after right_tag |
| 267 | items[-1] = block[:data_index] |
| 268 | text.insert(0, block[data_index:]) |
| 269 | |
| 270 | in_tag = False |
| 271 | if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| 272 | items[0] = items[0][left_index:] |
| 273 | items[-1] = items[-1][:-len(right_tag) - 2] |
| 274 | if items[len(items) - 1]: # not a newline/empty string |
| 275 | right_index = len(items) + 3 |
| 276 | else: |
| 277 | right_index = len(items) + 2 |
| 278 | new_blocks.append(self.markdown.htmlStash.store_tag( |
| 279 | left_tag, attrs, 0, right_index)) |
| 280 | placeholderslen = len(self.markdown.htmlStash.tag_data) |
| 281 | new_blocks.extend( |
| 282 | self._nested_markdown_in_html(items)) |
| 283 | nests = len(self.markdown.htmlStash.tag_data) - \ |
| 284 | placeholderslen |
| 285 | self.markdown.htmlStash.tag_data[-1 - nests][ |
| 286 | 'right_index'] += nests - 2 |
| 287 | else: |
| 288 | new_blocks.append( |
| 289 | self.markdown.htmlStash.store('\n\n'.join(items))) |
| 290 | items = [] |
| 291 | |
| 292 | if items: |
| 293 | if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| 294 | items[0] = items[0][left_index:] |
| 295 | items[-1] = items[-1][:-len(right_tag) - 2] |
| 296 | if items[len(items) - 1]: # not a newline/empty string |
| 297 | right_index = len(items) + 3 |
| 298 | else: |
| 299 | right_index = len(items) + 2 |
| 300 | new_blocks.append( |
| 301 | self.markdown.htmlStash.store_tag( |
| 302 | left_tag, attrs, 0, right_index)) |
| 303 | placeholderslen = len(self.markdown.htmlStash.tag_data) |
| 304 | new_blocks.extend(self._nested_markdown_in_html(items)) |
| 305 | nests = len(self.markdown.htmlStash.tag_data) - placeholderslen |
| 306 | self.markdown.htmlStash.tag_data[-1 - nests][ |
| 307 | 'right_index'] += nests - 2 |
| 308 | else: |
| 309 | new_blocks.append( |
| 310 | self.markdown.htmlStash.store('\n\n'.join(items))) |
| 311 | new_blocks.append('\n') |
| 312 | |
| 313 | new_text = "\n\n".join(new_blocks) |
| 314 | return new_text.split("\n") |
| 315 | |
| 316 | |
| 317 | class ReferencePreprocessor(Preprocessor): |
| 318 | """ Remove reference definitions from text and store for later use. """ |
| 319 | |
| 320 | TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*' |
| 321 | RE = re.compile( |
| 322 | r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL |
| 323 | ) |
| 324 | TITLE_RE = re.compile(r'^%s$' % TITLE) |
| 325 | |
| 326 | def run(self, lines): |
| 327 | new_text = [] |
| 328 | while lines: |
| 329 | line = lines.pop(0) |
| 330 | m = self.RE.match(line) |
| 331 | if m: |
| 332 | id = m.group(1).strip().lower() |
| 333 | link = m.group(2).lstrip('<').rstrip('>') |
| 334 | t = m.group(5) or m.group(6) or m.group(7) |
| 335 | if not t: |
| 336 | # Check next line for title |
| 337 | tm = self.TITLE_RE.match(lines[0]) |
| 338 | if tm: |
| 339 | lines.pop(0) |
| 340 | t = tm.group(2) or tm.group(3) or tm.group(4) |
| 341 | self.markdown.references[id] = (link, t) |
| 342 | else: |
| 343 | new_text.append(line) |
| 344 | |
| 345 | return new_text # + "\n" |