dpranke | b08af21 | 2015-10-06 17:44:36 -0700 | [diff] [blame^] | 1 | # markdown/searializers.py |
| 2 | # |
| 3 | # Add x/html serialization to Elementree |
| 4 | # Taken from ElementTree 1.3 preview with slight modifications |
| 5 | # |
| 6 | # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. |
| 7 | # |
| 8 | # fredrik@pythonware.com |
| 9 | # http://www.pythonware.com |
| 10 | # |
| 11 | # -------------------------------------------------------------------- |
| 12 | # The ElementTree toolkit is |
| 13 | # |
| 14 | # Copyright (c) 1999-2007 by Fredrik Lundh |
| 15 | # |
| 16 | # By obtaining, using, and/or copying this software and/or its |
| 17 | # associated documentation, you agree that you have read, understood, |
| 18 | # and will comply with the following terms and conditions: |
| 19 | # |
| 20 | # Permission to use, copy, modify, and distribute this software and |
| 21 | # its associated documentation for any purpose and without fee is |
| 22 | # hereby granted, provided that the above copyright notice appears in |
| 23 | # all copies, and that both that copyright notice and this permission |
| 24 | # notice appear in supporting documentation, and that the name of |
| 25 | # Secret Labs AB or the author not be used in advertising or publicity |
| 26 | # pertaining to distribution of the software without specific, written |
| 27 | # prior permission. |
| 28 | # |
| 29 | # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD |
| 30 | # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- |
| 31 | # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR |
| 32 | # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY |
| 33 | # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
| 34 | # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS |
| 35 | # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE |
| 36 | # OF THIS SOFTWARE. |
| 37 | # -------------------------------------------------------------------- |
| 38 | |
| 39 | |
| 40 | from __future__ import absolute_import |
| 41 | from __future__ import unicode_literals |
| 42 | from . import util |
| 43 | ElementTree = util.etree.ElementTree |
| 44 | QName = util.etree.QName |
| 45 | if hasattr(util.etree, 'test_comment'): # pragma: no cover |
| 46 | Comment = util.etree.test_comment |
| 47 | else: # pragma: no cover |
| 48 | Comment = util.etree.Comment |
| 49 | PI = util.etree.PI |
| 50 | ProcessingInstruction = util.etree.ProcessingInstruction |
| 51 | |
| 52 | __all__ = ['to_html_string', 'to_xhtml_string'] |
| 53 | |
| 54 | HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", |
| 55 | "img", "input", "isindex", "link", "meta" "param") |
| 56 | |
| 57 | try: |
| 58 | HTML_EMPTY = set(HTML_EMPTY) |
| 59 | except NameError: # pragma: no cover |
| 60 | pass |
| 61 | |
| 62 | _namespace_map = { |
| 63 | # "well-known" namespace prefixes |
| 64 | "http://www.w3.org/XML/1998/namespace": "xml", |
| 65 | "http://www.w3.org/1999/xhtml": "html", |
| 66 | "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", |
| 67 | "http://schemas.xmlsoap.org/wsdl/": "wsdl", |
| 68 | # xml schema |
| 69 | "http://www.w3.org/2001/XMLSchema": "xs", |
| 70 | "http://www.w3.org/2001/XMLSchema-instance": "xsi", |
| 71 | # dublic core |
| 72 | "http://purl.org/dc/elements/1.1/": "dc", |
| 73 | } |
| 74 | |
| 75 | |
| 76 | def _raise_serialization_error(text): # pragma: no cover |
| 77 | raise TypeError( |
| 78 | "cannot serialize %r (type %s)" % (text, type(text).__name__) |
| 79 | ) |
| 80 | |
| 81 | |
| 82 | def _encode(text, encoding): |
| 83 | try: |
| 84 | return text.encode(encoding, "xmlcharrefreplace") |
| 85 | except (TypeError, AttributeError): # pragma: no cover |
| 86 | _raise_serialization_error(text) |
| 87 | |
| 88 | |
| 89 | def _escape_cdata(text): |
| 90 | # escape character data |
| 91 | try: |
| 92 | # it's worth avoiding do-nothing calls for strings that are |
| 93 | # shorter than 500 character, or so. assume that's, by far, |
| 94 | # the most common case in most applications. |
| 95 | if "&" in text: |
| 96 | text = text.replace("&", "&") |
| 97 | if "<" in text: |
| 98 | text = text.replace("<", "<") |
| 99 | if ">" in text: |
| 100 | text = text.replace(">", ">") |
| 101 | return text |
| 102 | except (TypeError, AttributeError): # pragma: no cover |
| 103 | _raise_serialization_error(text) |
| 104 | |
| 105 | |
| 106 | def _escape_attrib(text): |
| 107 | # escape attribute value |
| 108 | try: |
| 109 | if "&" in text: |
| 110 | text = text.replace("&", "&") |
| 111 | if "<" in text: |
| 112 | text = text.replace("<", "<") |
| 113 | if ">" in text: |
| 114 | text = text.replace(">", ">") |
| 115 | if "\"" in text: |
| 116 | text = text.replace("\"", """) |
| 117 | if "\n" in text: |
| 118 | text = text.replace("\n", " ") |
| 119 | return text |
| 120 | except (TypeError, AttributeError): # pragma: no cover |
| 121 | _raise_serialization_error(text) |
| 122 | |
| 123 | |
| 124 | def _escape_attrib_html(text): |
| 125 | # escape attribute value |
| 126 | try: |
| 127 | if "&" in text: |
| 128 | text = text.replace("&", "&") |
| 129 | if "<" in text: |
| 130 | text = text.replace("<", "<") |
| 131 | if ">" in text: |
| 132 | text = text.replace(">", ">") |
| 133 | if "\"" in text: |
| 134 | text = text.replace("\"", """) |
| 135 | return text |
| 136 | except (TypeError, AttributeError): # pragma: no cover |
| 137 | _raise_serialization_error(text) |
| 138 | |
| 139 | |
| 140 | def _serialize_html(write, elem, qnames, namespaces, format): |
| 141 | tag = elem.tag |
| 142 | text = elem.text |
| 143 | if tag is Comment: |
| 144 | write("<!--%s-->" % _escape_cdata(text)) |
| 145 | elif tag is ProcessingInstruction: |
| 146 | write("<?%s?>" % _escape_cdata(text)) |
| 147 | else: |
| 148 | tag = qnames[tag] |
| 149 | if tag is None: |
| 150 | if text: |
| 151 | write(_escape_cdata(text)) |
| 152 | for e in elem: |
| 153 | _serialize_html(write, e, qnames, None, format) |
| 154 | else: |
| 155 | write("<" + tag) |
| 156 | items = elem.items() |
| 157 | if items or namespaces: |
| 158 | items = sorted(items) # lexical order |
| 159 | for k, v in items: |
| 160 | if isinstance(k, QName): |
| 161 | k = k.text |
| 162 | if isinstance(v, QName): |
| 163 | v = qnames[v.text] |
| 164 | else: |
| 165 | v = _escape_attrib_html(v) |
| 166 | if qnames[k] == v and format == 'html': |
| 167 | # handle boolean attributes |
| 168 | write(" %s" % v) |
| 169 | else: |
| 170 | write(" %s=\"%s\"" % (qnames[k], v)) |
| 171 | if namespaces: |
| 172 | items = namespaces.items() |
| 173 | items.sort(key=lambda x: x[1]) # sort on prefix |
| 174 | for v, k in items: |
| 175 | if k: |
| 176 | k = ":" + k |
| 177 | write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v))) |
| 178 | if format == "xhtml" and tag.lower() in HTML_EMPTY: |
| 179 | write(" />") |
| 180 | else: |
| 181 | write(">") |
| 182 | if text: |
| 183 | if tag.lower() in ["script", "style"]: |
| 184 | write(text) |
| 185 | else: |
| 186 | write(_escape_cdata(text)) |
| 187 | for e in elem: |
| 188 | _serialize_html(write, e, qnames, None, format) |
| 189 | if tag.lower() not in HTML_EMPTY: |
| 190 | write("</" + tag + ">") |
| 191 | if elem.tail: |
| 192 | write(_escape_cdata(elem.tail)) |
| 193 | |
| 194 | |
| 195 | def _write_html(root, |
| 196 | encoding=None, |
| 197 | default_namespace=None, |
| 198 | format="html"): |
| 199 | assert root is not None |
| 200 | data = [] |
| 201 | write = data.append |
| 202 | qnames, namespaces = _namespaces(root, default_namespace) |
| 203 | _serialize_html(write, root, qnames, namespaces, format) |
| 204 | if encoding is None: |
| 205 | return "".join(data) |
| 206 | else: |
| 207 | return _encode("".join(data)) |
| 208 | |
| 209 | |
| 210 | # -------------------------------------------------------------------- |
| 211 | # serialization support |
| 212 | |
| 213 | def _namespaces(elem, default_namespace=None): |
| 214 | # identify namespaces used in this tree |
| 215 | |
| 216 | # maps qnames to *encoded* prefix:local names |
| 217 | qnames = {None: None} |
| 218 | |
| 219 | # maps uri:s to prefixes |
| 220 | namespaces = {} |
| 221 | if default_namespace: |
| 222 | namespaces[default_namespace] = "" |
| 223 | |
| 224 | def add_qname(qname): |
| 225 | # calculate serialized qname representation |
| 226 | try: |
| 227 | if qname[:1] == "{": |
| 228 | uri, tag = qname[1:].split("}", 1) |
| 229 | prefix = namespaces.get(uri) |
| 230 | if prefix is None: |
| 231 | prefix = _namespace_map.get(uri) |
| 232 | if prefix is None: |
| 233 | prefix = "ns%d" % len(namespaces) |
| 234 | if prefix != "xml": |
| 235 | namespaces[uri] = prefix |
| 236 | if prefix: |
| 237 | qnames[qname] = "%s:%s" % (prefix, tag) |
| 238 | else: |
| 239 | qnames[qname] = tag # default element |
| 240 | else: |
| 241 | if default_namespace: |
| 242 | raise ValueError( |
| 243 | "cannot use non-qualified names with " |
| 244 | "default_namespace option" |
| 245 | ) |
| 246 | qnames[qname] = qname |
| 247 | except TypeError: # pragma: no cover |
| 248 | _raise_serialization_error(qname) |
| 249 | |
| 250 | # populate qname and namespaces table |
| 251 | try: |
| 252 | iterate = elem.iter |
| 253 | except AttributeError: |
| 254 | iterate = elem.getiterator # cET compatibility |
| 255 | for elem in iterate(): |
| 256 | tag = elem.tag |
| 257 | if isinstance(tag, QName) and tag.text not in qnames: |
| 258 | add_qname(tag.text) |
| 259 | elif isinstance(tag, util.string_type): |
| 260 | if tag not in qnames: |
| 261 | add_qname(tag) |
| 262 | elif tag is not None and tag is not Comment and tag is not PI: |
| 263 | _raise_serialization_error(tag) |
| 264 | for key, value in elem.items(): |
| 265 | if isinstance(key, QName): |
| 266 | key = key.text |
| 267 | if key not in qnames: |
| 268 | add_qname(key) |
| 269 | if isinstance(value, QName) and value.text not in qnames: |
| 270 | add_qname(value.text) |
| 271 | text = elem.text |
| 272 | if isinstance(text, QName) and text.text not in qnames: |
| 273 | add_qname(text.text) |
| 274 | return qnames, namespaces |
| 275 | |
| 276 | |
| 277 | def to_html_string(element): |
| 278 | return _write_html(ElementTree(element).getroot(), format="html") |
| 279 | |
| 280 | |
| 281 | def to_xhtml_string(element): |
| 282 | return _write_html(ElementTree(element).getroot(), format="xhtml") |