blob: 1e8d9dd288f00ea0d5e001a22eb5004001cbcbf5 [file] [log] [blame]
dprankeb08af212015-10-06 17:44:36 -07001# markdown/searializers.py
2#
3# Add x/html serialization to Elementree
4# Taken from ElementTree 1.3 preview with slight modifications
5#
6# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
7#
8# fredrik@pythonware.com
9# http://www.pythonware.com
10#
11# --------------------------------------------------------------------
12# The ElementTree toolkit is
13#
14# Copyright (c) 1999-2007 by Fredrik Lundh
15#
16# By obtaining, using, and/or copying this software and/or its
17# associated documentation, you agree that you have read, understood,
18# and will comply with the following terms and conditions:
19#
20# Permission to use, copy, modify, and distribute this software and
21# its associated documentation for any purpose and without fee is
22# hereby granted, provided that the above copyright notice appears in
23# all copies, and that both that copyright notice and this permission
24# notice appear in supporting documentation, and that the name of
25# Secret Labs AB or the author not be used in advertising or publicity
26# pertaining to distribution of the software without specific, written
27# prior permission.
28#
29# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
30# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
31# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
32# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
33# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
34# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
35# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
36# OF THIS SOFTWARE.
37# --------------------------------------------------------------------
38
39
40from __future__ import absolute_import
41from __future__ import unicode_literals
42from . import util
43ElementTree = util.etree.ElementTree
44QName = util.etree.QName
45if hasattr(util.etree, 'test_comment'): # pragma: no cover
46 Comment = util.etree.test_comment
47else: # pragma: no cover
48 Comment = util.etree.Comment
49PI = util.etree.PI
50ProcessingInstruction = util.etree.ProcessingInstruction
51
52__all__ = ['to_html_string', 'to_xhtml_string']
53
54HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
55 "img", "input", "isindex", "link", "meta" "param")
56
57try:
58 HTML_EMPTY = set(HTML_EMPTY)
59except NameError: # pragma: no cover
60 pass
61
62_namespace_map = {
63 # "well-known" namespace prefixes
64 "http://www.w3.org/XML/1998/namespace": "xml",
65 "http://www.w3.org/1999/xhtml": "html",
66 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
67 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
68 # xml schema
69 "http://www.w3.org/2001/XMLSchema": "xs",
70 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
71 # dublic core
72 "http://purl.org/dc/elements/1.1/": "dc",
73}
74
75
76def _raise_serialization_error(text): # pragma: no cover
77 raise TypeError(
78 "cannot serialize %r (type %s)" % (text, type(text).__name__)
79 )
80
81
82def _encode(text, encoding):
83 try:
84 return text.encode(encoding, "xmlcharrefreplace")
85 except (TypeError, AttributeError): # pragma: no cover
86 _raise_serialization_error(text)
87
88
89def _escape_cdata(text):
90 # escape character data
91 try:
92 # it's worth avoiding do-nothing calls for strings that are
93 # shorter than 500 character, or so. assume that's, by far,
94 # the most common case in most applications.
95 if "&" in text:
96 text = text.replace("&", "&")
97 if "<" in text:
98 text = text.replace("<", "&lt;")
99 if ">" in text:
100 text = text.replace(">", "&gt;")
101 return text
102 except (TypeError, AttributeError): # pragma: no cover
103 _raise_serialization_error(text)
104
105
106def _escape_attrib(text):
107 # escape attribute value
108 try:
109 if "&" in text:
110 text = text.replace("&", "&amp;")
111 if "<" in text:
112 text = text.replace("<", "&lt;")
113 if ">" in text:
114 text = text.replace(">", "&gt;")
115 if "\"" in text:
116 text = text.replace("\"", "&quot;")
117 if "\n" in text:
118 text = text.replace("\n", "&#10;")
119 return text
120 except (TypeError, AttributeError): # pragma: no cover
121 _raise_serialization_error(text)
122
123
124def _escape_attrib_html(text):
125 # escape attribute value
126 try:
127 if "&" in text:
128 text = text.replace("&", "&amp;")
129 if "<" in text:
130 text = text.replace("<", "&lt;")
131 if ">" in text:
132 text = text.replace(">", "&gt;")
133 if "\"" in text:
134 text = text.replace("\"", "&quot;")
135 return text
136 except (TypeError, AttributeError): # pragma: no cover
137 _raise_serialization_error(text)
138
139
140def _serialize_html(write, elem, qnames, namespaces, format):
141 tag = elem.tag
142 text = elem.text
143 if tag is Comment:
144 write("<!--%s-->" % _escape_cdata(text))
145 elif tag is ProcessingInstruction:
146 write("<?%s?>" % _escape_cdata(text))
147 else:
148 tag = qnames[tag]
149 if tag is None:
150 if text:
151 write(_escape_cdata(text))
152 for e in elem:
153 _serialize_html(write, e, qnames, None, format)
154 else:
155 write("<" + tag)
156 items = elem.items()
157 if items or namespaces:
158 items = sorted(items) # lexical order
159 for k, v in items:
160 if isinstance(k, QName):
161 k = k.text
162 if isinstance(v, QName):
163 v = qnames[v.text]
164 else:
165 v = _escape_attrib_html(v)
166 if qnames[k] == v and format == 'html':
167 # handle boolean attributes
168 write(" %s" % v)
169 else:
170 write(" %s=\"%s\"" % (qnames[k], v))
171 if namespaces:
172 items = namespaces.items()
173 items.sort(key=lambda x: x[1]) # sort on prefix
174 for v, k in items:
175 if k:
176 k = ":" + k
177 write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
178 if format == "xhtml" and tag.lower() in HTML_EMPTY:
179 write(" />")
180 else:
181 write(">")
182 if text:
183 if tag.lower() in ["script", "style"]:
184 write(text)
185 else:
186 write(_escape_cdata(text))
187 for e in elem:
188 _serialize_html(write, e, qnames, None, format)
189 if tag.lower() not in HTML_EMPTY:
190 write("</" + tag + ">")
191 if elem.tail:
192 write(_escape_cdata(elem.tail))
193
194
195def _write_html(root,
196 encoding=None,
197 default_namespace=None,
198 format="html"):
199 assert root is not None
200 data = []
201 write = data.append
202 qnames, namespaces = _namespaces(root, default_namespace)
203 _serialize_html(write, root, qnames, namespaces, format)
204 if encoding is None:
205 return "".join(data)
206 else:
207 return _encode("".join(data))
208
209
210# --------------------------------------------------------------------
211# serialization support
212
213def _namespaces(elem, default_namespace=None):
214 # identify namespaces used in this tree
215
216 # maps qnames to *encoded* prefix:local names
217 qnames = {None: None}
218
219 # maps uri:s to prefixes
220 namespaces = {}
221 if default_namespace:
222 namespaces[default_namespace] = ""
223
224 def add_qname(qname):
225 # calculate serialized qname representation
226 try:
227 if qname[:1] == "{":
228 uri, tag = qname[1:].split("}", 1)
229 prefix = namespaces.get(uri)
230 if prefix is None:
231 prefix = _namespace_map.get(uri)
232 if prefix is None:
233 prefix = "ns%d" % len(namespaces)
234 if prefix != "xml":
235 namespaces[uri] = prefix
236 if prefix:
237 qnames[qname] = "%s:%s" % (prefix, tag)
238 else:
239 qnames[qname] = tag # default element
240 else:
241 if default_namespace:
242 raise ValueError(
243 "cannot use non-qualified names with "
244 "default_namespace option"
245 )
246 qnames[qname] = qname
247 except TypeError: # pragma: no cover
248 _raise_serialization_error(qname)
249
250 # populate qname and namespaces table
251 try:
252 iterate = elem.iter
253 except AttributeError:
254 iterate = elem.getiterator # cET compatibility
255 for elem in iterate():
256 tag = elem.tag
257 if isinstance(tag, QName) and tag.text not in qnames:
258 add_qname(tag.text)
259 elif isinstance(tag, util.string_type):
260 if tag not in qnames:
261 add_qname(tag)
262 elif tag is not None and tag is not Comment and tag is not PI:
263 _raise_serialization_error(tag)
264 for key, value in elem.items():
265 if isinstance(key, QName):
266 key = key.text
267 if key not in qnames:
268 add_qname(key)
269 if isinstance(value, QName) and value.text not in qnames:
270 add_qname(value.text)
271 text = elem.text
272 if isinstance(text, QName) and text.text not in qnames:
273 add_qname(text.text)
274 return qnames, namespaces
275
276
277def to_html_string(element):
278 return _write_html(ElementTree(element).getroot(), format="html")
279
280
281def to_xhtml_string(element):
282 return _write_html(ElementTree(element).getroot(), format="xhtml")