# Add x/html serialization to `Elementree` # Taken from ElementTree 1.3 preview with slight modifications # # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. # # fredrik@pythonware.com # https://www.pythonware.com/ # # -------------------------------------------------------------------- # The ElementTree toolkit is # # Copyright (c) 1999-2007 by Fredrik Lundh # # By obtaining, using, and/or copying this software and/or its # associated documentation, you agree that you have read, understood, # and will comply with the following terms and conditions: # # Permission to use, copy, modify, and distribute this software and # its associated documentation for any purpose and without fee is # hereby granted, provided that the above copyright notice appears in # all copies, and that both that copyright notice and this permission # notice appear in supporting documentation, and that the name of # Secret Labs AB or the author not be used in advertising or publicity # pertaining to distribution of the software without specific, written # prior permission. # # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE # OF THIS SOFTWARE. # -------------------------------------------------------------------- from xml.etree.ElementTree import ProcessingInstruction from xml.etree.ElementTree import Comment, ElementTree, QName import re __all__ = ['to_html_string', 'to_xhtml_string'] HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", "img", "input", "isindex", "link", "meta", "param") RE_AMP = re.compile(r'&(?!(?:\#[0-9]+|\#x[0-9a-f]+|[0-9a-z]+);)', re.I) try: HTML_EMPTY = set(HTML_EMPTY) except NameError: # pragma: no cover pass def _raise_serialization_error(text): # pragma: no cover raise TypeError( "cannot serialize {!r} (type {})".format(text, type(text).__name__) ) def _escape_cdata(text): # escape character data try: # it's worth avoiding do-nothing calls for strings that are # shorter than 500 character, or so. assume that's, by far, # the most common case in most applications. if "&" in text: # Only replace & when not part of an entity text = RE_AMP.sub('&', text) if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") return text except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _escape_attrib(text): # escape attribute value try: if "&" in text: # Only replace & when not part of an entity text = RE_AMP.sub('&', text) if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) if "\n" in text: text = text.replace("\n", " ") return text except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _escape_attrib_html(text): # escape attribute value try: if "&" in text: # Only replace & when not part of an entity text = RE_AMP.sub('&', text) if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) return text except (TypeError, AttributeError): # pragma: no cover _raise_serialization_error(text) def _serialize_html(write, elem, format): tag = elem.tag text = elem.text if tag is Comment: write("" % _escape_cdata(text)) elif tag is ProcessingInstruction: write("" % _escape_cdata(text)) elif tag is None: if text: write(_escape_cdata(text)) for e in elem: _serialize_html(write, e, format) else: namespace_uri = None if isinstance(tag, QName): # `QNAME` objects store their data as a string: `{uri}tag` if tag.text[:1] == "{": namespace_uri, tag = tag.text[1:].split("}", 1) else: raise ValueError('QName objects must define a tag.') write("<" + tag) items = elem.items() if items: items = sorted(items) # lexical order for k, v in items: if isinstance(k, QName): # Assume a text only `QName` k = k.text if isinstance(v, QName): # Assume a text only `QName` v = v.text else: v = _escape_attrib_html(v) if k == v and format == 'html': # handle boolean attributes write(" %s" % v) else: write(' {}="{}"'.format(k, v)) if namespace_uri: write(' xmlns="%s"' % (_escape_attrib(namespace_uri))) if format == "xhtml" and tag.lower() in HTML_EMPTY: write(" />") else: write(">") if text: if tag.lower() in ["script", "style"]: write(text) else: write(_escape_cdata(text)) for e in elem: _serialize_html(write, e, format) if tag.lower() not in HTML_EMPTY: write("") if elem.tail: write(_escape_cdata(elem.tail)) def _write_html(root, format="html"): assert root is not None data = [] write = data.append _serialize_html(write, root, format) return "".join(data) # -------------------------------------------------------------------- # public functions def to_html_string(element): return _write_html(ElementTree(element).getroot(), format="html") def to_xhtml_string(element): return _write_html(ElementTree(element).getroot(), format="xhtml")