Warning

This document is for an old release of Galaxy. You can alternatively view this page in the latest release if it exists or view the top of the latest release's documentation.

Source code for galaxy.util.sanitize_html

"""
HTML Sanitizer (ripped from feedparser)
"""

import re
import sgmllib

from six import unichr

from galaxy.util import unicodify


# reversable htmlentitydefs mappings for Python 2.2
try:
    from htmlentitydefs import name2codepoint, codepoint2name
except:
    import htmlentitydefs
    name2codepoint = {}
    codepoint2name = {}
    for (name, codepoint) in htmlentitydefs.entitydefs.iteritems():
        if codepoint.startswith('&#'):
            codepoint = unichr(int(codepoint[2:-1]))
        name2codepoint[name] = ord(codepoint)
        codepoint2name[ord(codepoint)] = name


_cp1252 = {
    unichr(128): unichr(8364),  # euro sign
    unichr(130): unichr(8218),  # single low-9 quotation mark
    unichr(131): unichr(402),  # latin small letter f with hook
    unichr(132): unichr(8222),  # double low-9 quotation mark
    unichr(133): unichr(8230),  # horizontal ellipsis
    unichr(134): unichr(8224),  # dagger
    unichr(135): unichr(8225),  # double dagger
    unichr(136): unichr(710),  # modifier letter circumflex accent
    unichr(137): unichr(8240),  # per mille sign
    unichr(138): unichr(352),  # latin capital letter s with caron
    unichr(139): unichr(8249),  # single left-pointing angle quotation mark
    unichr(140): unichr(338),  # latin capital ligature oe
    unichr(142): unichr(381),  # latin capital letter z with caron
    unichr(145): unichr(8216),  # left single quotation mark
    unichr(146): unichr(8217),  # right single quotation mark
    unichr(147): unichr(8220),  # left double quotation mark
    unichr(148): unichr(8221),  # right double quotation mark
    unichr(149): unichr(8226),  # bullet
    unichr(150): unichr(8211),  # en dash
    unichr(151): unichr(8212),  # em dash
    unichr(152): unichr(732),  # small tilde
    unichr(153): unichr(8482),  # trade mark sign
    unichr(154): unichr(353),  # latin small letter s with caron
    unichr(155): unichr(8250),  # single right-pointing angle quotation mark
    unichr(156): unichr(339),  # latin small ligature oe
    unichr(158): unichr(382),  # latin small letter z with caron
    unichr(159): unichr(376)}  # latin capital letter y with diaeresis


class _BaseHTMLProcessor(sgmllib.SGMLParser):
    special = re.compile('''[<>'"]''')
    bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
                           'img', 'input', 'isindex', 'link', 'meta', 'param']

    def __init__(self, encoding, type):
        self.encoding = encoding
        self.type = type
        # if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
        sgmllib.SGMLParser.__init__(self)

    def reset(self):
        self.pieces = []
        sgmllib.SGMLParser.reset(self)

    def _shorttag_replace(self, match):
        tag = match.group(1)
        if tag in self.elements_no_end_tag:
            return '<' + tag + ' />'
        else:
            return '<' + tag + '></' + tag + '>'

    def parse_starttag(self, i):
        j = sgmllib.SGMLParser.parse_starttag(self, i)
        if self.type == 'application/xhtml+xml':
            if j > 2 and self.rawdata[j - 2:j] == '/>':
                self.unknown_endtag(self.lasttag)
        return j

    def feed(self, data):
        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
        data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
        data = data.replace('&#39;', "'")
        data = data.replace('&#34;', '"')
        sgmllib.SGMLParser.feed(self, data)
        sgmllib.SGMLParser.close(self)

    def normalize_attrs(self, attrs):
        if not attrs:
            return attrs
        # utility method to be called by descendants
        attrs = dict([(k.lower(), v) for k, v in attrs]).items()
        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
        attrs.sort()
        return attrs

    def unknown_starttag(self, tag, attrs):
        # called for each start tag
        # attrs is a list of (attr, value) tuples
        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
        uattrs = []
        strattrs = ''
        if attrs:
            for key, value in attrs:
                value = value.replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;')
                value = self.bare_ampersand.sub("&amp;", value)
                uattrs.append((key, value))
            strattrs = ''.join([' %s="%s"' % (k, v) for k, v in uattrs])
        if tag in self.elements_no_end_tag:
            self.pieces.append('<%s%s />' % (tag, strattrs))
        else:
            self.pieces.append('<%s%s>' % (tag, strattrs))

    def unknown_endtag(self, tag):
        # called for each end tag, e.g. for </pre>, tag will be 'pre'
        # Reconstruct the original end tag.
        if tag not in self.elements_no_end_tag:
            self.pieces.append("</%(tag)s>" % locals())

    def handle_charref(self, ref):
        # called for each character reference, e.g. for '&#160;', ref will be '160'
        # Reconstruct the original character reference.
        if ref.startswith('x'):
            value = unichr(int(ref[1:], 16))
        else:
            value = unichr(int(ref))

        if value in _cp1252.keys():
            self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
        else:
            self.pieces.append('&#%(ref)s;' % locals())

    def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
        # Reconstruct the original entity reference.
        if ref in name2codepoint:
            self.pieces.append('&%(ref)s;' % locals())
        else:
            self.pieces.append('&amp;%(ref)s' % locals())

    def handle_data(self, text):
        # called for each block of plain text, i.e. outside of any tag and
        # not containing any character or entity references
        # Store the original text verbatim.
        self.pieces.append(text)

    def handle_comment(self, text):
        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
        # Reconstruct the original comment.
        self.pieces.append('<!--%(text)s-->' % locals())

    def handle_pi(self, text):
        # called for each processing instruction, e.g. <?instruction>
        # Reconstruct original processing instruction.
        self.pieces.append('<?%(text)s>' % locals())

    def handle_decl(self, text):
        # called for the DOCTYPE, if present, e.g.
        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
        #     "http://www.w3.org/TR/html4/loose.dtd">
        # Reconstruct original DOCTYPE
        self.pieces.append('<!%(text)s>' % locals())

    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match

    def _scan_name(self, i, declstartpos):
        rawdata = self.rawdata
        n = len(rawdata)
        if i == n:
            return None, -1
        m = self._new_declname_match(rawdata, i)
        if m:
            s = m.group()
            name = s.strip()
            if (i + len(s)) == n:
                return None, -1  # end of buffer
            return name.lower(), m.end()
        else:
            self.handle_data(rawdata)
#            self.updatepos(declstartpos, i)
            return None, -1

    def convert_charref(self, name):
        return '&#%s;' % name

    def convert_entityref(self, name):
        return '&%s;' % name

    def output(self):
        '''Return processed HTML as a single string'''
        return ''.join(self.pieces)


class _HTMLSanitizer(_BaseHTMLProcessor):
    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article',
                           'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas',
                           'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command',
                           'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
                           'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
                           'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
                           'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
                           'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
                           'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
                           'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead',
                           'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']

    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
                             'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
                             'background', 'balance', 'bgcolor', 'bgproperties', 'border',
                             'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
                             'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
                             'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
                             'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
                             'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
                             'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
                             'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
                             'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
                             'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
                             'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
                             'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
                             'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
                             'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
                             'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
                             'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
                             'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
                             'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
                             'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
                             'xml:lang']

    unacceptable_elements_with_end_tag = ['script', 'applet', 'style']

    acceptable_css_properties = ['azimuth', 'background-color',
                                 'border-bottom-color', 'border-collapse', 'border-color',
                                 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
                                 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
                                 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
                                 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
                                 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
                                 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
                                 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
                                 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
                                 'white-space', 'width']

    # survey of common keywords found in feeds
    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
                               'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
                               'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
                               'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
                               'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
                               'transparent', 'underline', 'white', 'yellow']

    valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
                                  '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')

    mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
                       'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
                       'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
                       'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
                       'munderover', 'none', 'semantics']

    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
                         'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
                         'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
                         'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
                         'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
                         'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
                         'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
                         'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
                         'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']

    # svgtiny - foreignObject + linearGradient + radialGradient + stop
    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
                    'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
                    'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
                    'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
                    'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
                    'svg', 'switch', 'text', 'title', 'tspan', 'use']

    # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
                      'arabic-form', 'ascent', 'attributeName', 'attributeType',
                      'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
                      'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
                      'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
                      'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
                      'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
                      'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
                      'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
                      'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
                      'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
                      'min', 'name', 'offset', 'opacity', 'orient', 'origin',
                      'overline-position', 'overline-thickness', 'panose-1', 'path',
                      'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
                      'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
                      'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
                      'stop-color', 'stop-opacity', 'strikethrough-position',
                      'strikethrough-thickness', 'stroke', 'stroke-dasharray',
                      'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
                      'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
                      'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
                      'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
                      'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
                      'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
                      'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
                      'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
                      'y2', 'zoomAndPan']

    svg_attr_map = None
    svg_elem_map = None

    acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
                                 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
                                 'stroke-opacity']

    def reset(self):
        _BaseHTMLProcessor.reset(self)
        self.unacceptablestack = 0
        self.mathmlOK = 0
        self.svgOK = 0

    def unknown_starttag(self, tag, attrs):
        acceptable_attributes = self.acceptable_attributes
        keymap = {}
        if tag not in self.acceptable_elements or self.svgOK:
            if tag in self.unacceptable_elements_with_end_tag:
                self.unacceptablestack += 1

            # not otherwise acceptable, perhaps it is MathML or SVG?
            if tag == 'math' and ('xmlns', 'http://www.w3.org/1998/Math/MathML') in attrs:
                self.mathmlOK += 1
            if tag == 'svg' and ('xmlns', 'http://www.w3.org/2000/svg') in attrs:
                self.svgOK += 1

            # chose acceptable attributes based on tag class, else bail
            if self.mathmlOK and tag in self.mathml_elements:
                acceptable_attributes = self.mathml_attributes
            elif self.svgOK and tag in self.svg_elements:
                # for most vocabularies, lowercasing is a good idea.  Many
                # svg elements, however, are camel case
                if not self.svg_attr_map:
                    lower = [attr.lower() for attr in self.svg_attributes]
                    mix = [a for a in self.svg_attributes if a not in lower]
                    self.svg_attributes = lower
                    self.svg_attr_map = dict([(a.lower(), a) for a in mix])

                    lower = [attr.lower() for attr in self.svg_elements]
                    mix = [a for a in self.svg_elements if a not in lower]
                    self.svg_elements = lower
                    self.svg_elem_map = dict([(a.lower(), a) for a in mix])
                acceptable_attributes = self.svg_attributes
                tag = self.svg_elem_map.get(tag, tag)
                keymap = self.svg_attr_map
            elif tag not in self.acceptable_elements:
                return

        # declare xlink namespace, if needed
        if self.mathmlOK or self.svgOK:
            if filter(lambda n, v: n.startswith('xlink:'), attrs):
                if not ('xmlns:xlink', 'http://www.w3.org/1999/xlink') in attrs:
                    attrs.append(('xmlns:xlink', 'http://www.w3.org/1999/xlink'))

        clean_attrs = []
        for key, value in self.normalize_attrs(attrs):
            if key == "href" and value.strip().startswith("javascript"):
                pass
            elif key in acceptable_attributes:
                key = keymap.get(key, key)
                clean_attrs.append((key, value))
            elif key == 'style':
                pass
                # clean_value = self.sanitize_style(value)
                # if clean_value: clean_attrs.append((key,clean_value))
        _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)

    def unknown_endtag(self, tag):
        if tag not in self.acceptable_elements:
            if tag in self.unacceptable_elements_with_end_tag:
                self.unacceptablestack -= 1
            if self.mathmlOK and tag in self.mathml_elements:
                if tag == 'math' and self.mathmlOK:
                    self.mathmlOK -= 1
            elif self.svgOK and tag in self.svg_elements:
                tag = self.svg_elem_map.get(tag, tag)
                if tag == 'svg' and self.svgOK:
                    self.svgOK -= 1
            else:
                return
        _BaseHTMLProcessor.unknown_endtag(self, tag)

    def handle_pi(self, text):
        pass

    def handle_decl(self, text):
        pass

    def handle_data(self, text):
        if not self.unacceptablestack:
            _BaseHTMLProcessor.handle_data(self, text)

    def sanitize_style(self, style):
        # disallow urls
        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)

        # gauntlet
        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
            return ''
        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style):
            return ''

        clean = []
        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
            if not value:
                continue
            if prop.lower() in self.acceptable_css_properties:
                clean.append(prop + ': ' + value + ';')
            elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']:
                for keyword in value.split():
                    if keyword not in self.acceptable_css_keywords and \
                            not self.valid_css_values.match(keyword):
                        break
                else:
                    clean.append(prop + ': ' + value + ';')
            elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
                clean.append(prop + ': ' + value + ';')
        return ' '.join(clean)


[docs]def sanitize_html(htmlSource, encoding="utf-8", type="text/html"): p = _HTMLSanitizer(encoding, type) p.feed(unicodify(htmlSource, encoding)) data = p.output() data = data.strip().replace('\r\n', '\n') return data