Source code for dp_tornado.helper.web.html

# -*- coding: utf-8 -*-


from dp_tornado.engine.helper import Helper as dpHelper

try:
    # py 2.x
    import HTMLParser
    html_parser = HTMLParser.HTMLParser()

except:
    # py 3.4-
    try:
        import html.parser
        html_parser = html.parser.HTMLParser()
    except:
        # py 3.4+
        import html as html_parser

try:
    import htmltag
except:
    htmltag = None

import re

from bs4 import BeautifulSoup


re_html_tag = re.compile("(?i)<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>")
re_on_attrs = re.compile('.*\s+(on[a-z]+\s*=).*')


[docs]class HtmlHelper(dpHelper):
[docs]    def validate(self, s):
        s_id = '_____dp_s_xss_____'
        s = '<div id="%s">%s</div>' % (s_id, s)
        s = BeautifulSoup(s, 'lxml')

        s = str(s.find(id=s_id))
        return s[s.find('>')+1:s.rfind('<')]

[docs]    def strip_xss(self, s, whitelist=None):
        if whitelist is None:
            whitelist = (
                'a', 'abbr', 'aside', 'audio', 'bdi', 'bdo', 'blockquote', 'canvas',
                'caption', 'code', 'col', 'colgroup', 'data', 'dd', 'del',
                'details', 'div', 'dl', 'dt', 'em', 'figcaption', 'figure', 'h1',
                'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'ins', 'kbd', 'li',
                'mark', 'ol', 'p', 'pre', 'q', 'rp', 'rt', 'ruby', 's', 'samp',
                'small', 'source', 'span', 'strong', 'sub', 'summary', 'sup',
                'table', 'td', 'th', 'time', 'tr', 'track', 'u', 'ul', 'var',
                'video', 'wbr', 'b', 'br', 'site', 'font')

        elif not whitelist:
            whitelist = None

        s_id = '_____dp_s_xss_____'
        s = '<div id="%s">%s</div>' % (s_id, s)
        s = BeautifulSoup(s, 'lxml')

        self._strip_xss(s.find(id=s_id), whitelist)

        s = str(s.find(id=s_id))
        return s[s.find('>')+1:s.rfind('<')]

    def _strip_xss(self, elements, whitelist):
        for e in elements.find_all():
            found = False

            if whitelist and e.name not in whitelist:
                found = True
            elif e.attrs:
                for ea_k, ea_v in e.attrs.items():
                    s_tag = ea_k.strip().lower()

                    if s_tag.startswith('on') or s_tag.startswith('seeksegmenttime') or s_tag.startswith('fscommand'):
                        found = True
                        break

                    if self.helper.misc.type.check.array(ea_v):
                        s_val = ' '.join(ea_v).strip()
                    else:
                        s_val = ea_v.strip()

                    for xe in self.helper.string.whitespace:
                        s_val = s_val.replace(xe, '')

                    if s_val.find('javascript:') != -1 or s_val.find('vbscript:') != -1:
                        found = True
                        break

            if found and e.parent:
                e.unwrap()

            self._strip_xss(e, whitelist)

[docs]    def strip_tags(self, text):
        return re.sub('<[^<]+?>', '', text)

[docs]    def unescape(self, text):
        return html_parser.unescape(text)

[docs]    def escape(self, s, quote=False):
        s = s.replace("&", "&amp;")
        s = s.replace("<", "&lt;")
        s = s.replace(">", "&gt;")

        if quote:
            s = s.replace('"', "&quot;")

        return s