Add HtmlSanitizer python module to Melange repository.
authorPawel Solyga <Pawel.Solyga@gmail.com>
Mon, 18 May 2009 14:49:43 +0200
changeset 2324 9698749e2375
parent 2323 b3daada52dd3
child 2325 a72a1490a876
Add HtmlSanitizer python module to Melange repository.
app/htmlsanitizer/HtmlSanitizer.py
app/htmlsanitizer/LICENSE-HtmlSanitizer
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/app/htmlsanitizer/HtmlSanitizer.py	Mon May 18 14:49:43 2009 +0200
@@ -0,0 +1,575 @@
+# -*- coding: UTF-8 -*-
+"""
+some input filters, for regularising the html fragments from screen scraping and 
+browser-based editors into some semblance of sanity
+
+TODO: turn the messy setting[method_name]=True filter syntax into a list of cleaning methods to invoke, so that they can be invoked in a specific order and multiple times.
+
+AUTHORS:
+Dan MacKinlay - https://launchpad.net/~dan-possumpalace
+Collin Grady - http://launchpad.net/~collin-collingrady
+Andreas Gustafsson - https://bugs.launchpad.net/~gson
+HÃ¥kan W - https://launchpad.net/~hwaara-gmail
+"""
+
+import BeautifulSoup
+import re
+import sys
+
+# Python 2.4 compatibility
+try: any
+except NameError:
+    def any(iterable):
+        for element in iterable:
+            if element:
+                return True
+        return False
+
+"""
+html5lib compatibility. Basically, we need to know that this still works whether html5lib
+is imported or not. Should run complete suites of tests for both possible configs -
+or test in virtual environments, but for now a basic sanity check will do.
+>>> if html5:
+>>>     c=Cleaner(html5=False)
+>>>     c(u'<p>foo</p>)
+u'<p>foo</p>'
+"""
+try:
+    import html5lib
+    from html5lib import sanitizer, treebuilders
+    parser = html5lib.HTMLParser(
+        tree=treebuilders.getTreeBuilder("beautifulsoup"),
+        tokenizer=sanitizer.HTMLSanitizer
+    )
+    html5 = True
+except ImportError:
+    html5 = False
+
+ANTI_JS_RE=re.compile('j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*:', re.IGNORECASE)
+#These tags and attrs are sufficently liberal to let microformats through...
+#it ruthlessly culls all the rdf, dublin core metadata and so on.
+valid_tags = dict.fromkeys('p i em strong b u a h1 h2 h3 pre abbr br img dd dt ol ul li span sub sup ins del blockquote table tr td th address cite'.split()) #div?
+valid_attrs = dict.fromkeys('href src rel title'.split())
+valid_schemes = dict.fromkeys('http https'.split())
+elem_map = {'b' : 'strong', 'i': 'em'}
+attrs_considered_links = dict.fromkeys("src href".split()) #should include
+#courtesy http://developer.mozilla.org/en/docs/HTML:Block-level_elements
+block_elements = dict.fromkeys(["p", "h1","h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "address", "blockquote", "dl", "div", "fieldset", "form", "hr", "noscript", "table"])
+
+#convenient default filter lists.
+paranoid_filters = ["strip_comments", "strip_tags", "strip_attrs",
+  "strip_schemes", "rename_tags", "wrap_string", "strip_empty_tags", "strip_empty_tags", ]
+complete_filters = ["strip_comments", "rename_tags", "strip_tags", "strip_attrs",
+    "strip_cdata", "strip_schemes",  "wrap_string", "strip_empty_tags", "rebase_links", "reparse"]
+
+#set some conservative default string processings
+default_settings = {
+    "filters" : paranoid_filters,
+    "block_elements" : block_elements, #xml or None for a more liberal version
+    "convert_entities" : "html", #xml or None for a more liberal version
+    "valid_tags" : valid_tags,
+    "valid_attrs" : valid_attrs,
+    "valid_schemes" : valid_schemes,
+    "attrs_considered_links" : attrs_considered_links,
+    "elem_map" : elem_map,
+    "wrapping_element" : "p",
+    "auto_clean" : False,
+    "original_url" : "",
+    "new_url" : "",
+    "html5" : html5
+}
+#processes I'd like but haven't implemented            
+#"encode_xml_specials", "ensure complete xhtml doc", "ensure_xhtml_fragment_only"
+# and some handling of permitted namespaces for tags. for RDF, say. maybe.
+
+XML_ENTITIES = { u"'" : u"&apos;",
+                 u'"' : u"&quot;",
+                 u"&" : u"&amp;",
+                 u"<" : u"&lt;",
+                 u">" : u"&gt;"
+               }
+LINE_EXTRACTION_RE = re.compile(".+", re.MULTILINE)
+BR_EXTRACTION_RE = re.compile("</?br ?/?>", re.MULTILINE)
+
+class Stop:
+    """
+    handy class that we use as a stop input for our state machine in lieu of falling
+    off the end of lists
+    """
+    pass
+
+
+class Cleaner(object):
+    r"""
+    powerful and slow arbitrary HTML sanitisation. can deal (i hope) with most XSS
+    vectors and layout-breaking badness.
+    Probably overkill for content from trusted sources; defaults are accordingly
+    set to be paranoid.
+    >>> bad_html = '<p style="forbidden markup"><!-- XSS attach -->content</p'
+    >>> good_html = u'<p>content</p>'
+    >>> c = Cleaner()
+    >>> c.string = bad_html
+    >>> c.clean()
+    >>> c.string == good_html
+    True
+    
+    Also supports shorthand syntax:
+    >>> c = Cleaner()
+    >>> c(bad_html) == c(good_html)
+    True
+    """
+    
+    def __init__(self, string_or_soup="", *args,  **kwargs):
+        self.settings=default_settings.copy()
+        self.settings.update(kwargs)
+        if args :
+            self.settings['filters'] = args
+        super(Cleaner, self).__init__(string_or_soup, *args, **kwargs)
+        self.string = string_or_soup
+    
+    def __call__(self, string = None, **kwargs):
+        """
+        convenience method allowing one-step calling of an instance and returning
+        a cleaned string.
+        
+        TODO: make this method preserve internal state- perhaps by creating a new
+        instance.
+        
+        >>> s = 'input string'
+        >>> c1 = Cleaner(s, auto_clean=True)
+        >>> c2 = Cleaner("")
+        >>> c1.string == c2(s)
+        True
+        
+        """
+        self.settings.update(kwargs)
+        if not string == None :
+            self.string = string
+        self.clean()
+        return self.string
+    
+    def _set_contents(self, string_or_soup):
+        if isinstance(string_or_soup, BeautifulSoup.BeautifulSoup) :
+            self._set_soup(string_or_soup)
+        else :
+            self._set_string(string_or_soup)
+    
+    def _set_string(self, html_fragment_string):
+        if self.settings['html5']:
+            s = parser.parse(html_fragment_string).body
+        else:
+            s = BeautifulSoup.BeautifulSoup(
+                    html_fragment_string,
+                    convertEntities=self.settings['convert_entities'])
+        self._set_soup(s)
+        
+    def _set_soup(self, soup):
+        """
+        Does all the work of set_string, but bypasses a potential autoclean to avoid 
+        loops upon internal string setting ops.
+        """
+        self._soup = BeautifulSoup.BeautifulSoup(
+            '<rootrootroot></rootrootroot>'
+        )
+        self.root=self._soup.contents[0]
+        
+        if len(soup.contents) :
+            backwards_soup = [i for i in soup.contents]
+            backwards_soup.reverse()
+        else :
+            backwards_soup = []
+        for i in backwards_soup :
+            i.extract()
+            self.root.insert(0, i)
+    
+    def set_string(self, string) :
+        ur"""
+            sets the string to process and does the necessary input encoding too
+        really intended to be invoked as a property.
+        note the godawful rootrootroot element which we need because the
+        BeautifulSoup object has all the same methods as a Tag, but
+        behaves differently, silently failing on some inserts and appends
+        
+        >>> c = Cleaner(convert_entities="html")
+        >>> c.string = '&eacute;'
+        >>> c.string
+        u'\xe9'
+        >>> c = Cleaner(convert_entities="xml")
+        >>> c.string = u'&eacute;'
+        >>> c.string
+        u'&eacute;'
+        """
+        self._set_string(string)
+        if len(string) and self.settings['auto_clean'] : self.clean()
+        
+    def get_string(self):
+        return unicode(self.root.renderContents())
+    
+    string = property(get_string, set_string)
+    
+    def clean(self):
+        """
+        invoke all cleaning processes stipulated in the settings
+        """
+        for method in self.settings['filters'] :
+            try :
+                getattr(self, method)()
+            except NotImplementedError :
+                sys.stderr.write('Warning, called unimplemented method %s' % method + '\n')
+    
+    def strip_comments(self):
+        r"""
+        XHTML comments are used as an XSS attack vector. they must die.
+        
+        >>> c = Cleaner("", "strip_comments")
+        >>> c('<p>text<!-- comment --> More text</p>')
+        u'<p>text More text</p>'
+        """
+        for comment in self.root.findAll(
+            text = lambda text: isinstance(text, BeautifulSoup.Comment)):
+            comment.extract()
+            
+    def strip_cdata(self):
+        for cdata in self.root.findAll(
+          text = lambda text: isinstance(text, BeautifulSoup.CData)):
+            cdata.extract()
+    
+    def strip_tags(self):
+        r"""
+        ill-considered tags break our layout. they must die.
+        >>> c = Cleaner("", "strip_tags", auto_clean=True)
+        >>> c.string = '<div>A <strong>B C</strong></div>'
+        >>> c.string
+        u'A <strong>B C</strong>'
+        >>> c.string = '<div>A <div>B C</div></div>'
+        >>> c.string
+        u'A B C'
+        >>> c.string = '<div>A <br /><div>B C</div></div>'
+        >>> c.string
+        u'A <br />B C'
+        >>> c.string = '<p>A <div>B C</div></p>'
+        >>> c.string
+        u'<p>A B C</p>'
+        >>> c.string = 'A<div>B<div>C<div>D</div>E</div>F</div>G'
+        >>> c.string
+        u'ABCDEFG'
+        >>> c.string = '<div>B<div>C<div>D</div>E</div>F</div>'
+        >>> c.string
+        u'BCDEF'
+        """
+        # Beautiful Soup doesn't support dynamic .findAll results when the tree is
+        # modified in place.
+        # going backwards doesn't seem to help.
+        # so find one at a time
+        while True :
+            next_bad_tag = self.root.find(
+              lambda tag : not tag.name in (self.settings['valid_tags'])
+            )
+            if next_bad_tag :                
+                self.disgorge_elem(next_bad_tag)
+            else:
+                break
+    
+    def strip_attrs(self):
+        """
+        preserve only those attributes we need in the soup
+        >>> c = Cleaner("", "strip_attrs")
+        >>> c('<div title="v" bad="v">A <strong title="v" bad="v">B C</strong></div>')
+        u'<div title="v">A <strong title="v">B C</strong></div>'
+        """
+        for tag in self.root.findAll(True):
+            tag.attrs = [(attr, val) for attr, val in tag.attrs
+                         if attr in self.settings['valid_attrs']]
+    
+    def _all_links(self):
+        """
+        finds all tags with link attributes sequentially. safe against modification
+        of said attributes in-place.
+        """
+        start = self.root
+        while True: 
+            tag = start.findNext(
+              lambda tag : any(
+                [(tag.get(i) for i in self.settings['attrs_considered_links'])]
+              ))
+            if tag: 
+                start = tag
+                yield tag
+            else :
+                break
+            
+    def strip_schemes(self):
+        """
+        >>> c = Cleaner("", "strip_schemes")
+        >>> c('<img src="javascript:alert();" />')
+        u'<img />'
+        >>> c('<a href="javascript:alert();">foo</a>')
+        u'<a>foo</a>'
+        """
+        for tag in self._all_links() :
+            for key in self.settings['attrs_considered_links'] :
+                scheme_bits = tag.get(key, u"").split(u':',1)
+                if len(scheme_bits) == 1 : 
+                    pass #relative link
+                else:
+		    if not scheme_bits[0] in self.settings['valid_schemes'] :
+			del(tag[key])
+    
+    def br_to_p(self):
+        """
+        >>> c = Cleaner("", "br_to_p")
+        >>> c('<p>A<br />B</p>')
+        u'<p>A</p><p>B</p>'
+        >>> c('A<br />B')
+        u'<p>A</p><p>B</p>'
+        """
+        block_elems = self.settings['block_elements']
+        block_elems['br'] = None
+        block_elems['p'] = None
+        
+        while True :
+            next_br = self.root.find('br')
+            if not next_br: break
+            parent = next_br.parent
+            self.wrap_string('p', start_at=parent, block_elems = block_elems)
+            while True:
+                useless_br=parent.find('br', recursive=False)
+                if not useless_br: break
+                useless_br.extract()        
+            if parent.name == 'p':
+                self.disgorge_elem(parent)
+    
+    def rename_tags(self):
+        """
+        >>> c = Cleaner("", "rename_tags", elem_map={'i': 'em'})
+        >>> c('<b>A<i>B</i></b>')
+        u'<b>A<em>B</em></b>'
+        """
+        for tag in self.root.findAll(self.settings['elem_map']) :
+            tag.name = self.settings['elem_map'][tag.name]
+        
+    def wrap_string(self, wrapping_element = None, start_at=None, block_elems=None):
+        """
+        takes an html fragment, which may or may not have a single containing element,
+        and guarantees what the tag name of the topmost elements are.
+        TODO: is there some simpler way than a state machine to do this simple thing?
+        >>> c = Cleaner("", "wrap_string")
+        >>> c('A <strong>B C</strong>D')
+        u'<p>A <strong>B C</strong>D</p>'
+        >>> c('A <p>B C</p>D')
+        u'<p>A </p><p>B C</p><p>D</p>'
+        """
+        if not start_at : start_at = self.root
+        if not block_elems : block_elems = self.settings['block_elements']
+        e = (wrapping_element or self.settings['wrapping_element'])
+        paragraph_list = []
+        children = [elem for elem in start_at.contents]
+        children.append(Stop())
+        
+        last_state = 'block'
+        paragraph = BeautifulSoup.Tag(self._soup, e)
+        
+        for node in children :
+            if isinstance(node, Stop) :
+                state = 'end'
+            elif hasattr(node, 'name') and node.name in block_elems:
+                state = 'block'
+            else:
+                state = 'inline'
+                
+            if last_state == 'block' and state == 'inline':
+                #collate inline elements
+                paragraph = BeautifulSoup.Tag(self._soup, e)
+                
+            if state == 'inline' :
+                paragraph.append(node)
+                
+            if ((state <> 'inline') and last_state == 'inline') :
+                paragraph_list.append(paragraph)
+                
+            if state == 'block' :
+                paragraph_list.append(node)
+            
+            last_state = state
+        
+        #can't use append since it doesn't work on empty elements...
+        paragraph_list.reverse()
+        for paragraph in paragraph_list:
+            start_at.insert(0, paragraph)
+        
+    def strip_empty_tags(self):
+        """
+        strip out all empty tags
+        TODO: depth-first search
+        >>> c = Cleaner("", "strip_empty_tags")
+        >>> c('<p>A</p><p></p><p>B</p><p></p>')
+        u'<p>A</p><p>B</p>'
+        >>> c('<p><a></a></p>')
+        u'<p></p>'
+        """
+        tag = self.root
+        while True:
+            next_tag = tag.findNext(True)
+            if not next_tag: break
+            if next_tag.contents or next_tag.attrs:
+                tag = next_tag
+                continue
+            next_tag.extract()
+        
+    def rebase_links(self, original_url="", new_url ="") :
+        if not original_url : original_url = self.settings.get('original_url', '')
+        if not new_url : new_url = self.settings.get('new_url', '')
+        raise NotImplementedError
+    
+    # Because of its internal character set handling,
+    # the following will not work in Beautiful soup and is hopefully redundant.
+    # def encode_xml_specials(self, original_url="", new_url ="") :
+    #     """
+    #     BeautifulSoup will let some dangerous xml entities hang around
+    #     in the navigable strings. destroy all monsters.
+    #     >>> c = Cleaner(auto_clean=True, encode_xml_specials=True)
+    #     >>> c('<<<<<')
+    #     u'&lt;&lt;&lt;&lt;'
+    #     """
+    #     for string in self.root.findAll(text=True) :
+    #         sys.stderr.write("root" +"\n")
+    #         sys.stderr.write(str(self.root) +"\n")
+    #         sys.stderr.write("parent" +"\n")
+    #         sys.stderr.write(str(string.parent) +"\n")
+    #         new_string = unicode(string)
+    #         sys.stderr.write(string +"\n")
+    #         for special_char in XML_ENTITIES.keys() :
+    #             sys.stderr.write(special_char +"\n")
+    #         string.replaceWith(
+    #           new_string.replace(special_char, XML_ENTITIES[special_char])
+    #         )
+        
+        
+    def disgorge_elem(self, elem):
+        """
+        remove the given element from the soup and replaces it with its own contents
+        actually tricky, since you can't replace an element with an list of elements
+        using replaceWith
+        >>> disgorgeable_string = '<body>A <em>B</em> C</body>'
+        >>> c = Cleaner()
+        >>> c.string = disgorgeable_string
+        >>> elem = c._soup.find('em')
+        >>> c.disgorge_elem(elem)
+        >>> c.string
+        u'<body>A B C</body>'
+        >>> c.string = disgorgeable_string
+        >>> elem = c._soup.find('body')
+        >>> c.disgorge_elem(elem)
+        >>> c.string
+        u'A <em>B</em> C'
+        >>> c.string = '<div>A <div id="inner">B C</div></div>'
+        >>> elem = c._soup.find(id="inner")
+        >>> c.disgorge_elem(elem)
+        >>> c.string
+        u'<div>A B C</div>'
+        """
+        if elem == self.root :
+            raise AttributeError, "Can't disgorge root"  
+                      
+        # With in-place modification, BeautifulSoup occasionally can return
+        # elements that think they are orphans
+        # this lib is full of workarounds, but it's worth checking
+        parent = elem.parent
+        if parent == None: 
+            raise AttributeError, "AAAAAAAAGH! NO PARENTS! DEATH!"
+        
+        i = None
+        for i in range(len(parent.contents)) :
+            if parent.contents[i] == elem :
+                index = i
+                break
+                
+        elem.extract()
+        
+        #the proceeding method breaks horribly, sporadically.
+        # for i in range(len(elem.contents)) :
+        #     elem.contents[i].extract()
+        #     parent.contents.insert(index+i, elem.contents[i])
+        # return
+        self._safe_inject(parent, index, elem.contents)
+        
+    def _safe_inject(self, dest, dest_index, node_list):
+        #BeautifulSoup result sets look like lists but don't behave right
+        # i.e. empty ones are still True,
+        if not len(node_list) : return
+        node_list = [i for i in node_list]
+        node_list.reverse()
+        for i in node_list :
+            dest.insert(dest_index, i)
+
+        
+class Htmlator(object) :
+    """
+    converts a string into a series of html paragraphs
+    """
+    settings = {
+        "encode_xml_specials" : True,
+        "is_plaintext" : True,
+        "convert_newlines" : False,
+        "make_links" : True,
+        "auto_convert" : False,
+        "valid_schemes" : valid_schemes,
+    }
+    def __init__(self, string = "",  **kwargs):
+        self.settings.update(kwargs)
+        super(Htmlator, self).__init__(string, **kwargs)
+        self.string = string
+    
+    def _set_string(self, string):
+        self.string = string
+        if self.settings['auto_convert'] : self.convert()
+        
+    def _get_string(self):
+        return unicode(self._soup)
+    
+    string = property(_get_string, _set_string)
+    
+    def __call__(self, string):
+        """
+        convenience method supporting one-step calling of an instance
+        as a string cleaning function
+        """
+        self.string = string
+        self.convert()
+        return self.string
+        
+    def convert(self):
+        for method in ["encode_xml_specials", "convert_newlines",
+          "make_links"] :
+            if self.settings(method) :
+                getattr(self, method)()
+    
+    def encode_xml_specials(self) :
+        for char in XML_ENTITIES.keys() :
+            self.string.replace(char, XML_ENTITIES[char])
+        
+    def make_links(self):
+        raise NotImplementedError
+        
+    def convert_newlines(self) :
+        self.string = ''.join([
+            '<p>' + line + '</p>' for line in LINE_EXTRACTION_RE.findall(self.string)
+        ])
+        
+def _test():
+    import doctest
+    doctest.testmod()
+
+if __name__ == "__main__":
+    _test()
+
+
+# def cast_input_to_soup(fn):
+#     """
+#     Decorate function to handle strings as BeautifulSoups transparently
+#     """
+#     def stringy_version(input, *args, **kwargs) :
+#         if not isinstance(input,BeautifulSoup) :
+#             input=BeautifulSoup(input)
+#         return fn(input, *args, **kwargs)
+#     return stringy_version
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/app/htmlsanitizer/LICENSE-HtmlSanitizer	Mon May 18 14:49:43 2009 +0200
@@ -0,0 +1,23 @@
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright 
+     notice, this list of conditions and the following disclaimer in 
+     the documentation and/or other materials provided with the distribution.
+
+  3. The names of the authors may not be used to endorse or promote products
+     derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JCRAFT,
+INC. OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.