Added all GHOP specific css properties.
Reviewed by: Lennard de Rijk
# -*- coding: UTF-8 -*-"""some input filters, for regularising the html fragments from screen scraping and browser-based editors into some semblance of sanityTODO: turn the messy setting[method_name]=True filter syntax into a list of cleaning methods to invoke, so that they can be invoked in a specific order and multiple times.AUTHORS:Dan MacKinlay - https://launchpad.net/~dan-possumpalaceCollin Grady - http://launchpad.net/~collin-collingradyAndreas Gustafsson - https://bugs.launchpad.net/~gsonHåkan W - https://launchpad.net/~hwaara-gmail"""import BeautifulSoupimport reimport sys# Python 2.4 compatibilitytry: anyexcept NameError: def any(iterable): for element in iterable: if element: return True return False"""html5lib compatibility. Basically, we need to know that this still works whether html5libis imported or not. Should run complete suites of tests for both possible configs -or test in virtual environments, but for now a basic sanity check will do.>>> if html5:>>> c=Cleaner(html5=False)>>> c(u'<p>foo</p>)u'<p>foo</p>'"""try: import html5lib from html5lib import sanitizer, treebuilders parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder("beautifulsoup"), tokenizer=sanitizer.HTMLSanitizer ) html5 = Trueexcept ImportError: html5 = FalseANTI_JS_RE=re.compile('j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*:', re.IGNORECASE)#These tags and attrs are sufficently liberal to let microformats through...#it ruthlessly culls all the rdf, dublin core metadata and so on.valid_tags = dict.fromkeys('p i em strong b u a h1 h2 h3 pre abbr br img dd dt ol ul li span sub sup ins del blockquote table tr td th address cite'.split()) #div?valid_attrs = dict.fromkeys('href src rel title'.split())valid_schemes = dict.fromkeys('http https'.split())elem_map = {'b' : 'strong', 'i': 'em'}attrs_considered_links = dict.fromkeys("src href".split()) #should include#courtesy http://developer.mozilla.org/en/docs/HTML:Block-level_elementsblock_elements = dict.fromkeys(["p", "h1","h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "address", "blockquote", "dl", "div", "fieldset", "form", "hr", "noscript", "table"])#convenient default filter lists.paranoid_filters = ["strip_comments", "strip_tags", "strip_attrs", "strip_schemes", "rename_tags", "wrap_string", "strip_empty_tags", "strip_empty_tags", ]complete_filters = ["strip_comments", "rename_tags", "strip_tags", "strip_attrs", "strip_cdata", "strip_schemes", "wrap_string", "strip_empty_tags", "rebase_links", "reparse"]#set some conservative default string processingsdefault_settings = { "filters" : paranoid_filters, "block_elements" : block_elements, #xml or None for a more liberal version "convert_entities" : "html", #xml or None for a more liberal version "valid_tags" : valid_tags, "valid_attrs" : valid_attrs, "valid_schemes" : valid_schemes, "attrs_considered_links" : attrs_considered_links, "elem_map" : elem_map, "wrapping_element" : "p", "auto_clean" : False, "original_url" : "", "new_url" : "", "html5" : html5}#processes I'd like but haven't implemented #"encode_xml_specials", "ensure complete xhtml doc", "ensure_xhtml_fragment_only"# and some handling of permitted namespaces for tags. for RDF, say. maybe.XML_ENTITIES = { u"'" : u"'", u'"' : u""", u"&" : u"&", u"<" : u"<", u">" : u">" }LINE_EXTRACTION_RE = re.compile(".+", re.MULTILINE)BR_EXTRACTION_RE = re.compile("</?br ?/?>", re.MULTILINE)class Stop: """ handy class that we use as a stop input for our state machine in lieu of falling off the end of lists """ passclass Cleaner(object): r""" powerful and slow arbitrary HTML sanitisation. can deal (i hope) with most XSS vectors and layout-breaking badness. Probably overkill for content from trusted sources; defaults are accordingly set to be paranoid. >>> bad_html = '<p style="forbidden markup"><!-- XSS attach -->content</p' >>> good_html = u'<p>content</p>' >>> c = Cleaner() >>> c.string = bad_html >>> c.clean() >>> c.string == good_html True Also supports shorthand syntax: >>> c = Cleaner() >>> c(bad_html) == c(good_html) True """ def __init__(self, string_or_soup="", *args, **kwargs): self.settings=default_settings.copy() self.settings.update(kwargs) if args : self.settings['filters'] = args super(Cleaner, self).__init__(string_or_soup, *args, **kwargs) self.string = string_or_soup def __call__(self, string = None, **kwargs): """ convenience method allowing one-step calling of an instance and returning a cleaned string. TODO: make this method preserve internal state- perhaps by creating a new instance. >>> s = 'input string' >>> c1 = Cleaner(s, auto_clean=True) >>> c2 = Cleaner("") >>> c1.string == c2(s) True """ self.settings.update(kwargs) if not string == None : self.string = string self.clean() return self.string def _set_contents(self, string_or_soup): if isinstance(string_or_soup, BeautifulSoup.BeautifulSoup) : self._set_soup(string_or_soup) else : self._set_string(string_or_soup) def _set_string(self, html_fragment_string): if self.settings['html5']: s = parser.parse(html_fragment_string).body else: s = BeautifulSoup.BeautifulSoup( html_fragment_string, convertEntities=self.settings['convert_entities']) self._set_soup(s) def _set_soup(self, soup): """ Does all the work of set_string, but bypasses a potential autoclean to avoid loops upon internal string setting ops. """ self._soup = BeautifulSoup.BeautifulSoup( '<rootrootroot></rootrootroot>' ) self.root=self._soup.contents[0] if len(soup.contents) : backwards_soup = [i for i in soup.contents] backwards_soup.reverse() else : backwards_soup = [] for i in backwards_soup : i.extract() self.root.insert(0, i) def set_string(self, string) : ur""" sets the string to process and does the necessary input encoding too really intended to be invoked as a property. note the godawful rootrootroot element which we need because the BeautifulSoup object has all the same methods as a Tag, but behaves differently, silently failing on some inserts and appends >>> c = Cleaner(convert_entities="html") >>> c.string = 'é' >>> c.string u'\xe9' >>> c = Cleaner(convert_entities="xml") >>> c.string = u'é' >>> c.string u'é' """ self._set_string(string) if len(string) and self.settings['auto_clean'] : self.clean() def get_string(self): return self.root.renderContents().decode('utf-8') string = property(get_string, set_string) def clean(self): """ invoke all cleaning processes stipulated in the settings """ for method in self.settings['filters'] : try : getattr(self, method)() except NotImplementedError : sys.stderr.write('Warning, called unimplemented method %s' % method + '\n') def strip_comments(self): r""" XHTML comments are used as an XSS attack vector. they must die. >>> c = Cleaner("", "strip_comments") >>> c('<p>text<!-- comment --> More text</p>') u'<p>text More text</p>' """ for comment in self.root.findAll( text = lambda text: isinstance(text, BeautifulSoup.Comment)): comment.extract() def strip_cdata(self): for cdata in self.root.findAll( text = lambda text: isinstance(text, BeautifulSoup.CData)): cdata.extract() def strip_tags(self): r""" ill-considered tags break our layout. they must die. >>> c = Cleaner("", "strip_tags", auto_clean=True) >>> c.string = '<div>A <strong>B C</strong></div>' >>> c.string u'A <strong>B C</strong>' >>> c.string = '<div>A <div>B C</div></div>' >>> c.string u'A B C' >>> c.string = '<div>A <br /><div>B C</div></div>' >>> c.string u'A <br />B C' >>> c.string = '<p>A <div>B C</div></p>' >>> c.string u'<p>A B C</p>' >>> c.string = 'A<div>B<div>C<div>D</div>E</div>F</div>G' >>> c.string u'ABCDEFG' >>> c.string = '<div>B<div>C<div>D</div>E</div>F</div>' >>> c.string u'BCDEF' """ # Beautiful Soup doesn't support dynamic .findAll results when the tree is # modified in place. # going backwards doesn't seem to help. # so find one at a time while True : next_bad_tag = self.root.find( lambda tag : not tag.name in (self.settings['valid_tags']) ) if next_bad_tag : self.disgorge_elem(next_bad_tag) else: break def strip_attrs(self): """ preserve only those attributes we need in the soup >>> c = Cleaner("", "strip_attrs") >>> c('<div title="v" bad="v">A <strong title="v" bad="v">B C</strong></div>') u'<div title="v">A <strong title="v">B C</strong></div>' """ for tag in self.root.findAll(True): tag.attrs = [(attr, val) for attr, val in tag.attrs if attr in self.settings['valid_attrs']] def _all_links(self): """ finds all tags with link attributes sequentially. safe against modification of said attributes in-place. """ start = self.root while True: tag = start.findNext( lambda tag : any( [(tag.get(i) for i in self.settings['attrs_considered_links'])] )) if tag: start = tag yield tag else : break def strip_schemes(self): """ >>> c = Cleaner("", "strip_schemes") >>> c('<img src="javascript:alert();" />') u'<img />' >>> c('<a href="javascript:alert();">foo</a>') u'<a>foo</a>' """ for tag in self._all_links() : for key in self.settings['attrs_considered_links'] : scheme_bits = tag.get(key, u"").split(u':',1) if len(scheme_bits) == 1 : pass #relative link else: if not scheme_bits[0] in self.settings['valid_schemes'] : del(tag[key]) def br_to_p(self): """ >>> c = Cleaner("", "br_to_p") >>> c('<p>A<br />B</p>') u'<p>A</p><p>B</p>' >>> c('A<br />B') u'<p>A</p><p>B</p>' """ block_elems = self.settings['block_elements'] block_elems['br'] = None block_elems['p'] = None while True : next_br = self.root.find('br') if not next_br: break parent = next_br.parent self.wrap_string('p', start_at=parent, block_elems = block_elems) while True: useless_br=parent.find('br', recursive=False) if not useless_br: break useless_br.extract() if parent.name == 'p': self.disgorge_elem(parent) def rename_tags(self): """ >>> c = Cleaner("", "rename_tags", elem_map={'i': 'em'}) >>> c('<b>A<i>B</i></b>') u'<b>A<em>B</em></b>' """ for tag in self.root.findAll(self.settings['elem_map']) : tag.name = self.settings['elem_map'][tag.name] def wrap_string(self, wrapping_element = None, start_at=None, block_elems=None): """ takes an html fragment, which may or may not have a single containing element, and guarantees what the tag name of the topmost elements are. TODO: is there some simpler way than a state machine to do this simple thing? >>> c = Cleaner("", "wrap_string") >>> c('A <strong>B C</strong>D') u'<p>A <strong>B C</strong>D</p>' >>> c('A <p>B C</p>D') u'<p>A </p><p>B C</p><p>D</p>' """ if not start_at : start_at = self.root if not block_elems : block_elems = self.settings['block_elements'] e = (wrapping_element or self.settings['wrapping_element']) paragraph_list = [] children = [elem for elem in start_at.contents] children.append(Stop()) last_state = 'block' paragraph = BeautifulSoup.Tag(self._soup, e) for node in children : if isinstance(node, Stop) : state = 'end' elif hasattr(node, 'name') and node.name in block_elems: state = 'block' else: state = 'inline' if last_state == 'block' and state == 'inline': #collate inline elements paragraph = BeautifulSoup.Tag(self._soup, e) if state == 'inline' : paragraph.append(node) if ((state <> 'inline') and last_state == 'inline') : paragraph_list.append(paragraph) if state == 'block' : paragraph_list.append(node) last_state = state #can't use append since it doesn't work on empty elements... paragraph_list.reverse() for paragraph in paragraph_list: start_at.insert(0, paragraph) def strip_empty_tags(self): """ strip out all empty tags TODO: depth-first search >>> c = Cleaner("", "strip_empty_tags") >>> c('<p>A</p><p></p><p>B</p><p></p>') u'<p>A</p><p>B</p>' >>> c('<p><a></a></p>') u'<p></p>' """ tag = self.root while True: next_tag = tag.findNext(True) if not next_tag: break if next_tag.contents or next_tag.attrs: tag = next_tag continue next_tag.extract() def rebase_links(self, original_url="", new_url ="") : if not original_url : original_url = self.settings.get('original_url', '') if not new_url : new_url = self.settings.get('new_url', '') raise NotImplementedError # Because of its internal character set handling, # the following will not work in Beautiful soup and is hopefully redundant. # def encode_xml_specials(self, original_url="", new_url ="") : # """ # BeautifulSoup will let some dangerous xml entities hang around # in the navigable strings. destroy all monsters. # >>> c = Cleaner(auto_clean=True, encode_xml_specials=True) # >>> c('<<<<<') # u'<<<<' # """ # for string in self.root.findAll(text=True) : # sys.stderr.write("root" +"\n") # sys.stderr.write(str(self.root) +"\n") # sys.stderr.write("parent" +"\n") # sys.stderr.write(str(string.parent) +"\n") # new_string = unicode(string) # sys.stderr.write(string +"\n") # for special_char in XML_ENTITIES.keys() : # sys.stderr.write(special_char +"\n") # string.replaceWith( # new_string.replace(special_char, XML_ENTITIES[special_char]) # ) def disgorge_elem(self, elem): """ remove the given element from the soup and replaces it with its own contents actually tricky, since you can't replace an element with an list of elements using replaceWith >>> disgorgeable_string = '<body>A <em>B</em> C</body>' >>> c = Cleaner() >>> c.string = disgorgeable_string >>> elem = c._soup.find('em') >>> c.disgorge_elem(elem) >>> c.string u'<body>A B C</body>' >>> c.string = disgorgeable_string >>> elem = c._soup.find('body') >>> c.disgorge_elem(elem) >>> c.string u'A <em>B</em> C' >>> c.string = '<div>A <div id="inner">B C</div></div>' >>> elem = c._soup.find(id="inner") >>> c.disgorge_elem(elem) >>> c.string u'<div>A B C</div>' """ if elem == self.root : raise AttributeError, "Can't disgorge root" # With in-place modification, BeautifulSoup occasionally can return # elements that think they are orphans # this lib is full of workarounds, but it's worth checking parent = elem.parent if parent == None: raise AttributeError, "AAAAAAAAGH! NO PARENTS! DEATH!" i = None for i in range(len(parent.contents)) : if parent.contents[i] == elem : index = i break elem.extract() #the proceeding method breaks horribly, sporadically. # for i in range(len(elem.contents)) : # elem.contents[i].extract() # parent.contents.insert(index+i, elem.contents[i]) # return self._safe_inject(parent, index, elem.contents) def _safe_inject(self, dest, dest_index, node_list): #BeautifulSoup result sets look like lists but don't behave right # i.e. empty ones are still True, if not len(node_list) : return node_list = [i for i in node_list] node_list.reverse() for i in node_list : dest.insert(dest_index, i)class Htmlator(object) : """ converts a string into a series of html paragraphs """ settings = { "encode_xml_specials" : True, "is_plaintext" : True, "convert_newlines" : False, "make_links" : True, "auto_convert" : False, "valid_schemes" : valid_schemes, } def __init__(self, string = "", **kwargs): self.settings.update(kwargs) super(Htmlator, self).__init__(string, **kwargs) self.string = string def _set_string(self, string): self.string = string if self.settings['auto_convert'] : self.convert() def _get_string(self): return unicode(self._soup) string = property(_get_string, _set_string) def __call__(self, string): """ convenience method supporting one-step calling of an instance as a string cleaning function """ self.string = string self.convert() return self.string def convert(self): for method in ["encode_xml_specials", "convert_newlines", "make_links"] : if self.settings(method) : getattr(self, method)() def encode_xml_specials(self) : for char in XML_ENTITIES.keys() : self.string.replace(char, XML_ENTITIES[char]) def make_links(self): raise NotImplementedError def convert_newlines(self) : self.string = ''.join([ '<p>' + line + '</p>' for line in LINE_EXTRACTION_RE.findall(self.string) ])def _test(): import doctest doctest.testmod()if __name__ == "__main__": _test()# def cast_input_to_soup(fn):# """# Decorate function to handle strings as BeautifulSoups transparently# """# def stringy_version(input, *args, **kwargs) :# if not isinstance(input,BeautifulSoup) :# input=BeautifulSoup(input)# return fn(input, *args, **kwargs)# return stringy_version