app/htmlsanitizer/HtmlSanitizer.py
changeset 2324 9698749e2375
child 2555 b7f14c803619
equal deleted inserted replaced
2323:b3daada52dd3 2324:9698749e2375
       
     1 # -*- coding: UTF-8 -*-
       
     2 """
       
     3 some input filters, for regularising the html fragments from screen scraping and 
       
     4 browser-based editors into some semblance of sanity
       
     5 
       
     6 TODO: turn the messy setting[method_name]=True filter syntax into a list of cleaning methods to invoke, so that they can be invoked in a specific order and multiple times.
       
     7 
       
     8 AUTHORS:
       
     9 Dan MacKinlay - https://launchpad.net/~dan-possumpalace
       
    10 Collin Grady - http://launchpad.net/~collin-collingrady
       
    11 Andreas Gustafsson - https://bugs.launchpad.net/~gson
       
    12 HÃ¥kan W - https://launchpad.net/~hwaara-gmail
       
    13 """
       
    14 
       
    15 import BeautifulSoup
       
    16 import re
       
    17 import sys
       
    18 
       
    19 # Python 2.4 compatibility
       
    20 try: any
       
    21 except NameError:
       
    22     def any(iterable):
       
    23         for element in iterable:
       
    24             if element:
       
    25                 return True
       
    26         return False
       
    27 
       
    28 """
       
    29 html5lib compatibility. Basically, we need to know that this still works whether html5lib
       
    30 is imported or not. Should run complete suites of tests for both possible configs -
       
    31 or test in virtual environments, but for now a basic sanity check will do.
       
    32 >>> if html5:
       
    33 >>>     c=Cleaner(html5=False)
       
    34 >>>     c(u'<p>foo</p>)
       
    35 u'<p>foo</p>'
       
    36 """
       
    37 try:
       
    38     import html5lib
       
    39     from html5lib import sanitizer, treebuilders
       
    40     parser = html5lib.HTMLParser(
       
    41         tree=treebuilders.getTreeBuilder("beautifulsoup"),
       
    42         tokenizer=sanitizer.HTMLSanitizer
       
    43     )
       
    44     html5 = True
       
    45 except ImportError:
       
    46     html5 = False
       
    47 
       
    48 ANTI_JS_RE=re.compile('j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*:', re.IGNORECASE)
       
    49 #These tags and attrs are sufficently liberal to let microformats through...
       
    50 #it ruthlessly culls all the rdf, dublin core metadata and so on.
       
    51 valid_tags = dict.fromkeys('p i em strong b u a h1 h2 h3 pre abbr br img dd dt ol ul li span sub sup ins del blockquote table tr td th address cite'.split()) #div?
       
    52 valid_attrs = dict.fromkeys('href src rel title'.split())
       
    53 valid_schemes = dict.fromkeys('http https'.split())
       
    54 elem_map = {'b' : 'strong', 'i': 'em'}
       
    55 attrs_considered_links = dict.fromkeys("src href".split()) #should include
       
    56 #courtesy http://developer.mozilla.org/en/docs/HTML:Block-level_elements
       
    57 block_elements = dict.fromkeys(["p", "h1","h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "address", "blockquote", "dl", "div", "fieldset", "form", "hr", "noscript", "table"])
       
    58 
       
    59 #convenient default filter lists.
       
    60 paranoid_filters = ["strip_comments", "strip_tags", "strip_attrs",
       
    61   "strip_schemes", "rename_tags", "wrap_string", "strip_empty_tags", "strip_empty_tags", ]
       
    62 complete_filters = ["strip_comments", "rename_tags", "strip_tags", "strip_attrs",
       
    63     "strip_cdata", "strip_schemes",  "wrap_string", "strip_empty_tags", "rebase_links", "reparse"]
       
    64 
       
    65 #set some conservative default string processings
       
    66 default_settings = {
       
    67     "filters" : paranoid_filters,
       
    68     "block_elements" : block_elements, #xml or None for a more liberal version
       
    69     "convert_entities" : "html", #xml or None for a more liberal version
       
    70     "valid_tags" : valid_tags,
       
    71     "valid_attrs" : valid_attrs,
       
    72     "valid_schemes" : valid_schemes,
       
    73     "attrs_considered_links" : attrs_considered_links,
       
    74     "elem_map" : elem_map,
       
    75     "wrapping_element" : "p",
       
    76     "auto_clean" : False,
       
    77     "original_url" : "",
       
    78     "new_url" : "",
       
    79     "html5" : html5
       
    80 }
       
    81 #processes I'd like but haven't implemented            
       
    82 #"encode_xml_specials", "ensure complete xhtml doc", "ensure_xhtml_fragment_only"
       
    83 # and some handling of permitted namespaces for tags. for RDF, say. maybe.
       
    84 
       
    85 XML_ENTITIES = { u"'" : u"&apos;",
       
    86                  u'"' : u"&quot;",
       
    87                  u"&" : u"&amp;",
       
    88                  u"<" : u"&lt;",
       
    89                  u">" : u"&gt;"
       
    90                }
       
    91 LINE_EXTRACTION_RE = re.compile(".+", re.MULTILINE)
       
    92 BR_EXTRACTION_RE = re.compile("</?br ?/?>", re.MULTILINE)
       
    93 
       
    94 class Stop:
       
    95     """
       
    96     handy class that we use as a stop input for our state machine in lieu of falling
       
    97     off the end of lists
       
    98     """
       
    99     pass
       
   100 
       
   101 
       
   102 class Cleaner(object):
       
   103     r"""
       
   104     powerful and slow arbitrary HTML sanitisation. can deal (i hope) with most XSS
       
   105     vectors and layout-breaking badness.
       
   106     Probably overkill for content from trusted sources; defaults are accordingly
       
   107     set to be paranoid.
       
   108     >>> bad_html = '<p style="forbidden markup"><!-- XSS attach -->content</p'
       
   109     >>> good_html = u'<p>content</p>'
       
   110     >>> c = Cleaner()
       
   111     >>> c.string = bad_html
       
   112     >>> c.clean()
       
   113     >>> c.string == good_html
       
   114     True
       
   115     
       
   116     Also supports shorthand syntax:
       
   117     >>> c = Cleaner()
       
   118     >>> c(bad_html) == c(good_html)
       
   119     True
       
   120     """
       
   121     
       
   122     def __init__(self, string_or_soup="", *args,  **kwargs):
       
   123         self.settings=default_settings.copy()
       
   124         self.settings.update(kwargs)
       
   125         if args :
       
   126             self.settings['filters'] = args
       
   127         super(Cleaner, self).__init__(string_or_soup, *args, **kwargs)
       
   128         self.string = string_or_soup
       
   129     
       
   130     def __call__(self, string = None, **kwargs):
       
   131         """
       
   132         convenience method allowing one-step calling of an instance and returning
       
   133         a cleaned string.
       
   134         
       
   135         TODO: make this method preserve internal state- perhaps by creating a new
       
   136         instance.
       
   137         
       
   138         >>> s = 'input string'
       
   139         >>> c1 = Cleaner(s, auto_clean=True)
       
   140         >>> c2 = Cleaner("")
       
   141         >>> c1.string == c2(s)
       
   142         True
       
   143         
       
   144         """
       
   145         self.settings.update(kwargs)
       
   146         if not string == None :
       
   147             self.string = string
       
   148         self.clean()
       
   149         return self.string
       
   150     
       
   151     def _set_contents(self, string_or_soup):
       
   152         if isinstance(string_or_soup, BeautifulSoup.BeautifulSoup) :
       
   153             self._set_soup(string_or_soup)
       
   154         else :
       
   155             self._set_string(string_or_soup)
       
   156     
       
   157     def _set_string(self, html_fragment_string):
       
   158         if self.settings['html5']:
       
   159             s = parser.parse(html_fragment_string).body
       
   160         else:
       
   161             s = BeautifulSoup.BeautifulSoup(
       
   162                     html_fragment_string,
       
   163                     convertEntities=self.settings['convert_entities'])
       
   164         self._set_soup(s)
       
   165         
       
   166     def _set_soup(self, soup):
       
   167         """
       
   168         Does all the work of set_string, but bypasses a potential autoclean to avoid 
       
   169         loops upon internal string setting ops.
       
   170         """
       
   171         self._soup = BeautifulSoup.BeautifulSoup(
       
   172             '<rootrootroot></rootrootroot>'
       
   173         )
       
   174         self.root=self._soup.contents[0]
       
   175         
       
   176         if len(soup.contents) :
       
   177             backwards_soup = [i for i in soup.contents]
       
   178             backwards_soup.reverse()
       
   179         else :
       
   180             backwards_soup = []
       
   181         for i in backwards_soup :
       
   182             i.extract()
       
   183             self.root.insert(0, i)
       
   184     
       
   185     def set_string(self, string) :
       
   186         ur"""
       
   187             sets the string to process and does the necessary input encoding too
       
   188         really intended to be invoked as a property.
       
   189         note the godawful rootrootroot element which we need because the
       
   190         BeautifulSoup object has all the same methods as a Tag, but
       
   191         behaves differently, silently failing on some inserts and appends
       
   192         
       
   193         >>> c = Cleaner(convert_entities="html")
       
   194         >>> c.string = '&eacute;'
       
   195         >>> c.string
       
   196         u'\xe9'
       
   197         >>> c = Cleaner(convert_entities="xml")
       
   198         >>> c.string = u'&eacute;'
       
   199         >>> c.string
       
   200         u'&eacute;'
       
   201         """
       
   202         self._set_string(string)
       
   203         if len(string) and self.settings['auto_clean'] : self.clean()
       
   204         
       
   205     def get_string(self):
       
   206         return unicode(self.root.renderContents())
       
   207     
       
   208     string = property(get_string, set_string)
       
   209     
       
   210     def clean(self):
       
   211         """
       
   212         invoke all cleaning processes stipulated in the settings
       
   213         """
       
   214         for method in self.settings['filters'] :
       
   215             try :
       
   216                 getattr(self, method)()
       
   217             except NotImplementedError :
       
   218                 sys.stderr.write('Warning, called unimplemented method %s' % method + '\n')
       
   219     
       
   220     def strip_comments(self):
       
   221         r"""
       
   222         XHTML comments are used as an XSS attack vector. they must die.
       
   223         
       
   224         >>> c = Cleaner("", "strip_comments")
       
   225         >>> c('<p>text<!-- comment --> More text</p>')
       
   226         u'<p>text More text</p>'
       
   227         """
       
   228         for comment in self.root.findAll(
       
   229             text = lambda text: isinstance(text, BeautifulSoup.Comment)):
       
   230             comment.extract()
       
   231             
       
   232     def strip_cdata(self):
       
   233         for cdata in self.root.findAll(
       
   234           text = lambda text: isinstance(text, BeautifulSoup.CData)):
       
   235             cdata.extract()
       
   236     
       
   237     def strip_tags(self):
       
   238         r"""
       
   239         ill-considered tags break our layout. they must die.
       
   240         >>> c = Cleaner("", "strip_tags", auto_clean=True)
       
   241         >>> c.string = '<div>A <strong>B C</strong></div>'
       
   242         >>> c.string
       
   243         u'A <strong>B C</strong>'
       
   244         >>> c.string = '<div>A <div>B C</div></div>'
       
   245         >>> c.string
       
   246         u'A B C'
       
   247         >>> c.string = '<div>A <br /><div>B C</div></div>'
       
   248         >>> c.string
       
   249         u'A <br />B C'
       
   250         >>> c.string = '<p>A <div>B C</div></p>'
       
   251         >>> c.string
       
   252         u'<p>A B C</p>'
       
   253         >>> c.string = 'A<div>B<div>C<div>D</div>E</div>F</div>G'
       
   254         >>> c.string
       
   255         u'ABCDEFG'
       
   256         >>> c.string = '<div>B<div>C<div>D</div>E</div>F</div>'
       
   257         >>> c.string
       
   258         u'BCDEF'
       
   259         """
       
   260         # Beautiful Soup doesn't support dynamic .findAll results when the tree is
       
   261         # modified in place.
       
   262         # going backwards doesn't seem to help.
       
   263         # so find one at a time
       
   264         while True :
       
   265             next_bad_tag = self.root.find(
       
   266               lambda tag : not tag.name in (self.settings['valid_tags'])
       
   267             )
       
   268             if next_bad_tag :                
       
   269                 self.disgorge_elem(next_bad_tag)
       
   270             else:
       
   271                 break
       
   272     
       
   273     def strip_attrs(self):
       
   274         """
       
   275         preserve only those attributes we need in the soup
       
   276         >>> c = Cleaner("", "strip_attrs")
       
   277         >>> c('<div title="v" bad="v">A <strong title="v" bad="v">B C</strong></div>')
       
   278         u'<div title="v">A <strong title="v">B C</strong></div>'
       
   279         """
       
   280         for tag in self.root.findAll(True):
       
   281             tag.attrs = [(attr, val) for attr, val in tag.attrs
       
   282                          if attr in self.settings['valid_attrs']]
       
   283     
       
   284     def _all_links(self):
       
   285         """
       
   286         finds all tags with link attributes sequentially. safe against modification
       
   287         of said attributes in-place.
       
   288         """
       
   289         start = self.root
       
   290         while True: 
       
   291             tag = start.findNext(
       
   292               lambda tag : any(
       
   293                 [(tag.get(i) for i in self.settings['attrs_considered_links'])]
       
   294               ))
       
   295             if tag: 
       
   296                 start = tag
       
   297                 yield tag
       
   298             else :
       
   299                 break
       
   300             
       
   301     def strip_schemes(self):
       
   302         """
       
   303         >>> c = Cleaner("", "strip_schemes")
       
   304         >>> c('<img src="javascript:alert();" />')
       
   305         u'<img />'
       
   306         >>> c('<a href="javascript:alert();">foo</a>')
       
   307         u'<a>foo</a>'
       
   308         """
       
   309         for tag in self._all_links() :
       
   310             for key in self.settings['attrs_considered_links'] :
       
   311                 scheme_bits = tag.get(key, u"").split(u':',1)
       
   312                 if len(scheme_bits) == 1 : 
       
   313                     pass #relative link
       
   314                 else:
       
   315 		    if not scheme_bits[0] in self.settings['valid_schemes'] :
       
   316 			del(tag[key])
       
   317     
       
   318     def br_to_p(self):
       
   319         """
       
   320         >>> c = Cleaner("", "br_to_p")
       
   321         >>> c('<p>A<br />B</p>')
       
   322         u'<p>A</p><p>B</p>'
       
   323         >>> c('A<br />B')
       
   324         u'<p>A</p><p>B</p>'
       
   325         """
       
   326         block_elems = self.settings['block_elements']
       
   327         block_elems['br'] = None
       
   328         block_elems['p'] = None
       
   329         
       
   330         while True :
       
   331             next_br = self.root.find('br')
       
   332             if not next_br: break
       
   333             parent = next_br.parent
       
   334             self.wrap_string('p', start_at=parent, block_elems = block_elems)
       
   335             while True:
       
   336                 useless_br=parent.find('br', recursive=False)
       
   337                 if not useless_br: break
       
   338                 useless_br.extract()        
       
   339             if parent.name == 'p':
       
   340                 self.disgorge_elem(parent)
       
   341     
       
   342     def rename_tags(self):
       
   343         """
       
   344         >>> c = Cleaner("", "rename_tags", elem_map={'i': 'em'})
       
   345         >>> c('<b>A<i>B</i></b>')
       
   346         u'<b>A<em>B</em></b>'
       
   347         """
       
   348         for tag in self.root.findAll(self.settings['elem_map']) :
       
   349             tag.name = self.settings['elem_map'][tag.name]
       
   350         
       
   351     def wrap_string(self, wrapping_element = None, start_at=None, block_elems=None):
       
   352         """
       
   353         takes an html fragment, which may or may not have a single containing element,
       
   354         and guarantees what the tag name of the topmost elements are.
       
   355         TODO: is there some simpler way than a state machine to do this simple thing?
       
   356         >>> c = Cleaner("", "wrap_string")
       
   357         >>> c('A <strong>B C</strong>D')
       
   358         u'<p>A <strong>B C</strong>D</p>'
       
   359         >>> c('A <p>B C</p>D')
       
   360         u'<p>A </p><p>B C</p><p>D</p>'
       
   361         """
       
   362         if not start_at : start_at = self.root
       
   363         if not block_elems : block_elems = self.settings['block_elements']
       
   364         e = (wrapping_element or self.settings['wrapping_element'])
       
   365         paragraph_list = []
       
   366         children = [elem for elem in start_at.contents]
       
   367         children.append(Stop())
       
   368         
       
   369         last_state = 'block'
       
   370         paragraph = BeautifulSoup.Tag(self._soup, e)
       
   371         
       
   372         for node in children :
       
   373             if isinstance(node, Stop) :
       
   374                 state = 'end'
       
   375             elif hasattr(node, 'name') and node.name in block_elems:
       
   376                 state = 'block'
       
   377             else:
       
   378                 state = 'inline'
       
   379                 
       
   380             if last_state == 'block' and state == 'inline':
       
   381                 #collate inline elements
       
   382                 paragraph = BeautifulSoup.Tag(self._soup, e)
       
   383                 
       
   384             if state == 'inline' :
       
   385                 paragraph.append(node)
       
   386                 
       
   387             if ((state <> 'inline') and last_state == 'inline') :
       
   388                 paragraph_list.append(paragraph)
       
   389                 
       
   390             if state == 'block' :
       
   391                 paragraph_list.append(node)
       
   392             
       
   393             last_state = state
       
   394         
       
   395         #can't use append since it doesn't work on empty elements...
       
   396         paragraph_list.reverse()
       
   397         for paragraph in paragraph_list:
       
   398             start_at.insert(0, paragraph)
       
   399         
       
   400     def strip_empty_tags(self):
       
   401         """
       
   402         strip out all empty tags
       
   403         TODO: depth-first search
       
   404         >>> c = Cleaner("", "strip_empty_tags")
       
   405         >>> c('<p>A</p><p></p><p>B</p><p></p>')
       
   406         u'<p>A</p><p>B</p>'
       
   407         >>> c('<p><a></a></p>')
       
   408         u'<p></p>'
       
   409         """
       
   410         tag = self.root
       
   411         while True:
       
   412             next_tag = tag.findNext(True)
       
   413             if not next_tag: break
       
   414             if next_tag.contents or next_tag.attrs:
       
   415                 tag = next_tag
       
   416                 continue
       
   417             next_tag.extract()
       
   418         
       
   419     def rebase_links(self, original_url="", new_url ="") :
       
   420         if not original_url : original_url = self.settings.get('original_url', '')
       
   421         if not new_url : new_url = self.settings.get('new_url', '')
       
   422         raise NotImplementedError
       
   423     
       
   424     # Because of its internal character set handling,
       
   425     # the following will not work in Beautiful soup and is hopefully redundant.
       
   426     # def encode_xml_specials(self, original_url="", new_url ="") :
       
   427     #     """
       
   428     #     BeautifulSoup will let some dangerous xml entities hang around
       
   429     #     in the navigable strings. destroy all monsters.
       
   430     #     >>> c = Cleaner(auto_clean=True, encode_xml_specials=True)
       
   431     #     >>> c('<<<<<')
       
   432     #     u'&lt;&lt;&lt;&lt;'
       
   433     #     """
       
   434     #     for string in self.root.findAll(text=True) :
       
   435     #         sys.stderr.write("root" +"\n")
       
   436     #         sys.stderr.write(str(self.root) +"\n")
       
   437     #         sys.stderr.write("parent" +"\n")
       
   438     #         sys.stderr.write(str(string.parent) +"\n")
       
   439     #         new_string = unicode(string)
       
   440     #         sys.stderr.write(string +"\n")
       
   441     #         for special_char in XML_ENTITIES.keys() :
       
   442     #             sys.stderr.write(special_char +"\n")
       
   443     #         string.replaceWith(
       
   444     #           new_string.replace(special_char, XML_ENTITIES[special_char])
       
   445     #         )
       
   446         
       
   447         
       
   448     def disgorge_elem(self, elem):
       
   449         """
       
   450         remove the given element from the soup and replaces it with its own contents
       
   451         actually tricky, since you can't replace an element with an list of elements
       
   452         using replaceWith
       
   453         >>> disgorgeable_string = '<body>A <em>B</em> C</body>'
       
   454         >>> c = Cleaner()
       
   455         >>> c.string = disgorgeable_string
       
   456         >>> elem = c._soup.find('em')
       
   457         >>> c.disgorge_elem(elem)
       
   458         >>> c.string
       
   459         u'<body>A B C</body>'
       
   460         >>> c.string = disgorgeable_string
       
   461         >>> elem = c._soup.find('body')
       
   462         >>> c.disgorge_elem(elem)
       
   463         >>> c.string
       
   464         u'A <em>B</em> C'
       
   465         >>> c.string = '<div>A <div id="inner">B C</div></div>'
       
   466         >>> elem = c._soup.find(id="inner")
       
   467         >>> c.disgorge_elem(elem)
       
   468         >>> c.string
       
   469         u'<div>A B C</div>'
       
   470         """
       
   471         if elem == self.root :
       
   472             raise AttributeError, "Can't disgorge root"  
       
   473                       
       
   474         # With in-place modification, BeautifulSoup occasionally can return
       
   475         # elements that think they are orphans
       
   476         # this lib is full of workarounds, but it's worth checking
       
   477         parent = elem.parent
       
   478         if parent == None: 
       
   479             raise AttributeError, "AAAAAAAAGH! NO PARENTS! DEATH!"
       
   480         
       
   481         i = None
       
   482         for i in range(len(parent.contents)) :
       
   483             if parent.contents[i] == elem :
       
   484                 index = i
       
   485                 break
       
   486                 
       
   487         elem.extract()
       
   488         
       
   489         #the proceeding method breaks horribly, sporadically.
       
   490         # for i in range(len(elem.contents)) :
       
   491         #     elem.contents[i].extract()
       
   492         #     parent.contents.insert(index+i, elem.contents[i])
       
   493         # return
       
   494         self._safe_inject(parent, index, elem.contents)
       
   495         
       
   496     def _safe_inject(self, dest, dest_index, node_list):
       
   497         #BeautifulSoup result sets look like lists but don't behave right
       
   498         # i.e. empty ones are still True,
       
   499         if not len(node_list) : return
       
   500         node_list = [i for i in node_list]
       
   501         node_list.reverse()
       
   502         for i in node_list :
       
   503             dest.insert(dest_index, i)
       
   504 
       
   505         
       
   506 class Htmlator(object) :
       
   507     """
       
   508     converts a string into a series of html paragraphs
       
   509     """
       
   510     settings = {
       
   511         "encode_xml_specials" : True,
       
   512         "is_plaintext" : True,
       
   513         "convert_newlines" : False,
       
   514         "make_links" : True,
       
   515         "auto_convert" : False,
       
   516         "valid_schemes" : valid_schemes,
       
   517     }
       
   518     def __init__(self, string = "",  **kwargs):
       
   519         self.settings.update(kwargs)
       
   520         super(Htmlator, self).__init__(string, **kwargs)
       
   521         self.string = string
       
   522     
       
   523     def _set_string(self, string):
       
   524         self.string = string
       
   525         if self.settings['auto_convert'] : self.convert()
       
   526         
       
   527     def _get_string(self):
       
   528         return unicode(self._soup)
       
   529     
       
   530     string = property(_get_string, _set_string)
       
   531     
       
   532     def __call__(self, string):
       
   533         """
       
   534         convenience method supporting one-step calling of an instance
       
   535         as a string cleaning function
       
   536         """
       
   537         self.string = string
       
   538         self.convert()
       
   539         return self.string
       
   540         
       
   541     def convert(self):
       
   542         for method in ["encode_xml_specials", "convert_newlines",
       
   543           "make_links"] :
       
   544             if self.settings(method) :
       
   545                 getattr(self, method)()
       
   546     
       
   547     def encode_xml_specials(self) :
       
   548         for char in XML_ENTITIES.keys() :
       
   549             self.string.replace(char, XML_ENTITIES[char])
       
   550         
       
   551     def make_links(self):
       
   552         raise NotImplementedError
       
   553         
       
   554     def convert_newlines(self) :
       
   555         self.string = ''.join([
       
   556             '<p>' + line + '</p>' for line in LINE_EXTRACTION_RE.findall(self.string)
       
   557         ])
       
   558         
       
   559 def _test():
       
   560     import doctest
       
   561     doctest.testmod()
       
   562 
       
   563 if __name__ == "__main__":
       
   564     _test()
       
   565 
       
   566 
       
   567 # def cast_input_to_soup(fn):
       
   568 #     """
       
   569 #     Decorate function to handle strings as BeautifulSoups transparently
       
   570 #     """
       
   571 #     def stringy_version(input, *args, **kwargs) :
       
   572 #         if not isinstance(input,BeautifulSoup) :
       
   573 #             input=BeautifulSoup(input)
       
   574 #         return fn(input, *args, **kwargs)
       
   575 #     return stringy_version