|
1 # -*- coding: UTF-8 -*- |
|
2 """ |
|
3 some input filters, for regularising the html fragments from screen scraping and |
|
4 browser-based editors into some semblance of sanity |
|
5 |
|
6 TODO: turn the messy setting[method_name]=True filter syntax into a list of cleaning methods to invoke, so that they can be invoked in a specific order and multiple times. |
|
7 |
|
8 AUTHORS: |
|
9 Dan MacKinlay - https://launchpad.net/~dan-possumpalace |
|
10 Collin Grady - http://launchpad.net/~collin-collingrady |
|
11 Andreas Gustafsson - https://bugs.launchpad.net/~gson |
|
12 HÃ¥kan W - https://launchpad.net/~hwaara-gmail |
|
13 """ |
|
14 |
|
15 import BeautifulSoup |
|
16 import re |
|
17 import sys |
|
18 |
|
19 # Python 2.4 compatibility |
|
20 try: any |
|
21 except NameError: |
|
22 def any(iterable): |
|
23 for element in iterable: |
|
24 if element: |
|
25 return True |
|
26 return False |
|
27 |
|
28 """ |
|
29 html5lib compatibility. Basically, we need to know that this still works whether html5lib |
|
30 is imported or not. Should run complete suites of tests for both possible configs - |
|
31 or test in virtual environments, but for now a basic sanity check will do. |
|
32 >>> if html5: |
|
33 >>> c=Cleaner(html5=False) |
|
34 >>> c(u'<p>foo</p>) |
|
35 u'<p>foo</p>' |
|
36 """ |
|
37 try: |
|
38 import html5lib |
|
39 from html5lib import sanitizer, treebuilders |
|
40 parser = html5lib.HTMLParser( |
|
41 tree=treebuilders.getTreeBuilder("beautifulsoup"), |
|
42 tokenizer=sanitizer.HTMLSanitizer |
|
43 ) |
|
44 html5 = True |
|
45 except ImportError: |
|
46 html5 = False |
|
47 |
|
48 ANTI_JS_RE=re.compile('j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*:', re.IGNORECASE) |
|
49 #These tags and attrs are sufficently liberal to let microformats through... |
|
50 #it ruthlessly culls all the rdf, dublin core metadata and so on. |
|
51 valid_tags = dict.fromkeys('p i em strong b u a h1 h2 h3 pre abbr br img dd dt ol ul li span sub sup ins del blockquote table tr td th address cite'.split()) #div? |
|
52 valid_attrs = dict.fromkeys('href src rel title'.split()) |
|
53 valid_schemes = dict.fromkeys('http https'.split()) |
|
54 elem_map = {'b' : 'strong', 'i': 'em'} |
|
55 attrs_considered_links = dict.fromkeys("src href".split()) #should include |
|
56 #courtesy http://developer.mozilla.org/en/docs/HTML:Block-level_elements |
|
57 block_elements = dict.fromkeys(["p", "h1","h2", "h3", "h4", "h5", "h6", "ol", "ul", "pre", "address", "blockquote", "dl", "div", "fieldset", "form", "hr", "noscript", "table"]) |
|
58 |
|
59 #convenient default filter lists. |
|
60 paranoid_filters = ["strip_comments", "strip_tags", "strip_attrs", |
|
61 "strip_schemes", "rename_tags", "wrap_string", "strip_empty_tags", "strip_empty_tags", ] |
|
62 complete_filters = ["strip_comments", "rename_tags", "strip_tags", "strip_attrs", |
|
63 "strip_cdata", "strip_schemes", "wrap_string", "strip_empty_tags", "rebase_links", "reparse"] |
|
64 |
|
65 #set some conservative default string processings |
|
66 default_settings = { |
|
67 "filters" : paranoid_filters, |
|
68 "block_elements" : block_elements, #xml or None for a more liberal version |
|
69 "convert_entities" : "html", #xml or None for a more liberal version |
|
70 "valid_tags" : valid_tags, |
|
71 "valid_attrs" : valid_attrs, |
|
72 "valid_schemes" : valid_schemes, |
|
73 "attrs_considered_links" : attrs_considered_links, |
|
74 "elem_map" : elem_map, |
|
75 "wrapping_element" : "p", |
|
76 "auto_clean" : False, |
|
77 "original_url" : "", |
|
78 "new_url" : "", |
|
79 "html5" : html5 |
|
80 } |
|
81 #processes I'd like but haven't implemented |
|
82 #"encode_xml_specials", "ensure complete xhtml doc", "ensure_xhtml_fragment_only" |
|
83 # and some handling of permitted namespaces for tags. for RDF, say. maybe. |
|
84 |
|
85 XML_ENTITIES = { u"'" : u"'", |
|
86 u'"' : u""", |
|
87 u"&" : u"&", |
|
88 u"<" : u"<", |
|
89 u">" : u">" |
|
90 } |
|
91 LINE_EXTRACTION_RE = re.compile(".+", re.MULTILINE) |
|
92 BR_EXTRACTION_RE = re.compile("</?br ?/?>", re.MULTILINE) |
|
93 |
|
94 class Stop: |
|
95 """ |
|
96 handy class that we use as a stop input for our state machine in lieu of falling |
|
97 off the end of lists |
|
98 """ |
|
99 pass |
|
100 |
|
101 |
|
102 class Cleaner(object): |
|
103 r""" |
|
104 powerful and slow arbitrary HTML sanitisation. can deal (i hope) with most XSS |
|
105 vectors and layout-breaking badness. |
|
106 Probably overkill for content from trusted sources; defaults are accordingly |
|
107 set to be paranoid. |
|
108 >>> bad_html = '<p style="forbidden markup"><!-- XSS attach -->content</p' |
|
109 >>> good_html = u'<p>content</p>' |
|
110 >>> c = Cleaner() |
|
111 >>> c.string = bad_html |
|
112 >>> c.clean() |
|
113 >>> c.string == good_html |
|
114 True |
|
115 |
|
116 Also supports shorthand syntax: |
|
117 >>> c = Cleaner() |
|
118 >>> c(bad_html) == c(good_html) |
|
119 True |
|
120 """ |
|
121 |
|
122 def __init__(self, string_or_soup="", *args, **kwargs): |
|
123 self.settings=default_settings.copy() |
|
124 self.settings.update(kwargs) |
|
125 if args : |
|
126 self.settings['filters'] = args |
|
127 super(Cleaner, self).__init__(string_or_soup, *args, **kwargs) |
|
128 self.string = string_or_soup |
|
129 |
|
130 def __call__(self, string = None, **kwargs): |
|
131 """ |
|
132 convenience method allowing one-step calling of an instance and returning |
|
133 a cleaned string. |
|
134 |
|
135 TODO: make this method preserve internal state- perhaps by creating a new |
|
136 instance. |
|
137 |
|
138 >>> s = 'input string' |
|
139 >>> c1 = Cleaner(s, auto_clean=True) |
|
140 >>> c2 = Cleaner("") |
|
141 >>> c1.string == c2(s) |
|
142 True |
|
143 |
|
144 """ |
|
145 self.settings.update(kwargs) |
|
146 if not string == None : |
|
147 self.string = string |
|
148 self.clean() |
|
149 return self.string |
|
150 |
|
151 def _set_contents(self, string_or_soup): |
|
152 if isinstance(string_or_soup, BeautifulSoup.BeautifulSoup) : |
|
153 self._set_soup(string_or_soup) |
|
154 else : |
|
155 self._set_string(string_or_soup) |
|
156 |
|
157 def _set_string(self, html_fragment_string): |
|
158 if self.settings['html5']: |
|
159 s = parser.parse(html_fragment_string).body |
|
160 else: |
|
161 s = BeautifulSoup.BeautifulSoup( |
|
162 html_fragment_string, |
|
163 convertEntities=self.settings['convert_entities']) |
|
164 self._set_soup(s) |
|
165 |
|
166 def _set_soup(self, soup): |
|
167 """ |
|
168 Does all the work of set_string, but bypasses a potential autoclean to avoid |
|
169 loops upon internal string setting ops. |
|
170 """ |
|
171 self._soup = BeautifulSoup.BeautifulSoup( |
|
172 '<rootrootroot></rootrootroot>' |
|
173 ) |
|
174 self.root=self._soup.contents[0] |
|
175 |
|
176 if len(soup.contents) : |
|
177 backwards_soup = [i for i in soup.contents] |
|
178 backwards_soup.reverse() |
|
179 else : |
|
180 backwards_soup = [] |
|
181 for i in backwards_soup : |
|
182 i.extract() |
|
183 self.root.insert(0, i) |
|
184 |
|
185 def set_string(self, string) : |
|
186 ur""" |
|
187 sets the string to process and does the necessary input encoding too |
|
188 really intended to be invoked as a property. |
|
189 note the godawful rootrootroot element which we need because the |
|
190 BeautifulSoup object has all the same methods as a Tag, but |
|
191 behaves differently, silently failing on some inserts and appends |
|
192 |
|
193 >>> c = Cleaner(convert_entities="html") |
|
194 >>> c.string = 'é' |
|
195 >>> c.string |
|
196 u'\xe9' |
|
197 >>> c = Cleaner(convert_entities="xml") |
|
198 >>> c.string = u'é' |
|
199 >>> c.string |
|
200 u'é' |
|
201 """ |
|
202 self._set_string(string) |
|
203 if len(string) and self.settings['auto_clean'] : self.clean() |
|
204 |
|
205 def get_string(self): |
|
206 return unicode(self.root.renderContents()) |
|
207 |
|
208 string = property(get_string, set_string) |
|
209 |
|
210 def clean(self): |
|
211 """ |
|
212 invoke all cleaning processes stipulated in the settings |
|
213 """ |
|
214 for method in self.settings['filters'] : |
|
215 try : |
|
216 getattr(self, method)() |
|
217 except NotImplementedError : |
|
218 sys.stderr.write('Warning, called unimplemented method %s' % method + '\n') |
|
219 |
|
220 def strip_comments(self): |
|
221 r""" |
|
222 XHTML comments are used as an XSS attack vector. they must die. |
|
223 |
|
224 >>> c = Cleaner("", "strip_comments") |
|
225 >>> c('<p>text<!-- comment --> More text</p>') |
|
226 u'<p>text More text</p>' |
|
227 """ |
|
228 for comment in self.root.findAll( |
|
229 text = lambda text: isinstance(text, BeautifulSoup.Comment)): |
|
230 comment.extract() |
|
231 |
|
232 def strip_cdata(self): |
|
233 for cdata in self.root.findAll( |
|
234 text = lambda text: isinstance(text, BeautifulSoup.CData)): |
|
235 cdata.extract() |
|
236 |
|
237 def strip_tags(self): |
|
238 r""" |
|
239 ill-considered tags break our layout. they must die. |
|
240 >>> c = Cleaner("", "strip_tags", auto_clean=True) |
|
241 >>> c.string = '<div>A <strong>B C</strong></div>' |
|
242 >>> c.string |
|
243 u'A <strong>B C</strong>' |
|
244 >>> c.string = '<div>A <div>B C</div></div>' |
|
245 >>> c.string |
|
246 u'A B C' |
|
247 >>> c.string = '<div>A <br /><div>B C</div></div>' |
|
248 >>> c.string |
|
249 u'A <br />B C' |
|
250 >>> c.string = '<p>A <div>B C</div></p>' |
|
251 >>> c.string |
|
252 u'<p>A B C</p>' |
|
253 >>> c.string = 'A<div>B<div>C<div>D</div>E</div>F</div>G' |
|
254 >>> c.string |
|
255 u'ABCDEFG' |
|
256 >>> c.string = '<div>B<div>C<div>D</div>E</div>F</div>' |
|
257 >>> c.string |
|
258 u'BCDEF' |
|
259 """ |
|
260 # Beautiful Soup doesn't support dynamic .findAll results when the tree is |
|
261 # modified in place. |
|
262 # going backwards doesn't seem to help. |
|
263 # so find one at a time |
|
264 while True : |
|
265 next_bad_tag = self.root.find( |
|
266 lambda tag : not tag.name in (self.settings['valid_tags']) |
|
267 ) |
|
268 if next_bad_tag : |
|
269 self.disgorge_elem(next_bad_tag) |
|
270 else: |
|
271 break |
|
272 |
|
273 def strip_attrs(self): |
|
274 """ |
|
275 preserve only those attributes we need in the soup |
|
276 >>> c = Cleaner("", "strip_attrs") |
|
277 >>> c('<div title="v" bad="v">A <strong title="v" bad="v">B C</strong></div>') |
|
278 u'<div title="v">A <strong title="v">B C</strong></div>' |
|
279 """ |
|
280 for tag in self.root.findAll(True): |
|
281 tag.attrs = [(attr, val) for attr, val in tag.attrs |
|
282 if attr in self.settings['valid_attrs']] |
|
283 |
|
284 def _all_links(self): |
|
285 """ |
|
286 finds all tags with link attributes sequentially. safe against modification |
|
287 of said attributes in-place. |
|
288 """ |
|
289 start = self.root |
|
290 while True: |
|
291 tag = start.findNext( |
|
292 lambda tag : any( |
|
293 [(tag.get(i) for i in self.settings['attrs_considered_links'])] |
|
294 )) |
|
295 if tag: |
|
296 start = tag |
|
297 yield tag |
|
298 else : |
|
299 break |
|
300 |
|
301 def strip_schemes(self): |
|
302 """ |
|
303 >>> c = Cleaner("", "strip_schemes") |
|
304 >>> c('<img src="javascript:alert();" />') |
|
305 u'<img />' |
|
306 >>> c('<a href="javascript:alert();">foo</a>') |
|
307 u'<a>foo</a>' |
|
308 """ |
|
309 for tag in self._all_links() : |
|
310 for key in self.settings['attrs_considered_links'] : |
|
311 scheme_bits = tag.get(key, u"").split(u':',1) |
|
312 if len(scheme_bits) == 1 : |
|
313 pass #relative link |
|
314 else: |
|
315 if not scheme_bits[0] in self.settings['valid_schemes'] : |
|
316 del(tag[key]) |
|
317 |
|
318 def br_to_p(self): |
|
319 """ |
|
320 >>> c = Cleaner("", "br_to_p") |
|
321 >>> c('<p>A<br />B</p>') |
|
322 u'<p>A</p><p>B</p>' |
|
323 >>> c('A<br />B') |
|
324 u'<p>A</p><p>B</p>' |
|
325 """ |
|
326 block_elems = self.settings['block_elements'] |
|
327 block_elems['br'] = None |
|
328 block_elems['p'] = None |
|
329 |
|
330 while True : |
|
331 next_br = self.root.find('br') |
|
332 if not next_br: break |
|
333 parent = next_br.parent |
|
334 self.wrap_string('p', start_at=parent, block_elems = block_elems) |
|
335 while True: |
|
336 useless_br=parent.find('br', recursive=False) |
|
337 if not useless_br: break |
|
338 useless_br.extract() |
|
339 if parent.name == 'p': |
|
340 self.disgorge_elem(parent) |
|
341 |
|
342 def rename_tags(self): |
|
343 """ |
|
344 >>> c = Cleaner("", "rename_tags", elem_map={'i': 'em'}) |
|
345 >>> c('<b>A<i>B</i></b>') |
|
346 u'<b>A<em>B</em></b>' |
|
347 """ |
|
348 for tag in self.root.findAll(self.settings['elem_map']) : |
|
349 tag.name = self.settings['elem_map'][tag.name] |
|
350 |
|
351 def wrap_string(self, wrapping_element = None, start_at=None, block_elems=None): |
|
352 """ |
|
353 takes an html fragment, which may or may not have a single containing element, |
|
354 and guarantees what the tag name of the topmost elements are. |
|
355 TODO: is there some simpler way than a state machine to do this simple thing? |
|
356 >>> c = Cleaner("", "wrap_string") |
|
357 >>> c('A <strong>B C</strong>D') |
|
358 u'<p>A <strong>B C</strong>D</p>' |
|
359 >>> c('A <p>B C</p>D') |
|
360 u'<p>A </p><p>B C</p><p>D</p>' |
|
361 """ |
|
362 if not start_at : start_at = self.root |
|
363 if not block_elems : block_elems = self.settings['block_elements'] |
|
364 e = (wrapping_element or self.settings['wrapping_element']) |
|
365 paragraph_list = [] |
|
366 children = [elem for elem in start_at.contents] |
|
367 children.append(Stop()) |
|
368 |
|
369 last_state = 'block' |
|
370 paragraph = BeautifulSoup.Tag(self._soup, e) |
|
371 |
|
372 for node in children : |
|
373 if isinstance(node, Stop) : |
|
374 state = 'end' |
|
375 elif hasattr(node, 'name') and node.name in block_elems: |
|
376 state = 'block' |
|
377 else: |
|
378 state = 'inline' |
|
379 |
|
380 if last_state == 'block' and state == 'inline': |
|
381 #collate inline elements |
|
382 paragraph = BeautifulSoup.Tag(self._soup, e) |
|
383 |
|
384 if state == 'inline' : |
|
385 paragraph.append(node) |
|
386 |
|
387 if ((state <> 'inline') and last_state == 'inline') : |
|
388 paragraph_list.append(paragraph) |
|
389 |
|
390 if state == 'block' : |
|
391 paragraph_list.append(node) |
|
392 |
|
393 last_state = state |
|
394 |
|
395 #can't use append since it doesn't work on empty elements... |
|
396 paragraph_list.reverse() |
|
397 for paragraph in paragraph_list: |
|
398 start_at.insert(0, paragraph) |
|
399 |
|
400 def strip_empty_tags(self): |
|
401 """ |
|
402 strip out all empty tags |
|
403 TODO: depth-first search |
|
404 >>> c = Cleaner("", "strip_empty_tags") |
|
405 >>> c('<p>A</p><p></p><p>B</p><p></p>') |
|
406 u'<p>A</p><p>B</p>' |
|
407 >>> c('<p><a></a></p>') |
|
408 u'<p></p>' |
|
409 """ |
|
410 tag = self.root |
|
411 while True: |
|
412 next_tag = tag.findNext(True) |
|
413 if not next_tag: break |
|
414 if next_tag.contents or next_tag.attrs: |
|
415 tag = next_tag |
|
416 continue |
|
417 next_tag.extract() |
|
418 |
|
419 def rebase_links(self, original_url="", new_url ="") : |
|
420 if not original_url : original_url = self.settings.get('original_url', '') |
|
421 if not new_url : new_url = self.settings.get('new_url', '') |
|
422 raise NotImplementedError |
|
423 |
|
424 # Because of its internal character set handling, |
|
425 # the following will not work in Beautiful soup and is hopefully redundant. |
|
426 # def encode_xml_specials(self, original_url="", new_url ="") : |
|
427 # """ |
|
428 # BeautifulSoup will let some dangerous xml entities hang around |
|
429 # in the navigable strings. destroy all monsters. |
|
430 # >>> c = Cleaner(auto_clean=True, encode_xml_specials=True) |
|
431 # >>> c('<<<<<') |
|
432 # u'<<<<' |
|
433 # """ |
|
434 # for string in self.root.findAll(text=True) : |
|
435 # sys.stderr.write("root" +"\n") |
|
436 # sys.stderr.write(str(self.root) +"\n") |
|
437 # sys.stderr.write("parent" +"\n") |
|
438 # sys.stderr.write(str(string.parent) +"\n") |
|
439 # new_string = unicode(string) |
|
440 # sys.stderr.write(string +"\n") |
|
441 # for special_char in XML_ENTITIES.keys() : |
|
442 # sys.stderr.write(special_char +"\n") |
|
443 # string.replaceWith( |
|
444 # new_string.replace(special_char, XML_ENTITIES[special_char]) |
|
445 # ) |
|
446 |
|
447 |
|
448 def disgorge_elem(self, elem): |
|
449 """ |
|
450 remove the given element from the soup and replaces it with its own contents |
|
451 actually tricky, since you can't replace an element with an list of elements |
|
452 using replaceWith |
|
453 >>> disgorgeable_string = '<body>A <em>B</em> C</body>' |
|
454 >>> c = Cleaner() |
|
455 >>> c.string = disgorgeable_string |
|
456 >>> elem = c._soup.find('em') |
|
457 >>> c.disgorge_elem(elem) |
|
458 >>> c.string |
|
459 u'<body>A B C</body>' |
|
460 >>> c.string = disgorgeable_string |
|
461 >>> elem = c._soup.find('body') |
|
462 >>> c.disgorge_elem(elem) |
|
463 >>> c.string |
|
464 u'A <em>B</em> C' |
|
465 >>> c.string = '<div>A <div id="inner">B C</div></div>' |
|
466 >>> elem = c._soup.find(id="inner") |
|
467 >>> c.disgorge_elem(elem) |
|
468 >>> c.string |
|
469 u'<div>A B C</div>' |
|
470 """ |
|
471 if elem == self.root : |
|
472 raise AttributeError, "Can't disgorge root" |
|
473 |
|
474 # With in-place modification, BeautifulSoup occasionally can return |
|
475 # elements that think they are orphans |
|
476 # this lib is full of workarounds, but it's worth checking |
|
477 parent = elem.parent |
|
478 if parent == None: |
|
479 raise AttributeError, "AAAAAAAAGH! NO PARENTS! DEATH!" |
|
480 |
|
481 i = None |
|
482 for i in range(len(parent.contents)) : |
|
483 if parent.contents[i] == elem : |
|
484 index = i |
|
485 break |
|
486 |
|
487 elem.extract() |
|
488 |
|
489 #the proceeding method breaks horribly, sporadically. |
|
490 # for i in range(len(elem.contents)) : |
|
491 # elem.contents[i].extract() |
|
492 # parent.contents.insert(index+i, elem.contents[i]) |
|
493 # return |
|
494 self._safe_inject(parent, index, elem.contents) |
|
495 |
|
496 def _safe_inject(self, dest, dest_index, node_list): |
|
497 #BeautifulSoup result sets look like lists but don't behave right |
|
498 # i.e. empty ones are still True, |
|
499 if not len(node_list) : return |
|
500 node_list = [i for i in node_list] |
|
501 node_list.reverse() |
|
502 for i in node_list : |
|
503 dest.insert(dest_index, i) |
|
504 |
|
505 |
|
506 class Htmlator(object) : |
|
507 """ |
|
508 converts a string into a series of html paragraphs |
|
509 """ |
|
510 settings = { |
|
511 "encode_xml_specials" : True, |
|
512 "is_plaintext" : True, |
|
513 "convert_newlines" : False, |
|
514 "make_links" : True, |
|
515 "auto_convert" : False, |
|
516 "valid_schemes" : valid_schemes, |
|
517 } |
|
518 def __init__(self, string = "", **kwargs): |
|
519 self.settings.update(kwargs) |
|
520 super(Htmlator, self).__init__(string, **kwargs) |
|
521 self.string = string |
|
522 |
|
523 def _set_string(self, string): |
|
524 self.string = string |
|
525 if self.settings['auto_convert'] : self.convert() |
|
526 |
|
527 def _get_string(self): |
|
528 return unicode(self._soup) |
|
529 |
|
530 string = property(_get_string, _set_string) |
|
531 |
|
532 def __call__(self, string): |
|
533 """ |
|
534 convenience method supporting one-step calling of an instance |
|
535 as a string cleaning function |
|
536 """ |
|
537 self.string = string |
|
538 self.convert() |
|
539 return self.string |
|
540 |
|
541 def convert(self): |
|
542 for method in ["encode_xml_specials", "convert_newlines", |
|
543 "make_links"] : |
|
544 if self.settings(method) : |
|
545 getattr(self, method)() |
|
546 |
|
547 def encode_xml_specials(self) : |
|
548 for char in XML_ENTITIES.keys() : |
|
549 self.string.replace(char, XML_ENTITIES[char]) |
|
550 |
|
551 def make_links(self): |
|
552 raise NotImplementedError |
|
553 |
|
554 def convert_newlines(self) : |
|
555 self.string = ''.join([ |
|
556 '<p>' + line + '</p>' for line in LINE_EXTRACTION_RE.findall(self.string) |
|
557 ]) |
|
558 |
|
559 def _test(): |
|
560 import doctest |
|
561 doctest.testmod() |
|
562 |
|
563 if __name__ == "__main__": |
|
564 _test() |
|
565 |
|
566 |
|
567 # def cast_input_to_soup(fn): |
|
568 # """ |
|
569 # Decorate function to handle strings as BeautifulSoups transparently |
|
570 # """ |
|
571 # def stringy_version(input, *args, **kwargs) : |
|
572 # if not isinstance(input,BeautifulSoup) : |
|
573 # input=BeautifulSoup(input) |
|
574 # return fn(input, *args, **kwargs) |
|
575 # return stringy_version |