Use feedparser to sanitize HTML content for documents
authorSverre Rabbelier <srabbelier@gmail.com>
Sat, 21 Feb 2009 21:56:55 +0000
changeset 1446 bcbbcb72429d
parent 1445 c2e09f7d62d9
child 1447 5e729070dc60
Use feedparser to sanitize HTML content for documents Patch by: Sverre Rabbelier
app/soc/logic/cleaning.py
app/soc/views/models/document.py
--- a/app/soc/logic/cleaning.py	Sat Feb 21 21:49:46 2009 +0000
+++ b/app/soc/logic/cleaning.py	Sat Feb 21 21:56:55 2009 +0000
@@ -24,6 +24,8 @@
     ]
 
 
+import feedparser
+
 from google.appengine.api import users
 
 from django import forms
@@ -46,7 +48,6 @@
 DEF_ORGANZIATION_NOT_ACTIVE_MSG = ugettext(
     'This organization is not active/existent')
 
-
 def check_field_is_empty(field_name):
   """Returns decorator that bypasses cleaning for empty fields.
   """
@@ -265,6 +266,16 @@
   return feed_url
 
 
+def clean_document_content(self):
+  content = self.cleaned_data.get('content')
+
+  sanitizer = feedparser._HTMLSanitizer('utf-8')
+  sanitizer.feed(content)
+  content = sanitizer.output()
+  content = content.strip().replace('\r\n', '\n')
+
+  return content
+
 def clean_url(field_name):
   """Clean method for cleaning a field belonging to a LinkProperty.
   """
--- a/app/soc/views/models/document.py	Sat Feb 21 21:49:46 2009 +0000
+++ b/app/soc/views/models/document.py	Sat Feb 21 21:56:55 2009 +0000
@@ -92,6 +92,7 @@
         'prefix': forms.fields.CharField(widget=helper.widgets.ReadOnlyInput(),
                                         required=True),
 
+        'clean_content': cleaning.clean_document_content,
         'clean_link_id': cleaning.clean_link_id('link_id'),
         'clean_scope_path': cleaning.clean_scope_path('scope_path'),
         'clean': cleaning.validate_document_acl(self),