Use feedparser to sanitize HTML content for documents
Patch by: Sverre Rabbelier
--- a/app/soc/logic/cleaning.py Sat Feb 21 21:49:46 2009 +0000
+++ b/app/soc/logic/cleaning.py Sat Feb 21 21:56:55 2009 +0000
@@ -24,6 +24,8 @@
]
+import feedparser
+
from google.appengine.api import users
from django import forms
@@ -46,7 +48,6 @@
DEF_ORGANZIATION_NOT_ACTIVE_MSG = ugettext(
'This organization is not active/existent')
-
def check_field_is_empty(field_name):
"""Returns decorator that bypasses cleaning for empty fields.
"""
@@ -265,6 +266,16 @@
return feed_url
+def clean_document_content(self):
+ content = self.cleaned_data.get('content')
+
+ sanitizer = feedparser._HTMLSanitizer('utf-8')
+ sanitizer.feed(content)
+ content = sanitizer.output()
+ content = content.strip().replace('\r\n', '\n')
+
+ return content
+
def clean_url(field_name):
"""Clean method for cleaning a field belonging to a LinkProperty.
"""
--- a/app/soc/views/models/document.py Sat Feb 21 21:49:46 2009 +0000
+++ b/app/soc/views/models/document.py Sat Feb 21 21:56:55 2009 +0000
@@ -92,6 +92,7 @@
'prefix': forms.fields.CharField(widget=helper.widgets.ReadOnlyInput(),
required=True),
+ 'clean_content': cleaning.clean_document_content,
'clean_link_id': cleaning.clean_link_id('link_id'),
'clean_scope_path': cleaning.clean_scope_path('scope_path'),
'clean': cleaning.validate_document_acl(self),