# HG changeset patch # User Sverre Rabbelier # Date 1235253415 0 # Node ID bcbbcb72429dcd4f79940c36b447e64357b5760e # Parent c2e09f7d62d9509d4e1925da5ae7ee29828e34f1 Use feedparser to sanitize HTML content for documents Patch by: Sverre Rabbelier diff -r c2e09f7d62d9 -r bcbbcb72429d app/soc/logic/cleaning.py --- a/app/soc/logic/cleaning.py Sat Feb 21 21:49:46 2009 +0000 +++ b/app/soc/logic/cleaning.py Sat Feb 21 21:56:55 2009 +0000 @@ -24,6 +24,8 @@ ] +import feedparser + from google.appengine.api import users from django import forms @@ -46,7 +48,6 @@ DEF_ORGANZIATION_NOT_ACTIVE_MSG = ugettext( 'This organization is not active/existent') - def check_field_is_empty(field_name): """Returns decorator that bypasses cleaning for empty fields. """ @@ -265,6 +266,16 @@ return feed_url +def clean_document_content(self): + content = self.cleaned_data.get('content') + + sanitizer = feedparser._HTMLSanitizer('utf-8') + sanitizer.feed(content) + content = sanitizer.output() + content = content.strip().replace('\r\n', '\n') + + return content + def clean_url(field_name): """Clean method for cleaning a field belonging to a LinkProperty. """ diff -r c2e09f7d62d9 -r bcbbcb72429d app/soc/views/models/document.py --- a/app/soc/views/models/document.py Sat Feb 21 21:49:46 2009 +0000 +++ b/app/soc/views/models/document.py Sat Feb 21 21:56:55 2009 +0000 @@ -92,6 +92,7 @@ 'prefix': forms.fields.CharField(widget=helper.widgets.ReadOnlyInput(), required=True), + 'clean_content': cleaning.clean_document_content, 'clean_link_id': cleaning.clean_link_id('link_id'), 'clean_scope_path': cleaning.clean_scope_path('scope_path'), 'clean': cleaning.validate_document_acl(self),