Library for docbook conversion taken from docutils sandbox ... patches done on it by me
from lxml import etree
from lxml import html
from lxml.cssselect import CSSSelector
import md5
import sys
args = sys.argv[1:]
# django stuff
from import setup_environ
import settings # Assumed to be in the same directory.
setup_environ(settings) # ugly django collateral effects :(
from comments.models import Element
doc_id = 'MMSC'
sel = CSSSelector('div.chapter p, pre, h1, table.equation')
chapter_sel = CSSSelector('div.chapter')
filename = args[0]
except IndexError:
raise IndexError("Usage: %s <path-to-html-file>" % __file__)
tree = etree.parse(filename, html.HTMLParser(remove_blank_text=True))
root = tree.getroot()
chapter = chapter_sel(root)[0]
chapter_title = chapter.get('id').split(':')[1]
chapter_hash ='id').encode('utf8')).hexdigest()
chapter.set('id', chapter_hash)
for element in sel(root):
hsh_source = element.text or element.get('alt') or etree.tostring(element)
if hsh_source:
hsh_source_encoded = hsh_source.encode('utf8')
hsh =
element.set('id', '%s-%s' % (chapter_hash, hsh))
# create the commentable element in the DB
e = Element() = '%s-%s' % (chapter_hash, hsh)
e.chapter = chapter_hash
e.title = chapter_title
print etree.tostring(root) # pipe to a file if you wish