diff -r 000000000000 -r 8083d21c0020 web/hgbook/converter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/hgbook/converter.py Mon Jan 25 18:56:45 2010 +0530 @@ -0,0 +1,52 @@ +from lxml import etree +from lxml import html +from lxml.cssselect import CSSSelector +import md5 +import sys + + +args = sys.argv[1:] + +# django stuff +from django.core.management import setup_environ +import settings # Assumed to be in the same directory. +setup_environ(settings) # ugly django collateral effects :( +from comments.models import Element + +doc_id = 'MMSC' +sel = CSSSelector('div.chapter p, pre, h1, table.equation') +chapter_sel = CSSSelector('div.chapter') + +try: + filename = args[0] +except IndexError: + raise IndexError("Usage: %s " % __file__) + +tree = etree.parse(filename, html.HTMLParser(remove_blank_text=True)) +root = tree.getroot() + +chapter = chapter_sel(root)[0] +chapter_title = chapter.get('id').split(':')[1] +chapter_hash = md5.new(chapter.get('id').encode('utf8')).hexdigest() + +chapter.set('id', chapter_hash) + +for element in sel(root): + hsh_source = element.text or element.get('alt') or etree.tostring(element) + + if hsh_source: + hsh_source_encoded = hsh_source.encode('utf8') + hsh = md5.new(hsh_source_encoded).hexdigest() + element.set('id', '%s-%s' % (chapter_hash, hsh)) + + # create the commentable element in the DB + e = Element() + e.id = '%s-%s' % (chapter_hash, hsh) + e.chapter = chapter_hash + e.title = chapter_title + e.save() + + + +print etree.tostring(root) # pipe to a file if you wish +