SEESenv/web/hgbook/converter.py
author amit@thunder
Mon, 01 Mar 2010 15:23:42 +0530
changeset 40 ef147a79b098
parent 2 52d12eb31c30
permissions -rwxr-xr-x
Added change names to do required changes in names such that we get the chapter names beyond 10 in proper list

from lxml import etree
from lxml import html
from lxml.cssselect import CSSSelector
import md5
import sys


args = sys.argv[1:]

# django stuff
from django.core.management import setup_environ
import settings # Assumed to be in the same directory.
setup_environ(settings)       # ugly django collateral effects :(
from comments.models import Element

doc_id = 'MMSC'
sel = CSSSelector('div.chapter p, pre, h1, table.equation')
chapter_sel = CSSSelector('div.chapter')

try:
    filename = args[0]
except IndexError:
    raise IndexError("Usage: %s <path-to-html-file>" % __file__)

tree = etree.parse(filename, html.HTMLParser(remove_blank_text=True))
root = tree.getroot()

chapter = chapter_sel(root)[0]
chapter_title = chapter.get('id').split(':')[1]
chapter_hash = md5.new(chapter.get('id').encode('utf8')).hexdigest()

chapter.set('id', chapter_hash)

for element in sel(root):
    hsh_source = element.text or element.get('alt') or etree.tostring(element)

    if hsh_source:
        hsh_source_encoded = hsh_source.encode('utf8')
        hsh = md5.new(hsh_source_encoded).hexdigest()
        element.set('id', '%s-%s' % (chapter_hash, hsh))
    
        # create the commentable element in the DB
        e = Element()
        e.id = '%s-%s' % (chapter_hash, hsh)
        e.chapter = chapter_hash
        e.title = chapter_title
        e.save()



print etree.tostring(root)      # pipe to a file if you wish