web/hgbook/converter.py
changeset 0 8083d21c0020
equal deleted inserted replaced
-1:000000000000 0:8083d21c0020
       
     1 from lxml import etree
       
     2 from lxml import html
       
     3 from lxml.cssselect import CSSSelector
       
     4 import md5
       
     5 import sys
       
     6 
       
     7 
       
     8 args = sys.argv[1:]
       
     9 
       
    10 # django stuff
       
    11 from django.core.management import setup_environ
       
    12 import settings # Assumed to be in the same directory.
       
    13 setup_environ(settings)       # ugly django collateral effects :(
       
    14 from comments.models import Element
       
    15 
       
    16 doc_id = 'MMSC'
       
    17 sel = CSSSelector('div.chapter p, pre, h1, table.equation')
       
    18 chapter_sel = CSSSelector('div.chapter')
       
    19 
       
    20 try:
       
    21     filename = args[0]
       
    22 except IndexError:
       
    23     raise IndexError("Usage: %s <path-to-html-file>" % __file__)
       
    24 
       
    25 tree = etree.parse(filename, html.HTMLParser(remove_blank_text=True))
       
    26 root = tree.getroot()
       
    27 
       
    28 chapter = chapter_sel(root)[0]
       
    29 chapter_title = chapter.get('id').split(':')[1]
       
    30 chapter_hash = md5.new(chapter.get('id').encode('utf8')).hexdigest()
       
    31 
       
    32 chapter.set('id', chapter_hash)
       
    33 
       
    34 for element in sel(root):
       
    35     hsh_source = element.text or element.get('alt') or etree.tostring(element)
       
    36 
       
    37     if hsh_source:
       
    38         hsh_source_encoded = hsh_source.encode('utf8')
       
    39         hsh = md5.new(hsh_source_encoded).hexdigest()
       
    40         element.set('id', '%s-%s' % (chapter_hash, hsh))
       
    41     
       
    42         # create the commentable element in the DB
       
    43         e = Element()
       
    44         e.id = '%s-%s' % (chapter_hash, hsh)
       
    45         e.chapter = chapter_hash
       
    46         e.title = chapter_title
       
    47         e.save()
       
    48 
       
    49 
       
    50 
       
    51 print etree.tostring(root)      # pipe to a file if you wish
       
    52