web/hgbook/converter.py
author amit@thunder
Mon, 25 Jan 2010 18:56:45 +0530
changeset 0 8083d21c0020
permissions -rwxr-xr-x
The first commit of all the required files for the review app
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
     1
from lxml import etree
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
     2
from lxml import html
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
     3
from lxml.cssselect import CSSSelector
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
     4
import md5
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
     5
import sys
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
     6
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
     7
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
     8
args = sys.argv[1:]
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
     9
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    10
# django stuff
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    11
from django.core.management import setup_environ
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    12
import settings # Assumed to be in the same directory.
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    13
setup_environ(settings)       # ugly django collateral effects :(
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    14
from comments.models import Element
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    15
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    16
doc_id = 'MMSC'
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    17
sel = CSSSelector('div.chapter p, pre, h1, table.equation')
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    18
chapter_sel = CSSSelector('div.chapter')
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    19
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    20
try:
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    21
    filename = args[0]
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    22
except IndexError:
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    23
    raise IndexError("Usage: %s <path-to-html-file>" % __file__)
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    24
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    25
tree = etree.parse(filename, html.HTMLParser(remove_blank_text=True))
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    26
root = tree.getroot()
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    27
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    28
chapter = chapter_sel(root)[0]
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    29
chapter_title = chapter.get('id').split(':')[1]
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    30
chapter_hash = md5.new(chapter.get('id').encode('utf8')).hexdigest()
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    31
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    32
chapter.set('id', chapter_hash)
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    33
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    34
for element in sel(root):
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    35
    hsh_source = element.text or element.get('alt') or etree.tostring(element)
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    36
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    37
    if hsh_source:
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    38
        hsh_source_encoded = hsh_source.encode('utf8')
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    39
        hsh = md5.new(hsh_source_encoded).hexdigest()
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    40
        element.set('id', '%s-%s' % (chapter_hash, hsh))
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    41
    
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    42
        # create the commentable element in the DB
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    43
        e = Element()
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    44
        e.id = '%s-%s' % (chapter_hash, hsh)
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    45
        e.chapter = chapter_hash
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    46
        e.title = chapter_title
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    47
        e.save()
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    48
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    49
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    50
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    51
print etree.tostring(root)      # pipe to a file if you wish
8083d21c0020 The first commit of all the required files for the review app
amit@thunder
parents:
diff changeset
    52