author | amit@thunder |
Thu, 25 Feb 2010 18:53:51 +0530 | |
changeset 30 | f66b0a5ebf40 |
parent 2 | 52d12eb31c30 |
permissions | -rwxr-xr-x |
0
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
1 |
from lxml import etree |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
2 |
from lxml import html |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
3 |
from lxml.cssselect import CSSSelector |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
4 |
import md5 |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
5 |
import sys |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
6 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
7 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
8 |
args = sys.argv[1:] |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
9 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
10 |
# django stuff |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
11 |
from django.core.management import setup_environ |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
12 |
import settings # Assumed to be in the same directory. |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
13 |
setup_environ(settings) # ugly django collateral effects :( |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
14 |
from comments.models import Element |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
15 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
16 |
doc_id = 'MMSC' |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
17 |
sel = CSSSelector('div.chapter p, pre, h1, table.equation') |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
18 |
chapter_sel = CSSSelector('div.chapter') |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
19 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
20 |
try: |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
21 |
filename = args[0] |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
22 |
except IndexError: |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
23 |
raise IndexError("Usage: %s <path-to-html-file>" % __file__) |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
24 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
25 |
tree = etree.parse(filename, html.HTMLParser(remove_blank_text=True)) |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
26 |
root = tree.getroot() |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
27 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
28 |
chapter = chapter_sel(root)[0] |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
29 |
chapter_title = chapter.get('id').split(':')[1] |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
30 |
chapter_hash = md5.new(chapter.get('id').encode('utf8')).hexdigest() |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
31 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
32 |
chapter.set('id', chapter_hash) |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
33 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
34 |
for element in sel(root): |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
35 |
hsh_source = element.text or element.get('alt') or etree.tostring(element) |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
36 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
37 |
if hsh_source: |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
38 |
hsh_source_encoded = hsh_source.encode('utf8') |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
39 |
hsh = md5.new(hsh_source_encoded).hexdigest() |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
40 |
element.set('id', '%s-%s' % (chapter_hash, hsh)) |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
41 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
42 |
# create the commentable element in the DB |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
43 |
e = Element() |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
44 |
e.id = '%s-%s' % (chapter_hash, hsh) |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
45 |
e.chapter = chapter_hash |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
46 |
e.title = chapter_title |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
47 |
e.save() |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
48 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
49 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
50 |
|
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
51 |
print etree.tostring(root) # pipe to a file if you wish |
8083d21c0020
The first commit of all the required files for the review app
amit@thunder
parents:
diff
changeset
|
52 |