|
1 from lxml import etree |
|
2 from lxml import html |
|
3 from lxml.cssselect import CSSSelector |
|
4 import md5 |
|
5 import sys |
|
6 |
|
7 |
|
8 args = sys.argv[1:] |
|
9 |
|
10 # django stuff |
|
11 from django.core.management import setup_environ |
|
12 import settings # Assumed to be in the same directory. |
|
13 setup_environ(settings) # ugly django collateral effects :( |
|
14 from comments.models import Element |
|
15 |
|
16 doc_id = 'MMSC' |
|
17 sel = CSSSelector('div.chapter p, pre, h1, table.equation') |
|
18 chapter_sel = CSSSelector('div.chapter') |
|
19 |
|
20 try: |
|
21 filename = args[0] |
|
22 except IndexError: |
|
23 raise IndexError("Usage: %s <path-to-html-file>" % __file__) |
|
24 |
|
25 tree = etree.parse(filename, html.HTMLParser(remove_blank_text=True)) |
|
26 root = tree.getroot() |
|
27 |
|
28 chapter = chapter_sel(root)[0] |
|
29 chapter_title = chapter.get('id').split(':')[1] |
|
30 chapter_hash = md5.new(chapter.get('id').encode('utf8')).hexdigest() |
|
31 |
|
32 chapter.set('id', chapter_hash) |
|
33 |
|
34 for element in sel(root): |
|
35 hsh_source = element.text or element.get('alt') or etree.tostring(element) |
|
36 |
|
37 if hsh_source: |
|
38 hsh_source_encoded = hsh_source.encode('utf8') |
|
39 hsh = md5.new(hsh_source_encoded).hexdigest() |
|
40 element.set('id', '%s-%s' % (chapter_hash, hsh)) |
|
41 |
|
42 # create the commentable element in the DB |
|
43 e = Element() |
|
44 e.id = '%s-%s' % (chapter_hash, hsh) |
|
45 e.chapter = chapter_hash |
|
46 e.title = chapter_title |
|
47 e.save() |
|
48 |
|
49 |
|
50 |
|
51 print etree.tostring(root) # pipe to a file if you wish |
|
52 |