|
1 #!/usr/bin/env python |
|
2 # |
|
3 # Add unique ID attributes to para tags. This script should only be |
|
4 # run by one person, since otherwise it introduces the possibility of |
|
5 # chaotic conflicts among tags. |
|
6 |
|
7 import glob, os, re, sys |
|
8 |
|
9 tagged = re.compile('<para[^>]* id="x_([0-9a-f]+)"[^>]*>', re.M) |
|
10 untagged = re.compile('<para>') |
|
11 |
|
12 names = glob.glob('ch*.docbook') |
|
13 # First pass: find the highest-numbered paragraph ID. |
|
14 |
|
15 |
|
16 chapter=None |
|
17 seen = set() |
|
18 errs = 0 |
|
19 beginning="p_list= " |
|
20 |
|
21 id_file=open('p_list.py','w') |
|
22 dictionary={} |
|
23 id_list=[] |
|
24 for name in names: |
|
25 for m in tagged.finditer(open(name).read()): |
|
26 i = int(m.group(1),16) |
|
27 if i in seen: |
|
28 print >> sys.stderr, '%s: duplication of ID %s' % (name, i) |
|
29 errs += 1 |
|
30 seen.add(i) |
|
31 if i > biggest_id: |
|
32 biggest_id = i |
|
33 |
|
34 |
|
35 def retag(s): |
|
36 global biggest_id |
|
37 global chapter |
|
38 biggest_id += 1 |
|
39 |
|
40 id_name="%s_%x" % (chapter,biggest_id) |
|
41 id_list.append(id_name) |
|
42 |
|
43 return '<para id="%s">' %id_name |
|
44 |
|
45 # Second pass: add IDs to paragraphs that currently lack them. |
|
46 |
|
47 for name in names: |
|
48 biggest_id=0 |
|
49 chapter=name.split('.')[0] |
|
50 id_list=[] |
|
51 f = open(name).read() |
|
52 f1 = untagged.sub(retag, f ) |
|
53 dictionary[chapter]=id_list |
|
54 if f1 != f: |
|
55 tmpname = name + '.tmp' |
|
56 fp = open(tmpname, 'w') |
|
57 fp.write(f1) |
|
58 fp.close() |
|
59 os.rename(tmpname, name) |
|
60 p_lists_string=beginning+str(dictionary) |
|
61 id_file.write(p_lists_string) |
|
62 sys.exit(errs) |