SEESenv/scripts/autoid.py
author amit@thunder
Sun, 28 Feb 2010 16:22:19 +0530
changeset 33 bc535262231d
parent 31 06a02dd3966f
permissions -rw-r--r--
Solving some especially with p_list.py

#!/usr/bin/env python
#
# Add unique ID attributes to para tags.  This script should only be
# run by one person, since otherwise it introduces the possibility of
# chaotic conflicts among tags.

import glob, os, re, sys

tagged = re.compile('<para[^>]* id="x_([0-9a-f]+)"[^>]*>', re.M)
untagged = re.compile('<para>')

script_folder='/home/hg/repos/SEES-hacks/temp/'
names = glob.glob('/home/hg/repos/SEES-hacks/temp/ch*.docbook') 
# First pass: find the highest-numbered paragraph ID.


chapter=None
seen = set()
errs = 0
beginning="p_list= "

id_file=open(script_folder+'p_list.py','w') 
dictionary={}
id_list=[]
for name in names:
    for m in tagged.finditer(open(name).read()):
        i = int(m.group(1),16)
        if i in seen:
            print >> sys.stderr, '%s: duplication of ID %s' % (name, i)
            errs += 1
        seen.add(i)
        if i > biggest_id:
            biggest_id = i


def retag(s):
    global biggest_id
    global chapter   
    biggest_id += 1
     
    id_name="%s_%x" % (chapter.split('/')[-1],biggest_id)   
    id_list.append(id_name)    
    
    return '<para id="%s">' %id_name

# Second pass: add IDs to paragraphs that currently lack them.

for name in names:
    biggest_id=0 
    chapter=name.split('.')[0]    
    id_list=[]    
    f = open(name).read()
    f1 = untagged.sub(retag, f )
    dictionary[chapter.split('/')[-1]]=id_list       
    if f1 != f:
        tmpname = name + '.tmp'
        fp = open(tmpname, 'w')
        fp.write(f1)
        fp.close()
        os.rename(tmpname, name)
p_lists_string=beginning+str(dictionary)
id_file.write(p_lists_string)
sys.exit(errs)