#!/usr/bin/perl # # add_ids.pl # # Eugene Eric Kim # http://www.eekim.com/software/purple/ # # $Id: add_ids.pl,v 1.10 2001/08/10 00:45:40 eekim Exp $ # # Copyright (c) Eugene Eric Kim 2000-2001. All rights reserved. # See COPYING for licensing terms. =head1 NAME add_ids.pl - Add statement IDs and hierarchical addresses to an XML file. =head1 SYNOPSIS Usage: add_ids.pl [-s] [-r rules.purple] file.xml =head1 DESCRIPTION Parses the XML file, adds NIDs to nodes that do not already have one, and computes and adds hierarchical addresses to all nodes using a configurable set of rules. Saves the original XML file to file.xml~. If a rules file is not specified, defaults to rules for purple.dtd. This script is not very robust. It does not validate the XML file, and it does not handle errors well. So be careful. =cut use lib '/home/eekim/perl'; use strict; use IO::File; use File::Copy; use Getopt::Long; use Purple::HID; use XML::DOM; my $rules_file; my $on_nid_only = 0; GetOptions('s!'=>\$on_nid_only, 'r=s'=>\$rules_file); my $xml_file = $ARGV[0]; if (!$xml_file) { print <parsefile("$xml_file~"); ### element rules # default values for purple.dtd my $tag_contentroot = 'purple'; my $tag_lastnid = 'lastnid'; my %tags_with_nids = ('h'=>1,'p'=>1,'item'=>1,'example'=>1,'figure'=>1); my %start_tags_recurse = ($tag_contentroot=>1,'section'=>1, 'list'=>1); my %start_tags_spawn = ('section'=>1, 'list'=>1); my %start_tags_mark_spawn; my %start_tags_kill_mark_spawn = ('h'=>1); my %start_tags_mark_incr = ('p'=>1,'item'=>1,'example'=>1,'figure'=>1); my %end_tags_kill_incr = ('section'=>1, 'list'=>1); # override default rules with values in rules file, if it exists if (-e $rules_file) { my $fh = new IO::File $rules_file; if (defined $fh) { undef $tag_contentroot; undef $tag_lastnid; undef %tags_with_nids; undef %start_tags_recurse; undef %start_tags_spawn; undef %start_tags_mark_spawn; undef %start_tags_kill_mark_spawn; undef %start_tags_mark_incr; undef %end_tags_kill_incr; while (my $line = <$fh>) { chomp $line; if ($line =~ /^([A-Z_]+)=([A-Za-z,]+)$/) { my $var_name = $1; my $var_value = $2; if ($var_name eq 'CONTENT_ROOT') { $tag_contentroot = $var_value; } elsif ($var_name eq 'LAST_NID') { $tag_lastnid = $var_value; } elsif ($var_name eq 'TAGS_WITH_NIDS') { %tags_with_nids = map { $_ => 1 } split(',', $var_value); } elsif ($var_name eq 'ON_START_TAGS_RECURSE') { %start_tags_recurse = map { $_ => 1 } split(',', $var_value); } elsif ($var_name eq 'ON_START_TAGS_SPAWN') { %start_tags_spawn = map { $_ => 1 } split(',', $var_value); } elsif ($var_name eq 'ON_START_TAGS_MARK_SPAWN') { %start_tags_mark_spawn = map { $_ => 1 } split(',', $var_value); } elsif ($var_name eq 'ON_START_TAGS_KILL_MARK_SPAWN') { %start_tags_kill_mark_spawn = map { $_ => 1 } split(',', $var_value); } elsif ($var_name eq 'ON_START_TAGS_MARK_INCREMENT') { %start_tags_mark_incr = map { $_ => 1 } split(',', $var_value); } elsif ($var_name eq 'ON_END_TAGS_KILL_INCREMENT') { %end_tags_kill_incr = map { $_ => 1 } split(',', $var_value); } } } $start_tags_recurse{$tag_contentroot} = 1; } $fh->close; } ### find largest NID my $current_nid = 0; my $last_nid = 0; my @nodes; my $node; foreach my $tag (keys %tags_with_nids) { $current_nid = &find_largest_nid($tag, $current_nid); } $current_nid++; if ($tag_lastnid) { @nodes = ($doc->getElementsByTagName($tag_lastnid))[0]->getChildNodes; $last_nid = $nodes[0]->getData if (@nodes); if ($last_nid > $current_nid) { $current_nid = $last_nid + 1; } } ### walk tree and add NIDs and HIDs (hierarchical addresss) my $hid = new Purple::HID unless ($on_nid_only); &traverse(($doc->getElementsByTagName($tag_contentroot))[0]); ### update lastnid element if ($tag_lastnid) { $node = ($doc->getElementsByTagName($tag_lastnid))[0]; @nodes = $node->getChildNodes; if (@nodes) { $nodes[0]->setData($current_nid - 1); } else { # add lastnid element $node->insertBefore($doc->createTextNode($current_nid - 1), undef); } } ### print to file $doc->printToFile("$xml_file"); # fini ### subroutines sub find_largest_nid { my $element_name = shift; my ($current_nid) = @_; my @nodes = $doc->getElementsByTagName($element_name); foreach my $node (@nodes) { my $nid = $node->getAttribute('nid'); if ($nid > $current_nid) { $current_nid = $nid; } } return $current_nid; } sub traverse { my $node = shift; my $nodename = $node->getNodeName; if ($start_tags_recurse{$nodename}) { &update_ids_start_tags($node); foreach my $child ($node->getChildNodes) { &traverse($child); } &update_ids_endtags($nodename); } elsif ($tags_with_nids{$nodename}) { &update_ids_start_tags($node); } } sub update_ids_start_tags { my $node = shift; my $nodename = $node->getNodeName; if ($start_tags_spawn{$nodename}) { $hid->spawn unless ($on_nid_only); } elsif ($start_tags_mark_spawn{$nodename}) { &add_ids($node); $hid->spawn unless ($on_nid_only); } elsif ($start_tags_kill_mark_spawn{$nodename}) { $hid->kill unless ($on_nid_only); &add_ids($node); $hid->spawn unless ($on_nid_only); } elsif ($start_tags_mark_incr{$nodename}) { &add_ids($node); $hid->incr unless ($on_nid_only); } } sub update_ids_endtags { my $nodename = shift; if ($end_tags_kill_incr{$nodename}) { $hid->kill unless ($on_nid_only); $hid->incr unless ($on_nid_only); } } sub add_ids { my $node = shift; my $nid = $node->getAttribute('nid'); if (!$nid) { $node->setAttribute('nid', "0$current_nid"); $current_nid++; } $node->setAttribute('hid', $hid->string) unless ($on_nid_only); } sub my_tag_compression { my ($tag, $elem) = @_; return 1 if ($tag =~ /^p$/); return 0; } =head1 RULES-BASED ID GENERATION add_ids.pl uses a depth-first style algorithm for traversing the DOM tree and computing and adding the appropriate IDs. You can define files containing rules that add_ids.pl will use to determine when and how to increment the hierarchical ID (HID). The rules are defined as follows: RULE_NAME=tag1,tag2,tag3 where RULE_NAME is the name of the rule, and tag1, tag2, tag3... are the elements to which the rule applies. There are nine rules: =head2 CONTENT_ROOT The top-level element. add_ids.pl starts parsing with this element. Normally, this is the document root element, although in some cases, you can specify a lower-level element to reduce the amount of parsing. For example, if you were using add_ids.pl on XHTML, you could specify as the CONTENT_ROOT, but it is better to specify , because all of the elements that will obtain IDs are contained within the elements. =head2 LAST_NID If the XML vocabulary reserves an element to store the last NID used in the lifespan of the document, list that here. This is an optional rule. If it is not defined, add_ids.pl uses the largest available NID currently in the document. =head2 TAGS_WITH_NIDS List every tag that will contain an ID in this rule. =head2 ON_START_TAGS_RECURSE When add_ids.pl sees tags from this rule, it will recurse one level and examine the children elements as wel. =head2 ON_START_TAGS_SPAWN On seeing tags defined here, spawn a new level of the HID. For instance, if: ON_START_TAGS_SPAWN=list and if the HID is 2C when add_ids.pl comes across a element, add_ids.pl will change the HID to 2C1. =head2 ON_START_TAGS_MARK_SPAWN On seeing tags defined here, sets the NID and HID attributes, and spawns a new level of the HID. =head2 ON_START_TAGS_KILL_MARK_SPAWN Kills the current level of the HID, sets the NID and HID attributes, and spawns that level of the HID again. =head2 ON_START_TAGS_MARK_INCREMENT Sets the NID and HID attributes of the tags defined here, and increments the current HID. =head2 ON_END_TAGS_KILL_INCREMENT On seeing the end tag of the elements listed here, kills the current level of the HID, and then increments it. For instance, if: ON_END_TAGS_KILL_INCREMENT=list and if the HID is 2C1 when the program sees , add_ids.pl will change the HID to 2D. =head1 SAMPLE RULES FILE Here is the default rules file used for purple.dtd: # rules.purple CONTENT_ROOT=purple LAST_NID=lastnid TAGS_WITH_NIDS=h,p,item,example,figure ON_START_TAGS_RECURSE=section,list ON_START_TAGS_SPAWN=section,list ON_START_TAGS_MARK_SPAWN= ON_START_TAGS_KILL_MARK_SPAWN=h ON_START_TAGS_MARK_INCREMENT=p,item,example,figure ON_END_TAGS_KILL_INCREMENT=section,list =head1 AUTHOR Eugene Eric Kim =cut