#! /usr/local/bin/perl # BASED ON: # $Header: /srv/home/sbauman/bin/RCS/wwp-store_sgml2xml.perl,v 1.6 2003-04-18 08:12:27-04 sbauman Exp $ # # Copyright © 1999, 2003 Syd Bauman and Brown University # Women Writers Project. Some rights reserved. For complete # copyleft notice, see block comment at the end of this file. # # wwp-store_sgml2xml_for_real.perl # # usage # ----- # $PROGRAM_NAME # # input - plain text file(s); should be a single SGML entity # conforming to wwp-store.dtd (i.e., should start with a DOCTYPE # declaration and end with ). # output - a copy of the input file(s) that (hopefully) is valid # as XML. # # This routine is intended to work only on WWP textbase files. It may # work on some other SGML files, but it's not likely. # # known bugs # ----- ---- # Will mess up the internal subset of files that have one. # Will fail to insert a "/" before the ">" of an empty element whose tag # spans three or more input lines. # # known limitations # ----- ----------- # The input file must be valid with NAMECASE GENERAL NO (i.e., valid # in a completely case-sensitive environment) in order to get valid XML. # # An RCS-maintained change log is at the end of this file (just # before copyright info). # use File::Basename; use English; use HTTP::Date qw( time2iso time2isoz ); # # set flags, constants, etc. # ( $myname, $mypath, $mysuffix ) = fileparse( $PROGRAM_NAME, '\.[^.]*' ); $teiName = "[A-Za-z][A-Za-z0-9.-]{0,31}"; # pattern to match TEI syntax name $do_initial_tagc = 0; # flag: check for a TAGC before "<" and insert "/"? $time = &time2isoz(time); $xml_decl = "\n"; $xml_doctype_decl = ""; %is_empty = ("milestone" => 1, "pb" => 1, "lb" => 1, "cb" => 1, "anchor" => 1, "ptr" => 1, "addSpan" => 1, "delSpan" => 1, "hand" => 1, "gap" => 1, "step" => 1, "state" => 1, "move" => 1, "link" => 1, "xptr" => 1, "join" => 1, "alt" => 1, "handShift" => 1 ); print $xml_decl; # # Now read through STDIN, writing to STDOUT # while () { # # when we hit the doctype declaration, skip it and # stick our own in instead # if ( m// ) { $ARG = ; } } else { # not the doctype declaration # # Check to see if the last record ended with the begining of # an empty tag that will be continued on this record. # if ( $do_initial_tagc ) { # # Yup, insert a slash before 1st TAGC (">") # s;^([^<>]*)>;\1/>;; $do_initial_tagc = 0; # remember not to this again (unless reset, below) } $temp = ""; # place to rebuild this record # # foreach string that looks like a tag (i.e. starts with a "<" followed # by a name, and ends with a ">") ... # while ( m/<($teiName)(\s+[^>]*)?>/g ) { # # ... parse out the GI and the rest of the tag ... # $gi = $1; $rest = $2; # # ... use a "/" iff this is an empty element ... # $xml_empty_tagc = ($is_empty{$gi}) ? "/" : ""; # # ... and append what came before the "tag" and the newly # created tag to the temporary "rebuild the record" place. # $temp = "$temp$PREMATCH<$gi$rest$xml_empty_tagc>"; # # Reset the string we are searching through to just that which # is after the tag (this avoids duplication of the tag we just # found being tacked on as part of PREMATCH next time). # $ARG = $POSTMATCH; } # # print out the record we built, and whatever portion of the # input record that follows the last match # print "$temp$ARG"; # # If this record ends with what looks like the start of a tag # (i.e., a "<" follwed by a name optionally followed by stuff # that is *not* ">") (remember the $ARG we are comparing here # is only the record after the last "tag", anyway) ... # if ( m/<($teiName)(\s+[^>]*)?$/ ) { # # ... set a flag iff it's an empty tag, so when we hit # the next record we'll remember to insert a slash before # the ">". # $gi = $1; if ( $is_empty{$gi} ) { $do_initial_tagc = 1; } } } } exit(); # ----------------------------------------------------- # # Changed to wwp-store_sgml2xml_for_real.perl in order to # actually migrate instances to XML, rather than generate # temporary XML versions of SGML sources. # # $Log: wwp-store_sgml2xml.perl,v $ # Revision 1.6 2003-04-18 08:12:27-04 sbauman # Ported from mama to cushing. # # Revision 1.5 2002-02-07 17:55:04-05 syd # Add code to catch an empty tag that starts on one record and # ends on the next (we still don't catch one that spans 3 or more # input lines). # Improve commenting. # # Revision 1.4 1999-08-18 15:00:37-04 syd # Changed SYSTEM idientifier on DOCTYPE declaration # to match name change of system file. # # Revision 1.3 1999-05-23 18:21:53-04 syd # Added XML Declaration to output # # Revision 1.2 1999-05-17 11:30:58-04 syd # Code written, seems to work # # ----------------------------------------------------- # Copyright © 1999, 2003 Syd Bauman and Brown # University Women Writers Project. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the # Free Software Foundation, Inc. # 675 Mass Ave # Cambridge, MA 02139 # USA # gnu@prep.ai.mit.edu # # Syd Bauman, sgml textbase programmer/analyst # Brown University Women Writers Project # Box 1841 # Providence, RI 02912-1841 # 401-863-3835 # Syd_Bauman@Brown.edu #