afterTag

# afterTag.prl
# cajoles output from treeTagger into proper TEI XML
#
$sno = 1;
while (<>) {
chop;
s//XXXX/;
if (/<!--) { # tag found<br /--> $buffer .= $_;
} else { # token found
($str, $pos, $hw) = split(/\t/);
$tag = "w";
$pos =~ s/\$/s/;
unless ($pos =~ /[A-Z][A-Z]/) {
$hw ="";
$tag = "pc";
$pos = "sub" if($pos =~ /[\,\:\;]/);
$pos = "op" if($pos =~ /[\(\[\{]/);
$pos = "cp" if($pos =~ /[\)\]\}]/);
$pos = "oq" if($pos =~ /[\`]/);
$pos = "cq" if($pos =~ /[\"\']/);
}</code>

if ($pos=~ /SENT/)  {
$tag = "pc";
$hw ="";
} elsif ($hw =~ /[\[\]]/) {
$tag = "pc";
$hw ="";
}

$str =~ s/\&amp;/\&amp;\;/g;
$hw =~ s/\&amp;/and/g;
if ($hw) {$hw = " lemma\=\"$hw\""};

$buffer .=  "&lt;$tag type\=\"$pos\"$hw&gt;$str&lt;\/$tag&gt;\n";

}

} # end of file

$buffer =~ s/]*)&gt;//g;
$buffer =~ s/&lt;\/head&gt;/&lt;\/s&gt;&lt;\/head&gt;/g;
$buffer =~ s/

]+)&gt;/

/g;
$buffer =~ s/

/

/g;
#$buffer =~ s/

/

/g;
$buffer =~ s/&lt;\/p&gt;/&lt;\/s&gt;&lt;\/p&gt;/g;
$buffer =~ s/]+)&gt;//g;
$buffer =~ s/&lt;\/ab&gt;/&lt;\/s&gt;&lt;\/ab&gt;/g;
$buffer =~ s/(type="SENT"[^\/]+\/pc&gt;\n)&lt;\/s&gt;\n
print $buffer;

sub sentenceStart {
#print "" ;
$sentenceStarted = 1;
}

sub sentenceStop {
my $notYet;
foreach $openTag  (keys %tagStash) {
if($tagStash{$openTag} &gt; 0)
{ print"<!-- .. still got an open $openTag -->";
$notYet = "1";
}
}

unless ($notYet) {
#    print "&lt;\/s&gt;\n";
print "\n";
$sentenceStarted = $sentenceEnded = 0;
}
}
Posted in | 1 Comment

One Response to “afterTag”

  1. [...] in a nice clean TEI conformant version, but somehow it’s always quicker to just run an after-the-event perl script to tidy up its output. Which gave me a bunch of files that contained lines like this <div [...]