- #!/usr/local/bin/perl
- #
- # comp-mark -- compares two tagger output files
- # and then marks the lines where smalltag got it wrong
- #
- # Alex Krotov (alexk@dcs.shef.ac.uk)
- #
- #
- # usage: comp-mark file.stag file.bnc file.out
- # file.stag is the smalltag output,
- # and file.bnc is the correct tagging from the BNC (prepared by make-bnc)
- # file.out is the output file, diagnostics is printed to the stdout
- #
- #
- # stores the list of lines already read
- @lines1 = ();
- @lines2 = ();
- # maximum search depth
- $maxsearchdepth = 5;
- # and, finally the counters
- $words = 0;
- $noword = 0;
- $t_wrong = 0;
- $t_equal = 0;
- $t_eq = 0;
- $t_eq_verb = 0;
- $t_eq_aux = 0;
- $t_eq_x = 0;
- while (($line1 = &readline1)&&
- ($line2 = &readline2)) {
- $words++;
- # $line2 = &readline2;
- $out_line1 = $line1;
- # this is $line1 ready for output, and it is already
- # chopped
- $word1 = &getword($line1);
- $word2 = &getword($line2);
- if ($word1 eq $word2) {
- # the simplest case -- word matched
- $tag1 = &gettag($line1);
- $tag2 = &gettag($line2);
- $match = &comparetags($tag1, $tag2);
- if ($match eq "equal") {
- $t_equal++;
- #print "match\n";
- } elsif ($match eq "eq") {
- $t_eq++;
- #print "match\n";
- } elsif ($match eq "eq-verb") {
- $t_eq_verb++;
- } elsif ($match eq "eq-aux") {
- $t_eq_aux++;
- } elsif ($match eq "eq-x") {
- $t_eq_x++;
- } else {
- $out_line1 = $out_line1 . " ------ $tag2 ------";
- $t_wrong++;
- }
- next;
- # and continue on with the loop
- }
- #print "no word match: $word1 $word2\n";
- # if the word didn't match, find a match
- # first, search in the second file
- #
- @tmplines2 = ($line2);
- for ($i = 0; $i < $maxsearchdepth; $i++) {
- $line2 = &readline2;
- $word2 = &getword($line2);
- if ($word1 eq $word2) {
- # print "matched $word1\n";
- last;
- }
- }
- if ($word1 eq $word2) {
- $words--;
- # and restart the loop
- # print "restarting the loop\n";
- #print "@lines1 \n @lines2 \n";
- next;
- }
- # otherwise, search in the first file
- # first, update @lines2
- # and skip the current word incrementing the counter
- #print "@lines2 \n";
- $noword++;
- #print "skipping $word1\n";
- # instead, change $out_line1
- $out_line1 = $line1 . " .... no word match .... ";
- }
- # then print out the results
- $t_equal+$t_eq+$t_eq_verb+$t_eq_aux, "\n";
- $good_tags = $t_equal + $t_eq + $t_eq_verb + $t_eq_aux;
- $all_tags = $good_tags + $t_wrong;
- close FILE1;
- close FILE2;
- close OUTFILE;
- sub getword {
- if ($line =~ /<w \w+>(.+)$/) {
- $word = $1;
- #print $word;
- $word =~ s/[^A-Za-z0-9']//g;
- # replace all non-letters, digits etc. with nil
- # and canonicalize to the lower case
- $word =~ tr/A-Z/a-z/;
- # replace n't with not
- if ($word eq "n't") {
- $word = "not";
- }
- #print $word;
- } else {
- $word = "----";
- }
- $retval = $word;
- }
- sub gettag {
- if ($line =~ /<w (\w+)>/) {
- $retval = $1;
- } else {
- $retval = "----";
- }
- }
- sub comparetags {
- $tag1 = "VERB_MODAL" if ($tag1 eq "AUX");
- $tag2 = "VERB_MODAL" if ($tag2 eq "AUX");
- if ($tag1 eq $tag2) {
- $retval = "equal";
- }
- elsif (($tag1 =~ /$tag2/)||($tag2 =~ /$tag1/)) {
- $retval = "eq";
- }
- elsif (($tag1 =~ /VERB/)&&($tag2 =~ /VERB/)) {
- $retval = "eq-verb";
- }
- elsif ((($tag1 eq "AUX") || ($tag1 =~ /VERB/))
- && (($tag2 eq "AUX") || ($tag2 =~ /VERB/))) {
- $retval = "eq-aux";
- }
- elsif (($tag1 eq "X")||($tag2 eq "X")) {
- $retval = "eq-x";
- }
- else {
- $retval = "";
- }
- }
- sub readline1 {
- if (@lines1) {
- } else {
- $line = <FILE1>;
- }
- $retval = $line;
- }
- sub readline2 {
- if (@lines2) {
- } else {
- $line = <FILE2>;
- }
- $retval = $line;
- }
Raw Paste