PERL   62
comp-mark
Guest on 25th August 2022 01:49:21 PM


  1. #!/usr/local/bin/perl
  2.  
  3. #
  4. #  comp-mark -- compares two tagger output files
  5. #    and then marks the lines where smalltag got it wrong
  6. #
  7. #    Alex Krotov (alexk@dcs.shef.ac.uk)
  8. #
  9. #
  10. #   usage:   comp-mark file.stag file.bnc file.out
  11. #     file.stag is the smalltag  output,
  12. #       and file.bnc is the correct tagging from the BNC (prepared by make-bnc)
  13. #    file.out is the output file, diagnostics is printed to the stdout
  14. #
  15. #
  16.  
  17. open (FILE1, $ARGV[0]) || die "error";
  18.  
  19. open (FILE2, $ARGV[1]) || die "error";
  20.  
  21. open (OUTFILE, ">".$ARGV[2]) || die "error";
  22.  
  23.  
  24. # stores the list of lines already read
  25.  
  26. @lines1 = ();
  27. @lines2 = ();
  28.  
  29. # maximum search depth
  30.  
  31. $maxsearchdepth = 5;
  32.  
  33. # and, finally the counters
  34.  
  35. $words = 0;
  36. $noword = 0;
  37. $t_wrong = 0;
  38. $t_equal = 0;
  39. $t_eq = 0;
  40. $t_eq_verb = 0;
  41. $t_eq_aux = 0;
  42. $t_eq_x = 0;
  43.  
  44. while (($line1 = &readline1)&&
  45.        ($line2 = &readline2)) {
  46.  
  47.     $words++;
  48.  
  49.  
  50.    # $line2 = &readline2;
  51.  
  52.     $out_line1 = $line1;  
  53.    #   this is $line1 ready for output, and it is already
  54.    #   chopped
  55.      
  56.  
  57.    $word1 = &getword($line1);
  58.    $word2 = &getword($line2);
  59.  
  60.    if ($word1 eq $word2) {
  61.       # the simplest case -- word matched
  62.  
  63.        $tag1 = &gettag($line1);
  64.        $tag2 = &gettag($line2);
  65.  
  66.        $match = &comparetags($tag1, $tag2);
  67.        if ($match eq "equal") {
  68.            $t_equal++;
  69.            #print "match\n";
  70.        } elsif ($match eq "eq") {
  71.            $t_eq++;
  72.            #print "match\n";
  73.        } elsif ($match eq "eq-verb") {
  74.            $t_eq_verb++;
  75.        } elsif ($match eq "eq-aux") {
  76.            $t_eq_aux++;
  77.        } elsif ($match eq "eq-x") {
  78.            $t_eq_x++;
  79.        } else {
  80.            $out_line1 = $out_line1 . "    ------ $tag2 ------";
  81.            $t_wrong++;
  82.        }
  83.  
  84.        print OUTFILE $out_line1, "\n";
  85.  
  86.        next;
  87.        # and continue on with the loop
  88.    }        
  89.  
  90.     #print "no word match: $word1   $word2\n";
  91.  
  92.  
  93.    # if the word didn't match, find a match
  94.  
  95.    # first, search in the second file
  96.    #
  97.     @tmplines2 = ($line2);
  98.     for ($i = 0; $i < $maxsearchdepth; $i++) {
  99.         $line2 = &readline2;
  100.         $word2 = &getword($line2);
  101.         if ($word1 eq $word2) {
  102.             # print "matched $word1\n";
  103.             last;
  104.         }
  105.         push(@tmplines2, $line2);
  106.     }
  107.     if ($word1 eq $word2) {
  108.         unshift(@lines1, $line1);
  109.         unshift(@lines2, $line2);
  110.         $words--;
  111.         # and restart the loop
  112.         # print "restarting the loop\n";
  113.  
  114.         #print "@lines1 \n @lines2 \n";
  115.  
  116.         next;
  117.  
  118.     }
  119.  
  120.    # otherwise, search in the first file
  121.    #   first, update @lines2
  122.    #   and skip the current word incrementing the counter
  123.  
  124.     unshift(@lines2, @tmplines2);
  125.     #print "@lines2 \n";
  126.     $noword++;
  127.     #print "skipping $word1\n";
  128.     #  instead, change $out_line1
  129.  
  130.     $out_line1 = $line1 . " .... no word match .... ";
  131.         print OUTFILE $out_line1, "\n";
  132.  
  133.  
  134. }
  135.    
  136.  
  137. # then print out the results
  138.  
  139.  
  140. print "words: $words  \n";
  141. print "no word match: $noword \n";
  142. print "good tag: $t_equal + $t_eq + $t_eq_verb + $t_eq_aux = ",
  143.      $t_equal+$t_eq+$t_eq_verb+$t_eq_aux, "\n";
  144. print "wrong tag: $t_wrong \n";
  145. print "X-tags: $t_eq_x\n";
  146.  
  147. $good_tags = $t_equal + $t_eq + $t_eq_verb + $t_eq_aux;
  148. $all_tags = $good_tags + $t_wrong;
  149.  
  150. $percentage = int ( 100 * $good_tags / $all_tags);
  151.  
  152. print "percentage of good tags: $percentage% \n";  
  153.  
  154.  
  155. close FILE1;
  156. close FILE2;
  157. close OUTFILE;
  158.  
  159. sub getword {
  160.    local ($line) = @_;
  161.  
  162.    if ($line =~ /<w \w+>(.+)$/) {
  163.      
  164.        $word = $1;
  165.        #print $word;
  166.        $word =~ s/[^A-Za-z0-9']//g;  
  167.        # replace all non-letters, digits etc. with nil
  168.  
  169.        # and canonicalize to the lower case
  170.        $word =~ tr/A-Z/a-z/;
  171.  
  172.        # replace n't with not
  173.  
  174.        if ($word eq "n't") {
  175.            $word = "not";
  176.        }
  177.  
  178.        #print $word;                
  179.  
  180.    } else {
  181.        $word = "----";
  182.      
  183.    }
  184.    $retval = $word;
  185.    
  186. }
  187.  
  188. sub gettag {
  189.    local ($line) = @_;
  190.  
  191.    if ($line =~ /<w (\w+)>/) {
  192.        $retval = $1;
  193.    } else {
  194.        $retval = "----";
  195.    }
  196.    
  197. }
  198.  
  199. sub comparetags {
  200.    local ($tag1, $tag2) = @_;
  201.  
  202.    $tag1 = "VERB_MODAL" if ($tag1 eq "AUX");
  203.    $tag2 = "VERB_MODAL" if ($tag2 eq "AUX");
  204.  
  205.    if ($tag1 eq $tag2) {
  206.        $retval = "equal";
  207.    }
  208.    elsif (($tag1 =~ /$tag2/)||($tag2 =~ /$tag1/)) {
  209.        $retval = "eq";
  210.    }
  211.    elsif (($tag1 =~ /VERB/)&&($tag2 =~ /VERB/)) {
  212.        $retval = "eq-verb";
  213.    }
  214.    elsif ((($tag1 eq "AUX") || ($tag1 =~ /VERB/))
  215.          && (($tag2 eq "AUX") || ($tag2 =~ /VERB/))) {
  216.        $retval = "eq-aux";
  217.    }
  218.    elsif (($tag1 eq "X")||($tag2 eq "X")) {
  219.        $retval = "eq-x";
  220.    }
  221.    else {
  222.        $retval = "";
  223.    }
  224.  
  225. }
  226.  
  227.  
  228. sub readline1 {
  229.  
  230.     local ($line);
  231.  
  232.     if (@lines1) {
  233.  
  234.         $line = shift(@lines1);
  235.  
  236.     } else {
  237.         $line = <FILE1>;
  238.         chop $line;
  239.     }
  240.    
  241.     $retval = $line;
  242.  
  243. }
  244.  
  245. sub readline2 {
  246.  
  247.     local ($line);
  248.  
  249.     if (@lines2) {
  250.        
  251.         $line = shift(@lines2);
  252.  
  253.     } else {
  254.         $line = <FILE2>;
  255.         chop $line;
  256.     }
  257.     $retval = $line;
  258.  
  259. }

Raw Paste

Login or Register to edit or fork this paste. It's free.