# No.51 Perl Sample (code key - 67) # Programs: confusion matrix (2人のタグ付けがどれくらいあっているか調べる) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # use strict ; # restrict global variables my (%seenTags,%tagErrors,%numOfTags,%confusion,@text) ; if ($#ARGV != 1) { # error if no. of arguments is not 2 print "usage: \n" ; exit ; } my (@files) = @ARGV ; # take the file pathes from the command line foreach my $f (0..1) { # process each file open (INFILE, $files[$f]) or die "Cannot open $files[$f]: $!\n" ; while () { chomp ; $text[$f] .= $_ ; # put all text in a single line } close (INFILE) ; } # chunk words by whitespace # my(@first) = split (" ",$text[0]) ; my(@second) = split (" ",$text[1]) ; # die if the total number of tokens are not the same # (the source corpora may not be the same) # if ($#first != $#second) { die "Token counts for the two files don't match!\n" ; } # process each word and keep records in a multidimentional hash # also, count the number of tags for (my $i=0; $i<=$#first; $i++){ $first[$i] =~ /(.+)\/(.+)/ ; # the delimiter is "/" my ($firstWord) = $1 ; # the first match is token my ($firstTag) = $2 ; # the second match is tag $seenTags{$firstTag}++ ; # keep record of seen tags $numOfTags{$firstTag}++ ; # count the frequency of a tag $second[$i] =~ /(.+)\/(.+)/ ; # repeat the same process with my ($secondWord) = $1 ; # the second corpus my ($secondTag) = $2 ; $seenTags{$secondTag}++ ; # die if corresponding words are not the same # (probably, it failed to chunk words) # if ($firstWord ne $secondWord) { die "Token mismatch: $firstWord != $secondWord\n" ; } # count the number of tag errors # if ($firstTag ne $secondTag) { $tagErrors{$firstTag}++ ; } # store the tag information in a multidimentional hash # (first tag/human - row) # (second tag/tagger - column) # $confusion{$firstTag}{$secondTag}++ ; } # print the tag frequency in a confusion matrix # print "\n\n********************* Tag Frequency Table ***********************\n\n" ; print " "; # blank at (0,0) foreach my $tag (sort keys %seenTags){ # printing column labels printf(" %4s",$tag); } print "\n"; foreach my $row_tag (sort keys %seenTags){ printf("%4s ",$row_tag); # printing row label foreach my $col_tag (sort keys %seenTags){ if (!$confusion{$row_tag}{$col_tag}){ print " . "; # a dot for an empty cell } else { printf("%4d ",$confusion{$row_tag}{$col_tag}); # printing freq. } } print "\n\n"; # 2 line breaks at the end of row } # print the tag precision in a confusion matrix # print "\n\n********************* Tag Precision Table ***********************\n\n" ; print " "; # blank at (0,0) foreach my $tag (sort keys %seenTags){ # printing column labels printf(" %4s",$tag); } print "\n"; foreach my $row_tag (sort keys %seenTags){ printf("%4s ",$row_tag); # printing row label foreach my $col_tag (sort keys %seenTags){ if (!$confusion{$row_tag}{$col_tag}){ print " . "; # a dot for an empty cell } else { # precision is the freq of correct tag devided by the total tag # my $precision = int($confusion{$row_tag}{$col_tag}/$numOfTags{$row_tag}*100) ; printf("%4d ",$precision); } } print "\n\n"; # 2 line breaks at the end of row }