# No.52 Perl Sample (code key - 48) # Programs: count words with using JACET8000 (JACET8000を使ってレベル別の単語数を数える) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # # This script takes English files and compute the frequencies of each level of # JACET 8000 entries # # Require: Prof. Yasumasa Someya's "e_lemma.txt" file # : JACET8000.txt use strict ; # define global variables my ($nTokens,$nTypes,$nLemmanized,%jacet,%lemma,%jacetSeen,%seen) ; # Error if the number of argument is not two if ($#ARGV < 2) { die "usage: perl jacetSeenSort.pl jacetFile.txt e_lemma.txt targetFile(s)\n" ; } my $jacetFile = shift @ARGV ; # the first argument is JACET8000.txt my $eLemmaFile = shift @ARGV ; # the second argument is e_lemma.txt my @files = @ARGV ; # then go on to process JACET8000 file. open (INFILE,$jacetFile) or die "cannot open the file: $!" ; while(){ chomp ; my ($type,$pos,$rank) = split ("\t",$_) ; # this is the format of JACET8000 my $level = sprintf("%d",$rank/1000 +1) ; # the level of JACET8000 is computed from # mere ranks $jacet{$type} = $level ; # make a JACET hash } # create a lemma hash open (INFILE,$eLemmaFile) or die "cannot opne the file: $!" ; while(){ chomp ; next if (/^\[/) ; my ($lemmaTemp,$lexemesTemp) = split(' -> ',$_) ; my (@lexemes) = split(",",$lexemesTemp) ; foreach my $key (@lexemes) { $lemma{$key} = $lemmaTemp ; } } # process the input file(s) while(defined($_=shift(@files))){ next if ($_ eq "." || $_ eq ".." || $_ eq ".DS_Store") ; open(INFILE,"$_") or die "Couldn't open the file: $!\n" ; while(){ tr/A-Z/a-z/ ; # normalize to lower case s/([^\- a-zA-Z0-9])/ \1 /g ; # a negative definition of punctuation my @words = split ; for (my $i=0; $i<=$#words; $i++) { $nTokens++ ; # increase total num if (exists $lemma{$words[$i]}) { # lemmanizing $words[$i] = $lemma{$words[$i]} ; $nLemmanized++ ; } if (!defined $seen{$words[$i]}){ # look up in the JACET hash $nTypes++ ; $seen{$words[$i]} = 1 ; } else {$seen{$words[$i]}++ ;} } } } # sort the input file(s) by JACET levels foreach my $key (keys %seen) { if (defined $jacet{$key}) { my $level = $jacet{$key} ; push (@{$jacetSeen{$jacet{$key}}},$key) ; } else { push(@{$jacetSeen{'9'}},$key) ; } } # print each JACET level entry and summary my (@levelTokens,@levelType) ; foreach my $key (sort keys %jacetSeen) { print "Level $key\n" ; foreach my $subKey (sort @{$jacetSeen{$key}}) { print "\t$subKey\t$seen{$subKey}\n" ; $levelTokens[$key] += $seen{$subKey} ; $levelType[$key]++ ; } print "\n" ; } print << "EOF" ; ******************************************************************** SUMMARY ******************************************************************** EOF print "total tokens: $nTokens\n" ; print "total types: $nTypes\n" ; print "total lemmanized words: $nLemmanized\n\n" ; printf "%5s %8s %8s %12s %8s %8s", qw(level type token type/token type% token%) ; print "\n" ; for my $key (1..9) { next if ($levelTokens[$key] == 0 ) ; printf "%5s %8.0f %8.0f %12.2f %8.2f %8.2f", $key, $levelType[$key],$levelTokens[$key],$levelType[$key]/$levelTokens[$key],$levelType[$key]/$nTypes,$levelTokens[$key]/$nTokens ; print "\n" ; }