# No.49 Perl Sample (code key - 50) # Programs: compare words in two files (2ファイル間の単語の出現頻度を比べる) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # my $firstFile = $ARGV[0] ; my $secondFile = $ARGV[1] ; open (INFILE1,$firstFile) or die "cannot open $firstFile: $!" ; while(){ chomp ; tr/A-Z/a-z/ ; # normalize to lower case s/([^\- a-zA-Z0-9])/ \1 /g ; # a negative definition of punctuation my @words = split ; foreach (@words) { $nToken1++ ; next if /[^\- a-zA-Z]/ ; if (!exists $firstSeen{$_}) { $nType1++ ; $firstSeen{$_} = 1 ; } else { $firstSeen{$_}++ ; } $seen{$_}++ ; } } open (INFILE2,$secondFile) or die "cannot open $secondFile: $!" ; while(){ chomp ; tr/A-Z/a-z/ ; # normalize to lower case s/([^\- a-zA-Z0-9])/ \1 /g ; # a negative definition of punctuation my @words = split ; foreach (@words) { $nToken2++ ; next if /[^\- a-zA-Z]/ ; if (!exists $secondSeen{$_}) { $nType2++ ; $secondSeen{$_} = 1 ; } else { $secondSeen{$_}++ ; } $seen{$_}++ ; } } print "num of words in the 1st file: $nToken1\n" ; print "num of words in the 2nd file: $nToken2\n" ; print "num of types in the 1st file: $nType1\n" ; print "num of types in the 2nd file: $nType2\n\n" ; printf ("%4s: %15s %10s %10s", 'ID', 'word', '1st File','2nd File') ; print "\n" ; print "-------------------------------------------------------\n" ; my $i = 0 ; foreach my $key (sort keys %seen) { printf ("%4s: %15s %10s %10s", $i, $key, $firstSeen{$key},$secondSeen{$key} ) ; print "\n" ; $i++ ; }