# No.20 Perl Sample (code key - 68) # Learning Perl: count bigram types 1 (バイグラムのタイプ数を数える) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # # this sample script is simplified for pedagogical purposes. Those who are # interested in advanced Perl programming are encouraged to consult with # relevant sections of "Perl Cookbook" by Christiansen & Torkington while(<>){ # this reads the default file line by line chomp ; # delete the end-of-line character $_ = lc($_) ; # change to lower case s/([^\- a-zA-Z0-9])/ \1 /g; # a negative definition of punctuation push (@words,split(" ",$_)) ; # split into words and store them in @words } push (@words,'') ; # add a sentence initial character for (my $i=0; $i<$#words; $i++){ # process each word my $tempBigram = "$words[$i] $words[$i+1]" ;# make a bigram next if ($tempBigram =~ /[^ a-zA-Z]/) ; # skip if bigram contains non-alphabet $numBigrams++ ; # increase bigram token freq if (!exists $seen{$tempBigram}) { # if the hash with a bigram key dosn't exitst $seen{$tempBigram} = 1 ; # define a bigramas a new hash key with freq of 1 $numBigramTypes++ ; # increase bigram type freq } else { # otherwise $seen{$tempBigram}++ ; # increase its frequency } } print "numBigrams\t$numBigrams\n" ; print "numBigramTypes\t$numBigramTypes\n\n" ; foreach $key (sort { $seen{$b} <=> $seen{$a}; } keys %seen){ print "$key\t$seen{$key}\n"; }