# No.22 Perl Sample (code key - 57) # Learning Perl: count bigrams 2 (バイグラムの数を数える) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # # this sample script is simplified for pedagogical purposes. Those who are # interested in advanced Perl programming are encouraged to consult with # relevant sections of "Perl Cookbook" by Christiansen & Torkington my @words ; # define the global variables my $numWords = 0 ; my $numBigrams= 0 ; while(<>){ # this reads the default file line by line tr/A-Z/a-z/ ; # normalize to lower case s/([^\- a-zA-Z0-9])/ \1 /g ;# a negative definition of punctuation push (@words,split(" ",$_)) ; # split $_ into each word and store it in @words $numWords = $#words+1 ; # index+1 is the number of words } push (@words,'') ; # add a document initial tag for (my $i=0;$i<$#words;$i++) { # process each word my $tempBigram = "$words[$i] $words[$i+1]" ;# combine two consecutive words $numBigrams++ ; # increase the count eveyr time } if ($numWords != $numBigrams) { # if unigram and bigram have different freqs print "something is wrong!\n" ; # something is wrong } else { # otherwise print the frequency of uni/bigrams print "numWords/numBigrams\t$numWords\n" ; }