# No.33 Perl Sample (code key - 64) # Learning Perl: Count Word Types 5(単語のタイプ数とZipfの係数を数える) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # # this sample script is simplified for pedagogical purposes. Those who are # interested in advanced Perl programming are encouraged to consult with # relevant sections of "Perl Cookbook" by Christiansen & Torkington my $numWords= 0 ; # define the global variables while(<>){ # this reads the default file line by line tr/A-Z/a-z/ ; # normalize to lower case s/([^\- a-zA-Z0-9])/ \1 /g ;# a negative definition of punctuation my @words = split ; # split $_ into each word; alternatively use split (" ",$_) foreach (@words) { # process each word next if (/[^a-zA-Z]/) ; # skip if a word contains non-alphabet char $seen{$_}++ ; # increase type frequency $numWords++ ; # increase the count eveyr time } } # compute Zipf's constant foreach my $key (keys %seen){ $i++ ; $zipf += $i*$seen{$key} ; # compute rank*freq and store the sum } $zipfConst = $zipf/$numWords ; print "Zipf's constant: $zipfConst\n" ; # print the label for a formatted output printf "%15s %15s %20s %20s %20s", Type,'Type Frequency','Type Probability','rank*freq','difference' ; print "\n-------------------------------------------------------------------------------------------------\n"; my $i = 0 ; foreach $key (sort { $seen{$b} <=> $seen{$a}; } keys %seen){ $i++ ; $zipf += $i*$seen{$key} ; # compute rank*freq # formatted printout printf "%15s %15s %20.5f %20s %20.5f", $key,$seen{$key},$seen{$key}/$numWords,$i*$seen{$key},$i*$seen{$key}-$zipfConst ; print "\n"; }