# No.53 Perl Sample (code key - 47) # Programs: hyphenating with using a dictionary (辞書を使ってハイフンを入れる) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # use strict ; my (%morphe, %mhyph, %seen) ; my $numWords = 0 ; my $numMorphs = 0 ; my $numConv = 0 ; if ($#ARGV < 1) { die "usage: perl morphe.pl mhyph.txt corpusDIR\n" ; } my $mhyphFile = shift @ARGV ; open (MHYPH,$mhyphFile) or die "cannot open the file: $!" ; while() { chomp ; my $realWord = lc($_) ; $seen{$realWord}++ ; $realWord =~ s/\-//g ; $mhyph{$realWord} = lc($_) ; } while(defined($_=shift)){ open (INFILE,$_) or die "cannot open the file: $!" ; my @words ; my @morphes ; while(){ @words = split (" ",lc($_)) ; $numWords += $#words + 1 ; foreach (@words){ if ($mhyph{$_}){ my $hyphnated = $mhyph{$_} ; print "$hyphnated " ; $numConv++ ; @morphes = split ("-",$hyphnated) ; $numMorphs += $#morphes + 1 ; foreach(@morphes){ next if (defined $seen{$_}) ; $morphe{$_}++ ; } } else { print "$_ " ; } } } close(INFILE) ; print "\n" ; } foreach my $key (sort {$morphe{$b} <=> $morphe{$a}} keys %morphe){ print "$key\t$morphe{$key}\n" ; } print "\n" ; print "numTotalWords:\t$numWords\n" ; print "numConvWord:\t$numConv\n" ; print "numMorphs:\t$numMorphs\n" ;