# No.38 Perl Sample (code key - 53) # Learning Perl: Lemmanize words with a dictionary (辞書ファイルを使って単語をlemmaに変換する) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # # Require: Prof. Yasumasa Someya's "e_lemma.txt" file # Usage: Perl lemma.pl e_lemma.txt yourSourceFile(s).txt my $eLemmaFile = shift @ARGV ; # the second argument is e_lemma.txt my @files = @ARGV ; # create a lemma hash open (INFILE,$eLemmaFile) or die "cannot opne the file: $!" ; # open e_lemma.txt while(){ chomp ; # delete the end-of-line character next if (/^\[/) ; # next if it's a comment line # lemmas and lexemes are divided by ' -> ' in e_lemma.txt my ($lemmaTemp,$lexemesTemp) = split(' -> ',$_) ; # if there are more than two lexemes, they are separated by a comma my (@lexemes) = split(",",$lexemesTemp) ; # making a hash with lexemes as key and lemmas as value foreach my $key (@lexemes) { $lemma{$key} = $lemmaTemp ; } } # process the input file(s) while(defined($_=shift(@files))){ # as long as @files are defined open(INFILE,"$_") or die "Couldn't open the file: $!\n" ; # open file while(){ tr/A-Z/a-z/ ; # normalize to lower case s/([^\- a-zA-Z0-9])/ \1 /g ; # a negative definition of punctuation my @words = split ; # split into words for (my $i=0; $i<=$#words; $i++) { # process each word $nTokens++ ; # increase total num if (exists $lemma{$words[$i]}) { # if the word exists in the lemma hash $words[$i] = $lemma{$words[$i]} ; # change the word into a lemma word $nLemmanized++ ; # increase the lemmanization freq } print "$words[$i] " ; } } } print "\n" ; print "The number of words:\t$nTokens\n" ; # print num of words print "The number of lemmanizations:\t$nLemmanized\n" ; # print num of lemmanizations