# No.39 Perl Sample (code key - 54) # Learning Perl: Lemmanize words with FSA (辞書ファイル無しでFSAを使って単語をlemmaに変換する) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # # This is an extremely naive FSA lemmanization model. # See Porter (1980) for a more spficsticated program of rule-based suffix strapping while(<>){ chomp ; @words = split ; foreach (@words) { # take care of some of the irregular inflections s/(caught)\b/catch/ig ; # catch becomes caught s/(geese)\b/goose/ig ; # goose becomes geese s/(ran)\b/run/ig ; s/(wrote)\b/write/ig ; s/(has)\b/have/ig ; s/(had)\b/have/ig ; # change the irregular spelling to the regular spelling # noun pluralization and 3rd person singular -s s/(.*[s|z|x|ch|sh])es\b/\1s/ig ; # s/z/x/ch/sh takes s/z/x/ch/sh -es # problem! with words like 'loses' s/(.*[^aiueo])ies\b/\1ys/ig ; # -y not following v takes -ies s/(.*o)es\b/\1os/ig ; # -o takes -oes s/(.*)ves\b/\1fs/ig ; # -f takes -ves s/(.*sis)\b/\1ses/ig ; # -ses takes -sis # change the irregular spelling to the regular spelling # verb tens inflection s/(.*[^aiueo])ied\b/\1yed/ig ; # -y not following a vowel takes -ied s/(.*)[i|e]ed\b/\1ed/ig ; # -ie and -ee takes -ied/-eed s/(.*c)king\b/\1ing/ig ; # -c takes -cking s/(.*([qrtpsdfgklbnm]))\2ing\b/\1ing/ig ; # single consonant is doubled -ing # take out the inflection morpheme and put a tag instead s/(.*)ed\b/\1/ig ; # -ed is deleted (may be tagged with ) s/(.*)s\b/\1/ig ; # -s is deleted (may be tagged with <3ps_pl>) # this causes a lot of problem with many words # (e.g., this, kiss, pass etc.) s/(.*)ing\b/\1/ig ; # -ing is deleted (may be tagged with ) print "$_ "; } print "\n" ; }