# No.54 Perl Sample (code key - 66) # Programs: minimum edit distance (MEDで2つのセンテンスの近似値を調べる) # Tomonori Nagano # Last Update: January 18, 2008 # # This file is encoded in Unicode (UTF-8). If you see gibberish characters, # please re-encode the file in utf-8. # use strict ; # use strict to restrict global variables my ($targetWord,$sourceWord,$n,$m,@targetChars,@sourceChars,@mat) ; # output an error if the number of command-line arguments is not 2 # if ($#ARGV != 1) { print "usage \n" ; exit ; } # the first word is the "target word" and the second word is the "source word". # my $word1 = $ARGV[0] ; my $word2 = $ARGV[1] ; &distance($word1,$word2) ; # running the distance subroutine. ######################################################### # SUBROUTINE # ######################################################### # precondition: the subroutine takes two words (whitespace is not allowed) # as arguments. # postcondition: the subroutine outputs a minimum edit distance table, which is # created by a milti-dimentional array, @mat. sub distance{ my ($targetWord,$sourceWord) = @_ ; # the 1st word is a target and # the second is a source word my ($n, $m) = (length $targetWord, length $sourceWord) ; # save the lengthes of each word # Initializing a multi-dimentional array @mat # $i is the column number (target) and $j is the row number (source) for (my $i=0; $i<=$n; $i++) { $mat[$i][0] = $i ; } for (my $j=0; $j<=$m; $j++) { $mat[0][$j] = $j ; } # split both the target word and source word @targetChars = split(//, $targetWord) ; @sourceChars = split(//, $sourceWord) ; for (my $i=1; $i<=$n; $i++) { for (my $j=1; $j<=$m; $j++) { # the insertion cost and deletion cost is always 1. # (a bit confusing; what if [$i-1] and [$j] are identical??) my $insertCost = 1 ; my $deleteCost = 1 ; # the substitution cost incurs if [$i-1] and [$j-1] are different # letters; if they are identical, the cost is zero my $subCost = ($targetChars[$i-1] eq $sourceChars[$j-1])?0:2 ; # the distance of the cell [$i][$j] is the minimal value of the # three possible distances $mat[$i][$j] = &minimal([$mat[$i-1][$j] + $insertCost, $mat[$i][$j-1] + $deleteCost, $mat[$i-1][$j-1] + $subCost]) ; } } # output the multi-diemntional array in a table format # unshift a null character to each word # unshift (@targetChars,"#") ; unshift (@sourceChars,"#") ; print "\n" ; # output the distance table. Begin the maximum value (the last word of the # source word) so that the table looks exactly like one in the textbook. # for(my $j=$m; $j>=0; $j--) { printf("%4s ",@sourceChars[$j]) ; # output the source word label for (my $i=0; $i<=$n; $i++) { printf("%4d ",$mat[$i][$j]) ; } print "\n" ; } # output the label for the target word # printf("%4s ",".") ; for(my $i=0; $i<=$n; $i++) { printf("%4s ",@targetChars[$i]) ; } print "\n\n"; # new line at end of row } sub minimal { my @list = @{$_[0]}; # save the arguements in an array # the arguements need to be calculated first my $min = $list[0]; # tentatively save the first argument as MED foreach my $i (@list) { # if the next argument is smaller than the $min = $i if ($i < $min); # current MED, change the MED } return $min; }