#!/apps/bin/perl -w use Measures; # this computes the similarity between 'canonical' bigrams # a b c d e => ab, bc, cd, de in 'canonical form' my $text1 = shift; my $text2 = shift; # store the tokens in hash tables, the key is the token, the value is the number of # times the token appears in the text my @tokens1 = split(/ /,$text1); my @tokens2 = split(/ /,$text2); my $last1 = $#tokens1; my $last2 = $#tokens2; my %set1 = (); my %set2 = (); my $set1_count = 0; my $tokens1_count = 0; my $set2_count = 0; my $tokens2_count = 0; for($i=0;$i<$last1;$i++) { if($tokens1[$i] le $tokens1[$i+1]) { $bigram1=$tokens1[$i]."#".$tokens1[$i+1]; } else {$bigram1="$tokens1[$i+1]#$tokens1[$i]";} # print "$bigram1\n"; if(exists($set1{$bigram1})) { $set1{$bigram1} = $set1{$bigram1}+1; } else {$set1{$bigram1} =1; $set1_count++ } } show_bigrams(%set1); for($i=0;$i<$last2;$i++) { if($tokens2[$i] le $tokens2[$i+1]) { $bigram2=$tokens2[$i]."#".$tokens2[$i+1]; } else {$bigram2="$tokens2[$i+1]#$tokens2[$i]";} # print "$bigram2\n"; if(exists($set2{$bigram2})) { $set2{$bigram2} = $set2{$bigram2}+1; } else {$set2{$bigram2} =1; $set2_count++ } } show_bigrams(%set2); my $overlap = set_overlap(\%set1,\%set2); my $normalized = $overlap/($set1_count+$set2_count-$overlap); print "OVERLAP: $overlap\n"; print "NORMALIZED: $normalized\n"; sub show_bigrams { my %table = @_; foreach $key (sort keys %table) { print "$key\n"; } } #sub set_overlap { # my $s1 = shift; # my $s2 = shift; # my $overlap = 0; # # foreach $key (keys %$s1) { # if(exists($$s2{$key})) { # $overlap++; # } # } # return $overlap; #}