#!/apps/bin/perl -w # this scrips computes the cosine between two vectors # depending on the value of a parameter it will use counts and idf # two parameters # -text1: a string # -text2: a string # elements are separated by one space # text1 = x a x # text2 = b x # overlap is 1 use IDF; use COSINE; my $eng_idf_word = "HK-WORD-enidf"; my $eng_idf_lemma = "HK-LEMMA-enidf"; my $option = shift; die "Lemmas or Words???\n" unless (($option eq "-l") || ($option eq "-w")); my $idf_file = ($option eq "-l") ? $eng_idf_lemma : $eng_idf_word; my $OPT = ($option eq "-l") ? "LEX":"ORIGIN"; my $text1 = shift; my $text2 = shift; open_nidf($idf_file); # store the tokens in hash tables, the key is the token, the value is the number of # times the token appears in the text my @tokens1 = split(/ /,$text1); my @tokens2 = split(/ /,$text2); my %set1 = (); my %set2 = (); my $set1_count = 0; my $tokens1_count = 0; my $set2_count = 0; my $tokens2_count = 0; foreach $token (@tokens1) { $tokens1_count++; if(exists($set1{$token})) { $set1{$token} = $set1{$token}+1; } else {$set1{$token} =1; $set1_count++ } } foreach $key (keys %set1) { $idf = get_nidf("$key"); print "KEY: $key, VALUE: $idf\n"; $set1{$key}=$set1{$key}*$idf; } foreach $token (@tokens2) { $tokens2_count++; if(exists($set2{$token})) { $set2{$token} = $set2{$token}+1; } else {$set2{$token} =1; $set2_count++ } } foreach $key (keys %set2) { $idf = get_nidf("$key"); print "KEY: $key, VALUE: $idf\n"; $set2{$key}=$set2{$key}*$idf; } my $result1 = simple_cosine(\%set1,\%set2); my $result2 = complete_cosine(\%set1,\%set2); my $result3 = cosine_with_idf($text1,$text2,"ENG",$OPT); print "Cosinr 0/1) $result1\n"; print "Cosine a): $result2\n"; print "Cosine b): $result3\n"; # simple cosine assumes weith 1 for presence of the word sub simple_cosine { my $s1 = shift; my $s2 = shift; foreach $key (keys %$s1) { if(exists($$s2{$key})) { $cos++; } } $cos = $cos/(sqrt((scalar keys %$s1)*(scalar keys %$s2))); return $cos; } sub show_tokens { # I will use the @_ array print "TOKENS: $_[0]\n"; print "DIFFERENTS: $_[1]\n"; shift @_; shift @_; my %table = @_; foreach $key (keys %table) { print "$key\n"; } } sub complete_cosine { my $s1 = shift; my $s2 = shift; my $cos = 0; my $norm1 = 0; my $norm2 = 0; foreach $key (keys %$s1) { $val = $$s1{$key}; print "==> $key , $val\n"; $norm1 = $norm1 + $val*$val; if(exists($$s2{$key})) { $cos = $cos + ($val)*($$s2{$key}); } } foreach $key (keys %$s2) { $val = $$s2{$key}; print "++> $key, $val\n"; $norm2 = $norm2 + $val*$val; } if($cos==0) {return $cos;} # print "COS: $cos, NORM1: $norm1, NORM2: $norm2\n"; $cos = $cos/sqrt($norm1*$norm2); return $cos; }