#!/apps/bin/perl -w # this scrips computes the overlap between two text units # each text unit is treated as a set of elements and so it # is simetric. # two parameters # -text1: a string # -text2: a string # elements are separated by one space # text1 = x a x # text2 = b x # overlap is 1 my $text1 = shift; my $text2 = shift; # store the tokens in hash tables, the key is the token, the value is the number of # times the token appears in the text my @tokens1 = split(/ /,$text1); my @tokens2 = split(/ /,$text2); my %set1 = (); my %set2 = (); my $set1_count = 0; my $tokens1_count = 0; my $set2_count = 0; my $tokens2_count = 0; foreach $token (@tokens1) { $tokens1_count++; if(exists($set1{$token})) { $set1{$token} = $set1{$token}+1; } else {$set1{$token} =1; $set1_count++ } } foreach $token (@tokens2) { $tokens2_count++; if(exists($set2{$token})) { $set2{$token} = $set2{$token}+1; } else {$set2{$token} =1; $set2_count++ } } my $overlap = set_overlap(\%set1,\%set2); my $normalized = $overlap/($set1_count+$set2_count-$overlap); print "OVERLAP: $overlap\n"; print "NORMALIZED: $normalized\n"; # set overlap uses pointers to the sets, so you must invoke it like # set_overlap(\%set1,\%set2) sub set_overlap { my $s1 = shift; my $s2 = shift; my $overlap = 0; foreach $key (keys %$s1) { if(exists($$s2{$key})) { $overlap++; } } return $overlap; } sub show_tokens { # I will use the @_ array print "TOKENS: $_[0]\n"; print "DIFFERENTS: $_[1]\n"; shift @_; shift @_; my %table = @_; foreach $key (keys %table) { print "$key\n"; } }