use strict;
my @related=();
my @word=();
my @count=();
my %index=();
my $totalcount=0;
my $fuzzy=2;
# --------------------- Read thesaurus
open(IN,"thesaurus.dat");
while (my $entry=<IN>) {
my ($tword,$tindex,$tcount,$trelated)=split(/\@/,$entry);
$index{$tword}=$tindex;
$word[$tindex]=$tword;
$count[$tindex]=$tcount;
$totalcount+=$tcount;
$related[$tindex]=$trelated;
}
close(IN);
my $avecount=$totalcount/$#count;
print "Read thesaurus\n";
print "Average count: $avecount\n";
# --------------------- Test it
print "Input word [,fuzzyness]: ";
while (my $input=<STDIN>) {
chomp($input);
my ($newword,$newfuzzy)=split(/\,/,$input);
if ($newfuzzy) { $fuzzy=$newfuzzy; }
my $threshold=0.1/$fuzzy;
$newword=~s/\W//g;
$newword=~tr/A-Z/a-z/;
print "\n\n\n--- $newword (Fuzzy: $fuzzy) ---\n";
my $tindex=$index{$newword};
if ($tindex) {
if ($count[$tindex]>$avecount) {
print "\nKEYWORD\n\n";
} else {
print "\nNot keyword\n\n";
}
my %found=();
print "Related:\n";
map {
# - Related word found
my ($ridx,$rcount)=split(/\:/,$_);
# - Direct relation index
my $directrel=$rcount/$count[$tindex];
if ($directrel>$threshold) {
map {
my ($rridx,$rrcount)=split(/\:/,$_);
if ($rridx==$tindex) {
# - Determine reverse relation index
my $revrel=$rrcount/$count[$ridx];
# - Calculate full index
$found{$ridx}=$directrel*$revrel;
if ($found{$ridx}>$threshold) {
map {
my ($rrridx,$rrrcount)=split(/\:/,$_);
unless ($found{$rrridx}) {
my $revrevrel=$rrrcount/$count[$ridx];
if (
$directrel*$revrel*$revrevrel>$threshold
) {
$found{$rrridx}=
$directrel*$revrel*$revrevrel;
}
}
} split(/\,/,$related[$ridx]);
}
}
} split(/\,/,$related[$ridx]);
}
} split(/\,/,$related[$tindex]);
# - Print results
map {
if ($found{$_}>$threshold) {
print ' '.$word[$_].' '.$found{$_}."\n";
}
} sort { $found{$b}<=>$found{$a} } keys %found;
} else {
print "\nNot found\n\n";
}
print "\nInput word: ";
}
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>