File:  [LON-CAPA] / loncom / publisher / Attic / testthesaurus.pl
Revision 1.1: download - view: text, annotated - select for diffs
Sat Oct 13 23:03:17 2001 UTC (22 years, 8 months ago) by www
Branches: MAIN
CVS tags: version_0_4, stable_2002_spring, stable_2002_july, stable_2002_april, stable_2001_fall, STABLE, HEAD
Helper programs to create thesaurus
Only thesaurus.dat should be installed.

    1: use strict;
    2: 
    3: 
    4: my @related=();
    5: my @word=();
    6: my @count=();
    7: my %index=();
    8: my $totalcount=0;
    9: my $fuzzy=2;
   10: 
   11: # --------------------- Read thesaurus
   12: 
   13: open(IN,"thesaurus.dat");
   14: 
   15: while (my $entry=<IN>) {
   16:     my ($tword,$tindex,$tcount,$trelated)=split(/\@/,$entry);
   17:     $index{$tword}=$tindex;
   18:     $word[$tindex]=$tword;
   19:     $count[$tindex]=$tcount;
   20:     $totalcount+=$tcount;
   21:     $related[$tindex]=$trelated;
   22: }
   23: 
   24: close(IN);
   25: 
   26: my $avecount=$totalcount/$#count;
   27: 
   28: print "Read thesaurus\n";
   29: print "Average count: $avecount\n";
   30: 
   31: # --------------------- Test it
   32: print "Input word [,fuzzyness]: ";
   33: while (my $input=<STDIN>) {
   34:     chomp($input);
   35:     my ($newword,$newfuzzy)=split(/\,/,$input);
   36:     if ($newfuzzy) { $fuzzy=$newfuzzy; }
   37:     my $threshold=0.1/$fuzzy;
   38:     $newword=~s/\W//g;
   39:     $newword=~tr/A-Z/a-z/;
   40:     print "\n\n\n--- $newword (Fuzzy: $fuzzy) ---\n";
   41:     my $tindex=$index{$newword};
   42:     if ($tindex) {
   43:         if ($count[$tindex]>$avecount) {
   44:            print "\nKEYWORD\n\n";
   45:         } else {
   46:            print "\nNot keyword\n\n";
   47:         }
   48:         my %found=();
   49:         print "Related:\n";
   50:         map {
   51: # - Related word found
   52:             my ($ridx,$rcount)=split(/\:/,$_);
   53: # - Direct relation index
   54:             my $directrel=$rcount/$count[$tindex];
   55:             if ($directrel>$threshold) {
   56:                map {
   57:                   my ($rridx,$rrcount)=split(/\:/,$_);
   58:                   if ($rridx==$tindex) {
   59: # - Determine reverse relation index
   60:                      my $revrel=$rrcount/$count[$ridx];
   61: # - Calculate full index
   62:                      $found{$ridx}=$directrel*$revrel;
   63:                      if ($found{$ridx}>$threshold) {
   64:                         map {
   65:                             my ($rrridx,$rrrcount)=split(/\:/,$_);
   66:                             unless ($found{$rrridx}) {
   67:                                my $revrevrel=$rrrcount/$count[$ridx];
   68:                                if (
   69:                           $directrel*$revrel*$revrevrel>$threshold
   70:                                ) {
   71:                                   $found{$rrridx}=
   72:                                        $directrel*$revrel*$revrevrel;
   73:                                }
   74:                             }
   75:                         } split(/\,/,$related[$ridx]);
   76:                      }
   77:                   }
   78:                } split(/\,/,$related[$ridx]);
   79:             }
   80:         } split(/\,/,$related[$tindex]);
   81: # - Print results
   82:         map {
   83:            if ($found{$_}>$threshold) {
   84:               print '  '.$word[$_].' '.$found{$_}."\n";
   85:            }
   86:         } sort { $found{$b}<=>$found{$a} } keys %found;
   87:     } else {
   88:         print "\nNot found\n\n";
   89:     }
   90:     print "\nInput word: ";
   91: }
   92: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>