File:  [LON-CAPA] / loncom / publisher / Attic / thesaurus.pl
Revision 1.1: download - view: text, annotated - select for diffs
Sat Oct 13 23:03:17 2001 UTC (22 years, 10 months ago) by www
Branches: MAIN
CVS tags: version_0_4, stable_2002_spring, stable_2002_july, stable_2002_april, stable_2001_fall, STABLE, HEAD
Helper programs to create thesaurus
Only thesaurus.dat should be installed.

    1: use strict;
    2: 
    3: # ---------------------- Read word file
    4: 
    5: open(IN,"rawkey.txt");
    6: print "Opened file\n";
    7: my $readdata='';
    8: my %wordcount=();
    9: my %wordindex=();
   10: my %related=();
   11: my $line;
   12: while ($line=<IN>) {
   13:     $readdata.=$line;
   14: }
   15: close(IN);
   16: undef $line;
   17: 
   18: # ---------------------- Get all words and index them
   19: 
   20: my $index=0;
   21: map {
   22:     map {
   23:        my $word=$_;
   24:        if ($word) {
   25:           $word=~tr/A-Z/a-z/;
   26:           $wordcount{$word}++;
   27:           unless (defined $wordindex{$word}) {
   28:               $index++;
   29:               $wordindex{$word}=$index;
   30:           }
   31:        }
   32:     } split(/\W+/,$_);
   33: } split(/(\n|\r)+/,$readdata);
   34: 
   35: print "Built word index\n";
   36: 
   37: # ---------------------- Find related words
   38: 
   39: map {
   40:    my $line=$_;
   41:    if ($line) {
   42:       my @words=split(/\W+/,$line);
   43:       map {
   44:          my $word=$_;
   45:          if ($word) {
   46:             $word=~tr/A-Z/a-z/;
   47:             my $twordidx=$wordindex{$word};
   48:             if ($twordidx) {
   49:                my %alreadyrelated=();
   50:                if (defined $related{$twordidx}) {
   51:                   map {
   52:                       my ($idx,$count)=split(/\:/,$_);
   53:                       $alreadyrelated{$idx}=$count;
   54:                   } split(/\,/,$related{$twordidx});
   55:                }
   56:                map {
   57:                    my $rword=$_;
   58:                    $rword=~tr/A-Z/a-z/;
   59:                    if (($rword) && ($rword ne $word)) {
   60:                       my $rwordidx=$wordindex{$rword};
   61:                       if (defined $alreadyrelated{$rwordidx}) {
   62:                          $alreadyrelated{$rwordidx}++;
   63:                       } else {
   64:                          $alreadyrelated{$rwordidx}=1;
   65:                       }
   66:                    }
   67:                } @words;
   68:                $related{$twordidx}='';
   69:                map {
   70:                    $related{$twordidx}.=$_.':'.$alreadyrelated{$_}.',';
   71:                } keys %alreadyrelated;
   72:                chop $related{$twordidx};            
   73:             } else {
   74:                print "Warning! Unknown word: ".$word;
   75:             }
   76:          }
   77:       } @words;
   78:    }
   79: } split(/(\n|\r)+/,$readdata);
   80: 
   81: print "Built hash of related words\n";
   82: 
   83: # ---------------------- Output
   84: 
   85: open(OUT,">thesaurus.dat");
   86: map {
   87:     my $wordidx=$wordindex{$_};
   88:     print OUT $_.'@'.$wordidx.'@'.$wordcount{$_}.'@'.$related{$wordidx}.
   89:           "\n";
   90: } sort keys %wordindex;
   91: close(OUT);
   92: 
   93: print "Wrote thesaurus file\n";

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>