Annotation of loncom/publisher/thesaurus.pl, revision 1.1
1.1 ! www 1: use strict;
! 2:
! 3: # ---------------------- Read word file
! 4:
! 5: open(IN,"rawkey.txt");
! 6: print "Opened file\n";
! 7: my $readdata='';
! 8: my %wordcount=();
! 9: my %wordindex=();
! 10: my %related=();
! 11: my $line;
! 12: while ($line=<IN>) {
! 13: $readdata.=$line;
! 14: }
! 15: close(IN);
! 16: undef $line;
! 17:
! 18: # ---------------------- Get all words and index them
! 19:
! 20: my $index=0;
! 21: map {
! 22: map {
! 23: my $word=$_;
! 24: if ($word) {
! 25: $word=~tr/A-Z/a-z/;
! 26: $wordcount{$word}++;
! 27: unless (defined $wordindex{$word}) {
! 28: $index++;
! 29: $wordindex{$word}=$index;
! 30: }
! 31: }
! 32: } split(/\W+/,$_);
! 33: } split(/(\n|\r)+/,$readdata);
! 34:
! 35: print "Built word index\n";
! 36:
! 37: # ---------------------- Find related words
! 38:
! 39: map {
! 40: my $line=$_;
! 41: if ($line) {
! 42: my @words=split(/\W+/,$line);
! 43: map {
! 44: my $word=$_;
! 45: if ($word) {
! 46: $word=~tr/A-Z/a-z/;
! 47: my $twordidx=$wordindex{$word};
! 48: if ($twordidx) {
! 49: my %alreadyrelated=();
! 50: if (defined $related{$twordidx}) {
! 51: map {
! 52: my ($idx,$count)=split(/\:/,$_);
! 53: $alreadyrelated{$idx}=$count;
! 54: } split(/\,/,$related{$twordidx});
! 55: }
! 56: map {
! 57: my $rword=$_;
! 58: $rword=~tr/A-Z/a-z/;
! 59: if (($rword) && ($rword ne $word)) {
! 60: my $rwordidx=$wordindex{$rword};
! 61: if (defined $alreadyrelated{$rwordidx}) {
! 62: $alreadyrelated{$rwordidx}++;
! 63: } else {
! 64: $alreadyrelated{$rwordidx}=1;
! 65: }
! 66: }
! 67: } @words;
! 68: $related{$twordidx}='';
! 69: map {
! 70: $related{$twordidx}.=$_.':'.$alreadyrelated{$_}.',';
! 71: } keys %alreadyrelated;
! 72: chop $related{$twordidx};
! 73: } else {
! 74: print "Warning! Unknown word: ".$word;
! 75: }
! 76: }
! 77: } @words;
! 78: }
! 79: } split(/(\n|\r)+/,$readdata);
! 80:
! 81: print "Built hash of related words\n";
! 82:
! 83: # ---------------------- Output
! 84:
! 85: open(OUT,">thesaurus.dat");
! 86: map {
! 87: my $wordidx=$wordindex{$_};
! 88: print OUT $_.'@'.$wordidx.'@'.$wordcount{$_}.'@'.$related{$wordidx}.
! 89: "\n";
! 90: } sort keys %wordindex;
! 91: close(OUT);
! 92:
! 93: print "Wrote thesaurus file\n";
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>