Annotation of loncom/thesaurus/build_thesaurus_db.pl, revision 1.2

1.1       matthew     1: #!/usr/bin/perl -w
                      2: #
1.2     ! matthew     3: # $Id: build_thesaurus_db.pl,v 1.1 2002/07/11 20:48:31 matthew Exp $
1.1       matthew     4: #
                      5: #
                      6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
                      7: #
                      8: # Copyright Michigan State University Board of Trustees
                      9: #
                     10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
                     11: #
                     12: # LON-CAPA is free software; you can redistribute it and/or modify
                     13: # it under the terms of the GNU General Public License as published by
                     14: # the Free Software Foundation; either version 2 of the License, or
                     15: # (at your option) any later version.
                     16: #
                     17: # LON-CAPA is distributed in the hope that it will be useful,
                     18: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     20: # GNU General Public License for more details.
                     21: #
                     22: # You should have received a copy of the GNU General Public License
                     23: # along with LON-CAPA; if not, write to the Free Software
                     24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     25: #
                     26: # /home/httpd/html/adm/gpl.txt
                     27: #
                     28: # http://www.lon-capa.org/
                     29: #
                     30: use strict;
                     31: use Getopt::Long;
                     32: use GDBM_File;
                     33: # POD required stuff:
                     34: 
                     35: =pod
                     36: 
                     37: =head1 NAME
                     38: 
                     39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.
                     40: 
                     41: =head1 SYNOPSIS
                     42: 
                     43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
                     44: 
                     45: =head1 DESCRIPTION
                     46: 
                     47: build_thesaurus_db.pl reads two input files.  The first is a list of words to
                     48: omit from the thesaurus.  The second is the raw keyword data for the thesaurus.
                     49: From this file a database is built.
                     50: 
                     51: =head1 DATABASE FORMAT DESCRIPTION
                     52: 
                     53: The structure of the database entries is described below.  
                     54: 
                     55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
                     56: 
                     57: Allow me to repeat myself:
                     58: 
                     59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
                     60: 
                     61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
                     62: 
                     63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
                     64: 
                     65: Got it?  While you are reading this, let me encourage you to document
                     66: any changes to the structure of the database.  It is not that hard and
                     67: you will save much time if you do.  
                     68: 
                     69: That said, you should make sure the description below actually matches
                     70: the code, just to be safe.
                     71: 
                     72: This concludes the lecture portion of the comments.
                     73: 
                     74: =head1 DATABASE FORMAT DESCRIPTION
                     75: 
                     76: An entry in the database for a given word is shown below:
                     77: 
                     78:  polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
                     79:               |   |  |
                     80:               |   |  The number of times dna appeared in a keywords list
                     81:               |   |  with the word polymerase. 
                     82:               |   The related keyword
                     83:               The number of times polymerase appeared in a keywords list.
                     84: 
                     85: Note: the related words list will be in descending order of occurance with 
                     86: the keyword.
                     87: 
                     88: =head1 COMMAND LINE OPTIONS
                     89: 
                     90: =over 4
                     91: 
                     92: 
                     93: =item --badwordfile <filename>
                     94: 
                     95: filename must contain a list of words not to put in the thesaurus.  
                     96: Each word must appear on its own line.
                     97: Currently comments are not supported.
                     98: 
                     99: =item --keywordfile <filename>
                    100: 
                    101: File containing the raw word data for the thesaurus.  Each line must be 
                    102: comma seperated list of related keywords.
                    103: 
                    104: =item --outputdb <filename>
                    105: 
                    106: file to write the LON-CAPA thesaurus database to.
                    107: 
                    108: =item --help
                    109: 
                    110: Display this help message and exit.
                    111: 
                    112: =item --test
                    113: 
                    114: Run a few test lookups after writing the database.
                    115: 
                    116: =back
                    117: 
                    118: The following example shows the default values for each parameter
                    119: 
                    120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt
                    121: 
                    122: =cut
                    123: 
                    124: ##
                    125: ## Get command line parameters
                    126: ##
1.2     ! matthew   127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$checkdates,$test);
1.1       matthew   128: GetOptions( "badwordfile=s" => \$badwordfile,   # --badwordfile
                    129:             "outputdb=s"    => \$outputdbfile,  # --outputdb
                    130:             "keywordfile=s" => \$keywordfile,   # --keywordfile
                    131:             "help"          => \$help,          # --help
1.2     ! matthew   132:             "checkdates"    => \$checkdates,    # --checkdates
1.1       matthew   133:             "test"          => \$test);         # --test
                    134: 
1.2     ! matthew   135: 
1.1       matthew   136: ##
                    137: ## Help! Help!
                    138: ##
                    139: if ($help) {
                    140:     print <<ENDHELP;
                    141: build_thesaurus_db.pl     Build a LON-CAPA thesaurus database.
                    142: 
                    143: Command line arguements
                    144:    --badwordfile <filename>     filename must contain a list of words not to
                    145:                                 put in the thesaurus.  Each word must appear
                    146:                                 on its own line and currently comments are not
                    147:                                 supported.
1.2     ! matthew   148:    --checkdates                 Check the creation dates on the files involved
        !           149:                                 and only run if the outputdb file was created
        !           150:                                 prior to one of the badword or keyword files.
1.1       matthew   151:    --keywordfile <filename>     File containing the raw word data for the
                    152:                                 thesaurus.  Each line must be comma seperated
                    153:                                 list of related keywords.
                    154:    --outputdb <filename>        file to write the LON-CAPA thesaurus database
                    155:                                 to.
                    156:    --help                       Display this help message and exit.
                    157:    --test                       Run a few test lookups after writing the 
                    158:                                 database.
                    159: The following example shows the default values for each parameter
                    160: 
                    161: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
                    162:      --outputdb ./thesaurus.db --keywordfile rawkey.txt
                    163: 
                    164: ENDHELP
                    165:     exit;
                    166: }
                    167: 
                    168: ##
                    169: ## Set up defaults for parameters and check validity
                    170: ##
                    171: $badwordfile  = $badwordfile  || "./un_keyword.tab";
                    172: $outputdbfile = $outputdbfile || "./thesaurus.db";
                    173: $keywordfile  = $keywordfile  || "./rawkey.txt";
                    174: 
                    175: foreach my $file ($badwordfile,$keywordfile) {
                    176:     die "$file does not exist." if (! -e $file);
1.2     ! matthew   177: }
        !           178: 
        !           179: #
        !           180: # Check the dates on the input files to be sure we need to run
        !           181: if ($checkdates && -s $outputdbfile) {
        !           182:     my @Results = stat($badwordfile);
        !           183:     my $highest_dependency_ctime = $Results[10];
        !           184:     foreach ($keywordfile) {
        !           185:         if ($Results[10] > $highest_dependency_ctime) {
        !           186:             $highest_dependency_ctime = $Results[10];
        !           187:         }
        !           188:     }
        !           189:     #
        !           190:     # if the outputdbfile was made AFTER the last version of one of the
        !           191:     # dependencies, exit quietly.
        !           192:     @Results = stat($outputdbfile);
        !           193:     if ($highest_dependency_ctime < $Results[10]) { 
        !           194:         exit;
        !           195:     }
1.1       matthew   196: }
                    197: 
                    198: ##
                    199: ## Global hashes.
                    200: ##
                    201: my %wordcount = ();    # Holds the number of times each word appears in the
                    202:                        # input file.
                    203: my %related_words=();  # Holds the words related to a word.  The keys of this
                    204:                        # has are words, and the values are pointers to hashes
                    205:                        # which hold the words and their frequencies.
                    206: my %isbad;             # Holds an entry for each keyword that is 'bad'
                    207: 
                    208: ##
                    209: ## Initialize hash of bad words.  'bad' meaning their appearance in a keyword
                    210: ## list does not add information.  Not 'bad' meaning profane.  
                    211: ##
                    212: open BAD,$badwordfile || die "Unable to open ".$badwordfile;
                    213: while (<BAD>) {
                    214:     chomp;
                    215:     $isbad{lc($_)}++;
                    216: }
                    217: close BAD;
                    218: 
                    219: ##
                    220: ## Read in the data file and construction related words hash.  Skip bad words.
                    221: ##
                    222: open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
                    223: while (<IN>) {
                    224:     chomp;
                    225:     my @Words = split(/\W+/,lc($_));
                    226:     foreach my $keyword (@Words) {
                    227:         next if ($isbad{$keyword});
                    228:         $wordcount{$keyword}++;
                    229:         foreach my $otherword (@Words) {
                    230:             next if (($otherword eq $keyword) || ($isbad{$otherword}));
                    231:             $related_words{$keyword}->{$otherword}++;
                    232:         }
                    233:     }
                    234: }
                    235: close(IN);
                    236: 
                    237: ##
                    238: ## Determine average number of entries
                    239: ##
                    240: my $totalcount;
                    241: foreach (keys(%wordcount)) {
                    242:     $totalcount+=$wordcount{$_};
                    243: }
                    244: my $avecount = $totalcount /(scalar keys(%wordcount));
                    245: 
                    246: ##
                    247: ## Make sure we can write the database.
                    248: ##
                    249: if (-e $outputdbfile) {
                    250:     die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
                    251: }
                    252: my %thesaurus_db;
                    253: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
                    254:     die "Error opening DB file.\n";
                    255: }
                    256: 
                    257: ##
                    258: ## Write the database file
                    259: ##
                    260: foreach my $word (keys(%related_words)) {
                    261:     next if (! defined($word));
                    262:     my $result = &get_related($word);
                    263:     $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
                    264: }
                    265: 
                    266: ##
                    267: ## Store away special values (must contain characters not matched by \w)
                    268: ##
                    269: $thesaurus_db{'average.count'}=$avecount;
                    270: $thesaurus_db{'total.count'}=$totalcount;
                    271: untie %thesaurus_db;
                    272: 
                    273: ##
                    274: ## Perform test lookups
                    275: ##
                    276: if ($test) {
                    277:     if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
                    278:         die "Error opening DB file.\n";
                    279:     }
                    280:     foreach my $word ('torque','rna','polymerase') {
                    281:         my $result = $thesaurus_db{$word};
                    282:         print "Results for $word = $result\n" if ($result);
                    283:     }
                    284:     untie %thesaurus_db;
                    285: }
                    286: 
                    287: 
                    288: ################################################################
                    289: ################################################################
                    290: #
                    291: # get_related($keyword) is a utility function which will return a string
                    292: #     of the format: 
                    293: #        keyword1,frequency1:keyword2,frequency2:.....
                    294: #
                    295: #     'frequency1' is the number of times the keyword1 appears in a keywords
                    296: #     list with $keyword.
                    297: #
                    298: sub get_related {
                    299:     my $keyword = shift;
                    300:     return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
                    301:     my %related_hash = %{$related_words{$keyword}};
                    302:     my @Related_words = keys(%{$related_words{$keyword}});
                    303:     @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} } 
                    304:                           @Related_words;
                    305:     my $result;
                    306:     foreach (@Related_words) {
                    307:         $result .= "$_,$related_hash{$_}:";
                    308:     }
                    309:     chop $result;
                    310:     return $result;
                    311: }
                    312: 
                    313: 
                    314: 
                    315: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>