Annotation of loncom/thesaurus/build_thesaurus_db.pl, revision 1.1
1.1 ! matthew 1: #!/usr/bin/perl -w
! 2: #
! 3: # $Id$
! 4: #
! 5: #
! 6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
! 7: #
! 8: # Copyright Michigan State University Board of Trustees
! 9: #
! 10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
! 11: #
! 12: # LON-CAPA is free software; you can redistribute it and/or modify
! 13: # it under the terms of the GNU General Public License as published by
! 14: # the Free Software Foundation; either version 2 of the License, or
! 15: # (at your option) any later version.
! 16: #
! 17: # LON-CAPA is distributed in the hope that it will be useful,
! 18: # but WITHOUT ANY WARRANTY; without even the implied warranty of
! 19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! 20: # GNU General Public License for more details.
! 21: #
! 22: # You should have received a copy of the GNU General Public License
! 23: # along with LON-CAPA; if not, write to the Free Software
! 24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
! 25: #
! 26: # /home/httpd/html/adm/gpl.txt
! 27: #
! 28: # http://www.lon-capa.org/
! 29: #
! 30: use strict;
! 31: use Getopt::Long;
! 32: use GDBM_File;
! 33: # POD required stuff:
! 34:
! 35: =pod
! 36:
! 37: =head1 NAME
! 38:
! 39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.
! 40:
! 41: =head1 SYNOPSIS
! 42:
! 43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
! 44:
! 45: =head1 DESCRIPTION
! 46:
! 47: build_thesaurus_db.pl reads two input files. The first is a list of words to
! 48: omit from the thesaurus. The second is the raw keyword data for the thesaurus.
! 49: From this file a database is built.
! 50:
! 51: =head1 DATABASE FORMAT DESCRIPTION
! 52:
! 53: The structure of the database entries is described below.
! 54:
! 55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
! 56:
! 57: Allow me to repeat myself:
! 58:
! 59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
! 60:
! 61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
! 62:
! 63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
! 64:
! 65: Got it? While you are reading this, let me encourage you to document
! 66: any changes to the structure of the database. It is not that hard and
! 67: you will save much time if you do.
! 68:
! 69: That said, you should make sure the description below actually matches
! 70: the code, just to be safe.
! 71:
! 72: This concludes the lecture portion of the comments.
! 73:
! 74: =head1 DATABASE FORMAT DESCRIPTION
! 75:
! 76: An entry in the database for a given word is shown below:
! 77:
! 78: polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
! 79: | | |
! 80: | | The number of times dna appeared in a keywords list
! 81: | | with the word polymerase.
! 82: | The related keyword
! 83: The number of times polymerase appeared in a keywords list.
! 84:
! 85: Note: the related words list will be in descending order of occurance with
! 86: the keyword.
! 87:
! 88: =head1 COMMAND LINE OPTIONS
! 89:
! 90: =over 4
! 91:
! 92:
! 93: =item --badwordfile <filename>
! 94:
! 95: filename must contain a list of words not to put in the thesaurus.
! 96: Each word must appear on its own line.
! 97: Currently comments are not supported.
! 98:
! 99: =item --keywordfile <filename>
! 100:
! 101: File containing the raw word data for the thesaurus. Each line must be
! 102: comma seperated list of related keywords.
! 103:
! 104: =item --outputdb <filename>
! 105:
! 106: file to write the LON-CAPA thesaurus database to.
! 107:
! 108: =item --help
! 109:
! 110: Display this help message and exit.
! 111:
! 112: =item --test
! 113:
! 114: Run a few test lookups after writing the database.
! 115:
! 116: =back
! 117:
! 118: The following example shows the default values for each parameter
! 119:
! 120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt
! 121:
! 122: =cut
! 123:
! 124: ##
! 125: ## Get command line parameters
! 126: ##
! 127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$test);
! 128: GetOptions( "badwordfile=s" => \$badwordfile, # --badwordfile
! 129: "outputdb=s" => \$outputdbfile, # --outputdb
! 130: "keywordfile=s" => \$keywordfile, # --keywordfile
! 131: "help" => \$help, # --help
! 132: "test" => \$test); # --test
! 133:
! 134: ##
! 135: ## Help! Help!
! 136: ##
! 137: if ($help) {
! 138: print <<ENDHELP;
! 139: build_thesaurus_db.pl Build a LON-CAPA thesaurus database.
! 140:
! 141: Command line arguements
! 142: --badwordfile <filename> filename must contain a list of words not to
! 143: put in the thesaurus. Each word must appear
! 144: on its own line and currently comments are not
! 145: supported.
! 146: --keywordfile <filename> File containing the raw word data for the
! 147: thesaurus. Each line must be comma seperated
! 148: list of related keywords.
! 149: --outputdb <filename> file to write the LON-CAPA thesaurus database
! 150: to.
! 151: --help Display this help message and exit.
! 152: --test Run a few test lookups after writing the
! 153: database.
! 154: The following example shows the default values for each parameter
! 155:
! 156: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
! 157: --outputdb ./thesaurus.db --keywordfile rawkey.txt
! 158:
! 159: ENDHELP
! 160: exit;
! 161: }
! 162:
! 163: ##
! 164: ## Set up defaults for parameters and check validity
! 165: ##
! 166: $badwordfile = $badwordfile || "./un_keyword.tab";
! 167: $outputdbfile = $outputdbfile || "./thesaurus.db";
! 168: $keywordfile = $keywordfile || "./rawkey.txt";
! 169:
! 170: foreach my $file ($badwordfile,$keywordfile) {
! 171: die "$file does not exist." if (! -e $file);
! 172: }
! 173:
! 174: ##
! 175: ## Global hashes.
! 176: ##
! 177: my %wordcount = (); # Holds the number of times each word appears in the
! 178: # input file.
! 179: my %related_words=(); # Holds the words related to a word. The keys of this
! 180: # has are words, and the values are pointers to hashes
! 181: # which hold the words and their frequencies.
! 182: my %isbad; # Holds an entry for each keyword that is 'bad'
! 183:
! 184: ##
! 185: ## Initialize hash of bad words. 'bad' meaning their appearance in a keyword
! 186: ## list does not add information. Not 'bad' meaning profane.
! 187: ##
! 188: open BAD,$badwordfile || die "Unable to open ".$badwordfile;
! 189: while (<BAD>) {
! 190: chomp;
! 191: $isbad{lc($_)}++;
! 192: }
! 193: close BAD;
! 194:
! 195: ##
! 196: ## Read in the data file and construction related words hash. Skip bad words.
! 197: ##
! 198: open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
! 199: while (<IN>) {
! 200: chomp;
! 201: my @Words = split(/\W+/,lc($_));
! 202: foreach my $keyword (@Words) {
! 203: next if ($isbad{$keyword});
! 204: $wordcount{$keyword}++;
! 205: foreach my $otherword (@Words) {
! 206: next if (($otherword eq $keyword) || ($isbad{$otherword}));
! 207: $related_words{$keyword}->{$otherword}++;
! 208: }
! 209: }
! 210: }
! 211: close(IN);
! 212:
! 213: ##
! 214: ## Determine average number of entries
! 215: ##
! 216: my $totalcount;
! 217: foreach (keys(%wordcount)) {
! 218: $totalcount+=$wordcount{$_};
! 219: }
! 220: my $avecount = $totalcount /(scalar keys(%wordcount));
! 221:
! 222: ##
! 223: ## Make sure we can write the database.
! 224: ##
! 225: if (-e $outputdbfile) {
! 226: die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
! 227: }
! 228: my %thesaurus_db;
! 229: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
! 230: die "Error opening DB file.\n";
! 231: }
! 232:
! 233: ##
! 234: ## Write the database file
! 235: ##
! 236: foreach my $word (keys(%related_words)) {
! 237: next if (! defined($word));
! 238: my $result = &get_related($word);
! 239: $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
! 240: }
! 241:
! 242: ##
! 243: ## Store away special values (must contain characters not matched by \w)
! 244: ##
! 245: $thesaurus_db{'average.count'}=$avecount;
! 246: $thesaurus_db{'total.count'}=$totalcount;
! 247: untie %thesaurus_db;
! 248:
! 249: ##
! 250: ## Perform test lookups
! 251: ##
! 252: if ($test) {
! 253: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
! 254: die "Error opening DB file.\n";
! 255: }
! 256: foreach my $word ('torque','rna','polymerase') {
! 257: my $result = $thesaurus_db{$word};
! 258: print "Results for $word = $result\n" if ($result);
! 259: }
! 260: untie %thesaurus_db;
! 261: }
! 262:
! 263:
! 264: ################################################################
! 265: ################################################################
! 266: #
! 267: # get_related($keyword) is a utility function which will return a string
! 268: # of the format:
! 269: # keyword1,frequency1:keyword2,frequency2:.....
! 270: #
! 271: # 'frequency1' is the number of times the keyword1 appears in a keywords
! 272: # list with $keyword.
! 273: #
! 274: sub get_related {
! 275: my $keyword = shift;
! 276: return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
! 277: my %related_hash = %{$related_words{$keyword}};
! 278: my @Related_words = keys(%{$related_words{$keyword}});
! 279: @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} }
! 280: @Related_words;
! 281: my $result;
! 282: foreach (@Related_words) {
! 283: $result .= "$_,$related_hash{$_}:";
! 284: }
! 285: chop $result;
! 286: return $result;
! 287: }
! 288:
! 289:
! 290:
! 291:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>