Annotation of loncom/thesaurus/build_thesaurus_db.pl, revision 1.2
1.1 matthew 1: #!/usr/bin/perl -w
2: #
1.2 ! matthew 3: # $Id: build_thesaurus_db.pl,v 1.1 2002/07/11 20:48:31 matthew Exp $
1.1 matthew 4: #
5: #
6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
7: #
8: # Copyright Michigan State University Board of Trustees
9: #
10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
11: #
12: # LON-CAPA is free software; you can redistribute it and/or modify
13: # it under the terms of the GNU General Public License as published by
14: # the Free Software Foundation; either version 2 of the License, or
15: # (at your option) any later version.
16: #
17: # LON-CAPA is distributed in the hope that it will be useful,
18: # but WITHOUT ANY WARRANTY; without even the implied warranty of
19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20: # GNU General Public License for more details.
21: #
22: # You should have received a copy of the GNU General Public License
23: # along with LON-CAPA; if not, write to the Free Software
24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25: #
26: # /home/httpd/html/adm/gpl.txt
27: #
28: # http://www.lon-capa.org/
29: #
30: use strict;
31: use Getopt::Long;
32: use GDBM_File;
33: # POD required stuff:
34:
35: =pod
36:
37: =head1 NAME
38:
39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.
40:
41: =head1 SYNOPSIS
42:
43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
44:
45: =head1 DESCRIPTION
46:
47: build_thesaurus_db.pl reads two input files. The first is a list of words to
48: omit from the thesaurus. The second is the raw keyword data for the thesaurus.
49: From this file a database is built.
50:
51: =head1 DATABASE FORMAT DESCRIPTION
52:
53: The structure of the database entries is described below.
54:
55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
56:
57: Allow me to repeat myself:
58:
59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
60:
61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
62:
63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
64:
65: Got it? While you are reading this, let me encourage you to document
66: any changes to the structure of the database. It is not that hard and
67: you will save much time if you do.
68:
69: That said, you should make sure the description below actually matches
70: the code, just to be safe.
71:
72: This concludes the lecture portion of the comments.
73:
74: =head1 DATABASE FORMAT DESCRIPTION
75:
76: An entry in the database for a given word is shown below:
77:
78: polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
79: | | |
80: | | The number of times dna appeared in a keywords list
81: | | with the word polymerase.
82: | The related keyword
83: The number of times polymerase appeared in a keywords list.
84:
85: Note: the related words list will be in descending order of occurance with
86: the keyword.
87:
88: =head1 COMMAND LINE OPTIONS
89:
90: =over 4
91:
92:
93: =item --badwordfile <filename>
94:
95: filename must contain a list of words not to put in the thesaurus.
96: Each word must appear on its own line.
97: Currently comments are not supported.
98:
99: =item --keywordfile <filename>
100:
101: File containing the raw word data for the thesaurus. Each line must be
102: comma seperated list of related keywords.
103:
104: =item --outputdb <filename>
105:
106: file to write the LON-CAPA thesaurus database to.
107:
108: =item --help
109:
110: Display this help message and exit.
111:
112: =item --test
113:
114: Run a few test lookups after writing the database.
115:
116: =back
117:
118: The following example shows the default values for each parameter
119:
120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt
121:
122: =cut
123:
124: ##
125: ## Get command line parameters
126: ##
1.2 ! matthew 127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$checkdates,$test);
1.1 matthew 128: GetOptions( "badwordfile=s" => \$badwordfile, # --badwordfile
129: "outputdb=s" => \$outputdbfile, # --outputdb
130: "keywordfile=s" => \$keywordfile, # --keywordfile
131: "help" => \$help, # --help
1.2 ! matthew 132: "checkdates" => \$checkdates, # --checkdates
1.1 matthew 133: "test" => \$test); # --test
134:
1.2 ! matthew 135:
1.1 matthew 136: ##
137: ## Help! Help!
138: ##
139: if ($help) {
140: print <<ENDHELP;
141: build_thesaurus_db.pl Build a LON-CAPA thesaurus database.
142:
143: Command line arguements
144: --badwordfile <filename> filename must contain a list of words not to
145: put in the thesaurus. Each word must appear
146: on its own line and currently comments are not
147: supported.
1.2 ! matthew 148: --checkdates Check the creation dates on the files involved
! 149: and only run if the outputdb file was created
! 150: prior to one of the badword or keyword files.
1.1 matthew 151: --keywordfile <filename> File containing the raw word data for the
152: thesaurus. Each line must be comma seperated
153: list of related keywords.
154: --outputdb <filename> file to write the LON-CAPA thesaurus database
155: to.
156: --help Display this help message and exit.
157: --test Run a few test lookups after writing the
158: database.
159: The following example shows the default values for each parameter
160:
161: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
162: --outputdb ./thesaurus.db --keywordfile rawkey.txt
163:
164: ENDHELP
165: exit;
166: }
167:
168: ##
169: ## Set up defaults for parameters and check validity
170: ##
171: $badwordfile = $badwordfile || "./un_keyword.tab";
172: $outputdbfile = $outputdbfile || "./thesaurus.db";
173: $keywordfile = $keywordfile || "./rawkey.txt";
174:
175: foreach my $file ($badwordfile,$keywordfile) {
176: die "$file does not exist." if (! -e $file);
1.2 ! matthew 177: }
! 178:
! 179: #
! 180: # Check the dates on the input files to be sure we need to run
! 181: if ($checkdates && -s $outputdbfile) {
! 182: my @Results = stat($badwordfile);
! 183: my $highest_dependency_ctime = $Results[10];
! 184: foreach ($keywordfile) {
! 185: if ($Results[10] > $highest_dependency_ctime) {
! 186: $highest_dependency_ctime = $Results[10];
! 187: }
! 188: }
! 189: #
! 190: # if the outputdbfile was made AFTER the last version of one of the
! 191: # dependencies, exit quietly.
! 192: @Results = stat($outputdbfile);
! 193: if ($highest_dependency_ctime < $Results[10]) {
! 194: exit;
! 195: }
1.1 matthew 196: }
197:
198: ##
199: ## Global hashes.
200: ##
201: my %wordcount = (); # Holds the number of times each word appears in the
202: # input file.
203: my %related_words=(); # Holds the words related to a word. The keys of this
204: # has are words, and the values are pointers to hashes
205: # which hold the words and their frequencies.
206: my %isbad; # Holds an entry for each keyword that is 'bad'
207:
208: ##
209: ## Initialize hash of bad words. 'bad' meaning their appearance in a keyword
210: ## list does not add information. Not 'bad' meaning profane.
211: ##
212: open BAD,$badwordfile || die "Unable to open ".$badwordfile;
213: while (<BAD>) {
214: chomp;
215: $isbad{lc($_)}++;
216: }
217: close BAD;
218:
219: ##
220: ## Read in the data file and construction related words hash. Skip bad words.
221: ##
222: open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
223: while (<IN>) {
224: chomp;
225: my @Words = split(/\W+/,lc($_));
226: foreach my $keyword (@Words) {
227: next if ($isbad{$keyword});
228: $wordcount{$keyword}++;
229: foreach my $otherword (@Words) {
230: next if (($otherword eq $keyword) || ($isbad{$otherword}));
231: $related_words{$keyword}->{$otherword}++;
232: }
233: }
234: }
235: close(IN);
236:
237: ##
238: ## Determine average number of entries
239: ##
240: my $totalcount;
241: foreach (keys(%wordcount)) {
242: $totalcount+=$wordcount{$_};
243: }
244: my $avecount = $totalcount /(scalar keys(%wordcount));
245:
246: ##
247: ## Make sure we can write the database.
248: ##
249: if (-e $outputdbfile) {
250: die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
251: }
252: my %thesaurus_db;
253: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
254: die "Error opening DB file.\n";
255: }
256:
257: ##
258: ## Write the database file
259: ##
260: foreach my $word (keys(%related_words)) {
261: next if (! defined($word));
262: my $result = &get_related($word);
263: $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
264: }
265:
266: ##
267: ## Store away special values (must contain characters not matched by \w)
268: ##
269: $thesaurus_db{'average.count'}=$avecount;
270: $thesaurus_db{'total.count'}=$totalcount;
271: untie %thesaurus_db;
272:
273: ##
274: ## Perform test lookups
275: ##
276: if ($test) {
277: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
278: die "Error opening DB file.\n";
279: }
280: foreach my $word ('torque','rna','polymerase') {
281: my $result = $thesaurus_db{$word};
282: print "Results for $word = $result\n" if ($result);
283: }
284: untie %thesaurus_db;
285: }
286:
287:
288: ################################################################
289: ################################################################
290: #
291: # get_related($keyword) is a utility function which will return a string
292: # of the format:
293: # keyword1,frequency1:keyword2,frequency2:.....
294: #
295: # 'frequency1' is the number of times the keyword1 appears in a keywords
296: # list with $keyword.
297: #
298: sub get_related {
299: my $keyword = shift;
300: return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
301: my %related_hash = %{$related_words{$keyword}};
302: my @Related_words = keys(%{$related_words{$keyword}});
303: @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} }
304: @Related_words;
305: my $result;
306: foreach (@Related_words) {
307: $result .= "$_,$related_hash{$_}:";
308: }
309: chop $result;
310: return $result;
311: }
312:
313:
314:
315:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>