File:
[LON-CAPA] /
loncom /
thesaurus /
build_thesaurus_db.pl
Revision
1.2:
download - view:
text,
annotated -
select for diffs
Tue Aug 5 15:51:37 2003 UTC (21 years, 6 months ago) by
matthew
Branches:
MAIN
CVS tags:
version_2_9_X,
version_2_9_99_0,
version_2_9_1,
version_2_9_0,
version_2_8_X,
version_2_8_99_1,
version_2_8_99_0,
version_2_8_2,
version_2_8_1,
version_2_8_0,
version_2_7_X,
version_2_7_99_1,
version_2_7_99_0,
version_2_7_1,
version_2_7_0,
version_2_6_X,
version_2_6_99_1,
version_2_6_99_0,
version_2_6_3,
version_2_6_2,
version_2_6_1,
version_2_6_0,
version_2_5_X,
version_2_5_99_1,
version_2_5_99_0,
version_2_5_2,
version_2_5_1,
version_2_5_0,
version_2_4_X,
version_2_4_99_0,
version_2_4_2,
version_2_4_1,
version_2_4_0,
version_2_3_X,
version_2_3_99_0,
version_2_3_2,
version_2_3_1,
version_2_3_0,
version_2_2_X,
version_2_2_99_1,
version_2_2_99_0,
version_2_2_2,
version_2_2_1,
version_2_2_0,
version_2_1_X,
version_2_1_99_3,
version_2_1_99_2,
version_2_1_99_1,
version_2_1_99_0,
version_2_1_3,
version_2_1_2,
version_2_1_1,
version_2_1_0,
version_2_12_X,
version_2_11_X,
version_2_11_6_msu,
version_2_11_6,
version_2_11_5_msu,
version_2_11_5,
version_2_11_4_uiuc,
version_2_11_4_msu,
version_2_11_4,
version_2_11_3_uiuc,
version_2_11_3_msu,
version_2_11_3,
version_2_11_2_uiuc,
version_2_11_2_msu,
version_2_11_2_educog,
version_2_11_2,
version_2_11_1,
version_2_11_0_RC3,
version_2_11_0_RC2,
version_2_11_0_RC1,
version_2_11_0,
version_2_10_X,
version_2_10_1,
version_2_10_0_RC2,
version_2_10_0_RC1,
version_2_10_0,
version_2_0_X,
version_2_0_99_1,
version_2_0_2,
version_2_0_1,
version_2_0_0,
version_1_99_3,
version_1_99_2,
version_1_99_1_tmcc,
version_1_99_1,
version_1_99_0_tmcc,
version_1_99_0,
version_1_3_X,
version_1_3_3,
version_1_3_2,
version_1_3_1,
version_1_3_0,
version_1_2_X,
version_1_2_99_1,
version_1_2_99_0,
version_1_2_1,
version_1_2_0,
version_1_1_X,
version_1_1_99_5,
version_1_1_99_4,
version_1_1_99_3,
version_1_1_99_2,
version_1_1_99_1,
version_1_1_99_0,
version_1_1_3,
version_1_1_2,
version_1_1_1,
version_1_1_0,
version_1_0_99_3,
version_1_0_99_2,
version_1_0_99_1,
version_1_0_99,
loncapaMITrelate_1,
language_hyphenation_merge,
language_hyphenation,
bz6209-base,
bz6209,
bz5969,
bz2851,
PRINT_INCOMPLETE_base,
PRINT_INCOMPLETE,
HEAD,
GCI_3,
GCI_2,
GCI_1,
BZ5971-printing-apage,
BZ5434-fox,
BZ4492-merge,
BZ4492-feature_horizontal_radioresponse
Bug 1492: build_thesaurus_db.pl has new command line switch --checkdates
to only build the thesaurus if the dependencies are newer than the target.
1: #!/usr/bin/perl -w
2: #
3: # $Id: build_thesaurus_db.pl,v 1.2 2003/08/05 15:51:37 matthew Exp $
4: #
5: #
6: # build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
7: #
8: # Copyright Michigan State University Board of Trustees
9: #
10: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
11: #
12: # LON-CAPA is free software; you can redistribute it and/or modify
13: # it under the terms of the GNU General Public License as published by
14: # the Free Software Foundation; either version 2 of the License, or
15: # (at your option) any later version.
16: #
17: # LON-CAPA is distributed in the hope that it will be useful,
18: # but WITHOUT ANY WARRANTY; without even the implied warranty of
19: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20: # GNU General Public License for more details.
21: #
22: # You should have received a copy of the GNU General Public License
23: # along with LON-CAPA; if not, write to the Free Software
24: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25: #
26: # /home/httpd/html/adm/gpl.txt
27: #
28: # http://www.lon-capa.org/
29: #
30: use strict;
31: use Getopt::Long;
32: use GDBM_File;
33: # POD required stuff:
34:
35: =pod
36:
37: =head1 NAME
38:
39: build_thesaurus_db.pl - Build the LON-CAPA thesaurus database.
40:
41: =head1 SYNOPSIS
42:
43: build_thesaurus_db.pl creates the LON-CAPA thesaurus database.
44:
45: =head1 DESCRIPTION
46:
47: build_thesaurus_db.pl reads two input files. The first is a list of words to
48: omit from the thesaurus. The second is the raw keyword data for the thesaurus.
49: From this file a database is built.
50:
51: =head1 DATABASE FORMAT DESCRIPTION
52:
53: The structure of the database entries is described below.
54:
55: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
56:
57: Allow me to repeat myself:
58:
59: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
60:
61: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
62:
63: =head2 DO NOT CHANGE THE STRUCTURE OF THE DATABASE WITHOUT CHANGING loncommon.pm!
64:
65: Got it? While you are reading this, let me encourage you to document
66: any changes to the structure of the database. It is not that hard and
67: you will save much time if you do.
68:
69: That said, you should make sure the description below actually matches
70: the code, just to be safe.
71:
72: This concludes the lecture portion of the comments.
73:
74: =head1 DATABASE FORMAT DESCRIPTION
75:
76: An entry in the database for a given word is shown below:
77:
78: polymerase = 42:dna,32:rna,30:transcription,19:protein,16:...
79: | | |
80: | | The number of times dna appeared in a keywords list
81: | | with the word polymerase.
82: | The related keyword
83: The number of times polymerase appeared in a keywords list.
84:
85: Note: the related words list will be in descending order of occurance with
86: the keyword.
87:
88: =head1 COMMAND LINE OPTIONS
89:
90: =over 4
91:
92:
93: =item --badwordfile <filename>
94:
95: filename must contain a list of words not to put in the thesaurus.
96: Each word must appear on its own line.
97: Currently comments are not supported.
98:
99: =item --keywordfile <filename>
100:
101: File containing the raw word data for the thesaurus. Each line must be
102: comma seperated list of related keywords.
103:
104: =item --outputdb <filename>
105:
106: file to write the LON-CAPA thesaurus database to.
107:
108: =item --help
109:
110: Display this help message and exit.
111:
112: =item --test
113:
114: Run a few test lookups after writing the database.
115:
116: =back
117:
118: The following example shows the default values for each parameter
119:
120: build_thesaurus_db.pl --badwordfile ./un_keyword.tab --outputdb ./thesaurus.db --keywordfile rawkey.txt
121:
122: =cut
123:
124: ##
125: ## Get command line parameters
126: ##
127: my ($badwordfile,$outputdbfile,$keywordfile,$help,$checkdates,$test);
128: GetOptions( "badwordfile=s" => \$badwordfile, # --badwordfile
129: "outputdb=s" => \$outputdbfile, # --outputdb
130: "keywordfile=s" => \$keywordfile, # --keywordfile
131: "help" => \$help, # --help
132: "checkdates" => \$checkdates, # --checkdates
133: "test" => \$test); # --test
134:
135:
136: ##
137: ## Help! Help!
138: ##
139: if ($help) {
140: print <<ENDHELP;
141: build_thesaurus_db.pl Build a LON-CAPA thesaurus database.
142:
143: Command line arguements
144: --badwordfile <filename> filename must contain a list of words not to
145: put in the thesaurus. Each word must appear
146: on its own line and currently comments are not
147: supported.
148: --checkdates Check the creation dates on the files involved
149: and only run if the outputdb file was created
150: prior to one of the badword or keyword files.
151: --keywordfile <filename> File containing the raw word data for the
152: thesaurus. Each line must be comma seperated
153: list of related keywords.
154: --outputdb <filename> file to write the LON-CAPA thesaurus database
155: to.
156: --help Display this help message and exit.
157: --test Run a few test lookups after writing the
158: database.
159: The following example shows the default values for each parameter
160:
161: build_thesaurus_db.pl --badwordfile ./un_keyword.tab \
162: --outputdb ./thesaurus.db --keywordfile rawkey.txt
163:
164: ENDHELP
165: exit;
166: }
167:
168: ##
169: ## Set up defaults for parameters and check validity
170: ##
171: $badwordfile = $badwordfile || "./un_keyword.tab";
172: $outputdbfile = $outputdbfile || "./thesaurus.db";
173: $keywordfile = $keywordfile || "./rawkey.txt";
174:
175: foreach my $file ($badwordfile,$keywordfile) {
176: die "$file does not exist." if (! -e $file);
177: }
178:
179: #
180: # Check the dates on the input files to be sure we need to run
181: if ($checkdates && -s $outputdbfile) {
182: my @Results = stat($badwordfile);
183: my $highest_dependency_ctime = $Results[10];
184: foreach ($keywordfile) {
185: if ($Results[10] > $highest_dependency_ctime) {
186: $highest_dependency_ctime = $Results[10];
187: }
188: }
189: #
190: # if the outputdbfile was made AFTER the last version of one of the
191: # dependencies, exit quietly.
192: @Results = stat($outputdbfile);
193: if ($highest_dependency_ctime < $Results[10]) {
194: exit;
195: }
196: }
197:
198: ##
199: ## Global hashes.
200: ##
201: my %wordcount = (); # Holds the number of times each word appears in the
202: # input file.
203: my %related_words=(); # Holds the words related to a word. The keys of this
204: # has are words, and the values are pointers to hashes
205: # which hold the words and their frequencies.
206: my %isbad; # Holds an entry for each keyword that is 'bad'
207:
208: ##
209: ## Initialize hash of bad words. 'bad' meaning their appearance in a keyword
210: ## list does not add information. Not 'bad' meaning profane.
211: ##
212: open BAD,$badwordfile || die "Unable to open ".$badwordfile;
213: while (<BAD>) {
214: chomp;
215: $isbad{lc($_)}++;
216: }
217: close BAD;
218:
219: ##
220: ## Read in the data file and construction related words hash. Skip bad words.
221: ##
222: open(IN,$keywordfile) || die "Unable to open ".$keywordfile;
223: while (<IN>) {
224: chomp;
225: my @Words = split(/\W+/,lc($_));
226: foreach my $keyword (@Words) {
227: next if ($isbad{$keyword});
228: $wordcount{$keyword}++;
229: foreach my $otherword (@Words) {
230: next if (($otherword eq $keyword) || ($isbad{$otherword}));
231: $related_words{$keyword}->{$otherword}++;
232: }
233: }
234: }
235: close(IN);
236:
237: ##
238: ## Determine average number of entries
239: ##
240: my $totalcount;
241: foreach (keys(%wordcount)) {
242: $totalcount+=$wordcount{$_};
243: }
244: my $avecount = $totalcount /(scalar keys(%wordcount));
245:
246: ##
247: ## Make sure we can write the database.
248: ##
249: if (-e $outputdbfile) {
250: die "Cannot remove ".$outputdbfile if (!unlink $outputdbfile);
251: }
252: my %thesaurus_db;
253: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_WRCREAT,0640)) {
254: die "Error opening DB file.\n";
255: }
256:
257: ##
258: ## Write the database file
259: ##
260: foreach my $word (keys(%related_words)) {
261: next if (! defined($word));
262: my $result = &get_related($word);
263: $thesaurus_db{$word}=$wordcount{$word}.":".$result if ($result);
264: }
265:
266: ##
267: ## Store away special values (must contain characters not matched by \w)
268: ##
269: $thesaurus_db{'average.count'}=$avecount;
270: $thesaurus_db{'total.count'}=$totalcount;
271: untie %thesaurus_db;
272:
273: ##
274: ## Perform test lookups
275: ##
276: if ($test) {
277: if (! tie(%thesaurus_db,'GDBM_File',$outputdbfile,&GDBM_READER,0640)) {
278: die "Error opening DB file.\n";
279: }
280: foreach my $word ('torque','rna','polymerase') {
281: my $result = $thesaurus_db{$word};
282: print "Results for $word = $result\n" if ($result);
283: }
284: untie %thesaurus_db;
285: }
286:
287:
288: ################################################################
289: ################################################################
290: #
291: # get_related($keyword) is a utility function which will return a string
292: # of the format:
293: # keyword1,frequency1:keyword2,frequency2:.....
294: #
295: # 'frequency1' is the number of times the keyword1 appears in a keywords
296: # list with $keyword.
297: #
298: sub get_related {
299: my $keyword = shift;
300: return undef if ((! $keyword) ||(! exists($related_words{$keyword})));
301: my %related_hash = %{$related_words{$keyword}};
302: my @Related_words = keys(%{$related_words{$keyword}});
303: @Related_words = sort {$related_hash{$b} <=> $related_hash{$a} }
304: @Related_words;
305: my $result;
306: foreach (@Related_words) {
307: $result .= "$_,$related_hash{$_}:";
308: }
309: chop $result;
310: return $result;
311: }
312:
313:
314:
315:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>