File:  [LON-CAPA] / loncom / metadata_database / searchcat.pl
Revision 1.55: download - view: text, annotated - select for diffs
Thu Apr 8 15:57:32 2004 UTC (20 years, 3 months ago) by matthew
Branches: MAIN
CVS tags: HEAD
Complete refactoring.

Currently does not attempt to get dynamic metadata.
Now uses lonmetadata routines to manage MySQL tables and inserts.
Appears to work fine, insert correct values into mysql, and run without
warnings or errors.
Added counting of copyright types.

    1: #!/usr/bin/perl
    2: # The LearningOnline Network
    3: # searchcat.pl "Search Catalog" batch script
    4: #
    5: # $Id: searchcat.pl,v 1.55 2004/04/08 15:57:32 matthew Exp $
    6: #
    7: # Copyright Michigan State University Board of Trustees
    8: #
    9: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
   10: #
   11: # LON-CAPA is free software; you can redistribute it and/or modify
   12: # it under the terms of the GNU General Public License as published by
   13: # the Free Software Foundation; either version 2 of the License, or
   14: # (at your option) any later version.
   15: #
   16: # LON-CAPA is distributed in the hope that it will be useful,
   17: # but WITHOUT ANY WARRANTY; without even the implied warranty of
   18: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   19: # GNU General Public License for more details.
   20: #
   21: # You should have received a copy of the GNU General Public License
   22: # along with LON-CAPA; if not, write to the Free Software
   23: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   24: #
   25: # /home/httpd/html/adm/gpl.txt
   26: #
   27: # http://www.lon-capa.org/
   28: #
   29: ###
   30: 
   31: =pod
   32: 
   33: =head1 NAME
   34: 
   35: B<searchcat.pl> - put authoritative filesystem data into sql database.
   36: 
   37: =head1 SYNOPSIS
   38: 
   39: Ordinarily this script is to be called from a loncapa cron job
   40: (CVS source location: F<loncapa/loncom/cron/loncapa>; typical
   41: filesystem installation location: F</etc/cron.d/loncapa>).
   42: 
   43: Here is the cron job entry.
   44: 
   45: C<# Repopulate and refresh the metadata database used for the search catalog.>
   46: C<10 1 * * 7    www    /home/httpd/perl/searchcat.pl>
   47: 
   48: This script only allows itself to be run as the user C<www>.
   49: 
   50: =head1 DESCRIPTION
   51: 
   52: This script goes through a loncapa resource directory and gathers metadata.
   53: The metadata is entered into a SQL database.
   54: 
   55: This script also does general database maintenance such as reformatting
   56: the C<loncapa:metadata> table if it is deprecated.
   57: 
   58: This script evaluates dynamic metadata from the authors'
   59: F<nohist_resevaldata.db> database file in order to store it in MySQL.
   60: 
   61: This script is playing an increasingly important role for a loncapa
   62: library server.  The proper operation of this script is critical for a smooth
   63: and correct user experience.
   64: 
   65: =cut
   66: 
   67: use strict;
   68: 
   69: use DBI;
   70: use lib '/home/httpd/lib/perl/';
   71: use LONCAPA::Configuration;
   72: use LONCAPA::lonmetadata;
   73: 
   74: use IO::File;
   75: use HTML::TokeParser;
   76: use GDBM_File;
   77: use POSIX qw(strftime mktime);
   78: use File::Find;
   79: 
   80: ##
   81: ## Use variables for table names so we can test this routine a little easier
   82: my $oldname = 'metadata';
   83: my $newname = 'newmetadata';
   84: 
   85: #
   86: # Read loncapa_apache.conf and loncapa.conf
   87: my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
   88: my %perlvar=%{$perlvarref};
   89: undef $perlvarref;
   90: delete $perlvar{'lonReceipt'}; # remove since sensitive (really?) & not needed
   91: #
   92: # Only run if machine is a library server
   93: exit if ($perlvar{'lonRole'} ne 'library');
   94: #
   95: #  Make sure this process is running from user=www
   96: my $wwwid=getpwnam('www');
   97: if ($wwwid!=$<) {
   98:     my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
   99:     my $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
  100:     system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\
  101:  mailto $emailto -s '$subj' > /dev/null");
  102:     exit 1;
  103: }
  104: #
  105: # Let people know we are running
  106: open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
  107: print LOG '==== Searchcat Run '.localtime()."====\n";
  108: #
  109: # Connect to database
  110: my $dbh;
  111: if (! ($dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},
  112:                           { RaiseError =>0,PrintError=>0}))) {
  113:     print LOG "Cannot connect to database!\n";
  114:     die "MySQL Error: Cannot connect to database!\n";
  115: }
  116: # This can return an error and still be okay, so we do not bother checking.
  117: # (perhaps it should be more robust and check for specific errors)
  118: $dbh->do('DROP TABLE IF EXISTS '.$newname);
  119: #
  120: # Create the new table
  121: my $request = &LONCAPA::lonmetadata::create_metadata_storage($newname);
  122: $dbh->do($request);
  123: if ($dbh->err) {
  124:     $dbh->disconnect();
  125:     print LOG "\nMySQL Error Create: ".$dbh->errstr."\n";
  126:     die $dbh->errstr;
  127: }
  128: #
  129: # find out which users we need to examine
  130: opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}");
  131: my @homeusers = 
  132:     grep {
  133:         &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_");
  134:     } grep { 
  135:         !/^\.\.?$/;
  136:     } readdir(RESOURCES);
  137: closedir RESOURCES;
  138: #
  139: # Loop through the users
  140: foreach my $user (@homeusers) {
  141:     print LOG "=== User: ".$user."\n";
  142:     my $prodir=&propath($perlvar{'lonDefDomain'},$user);
  143:     #
  144:     # Use File::Find to get the files we need to read/modify
  145:     find(
  146:          {preprocess => \&only_meta_files,
  147: #          wanted     => \&print_filename,
  148: #          wanted     => \&log_metadata,
  149:           wanted     => \&process_meta_file,
  150:           }, 
  151:          "$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user");
  152: }
  153: #
  154: # Rename the table
  155: $dbh->do('DROP TABLE IF EXISTS '.$oldname);
  156: if (! $dbh->do('RENAME TABLE '.$newname.' TO '.$oldname)) {
  157:     print LOG "MySQL Error Rename: ".$dbh->errstr."\n";
  158:     die $dbh->errstr;
  159: }
  160: if (! $dbh->disconnect) {
  161:     print LOG "MySQL Error Disconnect: ".$dbh->errstr."\n";
  162:     die $dbh->errstr;
  163: }
  164: ##
  165: ## Finished!
  166: print LOG "==== Searchcat completed ".localtime()." ====\n";
  167: close(LOG);
  168: 
  169: &write_type_count();
  170: &write_copyright_count();
  171: 
  172: exit 0;
  173: 
  174: ########################################################
  175: ########################################################
  176: ###                                                  ###
  177: ###          File::Find support routines             ###
  178: ###                                                  ###
  179: ########################################################
  180: ########################################################
  181: ##
  182: ## &only_meta_files
  183: ##
  184: ## Called by File::Find.
  185: ## Takes a list of files/directories in and returns a list of files/directories
  186: ## to search.
  187: sub only_meta_files {
  188:     my @PossibleFiles = @_;
  189:     my @ChosenFiles;
  190:     foreach my $file (@PossibleFiles) {
  191:         if ( ($file =~ /\.meta$/ &&            # Ends in meta
  192:               $file !~ /\.\d+\.[^\.]+\.meta$/  # is not for a prior version
  193:              ) || (-d $file )) { # directories are okay
  194:                  # but we do not want /. or /..
  195:             push(@ChosenFiles,$file);
  196:         }
  197:     }
  198:     return @ChosenFiles;
  199: }
  200: 
  201: ##
  202: ##
  203: ## Debugging routines, use these for 'wanted' in the File::Find call
  204: ##
  205: sub print_filename {
  206:     my ($file) = $_;
  207:     my $fullfilename = $File::Find::name;
  208:     if (-d $file) {
  209:         print LOG " Got directory ".$fullfilename."\n";
  210:     } else {
  211:         print LOG " Got file ".$fullfilename."\n";
  212:     }
  213:     $_=$file;
  214: }
  215: 
  216: sub log_metadata {
  217:     my ($file) = $_;
  218:     my $fullfilename = $File::Find::name;
  219:     return if (-d $fullfilename); # No need to do anything here for directories
  220:     print LOG $fullfilename."\n";
  221:     my $ref=&metadata($fullfilename);
  222:     if (! defined($ref)) {
  223:         print LOG "    No data\n";
  224:         return;
  225:     }
  226:     while (my($key,$value) = each(%$ref)) {
  227:         print LOG "    ".$key." => ".$value."\n";
  228:     }
  229:     &count_copyright($ref->{'copyright'});
  230:     $_=$file;
  231: }
  232: 
  233: 
  234: ##
  235: ## process_meta_file
  236: ##   Called by File::Find. 
  237: ##   Only input is the filename in $_.  
  238: sub process_meta_file {
  239:     my ($file) = $_;
  240:     my $filename = $File::Find::name;
  241:     return if (-d $filename); # No need to do anything here for directories
  242:     #
  243:     print LOG $filename."\n";
  244:     #
  245:     my $ref=&metadata($filename);
  246:     #
  247:     # $url is the original file url, not the metadata file
  248:     my $url='/res/'.&declutter($filename);
  249:     $url=~s/\.meta$//;
  250:     print LOG "    ".$url."\n";
  251:     #
  252:     # Ignore some files based on their metadata
  253:     if ($ref->{'obsolete'}) { 
  254:         print LOG "obsolete\n"; 
  255:         return; 
  256:     }
  257:     &count_copyright($ref->{'copyright'});
  258:     if ($ref->{'copyright'} eq 'private') { 
  259:         print LOG "private\n"; 
  260:         return; 
  261:     }
  262:     #
  263:     # Find the dynamic metadata
  264:     my %dyn;
  265:     if ($url=~ m:/default$:) {
  266:         $url=~ s:/default$:/:;
  267:     } else {
  268:         # %dyn=&dynamicmeta($url);
  269:         &count_type($url);
  270:     }
  271:     #
  272:     $ref->{'creationdate'}     = &sqltime($ref->{'creationdate'});
  273:     $ref->{'lastrevisiondate'} = &sqltime($ref->{'lastrevisiondate'});
  274:     my %Data = (
  275:                 %$ref,
  276:                 %dyn,
  277:                 'url'=>$url,
  278:                 'version'=>'current');
  279:     my ($count,$err) = &LONCAPA::lonmetadata::store_metadata($dbh,$newname,
  280:                                                              \%Data);
  281:     if ($err) {
  282:         print LOG "\nMySQL Error Insert: ".$err."\n";
  283:         die $err;
  284:     }
  285:     if ($count < 1) {
  286:         print LOG "Unable to insert record into MySQL database for $url\n";
  287:         die "Unable to insert record into MySQl database for $url";
  288:     } else {
  289:         print LOG "Count = ".$count."\n";
  290:     }
  291:     #
  292:     # Reset $_ before leaving
  293:     $_ = $file;
  294: }
  295: 
  296: ########################################################
  297: ########################################################
  298: ###                                                  ###
  299: ###  &metadata($uri)                                 ###
  300: ###   Retrieve metadata for the given file           ###
  301: ###                                                  ###
  302: ########################################################
  303: ########################################################
  304: sub metadata {
  305:     my ($uri)=@_;
  306:     my %metacache=();
  307:     $uri=&declutter($uri);
  308:     my $filename=$uri;
  309:     $uri=~s/\.meta$//;
  310:     $uri='';
  311:     if ($filename !~ /\.meta$/) { 
  312:         $filename.='.meta';
  313:     }
  314:     my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename);
  315:     return undef if (! defined($metastring));
  316:     my $parser=HTML::TokeParser->new(\$metastring);
  317:     my $token;
  318:     while ($token=$parser->get_token) {
  319:         if ($token->[0] eq 'S') {
  320:             my $entry=$token->[1];
  321:             my $unikey=$entry;
  322:             if (defined($token->[2]->{'part'})) { 
  323:                 $unikey.='_'.$token->[2]->{'part'}; 
  324:             }
  325:             if (defined($token->[2]->{'name'})) { 
  326:                 $unikey.='_'.$token->[2]->{'name'}; 
  327:             }
  328:             if ($metacache{$uri.'keys'}) {
  329:                 $metacache{$uri.'keys'}.=','.$unikey;
  330:             } else {
  331:                 $metacache{$uri.'keys'}=$unikey;
  332:             }
  333:             foreach ( @{$token->[3]}) {
  334:                 $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};
  335:             } 
  336:             if (! ($metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry))){
  337:                 $metacache{$uri.''.$unikey} = 
  338:                     $metacache{$uri.''.$unikey.'.default'};
  339:             }
  340:         } # End of ($token->[0] eq 'S')
  341:     }
  342:     return \%metacache;
  343: }
  344: 
  345: ##
  346: ## &getfile($filename)
  347: ##   Slurps up an entire file into a scalar.  
  348: ##   Returns undef if the file does not exist
  349: sub getfile {
  350:     my $file = shift();
  351:     if (! -e $file ) { 
  352:         return undef; 
  353:     }
  354:     my $fh=IO::File->new($file);
  355:     my $contents = '';
  356:     while (<$fh>) { 
  357:         $contents .= $_;
  358:     }
  359:     return $contents;
  360: }
  361: 
  362: ########################################################
  363: ########################################################
  364: ###                                                  ###
  365: ###    Dynamic Metadata                              ###
  366: ###                                                  ###
  367: ########################################################
  368: ########################################################
  369: sub dynamicmeta {
  370:     my $url = &declutter(shift());
  371:     $url =~ s/\.meta$//;
  372:     my %data = ('count'         => 0,
  373:                 'course'        => 0,
  374:                 'course_list'   => '',
  375:                 'avetries'      => 'NULL',
  376:                 'avetries_list' => '',
  377:                 'stdno'         => 0,
  378:                 'stdno_list'    => '',
  379:                 'usage'         => 0,
  380:                 'usage_list'    => '',
  381:                 'goto'          => 0,
  382:                 'goto_list'     => '',
  383:                 'comefrom'      => 0,
  384:                 'comefrom_list' => '',
  385:                 'difficulty'    => 'NULL',
  386:                 'difficulty_list' => '',
  387:                 'sequsage'      => '0',
  388:                 'sequsage_list' => '',
  389:                 'clear'         => 'NULL',
  390:                 'technical'     => 'NULL',
  391:                 'correct'       => 'NULL',
  392:                 'helpful'       => 'NULL',
  393:                 'depth'         => 'NULL',
  394:                 'comments'      => '',                
  395:                 );
  396:     my ($dom,$auth)=($url=~/^(\w+)\/(\w+)\//);
  397:     my $prodir=&propath($dom,$auth);
  398:     #
  399:     # Get metadata except counts
  400:     my %evaldata;
  401:     if (! tie(%evaldata,'GDBM_File',
  402:               $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) {
  403:         return (undef);
  404:     }
  405:     my %sum=();
  406:     my %count=();
  407:     my %concat=();
  408:     my %listitems=(
  409:                    'course'       => 'add',
  410:                    'goto'         => 'add',
  411:                    'comefrom'     => 'add',
  412:                    'avetries'     => 'average',
  413:                    'stdno'        => 'add',
  414:                    'difficulty'   => 'average',
  415:                    'clear'        => 'average',
  416:                    'technical'    => 'average',
  417:                    'helpful'      => 'average',
  418:                    'correct'      => 'average',
  419:                    'depth'        => 'average',
  420:                    'comments'     => 'append',
  421:                    'usage'        => 'count'
  422:                    );
  423:     #
  424:     my $regexp=$url;
  425:     $regexp=~s/(\W)/\\$1/g;
  426:     $regexp='___'.$regexp.'___([a-z]+)$';
  427:     while (my ($esckey,$value)=each %evaldata) {
  428:         my $key=&unescape($esckey);
  429:         if ($key=~/$regexp/) {
  430:             my ($item,$purl,$cat)=split(/___/,$key);
  431:             $count{$cat}++;
  432:             if ($listitems{$cat} ne 'append') {
  433:                 if (defined($sum{$cat})) {
  434:                     $sum{$cat}+=&unescape($value);
  435:                     $concat{$cat}.=','.$item;
  436:                 } else {
  437:                     $sum{$cat}=&unescape($value);
  438:                     $concat{$cat}=$item;
  439:                 }
  440:             } else {
  441:                 if (defined($sum{$cat})) {
  442:                     if ($evaldata{$esckey}=~/\w/) {
  443:                         $sum{$cat}.='<hr />'.&unescape($evaldata{$esckey});
  444:                     }
  445:                 } else {
  446:                     $sum{$cat}=''.&unescape($evaldata{$esckey});
  447: 		    }
  448:             }
  449:         }
  450:     }
  451:     untie(%evaldata);
  452:     # transfer gathered data to returnhash, calculate averages where applicable
  453:     my %returnhash;
  454:     while (my $cat=each(%count)) {
  455:         if ($count{$cat} eq 'nan') { next; }
  456:         if ($sum{$cat} eq 'nan') { next; }
  457:         if ($listitems{$cat} eq 'average') {
  458:             if ($count{$cat}) {
  459:                 $returnhash{$cat}=int(($sum{$cat}/$count{$cat})*100.0+0.5)/100.0;
  460:             } else {
  461:                 $returnhash{$cat}='NULL';
  462:             }
  463:         } elsif ($listitems{$cat} eq 'count') {
  464:             $returnhash{$cat}=$count{$cat};
  465:         } else {
  466:             $returnhash{$cat}=$sum{$cat};
  467:         }
  468:         $returnhash{$cat.'_list'}=$concat{$cat};
  469:     }
  470:     #
  471:     # get count
  472:     if (tie(my %evaldata,'GDBM_File',
  473:             $prodir.'/nohist_accesscount.db',&GDBM_READER(),0640)) {
  474: 	my $escurl=&escape($url);
  475: 	if (! exists($evaldata{$escurl})) {
  476: 	    $returnhash{'count'}=0;
  477: 	} else {
  478: 	    $returnhash{'count'}=$evaldata{$escurl};
  479: 	}
  480: 	untie %evaldata;
  481:     }
  482:     return %returnhash;
  483: }
  484: 
  485: ########################################################
  486: ########################################################
  487: ###                                                  ###
  488: ###   Counts                                         ###
  489: ###                                                  ###
  490: ########################################################
  491: ########################################################
  492: {
  493: 
  494: my %countext;
  495: 
  496: sub count_type {
  497:     my $file=shift;
  498:     $file=~/\.(\w+)$/;
  499:     my $ext=lc($1);
  500:     $countext{$ext}++;
  501: }
  502: 
  503: sub write_type_count {
  504:     open(RESCOUNT,'>/home/httpd/html/lon-status/rescount.txt');
  505:     while (my ($extension,$count) = each(%countext)) {
  506: 	print RESCOUNT $extension.'='.$count.'&';
  507:     }
  508:     print RESCOUNT 'time='.time."\n";
  509:     close(RESCOUNT);
  510: }
  511: 
  512: } # end of scope for %countext
  513: 
  514: {
  515: 
  516: my %copyrights;
  517: 
  518: sub count_copyright {
  519:     $copyrights{@_[0]}++;
  520: }
  521: 
  522: sub write_copyright_count {
  523:     open(COPYCOUNT,'>/home/httpd/html/lon-status/copyrightcount.txt');
  524:     while (my ($copyright,$count) = each(%copyrights)) {
  525: 	print COPYCOUNT $copyright.'='.$count.'&';
  526:     }
  527:     print COPYCOUNT 'time='.time."\n";
  528:     close(COPYCOUNT);
  529: }
  530: 
  531: } # end of scope for %copyrights
  532: 
  533: ########################################################
  534: ########################################################
  535: ###                                                  ###
  536: ###   Miscellanous Utility Routines                  ###
  537: ###                                                  ###
  538: ########################################################
  539: ########################################################
  540: ##
  541: ## &ishome($username)
  542: ##   Returns 1 if $username is a LON-CAPA author, 0 otherwise
  543: ##   (copied from lond, modification of the return value)
  544: sub ishome {
  545:     my $author=shift;
  546:     $author=~s/\/home\/httpd\/html\/res\/([^\/]*)\/([^\/]*).*/$1\/$2/;
  547:     my ($udom,$uname)=split(/\//,$author);
  548:     my $proname=propath($udom,$uname);
  549:     if (-e $proname) {
  550: 	return 1;
  551:     } else {
  552:         return 0;
  553:     }
  554: }
  555: 
  556: ##
  557: ## &propath($udom,$uname)
  558: ##   Returns the path to the users LON-CAPA directory
  559: ##   (copied from lond)
  560: sub propath {
  561:     my ($udom,$uname)=@_;
  562:     $udom=~s/\W//g;
  563:     $uname=~s/\W//g;
  564:     my $subdir=$uname.'__';
  565:     $subdir =~ s/(.)(.)(.).*/$1\/$2\/$3/;
  566:     my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname";
  567:     return $proname;
  568: } 
  569: 
  570: ##
  571: ## &sqltime($timestamp)
  572: ##
  573: ## Convert perl $timestamp to MySQL time.  MySQL expects YYYY-MM-DD HH:MM:SS
  574: ##
  575: sub sqltime {
  576:     my ($time) = @_;
  577:     my $mysqltime;
  578:     if ($time =~ 
  579:         /(\d+)-(\d+)-(\d+) # YYYY-MM-DD
  580:         \s                 # a space
  581:         (\d+):(\d+):(\d+)  # HH:MM::SS
  582:         /x ) { 
  583:         # Some of the .meta files have the time in mysql
  584:         # format already, so just make sure they are 0 padded and
  585:         # pass them back.
  586:         $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d',
  587:                              $1,$2,$3,$4,$5,$6);
  588:     } elsif ($time =~ /^\d+$/) {
  589:         my @TimeData = gmtime($time);
  590:         # Alter the month to be 1-12 instead of 0-11
  591:         $TimeData[4]++;
  592:         # Alter the year to be from 0 instead of from 1900
  593:         $TimeData[5]+=1900;
  594:         $mysqltime = sprintf('%04d-%02d-%02d %02d:%02d:%02d',
  595:                              @TimeData[5,4,3,2,1,0]);
  596:     } else {
  597:         print LOG "    Unable to decode time ".$time."\n";
  598:         $mysqltime = 0;
  599:     }
  600:     return $mysqltime;
  601: }
  602: 
  603: ##
  604: ## &declutter($filename)
  605: ##   Given a filename, returns a url for the filename.
  606: sub declutter {
  607:     my $thisfn=shift;
  608:     $thisfn=~s/^$perlvar{'lonDocRoot'}//;
  609:     $thisfn=~s/^\///;
  610:     $thisfn=~s/^res\///;
  611:     return $thisfn;
  612: }
  613: 
  614: ##
  615: ## Escape / Unescape special characters
  616: sub unescape {
  617:     my $str=shift;
  618:     $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg;
  619:     return $str;
  620: }
  621: 
  622: sub escape {
  623:     my $str=shift;
  624:     $str =~ s/(\W)/"%".unpack('H2',$1)/eg;
  625:     return $str;
  626: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>