Annotation of nsdl/nsdlloncapaorg/harvester.pl, revision 1.8
1.1 www 1: #!/usr/local/bin/perl
2:
3: #
4: # lon-capa.pl
5: # Parse the LON-CAPA metadata
6: #
7: # Andy Dong <adong@smete.org> 10/23/2002
8: #
9: # Contact Gerd Kortemeyer (korte@lite.msu.edu)
10:
11: use strict;
12: use LWP::UserAgent;
13: use Getopt::Std;
1.2 www 14: use Digest::MD5 qw(md5_hex);
1.4 www 15: use IO::File;
16:
17: my $basepath='/home/httpd/cgi-bin/OAI-XMLFile/XMLFile/nsdlexport/data';
1.1 www 18:
19: my $pub_month;
20: my $pub_year;
21: my @loncapa;
22:
23: # HTTP requests
24:
25: my $content;
26: my $content_regex = 'File Not Found';
27:
28: # Configuration
29:
30: my $debug = 0;
1.5 www 31:
1.8 ! www 32: # Stats
! 33: my %allstats=();
! 34: my %filterstats=();
! 35: my %knockout=();
! 36: my %knockoutlang=();
! 37:
1.1 www 38: # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
1.5 www 39: my @servers = (
1.8 ! www 40: 'newscience.westshore.edu',
1.5 www 41: 's10.lite.msu.edu',
42: 's12.lite.msu.edu',
43: 'schubert.tmcc.edu',
44: 'dalton.chem.sfu.ca',
45: 'capa2.phy.ohiou.edu',
46: 'pollux.physics.fsu.edu',
1.8 ! www 47: 'loncapa3.physics.sc.edu',
1.5 www 48: 'zappa.ags.udel.edu',
49: 'loncapa.gwu.edu',
50: 'neptune.physics.ndsu.nodak.edu',
1.6 www 51: 'capa1.uwsp.edu',
52: 'loncapa.Mines.EDU',
1.8 ! www 53: 'loncapa.chm.nau.edu',
! 54: 'library1.lon-capa.uiuc.edu',
! 55: 'lon-capa.bsu.edu',
! 56: 'psblnx03.bd.psu.edu',
! 57: 'lon-capa.acadiau.ca',
! 58: 'harvard.lon-capa.org',
! 59: 'capa1.cc.huji.ac.il',
! 60: 'lon-capa.phy.cmich.edu',
! 61: 'meitner.physics.hope.edu',
! 62: 'loncapa.vcu.edu',
! 63: 'lon-capa.ucsc.edu',
! 64: 'lon-capa.bsu.edu'
! 65: );
1.1 www 66:
1.5 www 67: foreach (@servers) {
68: my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';
1.1 www 69: # End Configuration
70:
1.2 www 71: my $ua = new LWP::UserAgent;
72: $ua->timeout(600);
1.1 www 73:
1.2 www 74: my $request = new HTTP::Request GET => $url;
75: $request->authorization_basic('reaper', 'cat4u');
1.1 www 76:
1.2 www 77: my $response = $ua->request( $request );
1.1 www 78:
1.2 www 79: if ( $response->is_success ) {
1.5 www 80: print 'SUCCESS: ' . $response->message.' for '.$url."\n\n";
1.2 www 81: $content = $response->content;
1.1 www 82: # Delete all blank lines
1.2 www 83: $content =~ s/(?<!.)\n//g;
1.1 www 84: # Replace all ^M with spaces
1.2 www 85: $content =~ s/
/\s/g;
1.1 www 86: # Push the content into an array
1.2 www 87: @loncapa = split /\n/, $content;
88: } else {
1.5 www 89: print 'LON-CAPA request failed: ' . $response->message.' for '.$url."\n\n";
90: next;
1.2 www 91: }
1.1 www 92:
1.2 www 93: #@loncapa=undef;
94: #open (LON_FILE, 'metadata_harvest.txt') || die;
1.1 www 95:
1.2 www 96: #while (<LON_FILE>) {
97: # chomp;
98: # push(@loncapa,$_);
99: #}
1.1 www 100:
101: my %records = ();;
1.3 www 102:
1.8 ! www 103: my %stats=();
! 104:
1.1 www 105: foreach my $metadata (@loncapa) {
106: chomp $metadata;
1.2 www 107: $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
1.1 www 108: my @tkline = split('\|', $metadata);
1.8 ! www 109: my ($rawtype)=($tkline[3]=~/\.(\w+)$/);
! 110: $rawtype=~tr/A-Z/a-z/;
! 111: $allstats{$rawtype}++;
! 112:
! 113: my $title = $tkline[0];
! 114: if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; }
1.1 www 115: my $author = $tkline[1];
1.8 ! www 116: if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; }
1.1 www 117: my @authorname = split(' ', $author);
118: my $author_fname = $authorname[0];
119: my $author_lname = $authorname[1];
120: # We have to make an exception for Multimedia Physics which is an organization not a person
121: my $object_type;
122: if ( $author_lname eq 'Physics' ) {
123: $object_type = 'organization';
124: } else {
125: $object_type = 'person';
126: }
127: my $subject = $tkline[2];
128: next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
1.2 www 129: my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
130: my $baseid=$tkline[3];
1.4 www 131: my ($adom,$auname)=($baseid=~/^\/res\/(\w+)\/(\w+)\//);
1.2 www 132: $baseid=~s/\W/\_/g;
133: $baseid=~s/^\_res\_//g;
1.4 www 134: my $fileid=md5_hex($baseid);
1.2 www 135:
1.1 www 136: next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
1.8 ! www 137: # too many fragments out there
! 138: next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i);
! 139:
1.1 www 140: my $keywords = $tkline[4];
141: my $version = $tkline[5];
142: my $notes = $tkline[6];
143: my $abstract = $tkline[7];
1.7 www 144: unless ($abstract) { $abstract=$subject; }
145: unless ($abstract) { $abstract=$title; }
146: unless ($abstract) { $abstract=$keywords; }
1.8 ! www 147: my $type = $rawtype;
! 148: if ($type=~/htm/) { $type='htm'; }
! 149:
1.1 www 150: my $learning_resource_type;
151: if ( $type eq 'problem' ) {
152: $learning_resource_type = 114;
153: } elsif ( $type eq 'exam' ) {
154: $learning_resource_type = 114;
155: } elsif ( $type eq 'quiz' ) {
156: $learning_resource_type = 114;
157: } elsif ( $type eq 'assess' ) {
158: $learning_resource_type = 114;
159: } elsif ( $type eq 'survey' ) {
160: $learning_resource_type = 114;
161: } elsif ( $type eq 'form' ) {
162: $learning_resource_type = 114;
163: } elsif ( $type eq 'library' ) {
164: $learning_resource_type = 107;
165: } elsif ( $type eq 'page' ) {
166: $learning_resource_type = 104;
167: } elsif ( $type eq 'sequence' ) {
168: $learning_resource_type = 104;
169: } elsif ( $type eq 'spreadsheet' ) {
170: $learning_resource_type = 114;
171: } else {
172: $learning_resource_type = 0;
173: }
174:
175: my $media_format;
176: if ( ($type eq 'htm') || ($type eq 'gif') || ($type eq 'mov') || ($type eq 'xml') ) {
177: $media_format = 70;
178: } else {
179: $media_format = 0;
180: }
181:
1.8 ! www 182: my $language = $tkline[9];
! 183: # likelihood is that the following is true (people would bother if it is not)
! 184: if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; }
! 185: # NSDL only does English
! 186: if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; }
1.1 www 187: my $primary_language='en-US';
188: my $creation_date = $tkline[10];
1.3 www 189: my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
1.1 www 190: my $revision_date = $tkline[11];
1.3 www 191: my ($rev_year,$rev_month,$rev_day) = ( $revision_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
1.1 www 192: my $owner = $tkline[12];
193: my $rights_description;
194: my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)
195: # Public means no login required
196:
197: if ( $copyright eq 'public' ) {
198: $rights_description = 'LON-CAPA Public Resource. No login required.';
199: } elsif ($copyright eq 'domain') {
200: $rights_description = 'Restricted to certain LON-CAPA domains.';
201: } else {
202: $rights_description = 'LON-CAPA Default Use Restriction. Login required.';
203: }
204: # Domain means restricted to a particular LON-CAPA domain
205: # Defaults mean access open to any registered LON-CAPA user
206: # Private means open only to author of material
1.8 ! www 207: if ( $copyright eq 'private') { $knockout{'private_'.$rawtype}++; next; }
! 208: if ( $copyright eq 'domain') { $knockout{'domain_'.$rawtype}++; next; }
! 209: if ( $copyright eq 'custom') { $knockout{'custom_'.$rawtype}++; next; }
1.1 www 210: my $platform = "5"; # HTML Browser (not specified but construed from metadata)
1.4 www 211: #
1.8 ! www 212: # We actually do this
! 213: #
! 214: $stats{$type}++;
! 215: $filterstats{$type}++;
! 216: #
1.4 www 217: # Create path
218: #
219: unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
220: unless (-e $basepath.'/'.$adom.'/'.$auname) {
221: mkdir($basepath.'/'.$adom.'/'.$auname) || die 'Could not create '.$basepath.'/'.$adom.'/'.$auname;
222: }
223: open(XML,'>'.$basepath.'/'.$adom.'/'.$auname.'/'.$baseid.'.xml');
224: print XML (<<ENDMETA);
225: <?xml version="1.0" encoding="UTF-8"?>
226:
1.3 www 227: <oaidc:dc xmlns="http://purl.org/dc/elements/1.1/"
228: xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/"
229: xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
230: xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/
231: http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
232: >
233: <title>$title</title>
234: <creator>$author_fname $author_lname</creator>
235: <identifier>$resourceurl</identifier>
236: <subject>$keywords</subject>
237: <subject>$subject</subject>
238: <language>$primary_language</language>
239: <description>$abstract</description>
240: <date>$rev_year-$rev_month-$rev_day</date>
241: </oaidc:dc>
1.2 www 242: ENDMETA
1.4 www 243: close (XML);
1.5 www 244: }
1.8 ! www 245: foreach my $thistype (sort keys %stats) {
! 246: print "\n$thistype: $stats{$thistype}";
! 247: }
! 248: print "\n----\n";
! 249: }
! 250: print "\nDone.\n";
! 251: foreach my $thistype (sort keys %allstats) {
! 252: print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}";
! 253: }
! 254: print "\n----\n";
! 255: foreach my $thislang (sort keys %knockoutlang) {
! 256: print "\n>$thislang<: $knockoutlang{$thislang}";
1.1 www 257: }
1.8 ! www 258: print "\n";
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>