Annotation of nsdl/nsdlloncapaorg/harvester.pl, revision 1.4
1.1 www 1: #!/usr/local/bin/perl
2:
3: #
4: # lon-capa.pl
5: # Parse the LON-CAPA metadata
6: #
7: # Andy Dong <adong@smete.org> 10/23/2002
8: #
9: # Contact Gerd Kortemeyer (korte@lite.msu.edu)
10:
11: use strict;
12: use LWP::UserAgent;
13: use Getopt::Std;
1.2 www 14: use Digest::MD5 qw(md5_hex);
1.4 ! www 15: use IO::File;
! 16:
! 17: my $basepath='/home/httpd/cgi-bin/OAI-XMLFile/XMLFile/nsdlexport/data';
1.1 www 18:
19: my $pub_month;
20: my $pub_year;
21: my @loncapa;
22:
23: # HTTP requests
24:
25: my $content;
26: my $content_regex = 'File Not Found';
27:
28: # Configuration
29:
30: my $debug = 0;
1.2 www 31: my $url = 'http://s10.lite.msu.edu/cgi-bin/metadata_harvest.pl';
1.1 www 32: # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
33: my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu');
34:
35: # End Configuration
36:
1.2 www 37: my $ua = new LWP::UserAgent;
38: $ua->timeout(600);
1.1 www 39:
1.2 www 40: my $request = new HTTP::Request GET => $url;
41: $request->authorization_basic('reaper', 'cat4u');
1.1 www 42:
1.2 www 43: my $response = $ua->request( $request );
1.1 www 44:
1.2 www 45: if ( $response->is_success ) {
46: $content = $response->content;
1.1 www 47: # Delete all blank lines
1.2 www 48: $content =~ s/(?<!.)\n//g;
1.1 www 49: # Replace all ^M with spaces
1.2 www 50: $content =~ s/
/\s/g;
1.1 www 51: # Push the content into an array
1.2 www 52: @loncapa = split /\n/, $content;
53: } else {
54: die 'LON-CAPA request failed: ' . $response->message;
55: }
1.1 www 56:
1.2 www 57: #@loncapa=undef;
58: #open (LON_FILE, 'metadata_harvest.txt') || die;
1.1 www 59:
1.2 www 60: #while (<LON_FILE>) {
61: # chomp;
62: # push(@loncapa,$_);
63: #}
1.1 www 64:
65: my %records = ();;
1.3 www 66:
1.1 www 67: foreach my $metadata (@loncapa) {
68: chomp $metadata;
1.2 www 69: $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
1.1 www 70: my @tkline = split('\|', $metadata);
71: my $title = $tkline[0];
72: next if ( $title eq '' );
73: my $author = $tkline[1];
74: next if ( $author eq '' );
75: my @authorname = split(' ', $author);
76: my $author_fname = $authorname[0];
77: my $author_lname = $authorname[1];
78: # We have to make an exception for Multimedia Physics which is an organization not a person
79: my $object_type;
80: if ( $author_lname eq 'Physics' ) {
81: $object_type = 'organization';
82: } else {
83: $object_type = 'person';
84: }
85: my $subject = $tkline[2];
86: next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
1.2 www 87: my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
88: my $baseid=$tkline[3];
1.4 ! www 89: my ($adom,$auname)=($baseid=~/^\/res\/(\w+)\/(\w+)\//);
1.2 www 90: $baseid=~s/\W/\_/g;
91: $baseid=~s/^\_res\_//g;
1.4 ! www 92: my $fileid=md5_hex($baseid);
1.2 www 93:
1.1 www 94: next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
95: my $keywords = $tkline[4];
96: my $version = $tkline[5];
97: my $notes = $tkline[6];
98: my $abstract = $tkline[7];
99: next if ($abstract eq '');
100: my $type = $tkline[8];
101: my $learning_resource_type;
102: if ( $type eq 'problem' ) {
103: $learning_resource_type = 114;
104: } elsif ( $type eq 'exam' ) {
105: $learning_resource_type = 114;
106: } elsif ( $type eq 'quiz' ) {
107: $learning_resource_type = 114;
108: } elsif ( $type eq 'assess' ) {
109: $learning_resource_type = 114;
110: } elsif ( $type eq 'survey' ) {
111: $learning_resource_type = 114;
112: } elsif ( $type eq 'form' ) {
113: $learning_resource_type = 114;
114: } elsif ( $type eq 'library' ) {
115: $learning_resource_type = 107;
116: } elsif ( $type eq 'page' ) {
117: $learning_resource_type = 104;
118: } elsif ( $type eq 'sequence' ) {
119: $learning_resource_type = 104;
120: } elsif ( $type eq 'spreadsheet' ) {
121: $learning_resource_type = 114;
122: } else {
123: $learning_resource_type = 0;
124: }
125:
126: my $media_format;
127: if ( ($type eq 'htm') || ($type eq 'gif') || ($type eq 'mov') || ($type eq 'xml') ) {
128: $media_format = 70;
129: } else {
130: $media_format = 0;
131: }
132:
133: my $language = $tkline[9]; # Look only for seniso
134: next if ( $language ne 'seniso');
135: my $primary_language='en-US';
136: my $creation_date = $tkline[10];
1.3 www 137: my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
1.1 www 138: my $revision_date = $tkline[11];
1.3 www 139: my ($rev_year,$rev_month,$rev_day) = ( $revision_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
1.1 www 140: my $owner = $tkline[12];
141: my $rights_description;
142: my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)
143: # Public means no login required
144:
145: if ( $copyright eq 'public' ) {
146: $rights_description = 'LON-CAPA Public Resource. No login required.';
147: } elsif ($copyright eq 'domain') {
148: $rights_description = 'Restricted to certain LON-CAPA domains.';
149: } else {
150: $rights_description = 'LON-CAPA Default Use Restriction. Login required.';
151: }
152: # Domain means restricted to a particular LON-CAPA domain
153: # Defaults mean access open to any registered LON-CAPA user
154: # Private means open only to author of material
155: next if ( $copyright eq 'private');
156: my $platform = "5"; # HTML Browser (not specified but construed from metadata)
1.4 ! www 157: #
! 158: # Create path
! 159: #
! 160: unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
! 161: unless (-e $basepath.'/'.$adom.'/'.$auname) {
! 162: mkdir($basepath.'/'.$adom.'/'.$auname) || die 'Could not create '.$basepath.'/'.$adom.'/'.$auname;
! 163: }
! 164: open(XML,'>'.$basepath.'/'.$adom.'/'.$auname.'/'.$baseid.'.xml');
! 165: print XML (<<ENDMETA);
! 166: <?xml version="1.0" encoding="UTF-8"?>
! 167:
1.3 www 168: <oaidc:dc xmlns="http://purl.org/dc/elements/1.1/"
169: xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/"
170: xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
171: xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/
172: http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
173: >
174: <title>$title</title>
175: <creator>$author_fname $author_lname</creator>
176: <identifier>$resourceurl</identifier>
177: <subject>$keywords</subject>
178: <subject>$subject</subject>
179: <language>$primary_language</language>
180: <description>$abstract</description>
181: <date>$rev_year-$rev_month-$rev_day</date>
182: </oaidc:dc>
1.2 www 183: ENDMETA
1.4 ! www 184: close (XML);
1.1 www 185: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>