Annotation of nsdl/nsdlloncapaorg/harvester.pl, revision 1.2
1.1 www 1: #!/usr/local/bin/perl
2:
3: #
4: # lon-capa.pl
5: # Parse the LON-CAPA metadata
6: #
7: # Andy Dong <adong@smete.org> 10/23/2002
8: #
9: # Contact Gerd Kortemeyer (korte@lite.msu.edu)
10:
11: use strict;
12: use LWP::UserAgent;
13: use Getopt::Std;
1.2 ! www 14: use Digest::MD5 qw(md5_hex);
1.1 www 15:
16: my $pub_month;
17: my $pub_year;
18: my @loncapa;
19:
20: # HTTP requests
21:
22: my $content;
23: my $content_regex = 'File Not Found';
24:
25: # Configuration
26:
27: my $debug = 0;
1.2 ! www 28: my $url = 'http://s10.lite.msu.edu/cgi-bin/metadata_harvest.pl';
1.1 www 29: # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
30: my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu');
31:
32: # End Configuration
33:
1.2 ! www 34: my $ua = new LWP::UserAgent;
! 35: $ua->timeout(600);
1.1 www 36:
1.2 ! www 37: my $request = new HTTP::Request GET => $url;
! 38: $request->authorization_basic('reaper', 'cat4u');
1.1 www 39:
1.2 ! www 40: my $response = $ua->request( $request );
1.1 www 41:
1.2 ! www 42: if ( $response->is_success ) {
! 43: $content = $response->content;
1.1 www 44: # Delete all blank lines
1.2 ! www 45: $content =~ s/(?<!.)\n//g;
1.1 www 46: # Replace all ^M with spaces
1.2 ! www 47: $content =~ s/
/\s/g;
1.1 www 48: # Push the content into an array
1.2 ! www 49: @loncapa = split /\n/, $content;
! 50: } else {
! 51: die 'LON-CAPA request failed: ' . $response->message;
! 52: }
1.1 www 53:
1.2 ! www 54: #@loncapa=undef;
! 55: #open (LON_FILE, 'metadata_harvest.txt') || die;
1.1 www 56:
1.2 ! www 57: #while (<LON_FILE>) {
! 58: # chomp;
! 59: # push(@loncapa,$_);
! 60: #}
1.1 www 61:
62: my %records = ();;
63: foreach my $metadata (@loncapa) {
64: chomp $metadata;
1.2 ! www 65: $metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
1.1 www 66: my @tkline = split('\|', $metadata);
67: my $title = $tkline[0];
68: next if ( $title eq '' );
69: my $author = $tkline[1];
70: next if ( $author eq '' );
71: my @authorname = split(' ', $author);
72: my $author_fname = $authorname[0];
73: my $author_lname = $authorname[1];
74: # We have to make an exception for Multimedia Physics which is an organization not a person
75: my $object_type;
76: if ( $author_lname eq 'Physics' ) {
77: $object_type = 'organization';
78: } else {
79: $object_type = 'person';
80: }
81: my $subject = $tkline[2];
82: next if ( ($subject eq 'Sample') || ($subject eq 'Something') );
1.2 ! www 83: my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3];
! 84: my $baseid=$tkline[3];
! 85: $baseid=~s/\W/\_/g;
! 86: $baseid=~s/^\_res\_//g;
! 87:
1.1 www 88: next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
89: my $keywords = $tkline[4];
90: my $version = $tkline[5];
91: my $notes = $tkline[6];
92: my $abstract = $tkline[7];
93: next if ($abstract eq '');
94: my $type = $tkline[8];
95: my $learning_resource_type;
96: if ( $type eq 'problem' ) {
97: $learning_resource_type = 114;
98: } elsif ( $type eq 'exam' ) {
99: $learning_resource_type = 114;
100: } elsif ( $type eq 'quiz' ) {
101: $learning_resource_type = 114;
102: } elsif ( $type eq 'assess' ) {
103: $learning_resource_type = 114;
104: } elsif ( $type eq 'survey' ) {
105: $learning_resource_type = 114;
106: } elsif ( $type eq 'form' ) {
107: $learning_resource_type = 114;
108: } elsif ( $type eq 'library' ) {
109: $learning_resource_type = 107;
110: } elsif ( $type eq 'page' ) {
111: $learning_resource_type = 104;
112: } elsif ( $type eq 'sequence' ) {
113: $learning_resource_type = 104;
114: } elsif ( $type eq 'spreadsheet' ) {
115: $learning_resource_type = 114;
116: } else {
117: $learning_resource_type = 0;
118: }
119:
120: my $media_format;
121: if ( ($type eq 'htm') || ($type eq 'gif') || ($type eq 'mov') || ($type eq 'xml') ) {
122: $media_format = 70;
123: } else {
124: $media_format = 0;
125: }
126:
127: my $language = $tkline[9]; # Look only for seniso
128: next if ( $language ne 'seniso');
129: my $primary_language='en-US';
130: my $creation_date = $tkline[10];
131: my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4})-(\d{2})-(\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
132: my $revision_date = $tkline[11];
133: my $owner = $tkline[12];
134: my $rights_description;
135: my $copyright = $tkline[13]; # public,domain,default,private (skip if private and domain)
136: # Public means no login required
137:
138: if ( $copyright eq 'public' ) {
139: $rights_description = 'LON-CAPA Public Resource. No login required.';
140: } elsif ($copyright eq 'domain') {
141: $rights_description = 'Restricted to certain LON-CAPA domains.';
142: } else {
143: $rights_description = 'LON-CAPA Default Use Restriction. Login required.';
144: }
145: # Domain means restricted to a particular LON-CAPA domain
146: # Defaults mean access open to any registered LON-CAPA user
147: # Private means open only to author of material
148: next if ( $copyright eq 'private');
149: my $platform = "5"; # HTML Browser (not specified but construed from metadata)
1.2 ! www 150: print (<<ENDMETA);
! 151: <rdf about="lon-capa.nsdl.collections/$baseid">
! 152: <dc:title>$title</dc:title>
! 153: <dc:creator>$author_fname $author_lname</dc:creator>
! 154: <dc:subject>$keywords</dc:subject>
! 155: <dc:subject>$subject</dc:subject>
! 156: <dc:identifier scheme="URI">$resourceurl</dc:identifier>
! 157: <dc:language>$primary_language</dc:language>
! 158: <dc:description>$abstract<dc:description>
! 159: <dc:date>$revision_date</dc:date>
! 160: </rdf>
1.1 www 161:
1.2 ! www 162: ENDMETA
1.1 www 163: }
164:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>