--- nsdl/nsdlloncapaorg/harvester.pl	2003/07/29 15:10:31	1.5
+++ nsdl/nsdlloncapaorg/harvester.pl	2006/05/10 16:28:56	1.9
@@ -29,22 +29,41 @@ my $content_regex = 'File Not Found';
 
 my $debug = 0;
 
+# Stats
+my %allstats=();
+my %filterstats=();
+my %knockout=();
+my %knockoutlang=();
+
 # The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab
 my @servers = (
-'newscience.westshore.cc.mi.us',
+'newscience.westshore.edu',
 's10.lite.msu.edu',
 's12.lite.msu.edu',
-'lon-capa.chem.sunysb.edu',
 'schubert.tmcc.edu',
 'dalton.chem.sfu.ca',
 'capa2.phy.ohiou.edu',
 'pollux.physics.fsu.edu',
-'loncapa.physics.sc.edu',
-'loncapa.math.ucf.edu',
+'loncapa3.physics.sc.edu',
 'zappa.ags.udel.edu',
 'loncapa.gwu.edu',
 'neptune.physics.ndsu.nodak.edu',
-'capa1.uwsp.edu');
+'capa1.uwsp.edu',
+'loncapa.Mines.EDU',
+'loncapa.chm.nau.edu',
+'library1.lon-capa.uiuc.edu',
+'lon-capa.bsu.edu',
+'psblnx03.bd.psu.edu',
+'lon-capa.acadiau.ca',
+'harvard.lon-capa.org',
+'capa1.cc.huji.ac.il',
+'lon-capa.phy.cmich.edu',
+'meitner.physics.hope.edu',
+'loncapa.vcu.edu',
+'lon-capa.ucsc.edu',
+'lon-capa.bsu.edu',
+'harvard.lon-capa.org'
+);
 
 foreach (@servers) {
     my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl';
@@ -82,14 +101,20 @@ if ( $response->is_success ) {
 
 my %records = ();;
 
+my %stats=();
+
 foreach my $metadata (@loncapa) {
 	chomp $metadata;
 	$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs;
 	my @tkline = split('\|', $metadata);
-	my $title = $tkline[0];
-	next if ( $title eq '' );
+        my ($rawtype)=($tkline[3]=~/\.(\w+)$/);
+        $rawtype=~tr/A-Z/a-z/;
+        $allstats{$rawtype}++;
+        
+        my $title = $tkline[0];
+	if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; }
 	my $author = $tkline[1];
-	next if ( $author eq '' );
+	if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; }
 	my @authorname = split(' ', $author);
 	my $author_fname = $authorname[0];
 	my $author_lname = $authorname[1];
@@ -110,12 +135,26 @@ foreach my $metadata (@loncapa) {
 	my $fileid=md5_hex($baseid);
 
 	next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ );
+# too many fragments out there
+        next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i);
+
 	my $keywords = $tkline[4];
 	my $version = $tkline[5];
 	my $notes = $tkline[6];
 	my $abstract = $tkline[7];
-	next if ($abstract eq '');
-	my $type = $tkline[8];
+        $abstract=~s/ s / /gs;
+        $abstract=~s/\s+/ /gs;
+        my $postsubject=$subject;
+        unless ($postsubject) {
+           $postsubject=$keywords;
+        } else {
+           $postsubject.=' ('.$keywords.')';
+        }
+        unless ($postsubject=~/\w/) { $knockout{'nosubject_'.$rawtype}++; next; }
+        unless ($abstract) { $knockout{'noabstract_'.$rawtype}++; next; }
+	my $type = $rawtype;
+        if ($type=~/htm/) { $type='htm'; }
+
 	my $learning_resource_type;
 	if ( $type eq 'problem' ) {
 		$learning_resource_type = 114;
@@ -148,8 +187,11 @@ foreach my $metadata (@loncapa) {
 		$media_format = 0;
 	}
 
-	my $language = $tkline[9]; # Look only for seniso
-	next if ( $language ne 'seniso');
+	my $language = $tkline[9];
+# likelihood is that the following is true (people would bother if it is not)
+        if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; }
+# NSDL only does English
+        if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; } 
 	my $primary_language='en-US';
 	my $creation_date = $tkline[10];
 	my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ );
@@ -170,9 +212,14 @@ foreach my $metadata (@loncapa) {
 	# Domain means restricted to a particular LON-CAPA domain
 	# Defaults mean access open to any registered LON-CAPA user
 	# Private means open only to author of material
-	next if ( $copyright eq 'private');
+        unless ($copyright eq 'public') { $knockout{'notpublic_'.$rawtype}++; next; }
 	my $platform = "5";     # HTML Browser (not specified but construed from metadata)
 #
+# We actually do this
+#
+        $stats{$type}++;
+        $filterstats{$type}++;
+#
 # Create path
 #
 	unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); }
@@ -190,10 +237,9 @@ foreach my $metadata (@loncapa) {
                               http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
 >
     <title>$title</title>
-    <creator>$author_fname $author_lname</creator>
+    <creator>$author</creator>
     <identifier>$resourceurl</identifier>
-    <subject>$keywords</subject>
-    <subject>$subject</subject>
+    <subject>$postsubject</subject>
     <language>$primary_language</language>
     <description>$abstract</description>
     <date>$rev_year-$rev_month-$rev_day</date>
@@ -201,4 +247,17 @@ foreach my $metadata (@loncapa) {
 ENDMETA
       close (XML);
 }
+foreach my $thistype (sort keys %stats) {
+   print "\n$thistype: $stats{$thistype}";
+}
+print "\n----\n";
+}
+print "\nDone.\n";
+foreach my $thistype (sort keys %allstats) {
+   print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}";
+}
+print "\n----\n";
+foreach my $thislang (sort keys %knockoutlang) {
+print "\n>$thislang<: $knockoutlang{$thislang}";
 }
+print "\n";