Annotation of nsdl/harvestsmete/dlese-parserecord.pl, revision 1.1
1.1 ! www 1: #!/usr/local/bin/perl -w
! 2:
! 3: #
! 4: # dlese-parserecord.pl - Parse DLESE Metadata
! 5: # This program reads a file dlese-identifiers.xml then requests each record and stores in a sub-directory called dlese
! 6: #
! 7: # Written by Andy Dong <adong@smete.org> 11/01/2001
! 8: #
! 9:
! 10: use strict;
! 11: use Getopt::Std;
! 12: use Time::HiRes qw(usleep ualarm gettimeofday tv_interval);
! 13:
! 14: use HTTP::Request;
! 15: use LWP::UserAgent;
! 16:
! 17: use XML::Element;
! 18: use XML::Parser;
! 19: use XML::TreeBuilder;
! 20:
! 21: require OAIvocabulary_v2;
! 22: require OAIcataloging_v2;
! 23:
! 24: # -u flag specifies [u]pdate database; otherwise output to STDOUT
! 25:
! 26: my $usage = << "EOT";
! 27: Usage: dlese-parserecord.pl -u
! 28:
! 29: -u (U)pdate the database
! 30:
! 31: Without -u it simply prints to STDOUT
! 32: EOT
! 33:
! 34: my %args;
! 35: getopts('u', \%args) || die $usage;
! 36:
! 37: my $inserted = 0;
! 38: my $updated = 0;
! 39:
! 40: my $useDatabase = 1 if ($args{'u'});
! 41: my $dbh;
! 42:
! 43: my $t0 = [gettimeofday];
! 44:
! 45: # Database Configuration
! 46: if ( $useDatabase ) {
! 47: print "Updating the database\n";
! 48: my $DBI_DSN='dbi:ODBC:mel.odbc';
! 49: my $DBI_USER='autocataloger';
! 50: my $DBI_PWD='regolatacotua';
! 51: $dbh= DBI->connect($DBI_DSN, $DBI_USER, $DBI_PWD, { RaiseError => 1, AutoCommit => 0 }) || die "Unable to connect to database $DBI_DSN as $DBI_USER\n";;
! 52: }
! 53:
! 54: # First parse the dlese-identifiers.xml file to get the list of available records
! 55: my $tree = XML::TreeBuilder->new();
! 56: $tree->parse_file('dlese-identifiers.xml');
! 57: my @identifiers = $tree->find_by_tag_name('identifier');
! 58:
! 59: # Now go grab them and save them to a file
! 60: foreach my $identifier (@identifiers) {
! 61: my $record = $identifier->as_text();
! 62:
! 63: my $getrecord_tree = XML::TreeBuilder->new();
! 64: $getrecord_tree->parse_file("dlese/$record");
! 65:
! 66: my $metadata = $getrecord_tree->find_by_tag_name('metadata');
! 67:
! 68: # <general>
! 69: my $general_element = $metadata->find_by_tag_name('general');
! 70: my $title_element = $general_element->find_by_tag_name('title');
! 71: my $title = $title_element->find_by_tag_name('langstring')->as_text();
! 72: # Note: DLESE appears to use general.extension.topic for keywords not general.keywords
! 73: my @keywords_list = ();
! 74: my @keywords_element = $general_element->find_by_tag_name('topic');
! 75: foreach my $topics (@keywords_element) {
! 76: push(@keywords_list,$topics->find_by_tag_name('langstring')->as_text());
! 77: }
! 78: my $keywords = join(',',@keywords_list);
! 79: my $description_element = $general_element->find_by_tag_name('description');
! 80: my $description = $description_element->find_by_tag_name('langstring')->as_text();
! 81: my $language = $general_element->find_by_tag_name('language')->as_text();
! 82:
! 83: # <metametadata>
! 84: my $metametadata_element = $metadata->find_by_tag_name('metametadata');
! 85: my $catalog_entry_element = $metametadata_element->find_by_tag_name('accession')->as_text();
! 86: my ($pub_year,$pub_month,$pub_day) = ( $catalog_entry_element =~ /^(\d{4})-(\d{2})-(\d{2})$/ );
! 87:
! 88: # <technical>
! 89: my $technical_element = $metadata->find_by_tag_name('technical');
! 90: my $location = $technical_element->find_by_tag_name('location')->as_text;
! 91: my $format_element = $technical_element->find_by_tag_name('format');
! 92: my $format = $format_element->find_by_tag_name('langstring')->as_text;
! 93: if ( $format eq 'text/html' ) {
! 94: $format = 65; # text/html
! 95: } else {
! 96: $format = 0; # unknown
! 97: }
! 98: my $platform = "5"; # HTML Browser (not specified but construed from metadata)
! 99:
! 100: # <lifecycle>
! 101: my $lifecycle_element=$metadata->find_by_tag_name('lifecycle');
! 102: my $version= $lifecycle_element->find_by_tag_name('version');
! 103: my $version_string = $version->find_by_tag_name('langstring')->as_text();
! 104: my @contributor_element = $lifecycle_element->find_by_tag_name('contribute');
! 105:
! 106: my @author_reg_key = ();
! 107: my @contact_reg_key = ();
! 108: my @publisher_reg_key = ();
! 109: my @other_reg_key = ();
! 110: my $reg_key;
! 111: foreach my $contributor_item (@contributor_element) {
! 112: my $role_element = $contributor_item->find_by_tag_name('role');
! 113: my $role = $role_element->find_by_tag_name('langstring')->as_text();
! 114: my $role_id;
! 115: if ( $role eq "Author" ) {
! 116: $role_id = 8;
! 117: } elsif ( $role eq "Publisher" ) {
! 118: $role_id = 9;
! 119: } elsif ( $role eq "Contact" ) {
! 120: $role_id = 11;
! 121: } else {
! 122: $role_id = 4;
! 123: }
! 124: my $centity = $contributor_item->find_by_tag_name('centity');
! 125: my $role_extension = $centity->find_by_tag_name('extension');
! 126: my $person_first_name = $role_extension->find_by_tag_name('firstname')->as_text();
! 127: my $person_last_name = $role_extension->find_by_tag_name('lastname')->as_text();
! 128: my $person_middle_name = $role_extension->find_by_tag_name('mi')->as_text();
! 129: my $person_title = $role_extension->find_by_tag_name('nametitle')->as_text();
! 130: my $person_company = $role_extension->find_by_tag_name('org')->as_text();
! 131: my $entity_email_address = $role_extension->find_by_tag_name('email')->as_text();
! 132: my $entity_address = $role_extension->find_by_tag_name('adr')->as_text();
! 133: my $entity_city = $role_extension->find_by_tag_name('city')->as_text();
! 134: my $entity_state = $role_extension->find_by_tag_name('state')->as_text();
! 135: my $entity_postal_code = $role_extension->find_by_tag_name('zip')->as_text();
! 136: my $entity_home_page_url = $role_extension->find_by_tag_name('url')->as_text();
! 137: my $entity_phone = $role_extension->find_by_tag_name('tel')->as_text();
! 138: my $entity_fax = $role_extension->find_by_tag_name('fax')->as_text();
! 139: my $entity_country = $role_extension->find_by_tag_name('country')->as_text();
! 140:
! 141: # if there is no $person_first_name, then this is an organization
! 142: my $object_type;
! 143: if ( not $person_first_name ) {
! 144: $object_type = 'organization';
! 145: } else {
! 146: $object_type = 'person';
! 147: }
! 148: if ( $useDatabase ) {
! 149: # DLESE
! 150: my $collection_reg_key = '{CA001C50-77CA-43DC-A761-95207D386EDD}';
! 151: # Andy Dong
! 152: my $submitter_key = '{710FE693-46E9-4002-BA94-1BE2E6218CD6}'; # Andy Dong
! 153: # Does this entity exist
! 154: if ( $object_type eq 'organization' ) {
! 155: if ( ! ($reg_key = OAIc_orgexists($dbh,$person_company)) ) {
! 156: my $success = OAIc_insert_org($dbh,$collection_reg_key,$submitter_key,$entity_email_address,$person_company,$entity_address,$entity_city,$entity_state,$entity_postal_code,$entity_home_page_url,$entity_phone,$entity_fax,$entity_country);
! 157: $reg_key = OAIc_orgexists($dbh,$person_company);
! 158: printf("Inserted new organization %s\n", $reg_key);
! 159: }
! 160: } else {
! 161: if ( ! ($reg_key = OAIc_personexists($dbh,$entity_email_address)) ) {
! 162: my $success = OAIc_insert_person_full($dbh,$collection_reg_key,$submitter_key,$person_last_name,$person_first_name,$entity_email_address,$person_company,$person_middle_name,$person_title,$entity_address,$entity_city,$entity_state,$entity_postal_code,$entity_home_page_url,$entity_phone,$entity_fax,$entity_country);
! 163: $reg_key = OAIc_personexists($dbh,$entity_email_address);
! 164: printf("Inserted new person %s\n", $reg_key);
! 165: }
! 166: }
! 167: # Push author, contact and publisher id into array for insert/update into the database
! 168: if ( $role_id == 8 ) {
! 169: push(@author_reg_key,$reg_key);
! 170: } elsif ( $role_id == 9 ) {
! 171: push(@publisher_reg_key,$reg_key);
! 172: } elsif ( $role_id == 11 ) {
! 173: push(@contact_reg_key,$reg_key);
! 174: } else {
! 175: push(@other_reg_key,$reg_key);
! 176: }
! 177: } else {
! 178: printf("Author/Publisher/Contact Information (%d): %s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $role_id, $person_first_name, $person_last_name, $person_middle_name, $person_title, $person_company, $entity_email_address, $entity_address, $entity_city, $entity_state, $entity_postal_code, $entity_home_page_url, $entity_phone, $entity_fax, $entity_country);
! 179: printf("This is a(n) %s.\n", $object_type);
! 180: }
! 181: }
! 182:
! 183: # <educational>
! 184: my $educational_element = $metadata->find_by_tag_name('educational');
! 185: my @learning_context_element = $educational_element->find_by_tag_name('learningcontext');
! 186: my @learning_context_list = ();
! 187: foreach my $learning_context_item (@learning_context_element) {
! 188: push(@learning_context_list,OAIv_findLContext($learning_context_item->find_by_tag_name('langstring')->as_text()));
! 189: }
! 190: my $learning_context;
! 191: if (@learning_context_list) {
! 192: $learning_context = join(';',@learning_context_list);
! 193: }
! 194: my @intendedenduserrole_element = $educational_element->find_by_tag_name('intendedenduserrole');
! 195: my $intendedenduserrole_id = 0;
! 196: foreach my $intendedenduserrole_item (@intendedenduserrole_element) {
! 197: my $intendedenduserrole = $intendedenduserrole_item->find_by_tag_name('langstring')->as_text();
! 198: if ($intendedenduserrole eq 'Learner') {
! 199: $intendedenduserrole_id = 2;
! 200: } elsif ($intendedenduserrole eq 'Teacher') {
! 201: $intendedenduserrole_id = 1;
! 202: }
! 203: }
! 204:
! 205: # <rights>
! 206: my $rights_element = $metadata->find_by_tag_name('rights');
! 207: my $cost_element = $rights_element->find_by_tag_name('cost');
! 208: my $cost = $cost_element->find_by_tag_name('langstring')->as_text();
! 209: # This is version.purchase_license_type_id
! 210: if ( $cost eq 'No') {
! 211: $cost= 1;
! 212: } else {
! 213: $cost = 3;
! 214: }
! 215: my $rights_description_element = $rights_element->find_by_tag_name('description');
! 216: my $rights_description = $rights_description_element->find_by_tag_name('langstring')->as_text();
! 217:
! 218: if ( $useDatabase ) {
! 219: # General configuration information for DLESE
! 220: my $image = "http://www.smete.org/images/affiliation/dlese.gif";
! 221: my $submitter_key = '{710FE693-46E9-4002-BA94-1BE2E6218CD6}'; # Andy Dong
! 222: my $collection = 'Digital Library for Earth System Education';
! 223: my $collection_reg_key = '{CA001C50-77CA-43DC-A761-95207D386EDD}';
! 224: my $publisher = 'Digital Library for Earth System Education';
! 225: push(@publisher_reg_key,'{CA001C50-77CA-43DC-A761-95207D386EDD}');
! 226: if ( my $general_key = OAIc_loexists($dbh,$title) ) {
! 227: # my $success = OAIc_update_lo_dlese($dbh, $general_key, $title, $language, $description, $image, $pub_month, $pub_year, $keywords, $submitter_key, $publisher, $collection, $format, $platform, $location, $learning_context, $intendedenduserrole_id, $collection_reg_key, $rights_description, $cost);
! 228: my $success = OAIc_update_lo_dlese($dbh, $general_key, $learning_context, $intendedenduserrole_id, $rights_description, $cost);
! 229: $updated = $updated + 1;
! 230: } else {
! 231: printf("Inserting new record for %s\n",$title);
! 232: my $success = OAIc_insert_lo_dlese($dbh, $title, $language, $description, $image, $pub_month, $pub_year, $keywords, $submitter_key, $publisher, $collection, $format, $platform, $location, $learning_context, $intendedenduserrole_id, $collection_reg_key, $rights_description, $cost);
! 233: my $id = OAIc_loexists($dbh,$title);
! 234: # INSERT INTO [needs_3_1]..learning_object_contributor
! 235: # Add author contribution (ct_key=8)
! 236: foreach my $author (@author_reg_key) {
! 237: printf("Adding author contribution for %s\n", $author);
! 238: my $rc = $dbh->do(q{INSERT INTO learning_object_contributor (learning_object_id, entity_id, role_id) VALUES (?,?,8)}, undef, $id,$author);
! 239: }
! 240: # Add publisher contribution (ct_key=9)
! 241: foreach my $publisher_id (@publisher_reg_key) {
! 242: printf("Adding publisher contribution for %s\n", $publisher_id);
! 243: my $rc = $dbh->do(q{INSERT INTO learning_object_contributor (learning_object_id, entity_id, role_id) VALUES (?,?,9)}, undef, $id,$publisher_id);
! 244: }
! 245: # Add contact contribution (ct_key=11)
! 246: foreach my $contact (@contact_reg_key) {
! 247: printf("Adding contact contribution for %s\n", $contact);
! 248: my $rc = $dbh->do(q{INSERT INTO learning_object_contributor (learning_object_id, entity_id, role_id) VALUES (?,?,11)}, undef, $id, $contact);
! 249: }
! 250: # Add other contribution (ct_key=4)
! 251: foreach my $other (@other_reg_key) {
! 252: printf("Adding other contribution for %s\n", $other);
! 253: my $rc = $dbh->do(q{INSERT INTO learning_object_contributor (learning_object_id, entity_id, role_id) VALUES (?,?,4)}, undef, $id, $other);
! 254: }
! 255: # Add collection contribution (ct_key=12)
! 256: my $rc = $dbh->do(q{INSERT INTO learning_object_contributor (learning_object_id, entity_id, role_id) VALUES (?,?,12)}, undef, $id, $collection_reg_key);
! 257: if (!$rc) {
! 258: $dbh->rollback;
! 259: $dbh->disconnect;
! 260: die "Unable to insert new record into learning_object_contributor: $dbh->errstr\n";
! 261: }
! 262: $inserted = $inserted + 1;
! 263: }
! 264: $dbh->commit;
! 265: $getrecord_tree->delete;
! 266: } else {
! 267: # Print parsed data
! 268: printf("Title: %s\tKeywords: %s\tDescription: %s\n", $title,$keywords,$description);
! 269: printf("Publication %d-%d\n", $pub_month, $pub_year);
! 270: printf("Format: %s\tPlatform: %s\tLocation: %s\n", $format, $platform, $location);
! 271: printf("Learning Context: %s\tIntended End User Role: %d\n", $learning_context,$intendedenduserrole_id);
! 272: printf("Cost: %s\tCopyright: %s\n", $cost, substr($rights_description,0,1024));
! 273: $getrecord_tree->delete;
! 274: }
! 275:
! 276: }
! 277:
! 278: $tree->delete;
! 279:
! 280: if ( $useDatabase ) {
! 281: $dbh->disconnect();
! 282: }
! 283:
! 284: printf("Inserted %d records and Updated %d records in %f seconds.\n", $inserted, $updated, tv_interval($t0));
! 285:
! 286: exit 0;
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>