Fix overly optimistic JSTOR scraper

2006-06-20 17:06:41 +00:00 · 2006-06-20 17:06:41 +00:00 · 09d79d6dd7
commit 09d79d6dd7
parent 968348a5d1
3 changed files with 69 additions and 38 deletions
--- a/chrome/chromeFiles/content/scholar/ingester/browser.js
+++ b/chrome/chromeFiles/content/scholar/ingester/browser.js
@ -107,6 +107,7 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {}
 * appropriate status indicator for the current tab, and to free useless objects
 */
 Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
+	Scholar.debug("onLocationChange called");
    var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;

    // Remove document object of any browser that no longer exists
@ -213,12 +214,14 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) {
 */
 Scholar_Ingester_Interface._finishScraping = function(obj) {
 	if(obj.items.length) {
+		try {		// Encased in a try block to fix a as-of-yet unresolved issue
 			var item1 = obj.items[0];
 			
 			Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
 			
 			var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID"));
 			
+			// Display title and creators
 			var titleLabel = Scholar.getString("itemFields.title") + ":"
 			Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title"));
 			var creators = item1.numCreators();
@ -231,6 +234,7 @@ Scholar_Ingester_Interface._finishScraping = function(obj) {
 				}
 			}
 			
+			// Add additional fields for display
 			for(i in fields) {
 				var data = item1.getField(fields[i]);
 				if(data) {
@ -241,6 +245,8 @@ Scholar_Ingester_Interface._finishScraping = function(obj) {
 					}
 				}
 			}
+		} catch(ex) {
+		}
 		
 		// Save items
 		for(i in obj.items) {
--- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@ -333,15 +333,13 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
 	model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
 	// Extract creators
 	model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
-	model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
+	model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
+	model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
 	model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
-	model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
-	if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) {	// some LOC entries have no listed author, but have the author
-													// in the person subject field as the first entry
+	model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
+	model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
+	if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
+		// some LOC entries have no listed author, but have the author in the person subject field as the first entry
 		var field = record.get_field_subfields('600');
 		if(field[0]) {
 			model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));	
@ -694,6 +692,18 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
 					creatorIndex++;
 				}
 			}
+			if(this.model.data[uri][prefixDummy + 'corporateCreator']) {
+				for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) {
+					newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateCreator'][i], null, 1);
+					creatorIndex++;
+				}
+			}
+			if(this.model.data[uri][prefixDummy + 'corporateContributor']) {
+				for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) {
+					newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateContributor'][i], null, 2);
+					creatorIndex++;
+				}
+			}
 			
 			// Handle years, extracting from date if necessary
 			if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) {
--- a/scrapers.sql
+++ b/scrapers.sql
@ -234,7 +234,24 @@ utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
 })
 wait();');

-REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', 
+'var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+	if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+// If this is a view page, find the link to the citation
+var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+if(!elmts.length) {
+	var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
+	var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+}
+utilities.debugPrint(elmts.length);
+if(elmts && elmts.length) {
+	return true;
+}
+return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
 var prefixDC = ''http://purl.org/dc/elements/1.1/'';
 var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
 var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
@ -249,15 +266,13 @@ var uri = doc.location.href;
 // If this is a view page, find the link to the citation
 var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
 var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
-if(!elmts) {
+if(!elmts.length) {
 	var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
 	var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
 }
-if(!elmts) {
-	exit;
-}
-var saveCitation = utilities.getNode(doc, elmts[0], ''.'', nsResolver).href;
-var viewSavedCitations = utilities.getNode(doc, elmts[1], ''.'', nsResolver).href;
+
+var saveCitation = elmts[0].href;
+var viewSavedCitations = elmts[1].href;
 saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save'');

 // Parse save citation link