Fix overly optimistic JSTOR scraper

This commit is contained in:
Simon Kornblith 2006-06-20 17:06:41 +00:00
parent 968348a5d1
commit 09d79d6dd7
3 changed files with 69 additions and 38 deletions

View file

@ -107,6 +107,7 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {}
* appropriate status indicator for the current tab, and to free useless objects * appropriate status indicator for the current tab, and to free useless objects
*/ */
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) { Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
Scholar.debug("onLocationChange called");
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers; var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
// Remove document object of any browser that no longer exists // Remove document object of any browser that no longer exists
@ -213,33 +214,38 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) {
*/ */
Scholar_Ingester_Interface._finishScraping = function(obj) { Scholar_Ingester_Interface._finishScraping = function(obj) {
if(obj.items.length) { if(obj.items.length) {
var item1 = obj.items[0]; try { // Encased in a try block to fix a as-of-yet unresolved issue
var item1 = obj.items[0];
Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete")); Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID")); var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID"));
var titleLabel = Scholar.getString("itemFields.title") + ":" // Display title and creators
Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title")); var titleLabel = Scholar.getString("itemFields.title") + ":"
var creators = item1.numCreators(); Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title"));
if(creators) { var creators = item1.numCreators();
for(var i=0; i<creators; i++) { if(creators) {
var creator = item1.getCreator(i); for(var i=0; i<creators; i++) {
var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":"; var creator = item1.getCreator(i);
var data = creator.firstName + ' ' + creator.lastName; var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
Scholar_Ingester_Interface.scrapeProgress.addResult(label, data); var data = creator.firstName + ' ' + creator.lastName;
}
}
for(i in fields) {
var data = item1.getField(fields[i]);
if(data) {
var name = Scholar.ItemFields.getName(fields[i]);
if(name != "source") {
var label = Scholar.getString("itemFields."+ name) + ":";
Scholar_Ingester_Interface.scrapeProgress.addResult(label, data); Scholar_Ingester_Interface.scrapeProgress.addResult(label, data);
} }
} }
// Add additional fields for display
for(i in fields) {
var data = item1.getField(fields[i]);
if(data) {
var name = Scholar.ItemFields.getName(fields[i]);
if(name != "source") {
var label = Scholar.getString("itemFields."+ name) + ":";
Scholar_Ingester_Interface.scrapeProgress.addResult(label, data);
}
}
}
} catch(ex) {
} }
// Save items // Save items

View file

@ -333,15 +333,13 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN '); model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
// Extract creators // Extract creators
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor); model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor); model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString); model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString); if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author // some LOC entries have no listed author, but have the author in the person subject field as the first entry
// in the person subject field as the first entry
var field = record.get_field_subfields('600'); var field = record.get_field_subfields('600');
if(field[0]) { if(field[0]) {
model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a'])); model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
@ -694,6 +692,18 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
creatorIndex++; creatorIndex++;
} }
} }
if(this.model.data[uri][prefixDummy + 'corporateCreator']) {
for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) {
newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateCreator'][i], null, 1);
creatorIndex++;
}
}
if(this.model.data[uri][prefixDummy + 'corporateContributor']) {
for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) {
newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateContributor'][i], null, 2);
creatorIndex++;
}
}
// Handle years, extracting from date if necessary // Handle years, extracting from date if necessary
if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) { if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) {

View file

@ -234,7 +234,24 @@ utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
}) })
wait();'); wait();');
REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)',
'var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
// If this is a view page, find the link to the citation
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
if(!elmts.length) {
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
}
utilities.debugPrint(elmts.length);
if(elmts && elmts.length) {
return true;
}
return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
@ -249,15 +266,13 @@ var uri = doc.location.href;
// If this is a view page, find the link to the citation // If this is a view page, find the link to the citation
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]''; var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
if(!elmts) { if(!elmts.length) {
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]''; var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
} }
if(!elmts) {
exit; var saveCitation = elmts[0].href;
} var viewSavedCitations = elmts[1].href;
var saveCitation = utilities.getNode(doc, elmts[0], ''.'', nsResolver).href;
var viewSavedCitations = utilities.getNode(doc, elmts[1], ''.'', nsResolver).href;
saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save''); saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save'');
// Parse save citation link // Parse save citation link