Fix overly optimistic JSTOR scraper
This commit is contained in:
parent
968348a5d1
commit
09d79d6dd7
3 changed files with 69 additions and 38 deletions
|
@ -107,6 +107,7 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {}
|
|||
* appropriate status indicator for the current tab, and to free useless objects
|
||||
*/
|
||||
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
|
||||
Scholar.debug("onLocationChange called");
|
||||
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
|
||||
|
||||
// Remove document object of any browser that no longer exists
|
||||
|
@ -213,12 +214,14 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) {
|
|||
*/
|
||||
Scholar_Ingester_Interface._finishScraping = function(obj) {
|
||||
if(obj.items.length) {
|
||||
try { // Encased in a try block to fix a as-of-yet unresolved issue
|
||||
var item1 = obj.items[0];
|
||||
|
||||
Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
|
||||
|
||||
var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID"));
|
||||
|
||||
// Display title and creators
|
||||
var titleLabel = Scholar.getString("itemFields.title") + ":"
|
||||
Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title"));
|
||||
var creators = item1.numCreators();
|
||||
|
@ -231,6 +234,7 @@ Scholar_Ingester_Interface._finishScraping = function(obj) {
|
|||
}
|
||||
}
|
||||
|
||||
// Add additional fields for display
|
||||
for(i in fields) {
|
||||
var data = item1.getField(fields[i]);
|
||||
if(data) {
|
||||
|
@ -241,6 +245,8 @@ Scholar_Ingester_Interface._finishScraping = function(obj) {
|
|||
}
|
||||
}
|
||||
}
|
||||
} catch(ex) {
|
||||
}
|
||||
|
||||
// Save items
|
||||
for(i in obj.items) {
|
||||
|
|
|
@ -333,15 +333,13 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
|
|||
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
|
||||
// Extract creators
|
||||
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
|
||||
model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
|
||||
model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
|
||||
if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author
|
||||
// in the person subject field as the first entry
|
||||
model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
|
||||
if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
|
||||
// some LOC entries have no listed author, but have the author in the person subject field as the first entry
|
||||
var field = record.get_field_subfields('600');
|
||||
if(field[0]) {
|
||||
model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
|
||||
|
@ -694,6 +692,18 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
|||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
if(this.model.data[uri][prefixDummy + 'corporateCreator']) {
|
||||
for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) {
|
||||
newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateCreator'][i], null, 1);
|
||||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
if(this.model.data[uri][prefixDummy + 'corporateContributor']) {
|
||||
for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) {
|
||||
newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateContributor'][i], null, 2);
|
||||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle years, extracting from date if necessary
|
||||
if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) {
|
||||
|
|
29
scrapers.sql
29
scrapers.sql
|
@ -234,7 +234,24 @@ utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
|
|||
})
|
||||
wait();');
|
||||
|
||||
REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||
REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)',
|
||||
'var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
// If this is a view page, find the link to the citation
|
||||
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
if(!elmts.length) {
|
||||
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
}
|
||||
utilities.debugPrint(elmts.length);
|
||||
if(elmts && elmts.length) {
|
||||
return true;
|
||||
}
|
||||
return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||
|
@ -249,15 +266,13 @@ var uri = doc.location.href;
|
|||
// If this is a view page, find the link to the citation
|
||||
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
if(!elmts) {
|
||||
if(!elmts.length) {
|
||||
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
}
|
||||
if(!elmts) {
|
||||
exit;
|
||||
}
|
||||
var saveCitation = utilities.getNode(doc, elmts[0], ''.'', nsResolver).href;
|
||||
var viewSavedCitations = utilities.getNode(doc, elmts[1], ''.'', nsResolver).href;
|
||||
|
||||
var saveCitation = elmts[0].href;
|
||||
var viewSavedCitations = elmts[1].href;
|
||||
saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save'');
|
||||
|
||||
// Parse save citation link
|
||||
|
|
Loading…
Reference in a new issue