Fix overly optimistic JSTOR scraper
This commit is contained in:
parent
968348a5d1
commit
09d79d6dd7
3 changed files with 69 additions and 38 deletions
|
@ -107,6 +107,7 @@ Scholar_Ingester_Interface.Listener.onStateChange = function() {}
|
||||||
* appropriate status indicator for the current tab, and to free useless objects
|
* appropriate status indicator for the current tab, and to free useless objects
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
|
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
|
||||||
|
Scholar.debug("onLocationChange called");
|
||||||
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
|
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
|
||||||
|
|
||||||
// Remove document object of any browser that no longer exists
|
// Remove document object of any browser that no longer exists
|
||||||
|
@ -213,33 +214,38 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) {
|
||||||
*/
|
*/
|
||||||
Scholar_Ingester_Interface._finishScraping = function(obj) {
|
Scholar_Ingester_Interface._finishScraping = function(obj) {
|
||||||
if(obj.items.length) {
|
if(obj.items.length) {
|
||||||
var item1 = obj.items[0];
|
try { // Encased in a try block to fix a as-of-yet unresolved issue
|
||||||
|
var item1 = obj.items[0];
|
||||||
Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
|
|
||||||
|
|
||||||
var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID"));
|
|
||||||
|
|
||||||
var titleLabel = Scholar.getString("itemFields.title") + ":"
|
Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
|
||||||
Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title"));
|
|
||||||
var creators = item1.numCreators();
|
var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID"));
|
||||||
if(creators) {
|
|
||||||
for(var i=0; i<creators; i++) {
|
// Display title and creators
|
||||||
var creator = item1.getCreator(i);
|
var titleLabel = Scholar.getString("itemFields.title") + ":"
|
||||||
var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
|
Scholar_Ingester_Interface.scrapeProgress.addResult(titleLabel, item1.getField("title"));
|
||||||
var data = creator.firstName + ' ' + creator.lastName;
|
var creators = item1.numCreators();
|
||||||
Scholar_Ingester_Interface.scrapeProgress.addResult(label, data);
|
if(creators) {
|
||||||
}
|
for(var i=0; i<creators; i++) {
|
||||||
}
|
var creator = item1.getCreator(i);
|
||||||
|
var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
|
||||||
for(i in fields) {
|
var data = creator.firstName + ' ' + creator.lastName;
|
||||||
var data = item1.getField(fields[i]);
|
|
||||||
if(data) {
|
|
||||||
var name = Scholar.ItemFields.getName(fields[i]);
|
|
||||||
if(name != "source") {
|
|
||||||
var label = Scholar.getString("itemFields."+ name) + ":";
|
|
||||||
Scholar_Ingester_Interface.scrapeProgress.addResult(label, data);
|
Scholar_Ingester_Interface.scrapeProgress.addResult(label, data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add additional fields for display
|
||||||
|
for(i in fields) {
|
||||||
|
var data = item1.getField(fields[i]);
|
||||||
|
if(data) {
|
||||||
|
var name = Scholar.ItemFields.getName(fields[i]);
|
||||||
|
if(name != "source") {
|
||||||
|
var label = Scholar.getString("itemFields."+ name) + ":";
|
||||||
|
Scholar_Ingester_Interface.scrapeProgress.addResult(label, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(ex) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save items
|
// Save items
|
||||||
|
|
|
@ -333,15 +333,13 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
|
||||||
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
|
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
|
||||||
// Extract creators
|
// Extract creators
|
||||||
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
|
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
|
||||||
model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
|
||||||
model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
|
||||||
model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
|
|
||||||
model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
|
model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
|
||||||
model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
|
||||||
model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
|
||||||
model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
|
if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
|
||||||
if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author
|
// some LOC entries have no listed author, but have the author in the person subject field as the first entry
|
||||||
// in the person subject field as the first entry
|
|
||||||
var field = record.get_field_subfields('600');
|
var field = record.get_field_subfields('600');
|
||||||
if(field[0]) {
|
if(field[0]) {
|
||||||
model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
|
model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
|
||||||
|
@ -694,6 +692,18 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||||
creatorIndex++;
|
creatorIndex++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(this.model.data[uri][prefixDummy + 'corporateCreator']) {
|
||||||
|
for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) {
|
||||||
|
newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateCreator'][i], null, 1);
|
||||||
|
creatorIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDummy + 'corporateContributor']) {
|
||||||
|
for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) {
|
||||||
|
newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateContributor'][i], null, 2);
|
||||||
|
creatorIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Handle years, extracting from date if necessary
|
// Handle years, extracting from date if necessary
|
||||||
if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) {
|
if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) {
|
||||||
|
|
29
scrapers.sql
29
scrapers.sql
|
@ -234,7 +234,24 @@ utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
|
||||||
})
|
})
|
||||||
wait();');
|
wait();');
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
REPLACE INTO "scrapers" VALUES('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-18 11:02:00', 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)',
|
||||||
|
'var namespace = doc.documentElement.namespaceURI;
|
||||||
|
var nsResolver = namespace ? function(prefix) {
|
||||||
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
|
} : null;
|
||||||
|
|
||||||
|
// If this is a view page, find the link to the citation
|
||||||
|
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
|
||||||
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
|
if(!elmts.length) {
|
||||||
|
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
|
||||||
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
|
}
|
||||||
|
utilities.debugPrint(elmts.length);
|
||||||
|
if(elmts && elmts.length) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||||
|
@ -249,15 +266,13 @@ var uri = doc.location.href;
|
||||||
// If this is a view page, find the link to the citation
|
// If this is a view page, find the link to the citation
|
||||||
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
|
var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
|
||||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
if(!elmts) {
|
if(!elmts.length) {
|
||||||
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
|
var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
|
||||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
}
|
}
|
||||||
if(!elmts) {
|
|
||||||
exit;
|
var saveCitation = elmts[0].href;
|
||||||
}
|
var viewSavedCitations = elmts[1].href;
|
||||||
var saveCitation = utilities.getNode(doc, elmts[0], ''.'', nsResolver).href;
|
|
||||||
var viewSavedCitations = utilities.getNode(doc, elmts[1], ''.'', nsResolver).href;
|
|
||||||
saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save'');
|
saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save'');
|
||||||
|
|
||||||
// Parse save citation link
|
// Parse save citation link
|
||||||
|
|
Loading…
Reference in a new issue