- Make scrapers return standard ISO-style YYYY-MM-DD dates. Still need to work on journal article scrapers.
- Ingester lets callback function save items, rather than saving them itself. - Better handling of multiple items in API, although no scrapers currently implement this.
This commit is contained in:
parent
953b1f9d20
commit
3d881eec13
3 changed files with 148 additions and 113 deletions
|
@ -211,18 +211,20 @@ Scholar.Ingester.Interface._deleteDocument = function(browser) {
|
||||||
/*
|
/*
|
||||||
* Callback to be executed when scraping is complete
|
* Callback to be executed when scraping is complete
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Interface._finishScraping = function(documentObject) {
|
Scholar.Ingester.Interface._finishScraping = function(obj) {
|
||||||
if(documentObject.item) {
|
if(obj.items.length) {
|
||||||
|
var item1 = obj.items[0];
|
||||||
|
|
||||||
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
|
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
|
||||||
|
|
||||||
var fields = Scholar.ItemFields.getItemTypeFields(documentObject.item.getField("itemTypeID"));
|
var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID"));
|
||||||
|
|
||||||
var titleLabel = Scholar.getString("itemFields.title") + ":"
|
var titleLabel = Scholar.getString("itemFields.title") + ":"
|
||||||
Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, this.item.getField("title"));
|
Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, item1.getField("title"));
|
||||||
var creators = documentObject.item.numCreators();
|
var creators = item1.numCreators();
|
||||||
if(creators) {
|
if(creators) {
|
||||||
for(var i=0; i<creators; i++) {
|
for(var i=0; i<creators; i++) {
|
||||||
var creator = documentObject.item.getCreator(i);
|
var creator = item1.getCreator(i);
|
||||||
var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
|
var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
|
||||||
var data = creator.firstName + ' ' + creator.lastName;
|
var data = creator.firstName + ' ' + creator.lastName;
|
||||||
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
|
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
|
||||||
|
@ -230,7 +232,7 @@ Scholar.Ingester.Interface._finishScraping = function(documentObject) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i in fields) {
|
for(i in fields) {
|
||||||
var data = documentObject.item.getField(fields[i]);
|
var data = item1.getField(fields[i]);
|
||||||
if(data) {
|
if(data) {
|
||||||
var name = Scholar.ItemFields.getName(fields[i]);
|
var name = Scholar.ItemFields.getName(fields[i]);
|
||||||
if(name != "source") {
|
if(name != "source") {
|
||||||
|
@ -239,6 +241,11 @@ Scholar.Ingester.Interface._finishScraping = function(documentObject) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Save items
|
||||||
|
for(i in obj.items) {
|
||||||
|
obj.items[i].save();
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
|
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
|
||||||
Scholar.Ingester.Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
|
Scholar.Ingester.Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
|
||||||
|
|
|
@ -49,7 +49,7 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
|
||||||
// Scholar.Ingester.Utilities class, a set of methods to assist in data
|
// Scholar.Ingester.Utilities class, a set of methods to assist in data
|
||||||
// extraction. Most code here was stolen directly from the Piggy Bank project.
|
// extraction. Most code here was stolen directly from the Piggy Bank project.
|
||||||
Scholar.Ingester.Utilities = function(hiddenBrowser) {
|
Scholar.Ingester.Utilities = function(hiddenBrowser) {
|
||||||
this.hiddenBrowser = hiddenBrowser;
|
this._hiddenBrowser = hiddenBrowser;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adapter for Piggy Bank function to print debug messages; log level is
|
// Adapter for Piggy Bank function to print debug messages; log level is
|
||||||
|
@ -115,7 +115,7 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
|
||||||
// exception - a function to execute if an exception occurs (exceptions are
|
// exception - a function to execute if an exception occurs (exceptions are
|
||||||
// also logged in the Firefox Scholar log)
|
// also logged in the Firefox Scholar log)
|
||||||
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||||
var hiddenBrowser = this.hiddenBrowser;
|
var hiddenBrowser = this._hiddenBrowser;
|
||||||
Scholar.debug("processDocuments called");
|
Scholar.debug("processDocuments called");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -301,11 +301,13 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
|
||||||
// Extract title
|
// Extract title
|
||||||
model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString);
|
||||||
// Extract edition
|
// Extract edition
|
||||||
model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'edition', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString);
|
||||||
// Extract place info
|
// Extract place info
|
||||||
model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
|
model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
|
||||||
// Extract publisher info
|
// Extract publisher info
|
||||||
model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
|
model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
|
||||||
|
// Extract year
|
||||||
|
model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCCleanString, '', 'c');
|
||||||
// Extract series
|
// Extract series
|
||||||
model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
|
||||||
}
|
}
|
||||||
|
@ -411,9 +413,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
|
||||||
* browser - browser window object of document
|
* browser - browser window object of document
|
||||||
* model - data model for semantic scrapers
|
* model - data model for semantic scrapers
|
||||||
* scraper - best scraper to use to scrape page
|
* scraper - best scraper to use to scrape page
|
||||||
|
* items - items returned after page is scraped
|
||||||
*
|
*
|
||||||
* Private properties:
|
* Private properties:
|
||||||
* _sandbox - sandbox for code execution
|
* _sandbox - sandbox for code execution
|
||||||
|
* _appSvc - AppShellService instance
|
||||||
|
* _hiddenBrowser - hiden browser object
|
||||||
|
* _scrapeCallback - callback function to be executed when scraping is complete
|
||||||
*/
|
*/
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -426,12 +432,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
|
||||||
* Constructor for Document object
|
* Constructor for Document object
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
|
Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
|
||||||
|
this.scraper = null;
|
||||||
this.browser = browserWindow;
|
this.browser = browserWindow;
|
||||||
this.model = new Scholar.Ingester.Model();
|
this.model = new Scholar.Ingester.Model();
|
||||||
this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
|
this.items = new Array();
|
||||||
|
this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
|
||||||
.getService(Ci.nsIAppShellService);
|
.getService(Ci.nsIAppShellService);
|
||||||
this.scraper = null;
|
this._hiddenBrowser = hiddenBrowser;
|
||||||
this.hiddenBrowser = hiddenBrowser;
|
|
||||||
this._generateSandbox();
|
this._generateSandbox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -474,7 +481,7 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
|
||||||
if((!currentScraper.urlPattern || canScrape)
|
if((!currentScraper.urlPattern || canScrape)
|
||||||
&& currentScraper.scraperDetectCode) {
|
&& currentScraper.scraperDetectCode) {
|
||||||
Scholar.debug("Checking scraperDetectCode");
|
Scholar.debug("Checking scraperDetectCode");
|
||||||
var scraperSandbox = this.sandbox;
|
var scraperSandbox = this._sandbox;
|
||||||
try {
|
try {
|
||||||
canScrape = Components.utils.evalInSandbox("(function(){\n" +
|
canScrape = Components.utils.evalInSandbox("(function(){\n" +
|
||||||
currentScraper.scraperDetectCode +
|
currentScraper.scraperDetectCode +
|
||||||
|
@ -498,7 +505,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
||||||
|
|
||||||
Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
|
Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
|
||||||
|
|
||||||
var scraperSandbox = this.sandbox;
|
var scraperSandbox = this._sandbox;
|
||||||
try {
|
try {
|
||||||
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
|
@ -550,20 +557,20 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
||||||
* Generates a sandbox for scraping/scraper detection
|
* Generates a sandbox for scraping/scraper detection
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
||||||
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
||||||
this.sandbox.browser = this.browser;
|
this._sandbox.browser = this.browser;
|
||||||
this.sandbox.doc = this.sandbox.browser.contentDocument;
|
this._sandbox.doc = this._sandbox.browser.contentDocument;
|
||||||
this.sandbox.utilities = new Scholar.Ingester.Utilities(this.hiddenBrowser);
|
this._sandbox.utilities = new Scholar.Ingester.Utilities(this._hiddenBrowser);
|
||||||
this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
|
this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow);
|
||||||
this.sandbox.window = this.window;
|
this._sandbox.window = this.window;
|
||||||
this.sandbox.model = this.model;
|
this._sandbox.model = this.model;
|
||||||
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
|
this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
|
||||||
this.sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
|
this._sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
|
||||||
this.sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
|
this._sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
|
||||||
|
|
||||||
var me = this;
|
var me = this;
|
||||||
this.sandbox.wait = function(){ me._waitForCompletion = true; };
|
this._sandbox.wait = function(){ me._waitForCompletion = true; };
|
||||||
this.sandbox.done = function(){ me._scrapePageComplete(); };
|
this._sandbox.done = function(){ me._scrapePageComplete(); };
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -571,11 +578,14 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
||||||
* (Ontologies are hard-coded until we have a real way of dealing with them)
|
* (Ontologies are hard-coded until we have a real way of dealing with them)
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||||
|
Scholar.debug("doing updating");
|
||||||
|
|
||||||
var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
|
var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
|
||||||
var prefixDC = 'http://purl.org/dc/elements/1.1/';
|
var prefixDC = 'http://purl.org/dc/elements/1.1/';
|
||||||
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
|
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
|
||||||
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
|
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
|
||||||
|
|
||||||
|
try {
|
||||||
for(var uri in this.model.data) {
|
for(var uri in this.model.data) {
|
||||||
if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
|
if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
|
||||||
var newItem = Scholar.Items.getNewItemByType(2);
|
var newItem = Scholar.Items.getNewItemByType(2);
|
||||||
|
@ -635,18 +645,12 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||||
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
|
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
|
||||||
}
|
}
|
||||||
if(this.model.data[uri][prefixDC + 'year']) {
|
if(this.model.data[uri][prefixDC + 'year']) {
|
||||||
if(this.model.data[uri][prefixDC + 'year'].length == 4) {
|
|
||||||
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
|
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
|
||||||
} else {
|
} else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
|
||||||
try {
|
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
|
||||||
newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
|
|
||||||
this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
|
|
||||||
this.model.data[uri][prefixDC + 'year'][0].length));
|
|
||||||
} catch(e) {}
|
|
||||||
}
|
}
|
||||||
}
|
if(this.model.data[uri][prefixDC + 'hasVersion']) {
|
||||||
if(this.model.data[uri][prefixDC + 'edition']) {
|
newItem.setField("edition", this.model.data[uri][prefixDC + 'hasVersion'][0]);
|
||||||
newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
|
|
||||||
}
|
}
|
||||||
if(this.model.data[uri][prefixDummy + 'series']) {
|
if(this.model.data[uri][prefixDummy + 'series']) {
|
||||||
newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
|
newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
|
||||||
|
@ -663,11 +667,9 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
newItem.save();
|
this.items.push(newItem);
|
||||||
|
}
|
||||||
// First one is stored so as to be accessible
|
} catch(ex) {
|
||||||
if(!this.item) {
|
Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex);
|
||||||
this.item = newItem;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
30
scrapers.sql
30
scrapers.sql
|
@ -22,6 +22,30 @@ var cleanString = function(s) {
|
||||||
return s.replace(/ +/g, " ");
|
return s.replace(/ +/g, " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var dateToISO = function(jsDate) {
|
||||||
|
var date = "";
|
||||||
|
var year = jsDate.getFullYear().toString();
|
||||||
|
var month = (jsDate.getMonth()+1).toString();
|
||||||
|
var day = jsDate.getDate().toString();
|
||||||
|
|
||||||
|
for(var i = year.length; i<4; i++) {
|
||||||
|
date += "0";
|
||||||
|
}
|
||||||
|
date += year+"-";
|
||||||
|
|
||||||
|
if(month.length == 1) {
|
||||||
|
date += "0";
|
||||||
|
}
|
||||||
|
date += month+"-";
|
||||||
|
|
||||||
|
if(day.length == 1) {
|
||||||
|
date += "0";
|
||||||
|
}
|
||||||
|
date += day;
|
||||||
|
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
var uri = doc.location.href;
|
var uri = doc.location.href;
|
||||||
|
|
||||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
||||||
|
@ -43,10 +67,12 @@ for (var i = 0; i < elmts.length; i++) {
|
||||||
var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
|
var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
|
||||||
if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
|
if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
|
||||||
var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
|
var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
|
||||||
|
|
||||||
if(attribute == "Publisher:") {
|
if(attribute == "Publisher:") {
|
||||||
if(value.lastIndexOf("(") != -1) {
|
if(value.lastIndexOf("(") != -1) {
|
||||||
var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
||||||
|
jsDate = new Date(jsDate);
|
||||||
|
var date = dateToISO(jsDate);
|
||||||
|
|
||||||
value = value.substring(0, value.lastIndexOf("(")-1);
|
value = value.substring(0, value.lastIndexOf("(")-1);
|
||||||
}
|
}
|
||||||
if(value.lastIndexOf(";") != -1) {
|
if(value.lastIndexOf(";") != -1) {
|
||||||
|
|
Loading…
Reference in a new issue