Fix issues with asynchronous scraping and XMLHttpRequest
This commit is contained in:
parent
c42991a5bf
commit
93652a137c
3 changed files with 106 additions and 62 deletions
|
@ -211,31 +211,36 @@ Scholar.Ingester.Interface._deleteDocument = function(browser) {
|
||||||
* Callback to be executed when scraping is complete
|
* Callback to be executed when scraping is complete
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Interface._finishScraping = function(documentObject) {
|
Scholar.Ingester.Interface._finishScraping = function(documentObject) {
|
||||||
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
|
if(documentObject.item) {
|
||||||
|
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
|
||||||
|
|
||||||
var fields = Scholar.ItemFields.getItemTypeFields(documentObject.item.getField("itemTypeID"));
|
var fields = Scholar.ItemFields.getItemTypeFields(documentObject.item.getField("itemTypeID"));
|
||||||
|
|
||||||
var titleLabel = Scholar.getString("itemFields.title") + ":"
|
var titleLabel = Scholar.getString("itemFields.title") + ":"
|
||||||
Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, this.item.getField("title"));
|
Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, this.item.getField("title"));
|
||||||
var creators = documentObject.item.numCreators();
|
var creators = documentObject.item.numCreators();
|
||||||
if(creators) {
|
if(creators) {
|
||||||
for(var i=0; i<creators; i++) {
|
for(var i=0; i<creators; i++) {
|
||||||
var creator = documentObject.item.getCreator(i);
|
var creator = documentObject.item.getCreator(i);
|
||||||
var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
|
var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
|
||||||
var data = creator.firstName + ' ' + creator.lastName;
|
var data = creator.firstName + ' ' + creator.lastName;
|
||||||
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for(i in fields) {
|
|
||||||
var data = documentObject.item.getField(fields[i]);
|
|
||||||
if(data) {
|
|
||||||
var name = Scholar.ItemFields.getName(fields[i]);
|
|
||||||
if(name != "source") {
|
|
||||||
var label = Scholar.getString("itemFields."+ name) + ":";
|
|
||||||
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
|
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for(i in fields) {
|
||||||
|
var data = documentObject.item.getField(fields[i]);
|
||||||
|
if(data) {
|
||||||
|
var name = Scholar.ItemFields.getName(fields[i]);
|
||||||
|
if(name != "source") {
|
||||||
|
var label = Scholar.getString("itemFields."+ name) + ":";
|
||||||
|
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
|
||||||
|
Scholar.Ingester.Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
|
||||||
}
|
}
|
||||||
|
|
||||||
setTimeout(function() { Scholar.Ingester.Interface.scrapeProgress.fade() }, 2000);
|
setTimeout(function() { Scholar.Ingester.Interface.scrapeProgress.fade() }, 2000);
|
||||||
|
@ -311,6 +316,19 @@ Scholar.Ingester.Interface.Progress.prototype.addResult = function(label, data)
|
||||||
this.table.appendChild(tr);
|
this.table.appendChild(tr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Scholar.Ingester.Interface.Progress.prototype.addDescription = function(description) {
|
||||||
|
var descriptionNode = this.document.createTextNode(description);
|
||||||
|
var tr = this.document.createElement("tr");
|
||||||
|
var descriptionTd = this.document.createElement("td");
|
||||||
|
descriptionTd.style.fontSize = '10px';
|
||||||
|
descriptionTd.style.colspan = '2';
|
||||||
|
|
||||||
|
descriptionTd.appendChild(descriptionNode);
|
||||||
|
tr.appendChild(descriptionTd);
|
||||||
|
this.table.appendChild(tr);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Scholar.Ingester.Interface.Progress.prototype.fade = function() {
|
Scholar.Ingester.Interface.Progress.prototype.fade = function() {
|
||||||
// Icky, icky hack to keep objects
|
// Icky, icky hack to keep objects
|
||||||
var me = this;
|
var me = this;
|
||||||
|
|
|
@ -200,46 +200,55 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
|
||||||
// essential components for Scholar and would take a great deal of effort to
|
// essential components for Scholar and would take a great deal of effort to
|
||||||
// implement. We can, however, always implement them later.
|
// implement. We can, however, always implement them later.
|
||||||
|
|
||||||
// It looks like these are simple front-ends for XMLHttpRequest. They're a
|
// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
|
||||||
// component of the Piggy Bank API, so they're implemented here.
|
// accessed outside the sandbox, and even if it could, it wouldn't let scripts
|
||||||
Scholar.Ingester.Utilities.HTTPUtilities = function() {}
|
// access across domains, so everything's replicated here.
|
||||||
|
Scholar.Ingester.HTTPUtilities = function(contentWindow) {
|
||||||
Scholar.Ingester.Utilities.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
|
this.window = contentWindow;
|
||||||
var xmlhttp = new XMLHttpRequest();
|
|
||||||
|
|
||||||
xmlhttp.open('GET', url, true);
|
|
||||||
xmlhttp.overrideMimeType("text/xml");
|
|
||||||
xmlhttp.onreadystatechange = function() {
|
|
||||||
Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
|
|
||||||
};
|
|
||||||
xmlhttp.send(null);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Ingester.Utilities.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
|
Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
|
||||||
var xmlhttp = new XMLHttpRequest();
|
var xmlhttp = new this.window.XMLHttpRequest();
|
||||||
|
|
||||||
xmlhttp.open('POST', url, true);
|
xmlhttp.open('GET', url, true);
|
||||||
xmlhttp.overrideMimeType("text/xml");
|
xmlhttp.overrideMimeType("text/xml");
|
||||||
xmlhttp.onreadystatechange = function() {
|
|
||||||
Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
|
var me = this;
|
||||||
};
|
xmlhttp.onreadystatechange = function() {
|
||||||
xmlhttp.send(body);
|
me.stateChange(xmlhttp, onStatus, onDone);
|
||||||
|
};
|
||||||
|
xmlhttp.send(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Ingester.Utilities.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
|
Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
|
||||||
var xmlhttp = new XMLHttpRequest();
|
var xmlhttp = new this.window.XMLHttpRequest();
|
||||||
|
|
||||||
xmlhttp.open('OPTIONS', url, true);
|
xmlhttp.open('POST', url, true);
|
||||||
xmlhttp.overrideMimeType("text/xml");
|
xmlhttp.overrideMimeType("text/xml");
|
||||||
xmlhttp.onreadystatechange = function() {
|
|
||||||
Scholar.Ingester.Utilities.HTTPUtilities.stateChange(xmlhttp, onStatus, onDone);
|
var me = this;
|
||||||
};
|
xmlhttp.onreadystatechange = function() {
|
||||||
xmlhttp.send(body);
|
me.stateChange(xmlhttp, onStatus, onDone);
|
||||||
|
};
|
||||||
|
xmlhttp.send(body);
|
||||||
|
}
|
||||||
|
|
||||||
|
Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
|
||||||
|
var xmlhttp = new this.window.XMLHttpRequest();
|
||||||
|
|
||||||
|
xmlhttp.open('OPTIONS', url, true);
|
||||||
|
xmlhttp.overrideMimeType("text/xml");
|
||||||
|
|
||||||
|
var me = this;
|
||||||
|
xmlhttp.onreadystatechange = function() {
|
||||||
|
me.stateChange(xmlhttp, onStatus, onDone);
|
||||||
|
};
|
||||||
|
xmlhttp.send(body);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Possible point of failure; for some reason, this used to be a separate
|
// Possible point of failure; for some reason, this used to be a separate
|
||||||
// class, so make sure it works
|
// class, so make sure it works
|
||||||
Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
|
Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
|
||||||
switch (xmlhttp.readyState) {
|
switch (xmlhttp.readyState) {
|
||||||
|
|
||||||
// Request not yet made
|
// Request not yet made
|
||||||
|
@ -307,6 +316,8 @@ Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhtt
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Document = function(browserWindow){
|
Scholar.Ingester.Document = function(browserWindow){
|
||||||
this.browser = browserWindow;
|
this.browser = browserWindow;
|
||||||
|
this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
|
||||||
|
.getService(Ci.nsIAppShellService);
|
||||||
this.scraper = null
|
this.scraper = null
|
||||||
this.model = new Scholar.Ingester.Model();
|
this.model = new Scholar.Ingester.Model();
|
||||||
this._generateSandbox();
|
this._generateSandbox();
|
||||||
|
@ -379,10 +390,11 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
||||||
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
throw e+' in scraperJavaScript for '+this.scraper.label;
|
throw e+' in scraperJavaScript for '+this.scraper.label;
|
||||||
|
this._scrapePageComplete();
|
||||||
}
|
}
|
||||||
|
|
||||||
// If synchronous, call _scrapePageComplete();
|
// If synchronous, call _scrapePageComplete();
|
||||||
if(!scraperSandbox._waitForCompletion) {
|
if(!this._waitForCompletion) {
|
||||||
this._scrapePageComplete();
|
this._scrapePageComplete();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -411,7 +423,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
||||||
* function before returning
|
* function before returning
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*`
|
/*
|
||||||
* Called when scraping (synchronous or asynchronous) is complete
|
* Called when scraping (synchronous or asynchronous) is complete
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
||||||
|
@ -421,16 +433,22 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Generates a sandbox for scraping/scraper detection
|
||||||
|
*/
|
||||||
Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
||||||
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
||||||
this.sandbox.browser = this.browser;
|
this.sandbox.browser = this.browser;
|
||||||
this.sandbox.doc = this.sandbox.browser.contentDocument;
|
this.sandbox.doc = this.sandbox.browser.contentDocument;
|
||||||
this.sandbox.utilities = new Scholar.Ingester.Utilities;
|
this.sandbox.utilities = new Scholar.Ingester.Utilities;
|
||||||
|
this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
|
||||||
|
this.sandbox.window = this.window;
|
||||||
this.sandbox.model = this.model;
|
this.sandbox.model = this.model;
|
||||||
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
|
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
|
||||||
|
|
||||||
this.sandbox.wait = function(){ this._waitForCompletion = true; };
|
var me = this;
|
||||||
this.sandbox.done = function(){ this._scrapePageComplete(); };
|
this.sandbox.wait = function(){ me._waitForCompletion = true; };
|
||||||
|
this.sandbox.done = function(){ me._scrapePageComplete(); };
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -453,9 +471,15 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||||
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher']);
|
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher']);
|
||||||
}
|
}
|
||||||
if(this.model.data[uri][prefixDC + 'year']) {
|
if(this.model.data[uri][prefixDC + 'year']) {
|
||||||
data.date = this.model.data[uri][prefixDC + 'year'].substring(
|
if(this.model.data[uri][prefixDC + 'year'].length == 4) {
|
||||||
this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,
|
newItem.setField("year", this.model.data[uri][prefixDC + 'year']);
|
||||||
this.model.data[uri][prefixDC + 'year'].length);
|
} else {
|
||||||
|
try {
|
||||||
|
newItem.setField(this.model.data[uri][prefixDC + 'year'].substring(
|
||||||
|
this.model.data[uri][prefixDC + 'year'].lastIndexOf(" ")+1,
|
||||||
|
this.model.data[uri][prefixDC + 'year'].length));
|
||||||
|
} catch(e) {}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if(this.model.data[uri][prefixDC + 'edition']) {
|
if(this.model.data[uri][prefixDC + 'edition']) {
|
||||||
newItem.setField("edition", this.model.data[uri][prefixDC + 'edition']);
|
newItem.setField("edition", this.model.data[uri][prefixDC + 'edition']);
|
||||||
|
|
|
@ -25,3 +25,5 @@ creatorTypes.editor = Editor
|
||||||
|
|
||||||
ingester.scraping = Scraping Page...
|
ingester.scraping = Scraping Page...
|
||||||
ingester.scrapeComplete = Scraping Complete
|
ingester.scrapeComplete = Scraping Complete
|
||||||
|
ingester.scrapeError = Could Not Scrape
|
||||||
|
ingester.scrapeErrorDescription = An error occurred while scraping this page. Please try again. If this error persists, contact the scraper author.
|
Loading…
Reference in a new issue