Provide visual feedback for scraping
This commit is contained in:
parent
8f34487205
commit
bb57e6ba7d
4 changed files with 218 additions and 93 deletions
|
@ -1,9 +1,23 @@
|
||||||
// Firefox Scholar Ingester Browser Functions
|
// Firefox Scholar Ingester Browser Functions
|
||||||
// Utilities based on code taken from Greasemonkey
|
// Based on code taken from Greasemonkey and PiggyBank
|
||||||
// This code is licensed according to the GPL
|
// This code is licensed according to the GPL
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// Scholar.Ingester.Interface
|
||||||
|
//
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Class to interface with the browser when ingesting data
|
||||||
|
|
||||||
Scholar.Ingester.Interface = function() {}
|
Scholar.Ingester.Interface = function() {}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// Public Scholar.Ingester.Interface methods
|
||||||
|
//
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize some variables and prepare event listeners for when chrome is done
|
* Initialize some variables and prepare event listeners for when chrome is done
|
||||||
* loading
|
* loading
|
||||||
|
@ -40,63 +54,14 @@ Scholar.Ingester.Interface.chromeUnload = function() {
|
||||||
this.tabBrowser.removeProgressListener(this);
|
this.tabBrowser.removeProgressListener(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Gets a document object given a browser window object
|
|
||||||
*
|
|
||||||
* NOTE: Browser objects are associated with document objects via keys generated
|
|
||||||
* from the time the browser object is opened. I'm not sure if this is the
|
|
||||||
* appropriate mechanism for handling this, but it's what PiggyBank used and it
|
|
||||||
* appears to work.
|
|
||||||
*/
|
|
||||||
Scholar.Ingester.Interface.getDocument = function(browser) {
|
|
||||||
try {
|
|
||||||
var key = browser.getAttribute("scholar-key");
|
|
||||||
if(Scholar.Ingester.Interface.browserDocuments[key]) {
|
|
||||||
return Scholar.Ingester.Interface.browserDocuments[key];
|
|
||||||
}
|
|
||||||
} finally {}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Creates a new document object for a browser window object, attempts to
|
|
||||||
* retrieve appropriate scraper
|
|
||||||
*/
|
|
||||||
Scholar.Ingester.Interface.setDocument = function(browser) {
|
|
||||||
try {
|
|
||||||
var key = browser.getAttribute("scholar-key");
|
|
||||||
} finally {
|
|
||||||
if(!key) {
|
|
||||||
var key = (new Date()).getTime();
|
|
||||||
browser.setAttribute("scholar-key", key);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser);
|
|
||||||
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Deletes the document object associated with a given browser window object
|
|
||||||
*/
|
|
||||||
Scholar.Ingester.Interface.deleteDocument = function(browser) {
|
|
||||||
try {
|
|
||||||
var key = browser.getAttribute("scholar-key");
|
|
||||||
if(Scholar.Ingester.Interface.browserDocuments[key]) {
|
|
||||||
delete Scholar.Ingester.Interface.browserDocuments[key];
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} finally {}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Scrapes a page (called when the capture icon is clicked)
|
* Scrapes a page (called when the capture icon is clicked)
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Interface.scrapeThisPage = function() {
|
Scholar.Ingester.Interface.scrapeThisPage = function() {
|
||||||
var document = Scholar.Ingester.Interface.getDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
var documentObject = Scholar.Ingester.Interface._getDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
||||||
if(document.scraper) {
|
if(documentObject.scraper) {
|
||||||
document.scrapePage();
|
Scholar.Ingester.Interface.scrapeProgress = new Scholar.Ingester.Interface.Progress(window, Scholar.Ingester.Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping"));
|
||||||
|
documentObject.scrapePage(Scholar.Ingester.Interface._finishScraping);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -105,11 +70,11 @@ Scholar.Ingester.Interface.scrapeThisPage = function() {
|
||||||
* thereof of the current page
|
* thereof of the current page
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Interface.updateStatus = function(browser) {
|
Scholar.Ingester.Interface.updateStatus = function(browser) {
|
||||||
var document = Scholar.Ingester.Interface.getDocument(browser);
|
var documentObject = Scholar.Ingester.Interface._getDocument(browser);
|
||||||
if(document && document.scraper) {
|
if(documentObject && documentObject.scraper) {
|
||||||
this.statusImage.src = "chrome://scholar/skin/capture_colored.png";
|
Scholar.Ingester.Interface.statusImage.src = "chrome://scholar/skin/capture_colored.png";
|
||||||
} else {
|
} else {
|
||||||
this.statusImage.src = "chrome://scholar/skin/capture_gray.png";
|
Scholar.Ingester.Interface.statusImage.src = "chrome://scholar/skin/capture_gray.png";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,7 +88,7 @@ Scholar.Ingester.Interface.updateStatus = function(browser) {
|
||||||
* create a new object for it.
|
* create a new object for it.
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Interface.contentLoad = function() {
|
Scholar.Ingester.Interface.contentLoad = function() {
|
||||||
Scholar.Ingester.Interface.setDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
Scholar.Ingester.Interface._setDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
||||||
Scholar.Ingester.Interface.updateStatus(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
Scholar.Ingester.Interface.updateStatus(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,7 +124,7 @@ Scholar.Ingester.Interface.Listener.onLocationChange = function() {
|
||||||
Scholar.Ingester.Interface.browsers.splice(i,1);
|
Scholar.Ingester.Interface.browsers.splice(i,1);
|
||||||
|
|
||||||
// To execute if document object does not exist
|
// To execute if document object does not exist
|
||||||
Scholar.Ingester.Interface.deleteDocument(browser);
|
Scholar.Ingester.Interface._deleteDocument(browser);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -186,3 +151,178 @@ Scholar.Ingester.Interface.Listener.onLocationChange = function() {
|
||||||
Scholar.Ingester.Interface.tabBrowser.selectedBrowser
|
Scholar.Ingester.Interface.tabBrowser.selectedBrowser
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// Private Scholar.Ingester.Document methods
|
||||||
|
//
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Gets a document object given a browser window object
|
||||||
|
*
|
||||||
|
* NOTE: Browser objects are associated with document objects via keys generated
|
||||||
|
* from the time the browser object is opened. I'm not sure if this is the
|
||||||
|
* appropriate mechanism for handling this, but it's what PiggyBank used and it
|
||||||
|
* appears to work.
|
||||||
|
*/
|
||||||
|
Scholar.Ingester.Interface._getDocument = function(browser) {
|
||||||
|
try {
|
||||||
|
var key = browser.getAttribute("scholar-key");
|
||||||
|
if(Scholar.Ingester.Interface.browserDocuments[key]) {
|
||||||
|
return Scholar.Ingester.Interface.browserDocuments[key];
|
||||||
|
}
|
||||||
|
} finally {}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Creates a new document object for a browser window object, attempts to
|
||||||
|
* retrieve appropriate scraper
|
||||||
|
*/
|
||||||
|
Scholar.Ingester.Interface._setDocument = function(browser) {
|
||||||
|
try {
|
||||||
|
var key = browser.getAttribute("scholar-key");
|
||||||
|
} finally {
|
||||||
|
if(!key) {
|
||||||
|
var key = (new Date()).getTime();
|
||||||
|
browser.setAttribute("scholar-key", key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser);
|
||||||
|
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Deletes the document object associated with a given browser window object
|
||||||
|
*/
|
||||||
|
Scholar.Ingester.Interface._deleteDocument = function(browser) {
|
||||||
|
try {
|
||||||
|
var key = browser.getAttribute("scholar-key");
|
||||||
|
if(Scholar.Ingester.Interface.browserDocuments[key]) {
|
||||||
|
delete Scholar.Ingester.Interface.browserDocuments[key];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} finally {}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Callback to be executed when scraping is complete
|
||||||
|
*/
|
||||||
|
Scholar.Ingester.Interface._finishScraping = function(documentObject) {
|
||||||
|
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
|
||||||
|
|
||||||
|
var fields = Scholar.ItemFields.getItemTypeFields(documentObject.item.getField("itemTypeID"));
|
||||||
|
|
||||||
|
var titleLabel = Scholar.getString("itemFields.title") + ":"
|
||||||
|
Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, this.item.getField("title"));
|
||||||
|
var creators = documentObject.item.numCreators();
|
||||||
|
if(creators) {
|
||||||
|
for(var i=0; i<creators; i++) {
|
||||||
|
var creator = documentObject.item.getCreator(i);
|
||||||
|
var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
|
||||||
|
var data = creator.firstName + ' ' + creator.lastName;
|
||||||
|
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for(i in fields) {
|
||||||
|
var data = documentObject.item.getField(fields[i]);
|
||||||
|
if(data) {
|
||||||
|
var name = Scholar.ItemFields.getName(fields[i]);
|
||||||
|
if(name != "source") {
|
||||||
|
var label = Scholar.getString("itemFields."+ name) + ":";
|
||||||
|
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
setTimeout(function() { Scholar.Ingester.Interface.scrapeProgress.fade() }, 2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
// Scholar.Ingester.Progress
|
||||||
|
//
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Handles the display of a div showing progress in scraping
|
||||||
|
|
||||||
|
Scholar.Ingester.Interface.Progress = function(myWindow, myDocument, headline) {
|
||||||
|
this.window = myWindow;
|
||||||
|
this.document = myDocument;
|
||||||
|
this.div = this.document.createElement('div');
|
||||||
|
this.div.style.MozOpacity = '.9';
|
||||||
|
this.div.style.position = 'fixed';
|
||||||
|
this.div.style.right = '20px';
|
||||||
|
this.div.style.top = '20px';
|
||||||
|
this.div.style.width = '200px';
|
||||||
|
this.div.style.height = '120px';
|
||||||
|
this.div.style.backgroundColor = '#7eadd9'
|
||||||
|
this.div.style.color = '#000';
|
||||||
|
this.div.style.padding = '5px';
|
||||||
|
this.div.style.fontFamily = 'Arial, Geneva, Helvetica';
|
||||||
|
this.div.style.overflow = 'hidden';
|
||||||
|
this.div.id = 'firefoxScholarProgressDiv';
|
||||||
|
|
||||||
|
this.headlineP = this.document.createElement("div");
|
||||||
|
this.headlineP.style.textAlign = 'center';
|
||||||
|
this.headlineP.style.fontSize = '22px';
|
||||||
|
this.headlineP.style.marginBottom = '5px';
|
||||||
|
if(!headline) {
|
||||||
|
headline = ' ';
|
||||||
|
}
|
||||||
|
var headlineNode = this.document.createTextNode(headline);
|
||||||
|
this.headlineP.appendChild(headlineNode);
|
||||||
|
this.div.appendChild(this.headlineP);
|
||||||
|
|
||||||
|
this.bodyP = this.document.createElement("div");
|
||||||
|
this.table = this.document.createElement("table");
|
||||||
|
this.table.style.borderCollapse = 'collapse';
|
||||||
|
this.bodyP.appendChild(this.table);
|
||||||
|
this.div.appendChild(this.bodyP);
|
||||||
|
|
||||||
|
this.document.body.appendChild(this.div);
|
||||||
|
}
|
||||||
|
|
||||||
|
Scholar.Ingester.Interface.Progress.prototype.changeHeadline = function(headline) {
|
||||||
|
this.headlineP.removeChild(this.headlineP.firstChild);
|
||||||
|
|
||||||
|
var headlineNode = this.document.createTextNode(headline);
|
||||||
|
this.headlineP.appendChild(headlineNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
Scholar.Ingester.Interface.Progress.prototype.addResult = function(label, data) {
|
||||||
|
var labelNode = this.document.createTextNode(label);
|
||||||
|
var dataNode = this.document.createTextNode(data);
|
||||||
|
|
||||||
|
var tr = this.document.createElement("tr");
|
||||||
|
var labelTd = this.document.createElement("td");
|
||||||
|
labelTd.style.fontSize = '10px';
|
||||||
|
labelTd.style.width = '60px';
|
||||||
|
var dataTd = this.document.createElement("td");
|
||||||
|
dataTd.style.fontSize = '10px';
|
||||||
|
|
||||||
|
labelTd.appendChild(labelNode);
|
||||||
|
dataTd.appendChild(dataNode);
|
||||||
|
tr.appendChild(labelTd);
|
||||||
|
tr.appendChild(dataTd);
|
||||||
|
this.table.appendChild(tr);
|
||||||
|
}
|
||||||
|
|
||||||
|
Scholar.Ingester.Interface.Progress.prototype.fade = function() {
|
||||||
|
// Icky, icky hack to keep objects
|
||||||
|
var me = this;
|
||||||
|
this._fader = function() {
|
||||||
|
if(me.div.style.MozOpacity <= 0) {
|
||||||
|
me.div.style.display = 'none';
|
||||||
|
} else {
|
||||||
|
me.div.style.MozOpacity -= .1;
|
||||||
|
setTimeout(me._fader, 100);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Begin fade
|
||||||
|
this._fader();
|
||||||
|
}
|
||||||
|
|
|
@ -1,27 +0,0 @@
|
||||||
<?xml version="1.0" ?>
|
|
||||||
<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
|
|
||||||
<!DOCTYPE overlay SYSTEM "chrome://piggy-bank/locale/load-dom-dialog.dtd">
|
|
||||||
|
|
||||||
<window
|
|
||||||
xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
|
|
||||||
xmlns:xul="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
|
|
||||||
xmlns:html="http://www.w3.org/1999/xhtml"
|
|
||||||
id="scrape-progress"
|
|
||||||
windowtype="Options"
|
|
||||||
orient="vertical"
|
|
||||||
screenX="10" screenY="10"
|
|
||||||
persist="width height screenX screenY sizeMode"
|
|
||||||
title="Scraping Page…"
|
|
||||||
>
|
|
||||||
|
|
||||||
<hbox flex="1">
|
|
||||||
<vbox flex="1" style="padding: 10px">
|
|
||||||
<label value="Scraping Page…" />
|
|
||||||
<progressmeter id="progress" mode="undetermined" />
|
|
||||||
</vbox>
|
|
||||||
<resizer id="window-resizer" dir="bottomright"/>
|
|
||||||
<box style="visibility: collapse">
|
|
||||||
<tabbrowser id="hidden-browser" />
|
|
||||||
</box>
|
|
||||||
</hbox>
|
|
||||||
</window>
|
|
|
@ -356,7 +356,7 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
|
||||||
currentScraper.scraperDetectCode +
|
currentScraper.scraperDetectCode +
|
||||||
"\n})()", scraperSandbox);
|
"\n})()", scraperSandbox);
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
throw e+' in scraper '+currentScraper.label;
|
throw e+' in scraperDetectCode for '+currentScraper.label;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return canScrape;
|
return canScrape;
|
||||||
|
@ -375,7 +375,11 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
||||||
|
|
||||||
var scraperSandbox = this.sandbox;
|
var scraperSandbox = this.sandbox;
|
||||||
|
|
||||||
|
try {
|
||||||
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
||||||
|
} catch(e) {
|
||||||
|
throw e+' in scraperJavaScript for '+this.scraper.label;
|
||||||
|
}
|
||||||
|
|
||||||
// If synchronous, call _scrapePageComplete();
|
// If synchronous, call _scrapePageComplete();
|
||||||
if(!scraperSandbox._waitForCompletion) {
|
if(!scraperSandbox._waitForCompletion) {
|
||||||
|
@ -413,7 +417,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
||||||
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
||||||
this._updateDatabase();
|
this._updateDatabase();
|
||||||
if(this._scrapeCallback) {
|
if(this._scrapeCallback) {
|
||||||
this._scrapeCallback();
|
this._scrapeCallback(this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -469,5 +473,10 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||||
newItem.setCreator(0, firstName, lastName);
|
newItem.setCreator(0, firstName, lastName);
|
||||||
}
|
}
|
||||||
newItem.save();
|
newItem.save();
|
||||||
|
|
||||||
|
// First one is stored so as to be accessible
|
||||||
|
if(!this.item) {
|
||||||
|
this.item = newItem;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -22,3 +22,6 @@ itemTypes.journalArticle = Journal Article
|
||||||
creatorTypes.author = Author
|
creatorTypes.author = Author
|
||||||
creatorTypes.contributor = Contributor
|
creatorTypes.contributor = Contributor
|
||||||
creatorTypes.editor = Editor
|
creatorTypes.editor = Editor
|
||||||
|
|
||||||
|
ingester.scraping = Scraping Page...
|
||||||
|
ingester.scrapeComplete = Scraping Complete
|
Loading…
Add table
Reference in a new issue