XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?)
This commit is contained in:
parent
864fa537b5
commit
639a006efb
4 changed files with 74 additions and 23 deletions
|
@ -2,19 +2,23 @@
|
|||
// Utilities based on code taken from Greasemonkey
|
||||
// This code is licensed according to the GPL
|
||||
|
||||
// Prepare the browser and collector instrumentation caches --------------------
|
||||
Scholar.Ingester.Interface = function() {}
|
||||
|
||||
/*
|
||||
* Initialize some variables and prepare event listeners for when chrome is done
|
||||
* loading
|
||||
*/
|
||||
Scholar.Ingester.Interface.init = function() {
|
||||
Scholar.Ingester.Interface.browsers = new Array();
|
||||
Scholar.Ingester.Interface.browserDocuments = new Object();
|
||||
|
||||
window.addEventListener("load", Scholar.Ingester.Interface.chromeLoad, false);
|
||||
window.addEventListener("unload", Scholar.Ingester.Interface.chromeUnload, false);
|
||||
|
||||
Scholar.Ingester.Interface.browsers = new Array();
|
||||
Scholar.Ingester.Interface.browserDocuments = new Object();
|
||||
}
|
||||
|
||||
/*
|
||||
* When chrome loads, register our event handlers with the appropriate interfaces
|
||||
*/
|
||||
Scholar.Ingester.Interface.chromeLoad = function() {
|
||||
Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
|
||||
Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
|
||||
|
@ -28,10 +32,23 @@ Scholar.Ingester.Interface.chromeLoad = function() {
|
|||
Scholar.Ingester.Interface.contentLoad, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* When chrome unloads, delete our document objects and remove our listeners
|
||||
*/
|
||||
Scholar.Ingester.Interface.chromeUnload = function() {
|
||||
this.tabBrowser.removeProgressListener(this);
|
||||
delete Scholar.Ingester.Interface.browserDocuments;
|
||||
this.tabBrowser.removeProgressListener(this);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Gets a document object given a browser window object
|
||||
*
|
||||
* NOTE: Browser objects are associated with document objects via keys generated
|
||||
* from the time the browser object is opened. I'm not sure if this is the
|
||||
* appropriate mechanism for handling this, but it's what PiggyBank used and it
|
||||
* appears to work.
|
||||
*/
|
||||
Scholar.Ingester.Interface.getDocument = function(browser) {
|
||||
try {
|
||||
var key = browser.getAttribute("scholar-key");
|
||||
|
@ -42,6 +59,10 @@ Scholar.Ingester.Interface.getDocument = function(browser) {
|
|||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Creates a new document object for a browser window object, attempts to
|
||||
* retrieve appropriate scraper
|
||||
*/
|
||||
Scholar.Ingester.Interface.setDocument = function(browser) {
|
||||
try {
|
||||
var key = browser.getAttribute("scholar-key");
|
||||
|
@ -55,6 +76,9 @@ Scholar.Ingester.Interface.setDocument = function(browser) {
|
|||
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
|
||||
}
|
||||
|
||||
/*
|
||||
* Deletes the document object associated with a given browser window object
|
||||
*/
|
||||
Scholar.Ingester.Interface.deleteDocument = function(browser) {
|
||||
try {
|
||||
var key = browser.getAttribute("scholar-key");
|
||||
|
@ -66,6 +90,9 @@ Scholar.Ingester.Interface.deleteDocument = function(browser) {
|
|||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Scrapes a page (called when the capture icon is clicked)
|
||||
*/
|
||||
Scholar.Ingester.Interface.scrapeThisPage = function() {
|
||||
var document = Scholar.Ingester.Interface.getDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
||||
if(document.scraper) {
|
||||
|
@ -73,6 +100,10 @@ Scholar.Ingester.Interface.scrapeThisPage = function() {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Updates the status of the capture icon to reflect the scrapability or lack
|
||||
* thereof of the current page
|
||||
*/
|
||||
Scholar.Ingester.Interface.updateStatus = function(browser) {
|
||||
var document = Scholar.Ingester.Interface.getDocument(browser);
|
||||
if(document && document.scraper) {
|
||||
|
@ -82,16 +113,33 @@ Scholar.Ingester.Interface.updateStatus = function(browser) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* An event handler called when a new document is loaded. Creates a new document
|
||||
* object, and updates the status of the capture icon
|
||||
*
|
||||
* FIXME: This approach, again borrowed from PiggyBank, does not work properly
|
||||
* when the newly loaded page is not the currently selected page. For example,
|
||||
* if a tab is loaded behind the currently selected page, the ingester will not
|
||||
* create a new object for it.
|
||||
*/
|
||||
Scholar.Ingester.Interface.contentLoad = function() {
|
||||
Scholar.Ingester.Interface.setDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
||||
Scholar.Ingester.Interface.updateStatus(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
|
||||
}
|
||||
|
||||
/*
|
||||
* Dummy event handlers for all the events we don't care about
|
||||
*/
|
||||
Scholar.Ingester.Interface.Listener = function() {}
|
||||
Scholar.Ingester.Interface.Listener.onStatusChange = function() {}
|
||||
Scholar.Ingester.Interface.Listener.onSecurityChange = function() {}
|
||||
Scholar.Ingester.Interface.Listener.onProgressChange = function() {}
|
||||
Scholar.Ingester.Interface.Listener.onStateChange = function() {}
|
||||
|
||||
/*
|
||||
* onLocationChange is called when tabs are switched. Use it to retrieve the
|
||||
* appropriate status indicator for the current tab, and to free useless objects
|
||||
*/
|
||||
Scholar.Ingester.Interface.Listener.onLocationChange = function() {
|
||||
var browsers = Scholar.Ingester.Interface.tabBrowser.browsers;
|
||||
|
||||
|
@ -114,7 +162,7 @@ Scholar.Ingester.Interface.Listener.onLocationChange = function() {
|
|||
Scholar.Ingester.Interface.deleteDocument(browser);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*// Add a collector to any new browser
|
||||
for (var i = 0; i < browsers.length; i++) {
|
||||
var browser = browsers[i];
|
||||
|
|
|
@ -6,8 +6,7 @@
|
|||
<overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul">
|
||||
|
||||
<script src="../include.js"/>
|
||||
|
||||
<script src="ingester.js"/>
|
||||
|
||||
<script src="browser.js"/>
|
||||
|
||||
<script type="application/x-javascript">
|
||||
|
|
|
@ -83,7 +83,7 @@ Scholar.Ingester.Utilities.prototype.trimString = function(s) {
|
|||
Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
|
||||
var elmts = [];
|
||||
|
||||
var iterator = doc.evaluate(xpath, parentNode, nsResolver, XPathResult.ANY_TYPE,null);
|
||||
var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
|
||||
var elmt = iterator.iterateNext();
|
||||
var i = 0;
|
||||
while (elmt) {
|
||||
|
@ -180,7 +180,7 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
|
|||
var urls = [];
|
||||
var addedURLs = [];
|
||||
|
||||
var aElements = doc.evaluate("//a", doc, null, XPathResult.ANY_TYPE,null);
|
||||
var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
|
||||
var aElement = aElements.iterateNext();
|
||||
while (aElement) {
|
||||
var href = aElement.href;
|
||||
|
@ -294,7 +294,6 @@ Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhtt
|
|||
*
|
||||
* Private properties:
|
||||
* _sandbox - sandbox for code execution
|
||||
* _progressDialog - dialog showing scrape progress
|
||||
*/
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -365,15 +364,17 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
|
|||
|
||||
/*
|
||||
* Populate model with semantic data regarding this page using _scraper_
|
||||
* Callback will be executed once scraping is complete
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype.scrapePage = function() {
|
||||
Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
||||
if(callback) {
|
||||
this._scrapeCallback = callback;
|
||||
}
|
||||
|
||||
Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
|
||||
|
||||
var scraperSandbox = this.sandbox;
|
||||
|
||||
this._progressDialog = openDialog("chrome://scholar/content/ingester/scrape-progress.xul",
|
||||
"_blank", "chrome,all,dialog=no", null, null, null);
|
||||
|
||||
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
||||
|
||||
// If synchronous, call _scrapePageComplete();
|
||||
|
@ -406,12 +407,14 @@ Scholar.Ingester.Document.prototype.scrapePage = function() {
|
|||
* function before returning
|
||||
*/
|
||||
|
||||
/*
|
||||
/*`
|
||||
* Called when scraping (synchronous or asynchronous) is complete
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
||||
this._updateDatabase();
|
||||
this._progressDialog.close();
|
||||
if(this._scrapeCallback) {
|
||||
this._scrapeCallback();
|
||||
}
|
||||
}
|
||||
|
||||
Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
||||
|
@ -420,7 +423,7 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
|||
this.sandbox.doc = this.sandbox.browser.contentDocument;
|
||||
this.sandbox.utilities = new Scholar.Ingester.Utilities;
|
||||
this.sandbox.model = this.model;
|
||||
this.sandbox.XPathResult = XPathResult;
|
||||
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
|
||||
|
||||
this.sandbox.wait = function(){ this._waitForCompletion = true; };
|
||||
this.sandbox.done = function(){ this._scrapePageComplete(); };
|
||||
|
@ -456,15 +459,12 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
|||
if(this.model.data[uri][prefixDC + 'identifier']) {
|
||||
newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'].substring(5));
|
||||
}
|
||||
if(this.model.data[uri][prefixDummy + 'pages']) {
|
||||
newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages']);
|
||||
}
|
||||
if(this.model.data[uri][prefixDC + 'creator']) {
|
||||
var creator = this.model.data[uri][prefixDC + 'creator'];
|
||||
|
||||
var spaceIndex = creator.lastIndexOf(" ");
|
||||
var firstName = creator.substring(spaceIndex+1, creator.length);
|
||||
var lastName = creator.substring(0, spaceIndex);
|
||||
var lastName = creator.substring(spaceIndex+1, creator.length);
|
||||
var firstName = creator.substring(0, spaceIndex);
|
||||
|
||||
newItem.setCreator(0, firstName, lastName);
|
||||
}
|
|
@ -31,6 +31,10 @@ Cc["@mozilla.org/moz/jssubscript-loader;1"]
|
|||
.getService(Ci.mozIJSSubScriptLoader)
|
||||
.loadSubScript("chrome://scholar/content/xpcom/notifier.js");
|
||||
|
||||
Cc["@mozilla.org/moz/jssubscript-loader;1"]
|
||||
.getService(Ci.mozIJSSubScriptLoader)
|
||||
.loadSubScript("chrome://scholar/content/xpcom/ingester.js");
|
||||
|
||||
/********************************************************************/
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue