XPCOM-ize ingester, fix swapped first and last name in ingested info, stop ingesting pages field (this should be for pages of the source used, not the total number of pages, right?)

This commit is contained in:
Simon Kornblith 2006-06-02 03:19:12 +00:00
parent 864fa537b5
commit 639a006efb
4 changed files with 74 additions and 23 deletions

View file

@ -2,19 +2,23 @@
// Utilities based on code taken from Greasemonkey // Utilities based on code taken from Greasemonkey
// This code is licensed according to the GPL // This code is licensed according to the GPL
// Prepare the browser and collector instrumentation caches --------------------
Scholar.Ingester.Interface = function() {} Scholar.Ingester.Interface = function() {}
/*
* Initialize some variables and prepare event listeners for when chrome is done
* loading
*/
Scholar.Ingester.Interface.init = function() { Scholar.Ingester.Interface.init = function() {
Scholar.Ingester.Interface.browsers = new Array(); Scholar.Ingester.Interface.browsers = new Array();
Scholar.Ingester.Interface.browserDocuments = new Object();
window.addEventListener("load", Scholar.Ingester.Interface.chromeLoad, false); window.addEventListener("load", Scholar.Ingester.Interface.chromeLoad, false);
window.addEventListener("unload", Scholar.Ingester.Interface.chromeUnload, false); window.addEventListener("unload", Scholar.Ingester.Interface.chromeUnload, false);
Scholar.Ingester.Interface.browsers = new Array();
Scholar.Ingester.Interface.browserDocuments = new Object();
} }
/*
* When chrome loads, register our event handlers with the appropriate interfaces
*/
Scholar.Ingester.Interface.chromeLoad = function() { Scholar.Ingester.Interface.chromeLoad = function() {
Scholar.Ingester.Interface.tabBrowser = document.getElementById("content"); Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
Scholar.Ingester.Interface.appContent = document.getElementById("appcontent"); Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
@ -28,10 +32,23 @@ Scholar.Ingester.Interface.chromeLoad = function() {
Scholar.Ingester.Interface.contentLoad, true); Scholar.Ingester.Interface.contentLoad, true);
} }
/*
* When chrome unloads, delete our document objects and remove our listeners
*/
Scholar.Ingester.Interface.chromeUnload = function() { Scholar.Ingester.Interface.chromeUnload = function() {
this.tabBrowser.removeProgressListener(this); delete Scholar.Ingester.Interface.browserDocuments;
this.tabBrowser.removeProgressListener(this);
} }
/*
* Gets a document object given a browser window object
*
* NOTE: Browser objects are associated with document objects via keys generated
* from the time the browser object is opened. I'm not sure if this is the
* appropriate mechanism for handling this, but it's what PiggyBank used and it
* appears to work.
*/
Scholar.Ingester.Interface.getDocument = function(browser) { Scholar.Ingester.Interface.getDocument = function(browser) {
try { try {
var key = browser.getAttribute("scholar-key"); var key = browser.getAttribute("scholar-key");
@ -42,6 +59,10 @@ Scholar.Ingester.Interface.getDocument = function(browser) {
return false; return false;
} }
/*
* Creates a new document object for a browser window object, attempts to
* retrieve appropriate scraper
*/
Scholar.Ingester.Interface.setDocument = function(browser) { Scholar.Ingester.Interface.setDocument = function(browser) {
try { try {
var key = browser.getAttribute("scholar-key"); var key = browser.getAttribute("scholar-key");
@ -55,6 +76,9 @@ Scholar.Ingester.Interface.setDocument = function(browser) {
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper(); Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
} }
/*
* Deletes the document object associated with a given browser window object
*/
Scholar.Ingester.Interface.deleteDocument = function(browser) { Scholar.Ingester.Interface.deleteDocument = function(browser) {
try { try {
var key = browser.getAttribute("scholar-key"); var key = browser.getAttribute("scholar-key");
@ -66,6 +90,9 @@ Scholar.Ingester.Interface.deleteDocument = function(browser) {
return false; return false;
} }
/*
* Scrapes a page (called when the capture icon is clicked)
*/
Scholar.Ingester.Interface.scrapeThisPage = function() { Scholar.Ingester.Interface.scrapeThisPage = function() {
var document = Scholar.Ingester.Interface.getDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser); var document = Scholar.Ingester.Interface.getDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
if(document.scraper) { if(document.scraper) {
@ -73,6 +100,10 @@ Scholar.Ingester.Interface.scrapeThisPage = function() {
} }
} }
/*
* Updates the status of the capture icon to reflect the scrapability or lack
* thereof of the current page
*/
Scholar.Ingester.Interface.updateStatus = function(browser) { Scholar.Ingester.Interface.updateStatus = function(browser) {
var document = Scholar.Ingester.Interface.getDocument(browser); var document = Scholar.Ingester.Interface.getDocument(browser);
if(document && document.scraper) { if(document && document.scraper) {
@ -82,16 +113,33 @@ Scholar.Ingester.Interface.updateStatus = function(browser) {
} }
} }
/*
* An event handler called when a new document is loaded. Creates a new document
* object, and updates the status of the capture icon
*
* FIXME: This approach, again borrowed from PiggyBank, does not work properly
* when the newly loaded page is not the currently selected page. For example,
* if a tab is loaded behind the currently selected page, the ingester will not
* create a new object for it.
*/
Scholar.Ingester.Interface.contentLoad = function() { Scholar.Ingester.Interface.contentLoad = function() {
Scholar.Ingester.Interface.setDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser); Scholar.Ingester.Interface.setDocument(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
Scholar.Ingester.Interface.updateStatus(Scholar.Ingester.Interface.tabBrowser.selectedBrowser); Scholar.Ingester.Interface.updateStatus(Scholar.Ingester.Interface.tabBrowser.selectedBrowser);
} }
/*
* Dummy event handlers for all the events we don't care about
*/
Scholar.Ingester.Interface.Listener = function() {} Scholar.Ingester.Interface.Listener = function() {}
Scholar.Ingester.Interface.Listener.onStatusChange = function() {} Scholar.Ingester.Interface.Listener.onStatusChange = function() {}
Scholar.Ingester.Interface.Listener.onSecurityChange = function() {} Scholar.Ingester.Interface.Listener.onSecurityChange = function() {}
Scholar.Ingester.Interface.Listener.onProgressChange = function() {} Scholar.Ingester.Interface.Listener.onProgressChange = function() {}
Scholar.Ingester.Interface.Listener.onStateChange = function() {} Scholar.Ingester.Interface.Listener.onStateChange = function() {}
/*
* onLocationChange is called when tabs are switched. Use it to retrieve the
* appropriate status indicator for the current tab, and to free useless objects
*/
Scholar.Ingester.Interface.Listener.onLocationChange = function() { Scholar.Ingester.Interface.Listener.onLocationChange = function() {
var browsers = Scholar.Ingester.Interface.tabBrowser.browsers; var browsers = Scholar.Ingester.Interface.tabBrowser.browsers;
@ -114,7 +162,7 @@ Scholar.Ingester.Interface.Listener.onLocationChange = function() {
Scholar.Ingester.Interface.deleteDocument(browser); Scholar.Ingester.Interface.deleteDocument(browser);
} }
} }
/*// Add a collector to any new browser /*// Add a collector to any new browser
for (var i = 0; i < browsers.length; i++) { for (var i = 0; i < browsers.length; i++) {
var browser = browsers[i]; var browser = browsers[i];

View file

@ -6,8 +6,7 @@
<overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"> <overlay id="scholar-ingester-overlay" xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul">
<script src="../include.js"/> <script src="../include.js"/>
<script src="ingester.js"/>
<script src="browser.js"/> <script src="browser.js"/>
<script type="application/x-javascript"> <script type="application/x-javascript">

View file

@ -83,7 +83,7 @@ Scholar.Ingester.Utilities.prototype.trimString = function(s) {
Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
var elmts = []; var elmts = [];
var iterator = doc.evaluate(xpath, parentNode, nsResolver, XPathResult.ANY_TYPE,null); var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
var elmt = iterator.iterateNext(); var elmt = iterator.iterateNext();
var i = 0; var i = 0;
while (elmt) { while (elmt) {
@ -180,7 +180,7 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
var urls = []; var urls = [];
var addedURLs = []; var addedURLs = [];
var aElements = doc.evaluate("//a", doc, null, XPathResult.ANY_TYPE,null); var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
var aElement = aElements.iterateNext(); var aElement = aElements.iterateNext();
while (aElement) { while (aElement) {
var href = aElement.href; var href = aElement.href;
@ -294,7 +294,6 @@ Scholar.Ingester.Utilities.HTTPUtilities.prototype.stateChange = function(xmlhtt
* *
* Private properties: * Private properties:
* _sandbox - sandbox for code execution * _sandbox - sandbox for code execution
* _progressDialog - dialog showing scrape progress
*/ */
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@ -365,15 +364,17 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
/* /*
* Populate model with semantic data regarding this page using _scraper_ * Populate model with semantic data regarding this page using _scraper_
* Callback will be executed once scraping is complete
*/ */
Scholar.Ingester.Document.prototype.scrapePage = function() { Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
if(callback) {
this._scrapeCallback = callback;
}
Scholar.debug("Scraping "+this.browser.contentDocument.location.href); Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
var scraperSandbox = this.sandbox; var scraperSandbox = this.sandbox;
this._progressDialog = openDialog("chrome://scholar/content/ingester/scrape-progress.xul",
"_blank", "chrome,all,dialog=no", null, null, null);
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox); Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
// If synchronous, call _scrapePageComplete(); // If synchronous, call _scrapePageComplete();
@ -406,12 +407,14 @@ Scholar.Ingester.Document.prototype.scrapePage = function() {
* function before returning * function before returning
*/ */
/* /*`
* Called when scraping (synchronous or asynchronous) is complete * Called when scraping (synchronous or asynchronous) is complete
*/ */
Scholar.Ingester.Document.prototype._scrapePageComplete = function() { Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
this._updateDatabase(); this._updateDatabase();
this._progressDialog.close(); if(this._scrapeCallback) {
this._scrapeCallback();
}
} }
Scholar.Ingester.Document.prototype._generateSandbox = function() { Scholar.Ingester.Document.prototype._generateSandbox = function() {
@ -420,7 +423,7 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
this.sandbox.doc = this.sandbox.browser.contentDocument; this.sandbox.doc = this.sandbox.browser.contentDocument;
this.sandbox.utilities = new Scholar.Ingester.Utilities; this.sandbox.utilities = new Scholar.Ingester.Utilities;
this.sandbox.model = this.model; this.sandbox.model = this.model;
this.sandbox.XPathResult = XPathResult; this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
this.sandbox.wait = function(){ this._waitForCompletion = true; }; this.sandbox.wait = function(){ this._waitForCompletion = true; };
this.sandbox.done = function(){ this._scrapePageComplete(); }; this.sandbox.done = function(){ this._scrapePageComplete(); };
@ -456,15 +459,12 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
if(this.model.data[uri][prefixDC + 'identifier']) { if(this.model.data[uri][prefixDC + 'identifier']) {
newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'].substring(5)); newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'].substring(5));
} }
if(this.model.data[uri][prefixDummy + 'pages']) {
newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages']);
}
if(this.model.data[uri][prefixDC + 'creator']) { if(this.model.data[uri][prefixDC + 'creator']) {
var creator = this.model.data[uri][prefixDC + 'creator']; var creator = this.model.data[uri][prefixDC + 'creator'];
var spaceIndex = creator.lastIndexOf(" "); var spaceIndex = creator.lastIndexOf(" ");
var firstName = creator.substring(spaceIndex+1, creator.length); var lastName = creator.substring(spaceIndex+1, creator.length);
var lastName = creator.substring(0, spaceIndex); var firstName = creator.substring(0, spaceIndex);
newItem.setCreator(0, firstName, lastName); newItem.setCreator(0, firstName, lastName);
} }

View file

@ -31,6 +31,10 @@ Cc["@mozilla.org/moz/jssubscript-loader;1"]
.getService(Ci.mozIJSSubScriptLoader) .getService(Ci.mozIJSSubScriptLoader)
.loadSubScript("chrome://scholar/content/xpcom/notifier.js"); .loadSubScript("chrome://scholar/content/xpcom/notifier.js");
Cc["@mozilla.org/moz/jssubscript-loader;1"]
.getService(Ci.mozIJSSubScriptLoader)
.loadSubScript("chrome://scholar/content/xpcom/ingester.js");
/********************************************************************/ /********************************************************************/