- Made ingester automatically create hidden browser objects, given a window object. This should make things much easier for both David and me.

- Multiple item detection code is now a part of the scraperJavaScript, rather than the scrapeDetectCode, and code to choose which items to add is part of Scholar.Ingester.Utilities, accessible from inside scrapers. The alternative approach would result in one request (or, in the case of JSTOR, three requests) per new item, while in some cases (e.g. Voyager) only one request is necessary to get all of the items.
This commit is contained in:
Simon Kornblith 2006-06-22 15:50:46 +00:00
parent 726364d091
commit 3890e5f122
5 changed files with 109 additions and 95 deletions

View file

@ -35,7 +35,6 @@ Scholar_Ingester_Interface.init = function() {
*/
Scholar_Ingester_Interface.chromeLoad = function() {
Scholar_Ingester_Interface.tabBrowser = document.getElementById("content");
Scholar_Ingester_Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser");
Scholar_Ingester_Interface.appContent = document.getElementById("appcontent");
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
@ -61,21 +60,11 @@ Scholar_Ingester_Interface.chromeUnload = function() {
Scholar_Ingester_Interface.scrapeThisPage = function() {
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
if(documentObject.scraper) {
if(documentObject.scrapeURLList) {
// In the case that there are multiple scrapable URLs, make the user choose
Scholar_Ingester_Interface.chooseURL(documentObject);
}
Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping"));
documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping);
}
}
Scholar_Ingester_Interface.chooseURL = function(documentObject) {
Scholar.debug("chooseURL called");
var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
"_blank","chrome,modal,centerscreen,resizable=yes", documentObject);
}
/*
* Updates the status of the capture icon to reflect the scrapability or lack
* thereof of the current page
@ -182,7 +171,7 @@ Scholar_Ingester_Interface._setDocument = function(browser) {
browser.setAttribute("scholar-key", key);
}
}
Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar_Ingester_Interface.hiddenBrowser);
Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, window);
Scholar_Ingester_Interface.browserDocuments[key].retrieveScraper();
}
@ -203,7 +192,7 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) {
/*
* Callback to be executed when scraping is complete
*/
Scholar_Ingester_Interface._finishScraping = function(obj) {
Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) {
if(obj.items.length) {
try { // Encased in a try block to fix a as-of-yet unresolved issue
var item1 = obj.items[0];
@ -243,12 +232,14 @@ Scholar_Ingester_Interface._finishScraping = function(obj) {
for(i in obj.items) {
obj.items[i].save();
}
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
} else if(returnValue) {
Scholar_Ingester_Interface.scrapeProgress.kill();
} else {
Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
Scholar_Ingester_Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
}
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
}
//////////////////////////////////////////////////////////////////////////////
@ -333,7 +324,6 @@ Scholar_Ingester_Interface.Progress.prototype.addDescription = function(descript
this.table.appendChild(tr);
}
Scholar_Ingester_Interface.Progress.prototype.fade = function() {
// Icky, icky hack to keep objects
var me = this;
@ -349,3 +339,8 @@ Scholar_Ingester_Interface.Progress.prototype.fade = function() {
// Begin fade
this._fader();
}
Scholar_Ingester_Interface.Progress.prototype.kill = function() {
this.div.style.display = 'none';
}

View file

@ -12,10 +12,4 @@
<hbox id="urlbar-icons">
<image src="chrome://scholar/skin/treeitem-book.png" id="scholar-status-image" onclick="Scholar_Ingester_Interface.scrapeThisPage()" position="1" hidden="true"/>
</hbox>
<window id="main-window">
<box style="visibility: collapse">
<browser id="scholar-hidden-browser" />
</box>
</window>
</overlay>

View file

@ -19,26 +19,26 @@ Scholar_Ingester_Interface_SelectItems = function() {}
* loading
*/
Scholar_Ingester_Interface_SelectItems.init = function() {
this.documentObject = window.arguments[0];
this.io = window.arguments[0];
this.Scholar_Ingester_Interface = window.arguments[1];
this.listbox = document.getElementById("scholar-selectitems-links");
for(i in this.documentObject.scrapeURLList) { // we could use a tree for this if we wanted to
for(i in this.io.dataIn) { // we could use a tree for this if we wanted to
var itemNode = document.createElement("listitem");
itemNode.setAttribute("type", "checkbox");
itemNode.setAttribute("value", i);
itemNode.setAttribute("label", this.documentObject.scrapeURLList[i]);
itemNode.setAttribute("label", this.io.dataIn[i]);
itemNode.setAttribute("checked", false);
this.listbox.appendChild(itemNode);
}
}
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
// clear scrapeURLList
this.documentObject.scrapeURLList = new Object();
this.io.dataOut = new Object();
// collect scrapeURLList from listbox
for(var i=0; i<this.listbox.length; i++) {
var itemNode = this.listbox[i];
this.documentObject.scrapeURLList[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
}
}

View file

@ -4,6 +4,21 @@
Scholar.Ingester = new function() {}
Scholar.Ingester.createHiddenBrowser = function(myWindow) {
// Create a hidden browser
var newHiddenBrowser = myWindow.document.createElement("browser");
var windows = myWindow.document.getElementsByTagName("window");
windows[0].appendChild(newHiddenBrowser);
Scholar.debug("created hidden browser");
return newHiddenBrowser;
}
Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) {
// Delete a hidden browser
delete myBrowser;
Scholar.debug("deleted hidden browser");
}
/////////////////////////////////////////////////////////////////
//
// Scholar.Ingester.Model
@ -48,8 +63,8 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
/////////////////////////////////////////////////////////////////
// Scholar.Ingester.Utilities class, a set of methods to assist in data
// extraction. Most code here was stolen directly from the Piggy Bank project.
Scholar.Ingester.Utilities = function(hiddenBrowser) {
this._hiddenBrowser = hiddenBrowser;
Scholar.Ingester.Utilities = function(myWindow) {
this.window = myWindow;
}
// Adapter for Piggy Bank function to print debug messages; log level is
@ -115,7 +130,7 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
// exception - a function to execute if an exception occurs (exceptions are
// also logged in the Firefox Scholar log)
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
var hiddenBrowser = this._hiddenBrowser;
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
Scholar.debug("processDocuments called");
try {
@ -141,26 +156,23 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
exception(e);
}
} else {
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
hiddenBrowser.setTimeout(done, 10);
}
};
var onLoad = function() {
Scholar.debug("onLoad called");
if(hiddenBrowser.id == "scholar-hidden-browser") {
hiddenBrowser.removeEventListener("load", onLoad, true);
try {
var newHiddenBrowser = new Object();
Scholar.debug("new hidden browser");
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
Scholar.debug("added attributes");
processor(newHiddenBrowser);
Scholar.debug("called processor");
} catch (e) {
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
exception(e);
}
hiddenBrowser.removeEventListener("load", onLoad, true);
try {
var newHiddenBrowser = new Object();
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
processor(newHiddenBrowser);
} catch (e) {
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
exception(e);
}
doLoad();
};
var init = function() {
Scholar.debug("init called");
@ -302,6 +314,50 @@ Scholar.Ingester.Utilities.prototype.cleanTags = function(x) {
return x.replace(/<[^>]+>/g, "");
}
/*
* Allows a user to select which items to scrape
*/
Scholar.Ingester.Utilities.prototype.selectItems = function(itemList) {
// mozillazine made me do it! honest!
var io = { dataIn:itemList, dataOut:null }
var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
"_blank","chrome,modal,centerscreen,resizable=yes", io);
return io.dataOut;
}
/*
* Grabs items based on URLs
*/
Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) {
var availableItems = new Object(); // Technically, associative arrays are objects
// Require link to match this
var tagRegexp = new RegExp();
tagRegexp.compile(urlRe);
// Do not allow text to match this
var rejectRegexp = new RegExp();
rejectRegexp.compile(rejectRe);
var links = inHere.getElementsByTagName("a");
for(var i=0; i<links.length; i++) {
if(tagRegexp.test(links[i].href)) {
var text = this.getNodeString(doc, links[i], './/text()', null);
if(text) {
text = this.cleanString(text);
if(!rejectRegexp.test(text)) {
if(availableItems[links[i].href]) {
availableItems[links[i].href] += " "+text;
} else {
availableItems[links[i].href] = text;
}
}
}
}
}
return availableItems;
}
// These functions are for use by importMARCRecord. They're private, because,
// while they are useful, it's also nice if as many of our scrapers as possible
// are PiggyBank compatible, and if our scrapers used functions, that would
@ -512,14 +568,14 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
/*
* Constructor for Document object
*/
Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
Scholar.Ingester.Document = function(browserWindow, myWindow){
this.scraper = null;
this.browser = browserWindow;
this.window = myWindow;
this.model = new Scholar.Ingester.Model();
this.items = new Array();
this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
.getService(Ci.nsIAppShellService);
this._hiddenBrowser = hiddenBrowser;
this._generateSandbox();
}
@ -596,17 +652,19 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
var scraperSandbox = this._sandbox;
try {
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
var returnValue = Components.utils.evalInSandbox("(function(){\n" +
this.scraper.scraperJavaScript +
"\n})()", scraperSandbox);
} catch(e) {
Scholar.debug(e+' in scraperJavaScript for '+this.scraper.label);
this._scrapePageComplete();
this._scrapePageComplete(false);
return;
}
// If synchronous, call _scrapePageComplete();
if(!this._waitForCompletion) {
Scholar.debug("is asynch");
this._scrapePageComplete();
this._scrapePageComplete(returnValue);
}
}
@ -637,10 +695,10 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
/*
* Called when scraping (synchronous or asynchronous) is complete
*/
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue) {
this._updateDatabase();
if(this._scrapeCallback) {
this._scrapeCallback(this);
this._scrapeCallback(this, returnValue);
}
}
@ -651,7 +709,7 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
this._sandbox.browser = this.browser;
this._sandbox.doc = this._sandbox.browser.contentDocument;
this._sandbox.utilities = new Scholar.Ingester.Utilities(this._hiddenBrowser);
this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window);
this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow);
this._sandbox.window = this.window;
this._sandbox.model = this.model;

View file

@ -175,48 +175,7 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintex
wait();');
REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-21 22:44:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
'if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
// We have search results
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
var availableItems = new Object(); // Technically, associative arrays are objects
// Require link to match this
var tagRegexp = new RegExp();
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
// Do not allow text to match this
var rejectRegexp = new RegExp();
rejectRegexp.compile(''\[ [0-9]+ \]'');
var links = doc.getElementsByTagName("a");
for(var i=0; i<links.length; i++) {
if(tagRegexp.test(links[i].href)) {
var text = utilities.getNodeString(doc, links[i], ''.//text()'', nsResolver);
if(text) {
text = utilities.cleanString(text);
if(!rejectRegexp.test(text)) {
if(availableItems[links[i].href]) {
availableItems[links[i].href] += " "+text;
} else {
availableItems[links[i].href] = text;
}
}
}
}
}
if(availableItems) {
return availableItems;
} else {
return false;
}
}
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
'var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
for(i in export_options) {
if(export_options[i].text == ''Latin1 MARC''
|| export_options[i].text == ''Raw MARC''
@ -233,6 +192,14 @@ var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
var items = utilities.getItemArray(doc, doc, ''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='', ''\[ [0-9]+ \]'');
var items = utilities.selectItems(items);
if(!items) {
return true;
}
}
var uri = doc.location.href;
var raw, unicode, latin1;