- Made ingester automatically create hidden browser objects, given a window object. This should make things much easier for both David and me.
- Multiple item detection code is now a part of the scraperJavaScript, rather than the scrapeDetectCode, and code to choose which items to add is part of Scholar.Ingester.Utilities, accessible from inside scrapers. The alternative approach would result in one request (or, in the case of JSTOR, three requests) per new item, while in some cases (e.g. Voyager) only one request is necessary to get all of the items.
This commit is contained in:
parent
726364d091
commit
3890e5f122
5 changed files with 109 additions and 95 deletions
|
@ -35,7 +35,6 @@ Scholar_Ingester_Interface.init = function() {
|
|||
*/
|
||||
Scholar_Ingester_Interface.chromeLoad = function() {
|
||||
Scholar_Ingester_Interface.tabBrowser = document.getElementById("content");
|
||||
Scholar_Ingester_Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser");
|
||||
Scholar_Ingester_Interface.appContent = document.getElementById("appcontent");
|
||||
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
|
||||
|
||||
|
@ -61,21 +60,11 @@ Scholar_Ingester_Interface.chromeUnload = function() {
|
|||
Scholar_Ingester_Interface.scrapeThisPage = function() {
|
||||
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
||||
if(documentObject.scraper) {
|
||||
if(documentObject.scrapeURLList) {
|
||||
// In the case that there are multiple scrapable URLs, make the user choose
|
||||
Scholar_Ingester_Interface.chooseURL(documentObject);
|
||||
}
|
||||
Scholar_Ingester_Interface.scrapeProgress = new Scholar_Ingester_Interface.Progress(window, Scholar_Ingester_Interface.tabBrowser.selectedBrowser.contentDocument, Scholar.getString("ingester.scraping"));
|
||||
documentObject.scrapePage(Scholar_Ingester_Interface._finishScraping);
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.chooseURL = function(documentObject) {
|
||||
Scholar.debug("chooseURL called");
|
||||
var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
|
||||
"_blank","chrome,modal,centerscreen,resizable=yes", documentObject);
|
||||
}
|
||||
|
||||
/*
|
||||
* Updates the status of the capture icon to reflect the scrapability or lack
|
||||
* thereof of the current page
|
||||
|
@ -182,7 +171,7 @@ Scholar_Ingester_Interface._setDocument = function(browser) {
|
|||
browser.setAttribute("scholar-key", key);
|
||||
}
|
||||
}
|
||||
Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar_Ingester_Interface.hiddenBrowser);
|
||||
Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, window);
|
||||
Scholar_Ingester_Interface.browserDocuments[key].retrieveScraper();
|
||||
}
|
||||
|
||||
|
@ -203,7 +192,7 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) {
|
|||
/*
|
||||
* Callback to be executed when scraping is complete
|
||||
*/
|
||||
Scholar_Ingester_Interface._finishScraping = function(obj) {
|
||||
Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) {
|
||||
if(obj.items.length) {
|
||||
try { // Encased in a try block to fix a as-of-yet unresolved issue
|
||||
var item1 = obj.items[0];
|
||||
|
@ -243,12 +232,14 @@ Scholar_Ingester_Interface._finishScraping = function(obj) {
|
|||
for(i in obj.items) {
|
||||
obj.items[i].save();
|
||||
}
|
||||
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
|
||||
} else if(returnValue) {
|
||||
Scholar_Ingester_Interface.scrapeProgress.kill();
|
||||
} else {
|
||||
Scholar_Ingester_Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
|
||||
Scholar_Ingester_Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
|
||||
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
|
||||
}
|
||||
|
||||
setTimeout(function() { Scholar_Ingester_Interface.scrapeProgress.fade() }, 2000);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -333,7 +324,6 @@ Scholar_Ingester_Interface.Progress.prototype.addDescription = function(descript
|
|||
this.table.appendChild(tr);
|
||||
}
|
||||
|
||||
|
||||
Scholar_Ingester_Interface.Progress.prototype.fade = function() {
|
||||
// Icky, icky hack to keep objects
|
||||
var me = this;
|
||||
|
@ -349,3 +339,8 @@ Scholar_Ingester_Interface.Progress.prototype.fade = function() {
|
|||
// Begin fade
|
||||
this._fader();
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.Progress.prototype.kill = function() {
|
||||
this.div.style.display = 'none';
|
||||
}
|
||||
|
||||
|
|
|
@ -12,10 +12,4 @@
|
|||
<hbox id="urlbar-icons">
|
||||
<image src="chrome://scholar/skin/treeitem-book.png" id="scholar-status-image" onclick="Scholar_Ingester_Interface.scrapeThisPage()" position="1" hidden="true"/>
|
||||
</hbox>
|
||||
|
||||
<window id="main-window">
|
||||
<box style="visibility: collapse">
|
||||
<browser id="scholar-hidden-browser" />
|
||||
</box>
|
||||
</window>
|
||||
</overlay>
|
||||
|
|
|
@ -19,26 +19,26 @@ Scholar_Ingester_Interface_SelectItems = function() {}
|
|||
* loading
|
||||
*/
|
||||
Scholar_Ingester_Interface_SelectItems.init = function() {
|
||||
this.documentObject = window.arguments[0];
|
||||
this.io = window.arguments[0];
|
||||
this.Scholar_Ingester_Interface = window.arguments[1];
|
||||
this.listbox = document.getElementById("scholar-selectitems-links");
|
||||
|
||||
for(i in this.documentObject.scrapeURLList) { // we could use a tree for this if we wanted to
|
||||
for(i in this.io.dataIn) { // we could use a tree for this if we wanted to
|
||||
var itemNode = document.createElement("listitem");
|
||||
itemNode.setAttribute("type", "checkbox");
|
||||
itemNode.setAttribute("value", i);
|
||||
itemNode.setAttribute("label", this.documentObject.scrapeURLList[i]);
|
||||
itemNode.setAttribute("label", this.io.dataIn[i]);
|
||||
itemNode.setAttribute("checked", false);
|
||||
this.listbox.appendChild(itemNode);
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
|
||||
// clear scrapeURLList
|
||||
this.documentObject.scrapeURLList = new Object();
|
||||
this.io.dataOut = new Object();
|
||||
|
||||
// collect scrapeURLList from listbox
|
||||
for(var i=0; i<this.listbox.length; i++) {
|
||||
var itemNode = this.listbox[i];
|
||||
this.documentObject.scrapeURLList[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
|
||||
this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
|
||||
}
|
||||
}
|
|
@ -4,6 +4,21 @@
|
|||
|
||||
Scholar.Ingester = new function() {}
|
||||
|
||||
Scholar.Ingester.createHiddenBrowser = function(myWindow) {
|
||||
// Create a hidden browser
|
||||
var newHiddenBrowser = myWindow.document.createElement("browser");
|
||||
var windows = myWindow.document.getElementsByTagName("window");
|
||||
windows[0].appendChild(newHiddenBrowser);
|
||||
Scholar.debug("created hidden browser");
|
||||
return newHiddenBrowser;
|
||||
}
|
||||
|
||||
Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) {
|
||||
// Delete a hidden browser
|
||||
delete myBrowser;
|
||||
Scholar.debug("deleted hidden browser");
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scholar.Ingester.Model
|
||||
|
@ -48,8 +63,8 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
|
|||
/////////////////////////////////////////////////////////////////
|
||||
// Scholar.Ingester.Utilities class, a set of methods to assist in data
|
||||
// extraction. Most code here was stolen directly from the Piggy Bank project.
|
||||
Scholar.Ingester.Utilities = function(hiddenBrowser) {
|
||||
this._hiddenBrowser = hiddenBrowser;
|
||||
Scholar.Ingester.Utilities = function(myWindow) {
|
||||
this.window = myWindow;
|
||||
}
|
||||
|
||||
// Adapter for Piggy Bank function to print debug messages; log level is
|
||||
|
@ -115,7 +130,7 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
|
|||
// exception - a function to execute if an exception occurs (exceptions are
|
||||
// also logged in the Firefox Scholar log)
|
||||
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||
var hiddenBrowser = this._hiddenBrowser;
|
||||
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
|
||||
Scholar.debug("processDocuments called");
|
||||
|
||||
try {
|
||||
|
@ -141,26 +156,23 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
|||
exception(e);
|
||||
}
|
||||
} else {
|
||||
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
|
||||
hiddenBrowser.setTimeout(done, 10);
|
||||
}
|
||||
};
|
||||
var onLoad = function() {
|
||||
Scholar.debug("onLoad called");
|
||||
if(hiddenBrowser.id == "scholar-hidden-browser") {
|
||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||
try {
|
||||
var newHiddenBrowser = new Object();
|
||||
Scholar.debug("new hidden browser");
|
||||
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
||||
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
|
||||
Scholar.debug("added attributes");
|
||||
processor(newHiddenBrowser);
|
||||
Scholar.debug("called processor");
|
||||
} catch (e) {
|
||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
||||
exception(e);
|
||||
}
|
||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||
try {
|
||||
var newHiddenBrowser = new Object();
|
||||
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
||||
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
|
||||
processor(newHiddenBrowser);
|
||||
} catch (e) {
|
||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
||||
exception(e);
|
||||
}
|
||||
doLoad();
|
||||
};
|
||||
var init = function() {
|
||||
Scholar.debug("init called");
|
||||
|
@ -302,6 +314,50 @@ Scholar.Ingester.Utilities.prototype.cleanTags = function(x) {
|
|||
return x.replace(/<[^>]+>/g, "");
|
||||
}
|
||||
|
||||
/*
|
||||
* Allows a user to select which items to scrape
|
||||
*/
|
||||
Scholar.Ingester.Utilities.prototype.selectItems = function(itemList) {
|
||||
// mozillazine made me do it! honest!
|
||||
var io = { dataIn:itemList, dataOut:null }
|
||||
var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
|
||||
"_blank","chrome,modal,centerscreen,resizable=yes", io);
|
||||
return io.dataOut;
|
||||
}
|
||||
|
||||
/*
|
||||
* Grabs items based on URLs
|
||||
*/
|
||||
Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) {
|
||||
var availableItems = new Object(); // Technically, associative arrays are objects
|
||||
|
||||
// Require link to match this
|
||||
var tagRegexp = new RegExp();
|
||||
tagRegexp.compile(urlRe);
|
||||
// Do not allow text to match this
|
||||
var rejectRegexp = new RegExp();
|
||||
rejectRegexp.compile(rejectRe);
|
||||
|
||||
var links = inHere.getElementsByTagName("a");
|
||||
for(var i=0; i<links.length; i++) {
|
||||
if(tagRegexp.test(links[i].href)) {
|
||||
var text = this.getNodeString(doc, links[i], './/text()', null);
|
||||
if(text) {
|
||||
text = this.cleanString(text);
|
||||
if(!rejectRegexp.test(text)) {
|
||||
if(availableItems[links[i].href]) {
|
||||
availableItems[links[i].href] += " "+text;
|
||||
} else {
|
||||
availableItems[links[i].href] = text;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return availableItems;
|
||||
}
|
||||
|
||||
// These functions are for use by importMARCRecord. They're private, because,
|
||||
// while they are useful, it's also nice if as many of our scrapers as possible
|
||||
// are PiggyBank compatible, and if our scrapers used functions, that would
|
||||
|
@ -512,14 +568,14 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
|
|||
/*
|
||||
* Constructor for Document object
|
||||
*/
|
||||
Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
|
||||
Scholar.Ingester.Document = function(browserWindow, myWindow){
|
||||
this.scraper = null;
|
||||
this.browser = browserWindow;
|
||||
this.window = myWindow;
|
||||
this.model = new Scholar.Ingester.Model();
|
||||
this.items = new Array();
|
||||
this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
|
||||
.getService(Ci.nsIAppShellService);
|
||||
this._hiddenBrowser = hiddenBrowser;
|
||||
this._generateSandbox();
|
||||
}
|
||||
|
||||
|
@ -596,17 +652,19 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
|||
|
||||
var scraperSandbox = this._sandbox;
|
||||
try {
|
||||
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
|
||||
var returnValue = Components.utils.evalInSandbox("(function(){\n" +
|
||||
this.scraper.scraperJavaScript +
|
||||
"\n})()", scraperSandbox);
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in scraperJavaScript for '+this.scraper.label);
|
||||
this._scrapePageComplete();
|
||||
this._scrapePageComplete(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// If synchronous, call _scrapePageComplete();
|
||||
if(!this._waitForCompletion) {
|
||||
Scholar.debug("is asynch");
|
||||
this._scrapePageComplete();
|
||||
this._scrapePageComplete(returnValue);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -637,10 +695,10 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
|||
/*
|
||||
* Called when scraping (synchronous or asynchronous) is complete
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
|
||||
Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue) {
|
||||
this._updateDatabase();
|
||||
if(this._scrapeCallback) {
|
||||
this._scrapeCallback(this);
|
||||
this._scrapeCallback(this, returnValue);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -651,7 +709,7 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
|||
this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
||||
this._sandbox.browser = this.browser;
|
||||
this._sandbox.doc = this._sandbox.browser.contentDocument;
|
||||
this._sandbox.utilities = new Scholar.Ingester.Utilities(this._hiddenBrowser);
|
||||
this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window);
|
||||
this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow);
|
||||
this._sandbox.window = this.window;
|
||||
this._sandbox.model = this.model;
|
||||
|
|
51
scrapers.sql
51
scrapers.sql
|
@ -175,48 +175,7 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintex
|
|||
wait();');
|
||||
|
||||
REPLACE INTO "scrapers" VALUES('88915634-1af6-c134-0171-56fd198235ed', '2006-06-21 22:44:00', 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
|
||||
'if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
||||
// We have search results
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var availableItems = new Object(); // Technically, associative arrays are objects
|
||||
|
||||
// Require link to match this
|
||||
var tagRegexp = new RegExp();
|
||||
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
|
||||
// Do not allow text to match this
|
||||
var rejectRegexp = new RegExp();
|
||||
rejectRegexp.compile(''\[ [0-9]+ \]'');
|
||||
|
||||
var links = doc.getElementsByTagName("a");
|
||||
for(var i=0; i<links.length; i++) {
|
||||
if(tagRegexp.test(links[i].href)) {
|
||||
var text = utilities.getNodeString(doc, links[i], ''.//text()'', nsResolver);
|
||||
if(text) {
|
||||
text = utilities.cleanString(text);
|
||||
if(!rejectRegexp.test(text)) {
|
||||
if(availableItems[links[i].href]) {
|
||||
availableItems[links[i].href] += " "+text;
|
||||
} else {
|
||||
availableItems[links[i].href] = text;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(availableItems) {
|
||||
return availableItems;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
|
||||
'var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
|
||||
for(i in export_options) {
|
||||
if(export_options[i].text == ''Latin1 MARC''
|
||||
|| export_options[i].text == ''Raw MARC''
|
||||
|
@ -233,6 +192,14 @@ var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||
|
||||
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
||||
var items = utilities.getItemArray(doc, doc, ''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='', ''\[ [0-9]+ \]'');
|
||||
var items = utilities.selectItems(items);
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
var uri = doc.location.href;
|
||||
|
||||
var raw, unicode, latin1;
|
||||
|
|
Loading…
Add table
Reference in a new issue