closes #187, make berkeley's library work
closes #186, stop translators from hanging when a document loads inside a frameset, we now check whether we can scrape each individual frame. all functions involving tabs have been vastly simplified, because in the process of figuring this out, i discovered Firefox 2's new tab events. if a translator throws an exception inside loadDocument(), doGet(), doPost(), or processDocuments(), a translate error message will appear, and the translator will not hang
This commit is contained in:
parent
009a4ad520
commit
51108446e3
4 changed files with 262 additions and 170 deletions
|
@ -23,7 +23,6 @@ var Scholar_Ingester_Interface = function() {}
|
|||
* loading
|
||||
*/
|
||||
Scholar_Ingester_Interface.init = function() {
|
||||
Scholar_Ingester_Interface.browsers = new Array();
|
||||
Scholar_Ingester_Interface.browserData = new Object();
|
||||
Scholar_Ingester_Interface._scrapePopupShowing = false;
|
||||
Scholar.Ingester.ProxyMonitor.init();
|
||||
|
@ -42,8 +41,10 @@ Scholar_Ingester_Interface.chromeLoad = function() {
|
|||
Scholar_Ingester_Interface.statusImage = document.getElementById("scholar-status-image");
|
||||
|
||||
// this gives us onLocationChange, for updating when tabs are switched/created
|
||||
Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
|
||||
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
|
||||
Scholar_Ingester_Interface.tabBrowser.addEventListener("TabClose",
|
||||
Scholar_Ingester_Interface.tabClose, false);
|
||||
Scholar_Ingester_Interface.tabBrowser.addEventListener("TabSelect",
|
||||
Scholar_Ingester_Interface.tabSelect, false);
|
||||
// this is for pageshow, for updating the status of the book icon
|
||||
Scholar_Ingester_Interface.appContent.addEventListener("pageshow",
|
||||
Scholar_Ingester_Interface.contentLoad, true);
|
||||
|
@ -53,8 +54,7 @@ Scholar_Ingester_Interface.chromeLoad = function() {
|
|||
* When chrome unloads, delete our document objects and remove our listeners
|
||||
*/
|
||||
Scholar_Ingester_Interface.chromeUnload = function() {
|
||||
delete Scholar_Ingester_Interface.browserData, Scholar_Ingester_Interface.browsers;
|
||||
this.tabBrowser.removeProgressListener(this);
|
||||
delete Scholar_Ingester_Interface.browserData;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -77,7 +77,7 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
|
|||
}
|
||||
|
||||
var translate = new Scholar.Translate("web");
|
||||
translate.setBrowser(browser);
|
||||
translate.setDocument(data.document);
|
||||
// use first translator available
|
||||
translate.setTranslator(data.translators[0]);
|
||||
translate.setHandler("select", Scholar_Ingester_Interface._selectItems);
|
||||
|
@ -90,86 +90,69 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
|
|||
/*
|
||||
* An event handler called when a new document is loaded. Creates a new document
|
||||
* object, and updates the status of the capture icon
|
||||
|
||||
*/
|
||||
Scholar_Ingester_Interface.contentLoad = function(event) {
|
||||
if (event.originalTarget instanceof HTMLDocument) {
|
||||
// Stolen off the Mozilla extension developer's website, a routine to
|
||||
// determine the root document loaded from a frameset
|
||||
if (event.originalTarget.defaultView.frameElement) {
|
||||
var doc = event.originalTarget;
|
||||
while (doc.defaultView.frameElement) {
|
||||
doc=doc.defaultView.frameElement.ownerDocument;
|
||||
}
|
||||
// Frame within a tab was loaded. doc is the root document of the frameset
|
||||
} else {
|
||||
var doc = event.originalTarget;
|
||||
// Page was loaded. doc is the document that loaded.
|
||||
if(event.originalTarget instanceof HTMLDocument) {
|
||||
var doc = event.originalTarget;
|
||||
var rootDoc = doc;
|
||||
|
||||
// get the appropriate root document to check which browser we're on
|
||||
Scholar.debug("getting root document");
|
||||
while(rootDoc.defaultView.frameElement) {
|
||||
rootDoc = rootDoc.defaultView.frameElement.ownerDocument;
|
||||
}
|
||||
|
||||
// Figure out what browser this contentDocument is associated with
|
||||
var browser;
|
||||
Scholar.debug("getting browser");
|
||||
for(var i=0; i<Scholar_Ingester_Interface.tabBrowser.browsers.length; i++) {
|
||||
if(doc == Scholar_Ingester_Interface.tabBrowser.browsers[i].contentDocument) {
|
||||
if(rootDoc == Scholar_Ingester_Interface.tabBrowser.browsers[i].contentDocument) {
|
||||
browser = Scholar_Ingester_Interface.tabBrowser.browsers[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!browser) {
|
||||
Scholar.debug("Could not find browser!");
|
||||
return;
|
||||
}
|
||||
|
||||
Scholar.debug("getting data");
|
||||
// get data object
|
||||
var data = Scholar_Ingester_Interface._getData(browser);
|
||||
|
||||
// if there's already a scrapable page in the browser window, and it's
|
||||
// still there, return
|
||||
if(data.translators && data.translators.length && data.document.location) {
|
||||
return;
|
||||
}
|
||||
|
||||
Scholar.debug("translating");
|
||||
// get translators
|
||||
var translate = new Scholar.Translate("web");
|
||||
translate.setBrowser(browser);
|
||||
translate.setDocument(doc);
|
||||
data.translators = translate.getTranslators();
|
||||
// update status
|
||||
Scholar_Ingester_Interface._updateStatus(data);
|
||||
// add document
|
||||
if(data.translators && data.translators.length) {
|
||||
data.document = doc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Dummy event handlers for all the events we don't care about
|
||||
* called when a tab is closed
|
||||
*/
|
||||
Scholar_Ingester_Interface.Listener = function() {}
|
||||
Scholar_Ingester_Interface.Listener.onStatusChange = function() {}
|
||||
Scholar_Ingester_Interface.Listener.onSecurityChange = function() {}
|
||||
Scholar_Ingester_Interface.Listener.onProgressChange = function() {}
|
||||
Scholar_Ingester_Interface.Listener.onStateChange = function() {}
|
||||
Scholar_Ingester_Interface.tabClose = function(event) {
|
||||
// To execute if document object does not exist
|
||||
Scholar_Ingester_Interface._deleteData(event.target.linkedBrowser);
|
||||
}
|
||||
|
||||
/*
|
||||
* onLocationChange is called when tabs are switched. Use it to retrieve the
|
||||
* appropriate status indicator for the current tab, and to free useless objects
|
||||
* called when a tab is switched
|
||||
*/
|
||||
Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) {
|
||||
var browsers = Scholar_Ingester_Interface.tabBrowser.browsers;
|
||||
|
||||
// Remove document object of any browser that no longer exists
|
||||
for (var i = 0; i < Scholar_Ingester_Interface.browsers.length; i++) {
|
||||
var browser = Scholar_Ingester_Interface.browsers[i];
|
||||
var exists = false;
|
||||
|
||||
for (var j = 0; j < browsers.length; j++) {
|
||||
if (browser == browsers[j]) {
|
||||
exists = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!exists) {
|
||||
Scholar_Ingester_Interface.browsers.splice(i,1);
|
||||
|
||||
// To execute if document object does not exist
|
||||
Scholar_Ingester_Interface._deleteDocument(browser);
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.tabSelect = function(event) {
|
||||
var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
||||
Scholar_Ingester_Interface._updateStatus(data);
|
||||
|
||||
// Make sure scrape progress is gone
|
||||
Scholar_Ingester_Interface.Progress.kill();
|
||||
}
|
||||
|
|
|
@ -29,8 +29,8 @@
|
|||
* PUBLIC PROPERTIES:
|
||||
*
|
||||
* type - the text type of translator (set by constructor, should be read only)
|
||||
* browser - the browser object to be used for web scraping (read-only; set
|
||||
* with setBrowser)
|
||||
* document - the document object to be used for web scraping (read-only; set
|
||||
* with setDocument)
|
||||
* translator - the translator currently in use (read-only; set with
|
||||
* setTranslator)
|
||||
* location - the location of the target (read-only; set with setLocation)
|
||||
|
@ -115,9 +115,9 @@ Scholar.Translate = function(type, saveItem) {
|
|||
/*
|
||||
* sets the browser to be used for web translation; also sets the location
|
||||
*/
|
||||
Scholar.Translate.prototype.setBrowser = function(browser) {
|
||||
this.browser = browser;
|
||||
this.setLocation(browser.contentDocument.location.href);
|
||||
Scholar.Translate.prototype.setDocument = function(doc) {
|
||||
this.document = doc;
|
||||
this.setLocation(doc.location.href);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -428,7 +428,7 @@ Scholar.Translate.prototype._generateSandbox = function() {
|
|||
var sandboxURL = "";
|
||||
if(this.type == "web") {
|
||||
// use real URL, not proxied version, to create sandbox
|
||||
sandboxURL = this.browser.contentDocument.location.href;
|
||||
sandboxURL = this.document.location.href;
|
||||
} else {
|
||||
// generate sandbox for search by extracting domain from translator
|
||||
// target, if one exists
|
||||
|
@ -446,8 +446,8 @@ Scholar.Translate.prototype._generateSandbox = function() {
|
|||
this._sandbox.Scholar = new Object();
|
||||
|
||||
// add ingester utilities
|
||||
this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this.locationIsProxied);
|
||||
this._sandbox.Scholar.Utilities.HTTP = new Scholar.Utilities.Ingester.HTTP(this.locationIsProxied);
|
||||
this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this);
|
||||
this._sandbox.Scholar.Utilities.HTTP = new Scholar.Utilities.Ingester.HTTP(this);
|
||||
|
||||
// set up selectItems handler
|
||||
this._sandbox.Scholar.selectItems = function(options) { return me._selectItems(options) };
|
||||
|
@ -584,7 +584,7 @@ Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtension
|
|||
|
||||
try {
|
||||
if(this.type == "web") {
|
||||
returnValue = this._sandbox.detectWeb(this.browser.contentDocument, this.location);
|
||||
returnValue = this._sandbox.detectWeb(this.document, this.location);
|
||||
} else if(this.type == "search") {
|
||||
returnValue = this._sandbox.detectSearch(this.search);
|
||||
} else if(this.type == "import") {
|
||||
|
@ -954,7 +954,7 @@ Scholar.Translate.prototype._runHandler = function(type, argument) {
|
|||
*/
|
||||
Scholar.Translate.prototype._web = function() {
|
||||
try {
|
||||
this._sandbox.doWeb(this.browser.contentDocument, this.location);
|
||||
this._sandbox.doWeb(this.document, this.location);
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in executing code for '+this.translator[0].label);
|
||||
return false;
|
||||
|
|
|
@ -164,8 +164,8 @@ Scholar.Utilities.prototype.itemTypeExists = function(type) {
|
|||
// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
|
||||
// classes relating to data extraction specifically from HTML documents.
|
||||
|
||||
Scholar.Utilities.Ingester = function(proxiedURL) {
|
||||
this.proxiedURL = proxiedURL;
|
||||
Scholar.Utilities.Ingester = function(translate, proxiedURL) {
|
||||
this.translate = translate;
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
|
||||
|
@ -252,43 +252,62 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) {
|
|||
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies
|
||||
|
||||
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
|
||||
if(this.proxiedURL) {
|
||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
||||
}
|
||||
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
|
||||
this.processDocuments([ url ], succeeded, null, failed);
|
||||
}
|
||||
Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
|
||||
if(this.proxiedURL) {
|
||||
if(this.translate.locationIsProxied) {
|
||||
for(i in urls) {
|
||||
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// unless the translator has proposed some way to handle an error, handle it
|
||||
// by throwing a "scraping error" message
|
||||
if(!exception) {
|
||||
var translate = this.translate;
|
||||
exception = function(e) {
|
||||
Scholar.debug("an error occurred in code called by processDocuments: "+e);
|
||||
translate._translationComplete(false);
|
||||
}
|
||||
}
|
||||
|
||||
Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception);
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.HTTP = function(proxiedURL) {
|
||||
this.proxiedURL = proxiedURL;
|
||||
Scholar.Utilities.Ingester.HTTP = function(translate) {
|
||||
this.translate = translate;
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.HTTP.prototype.doGet = function(url, onDone) {
|
||||
if(this.proxiedURL) {
|
||||
if(this.translate.locationIsProxied) {
|
||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
||||
}
|
||||
Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
|
||||
|
||||
var translate = this.translate;
|
||||
Scholar.Utilities.HTTP.doGet(url, function(xmlhttp) {
|
||||
try {
|
||||
onDone(xmlhttp.responseText, xmlhttp);
|
||||
} catch(e) {
|
||||
Scholar.debug("an error occurred in code called by doGet: "+e);
|
||||
translate._translationComplete(false);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.HTTP.prototype.doPost = function(url, body, onDone) {
|
||||
if(this.proxiedURL) {
|
||||
if(this.translate.locationIsProxied) {
|
||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
||||
}
|
||||
Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.HTTP.prototype.doOptions = function(url, onDone) {
|
||||
if(this.proxiedURL) {
|
||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
||||
}
|
||||
Scholar.Utilities.HTTP.doOptions(url, function(xmlhttp) { onDone(xmlhttp.responseText, xmlhttp) })
|
||||
|
||||
var translate = this.translate;
|
||||
Scholar.Utilities.HTTP.doPost(url, body, function(xmlhttp) {
|
||||
try {
|
||||
onDone(xmlhttp.responseText, xmlhttp);
|
||||
} catch(e) {
|
||||
Scholar.debug("an error occurred in code called by doPost: "+e);
|
||||
translate._translationComplete(false);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
|
||||
|
@ -310,7 +329,7 @@ Scholar.Utilities.HTTP = new function() {
|
|||
* doGet can be called as:
|
||||
* Scholar.Utilities.HTTP.doGet(url, onDone)
|
||||
**/
|
||||
function doGet(url, onDone) {
|
||||
function doGet(url, onDone, onError) {
|
||||
Scholar.debug("HTTP GET "+url);
|
||||
if (this.browserIsOffline()){
|
||||
return false;
|
||||
|
@ -429,17 +448,14 @@ Scholar.Utilities.HTTP = new function() {
|
|||
|
||||
// Download complete
|
||||
case 4:
|
||||
try {
|
||||
if (onDone){
|
||||
onDone(xmlhttp);
|
||||
}
|
||||
}
|
||||
catch (e){
|
||||
Scholar.debug(e, 2);
|
||||
if(onDone){
|
||||
onDone(xmlhttp);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// Downloads and processes documents with processor()
|
||||
|
@ -455,63 +471,71 @@ Scholar.Utilities.HTTP = new function() {
|
|||
Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, done, exception, saveBrowser) {
|
||||
var hiddenBrowser = Scholar.Browser.createHiddenBrowser();
|
||||
var prevUrl, url;
|
||||
|
||||
try {
|
||||
if (urls.length == 0) {
|
||||
if(firstDoc) {
|
||||
processor(firstDoc, done);
|
||||
} else {
|
||||
done();
|
||||
}
|
||||
return;
|
||||
|
||||
if (urls.length == 0) {
|
||||
if(firstDoc) {
|
||||
processor(firstDoc, done);
|
||||
} else {
|
||||
done();
|
||||
}
|
||||
|
||||
var urlIndex = -1;
|
||||
var doLoad = function() {
|
||||
urlIndex++;
|
||||
if (urlIndex < urls.length) {
|
||||
url = urls[urlIndex];
|
||||
try {
|
||||
Scholar.debug("loading "+url);
|
||||
hiddenBrowser.loadURI(url);
|
||||
} catch (e) {
|
||||
Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2);
|
||||
exception(e);
|
||||
}
|
||||
} else {
|
||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||
if(!saveBrowser) {
|
||||
Scholar.Browser.deleteHiddenBrowser(hiddenBrowser);
|
||||
}
|
||||
done();
|
||||
}
|
||||
};
|
||||
var onLoad = function() {
|
||||
Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
|
||||
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
|
||||
prevUrl = hiddenBrowser.contentDocument.location.href;
|
||||
try {
|
||||
processor(hiddenBrowser.contentDocument);
|
||||
} catch (e) {
|
||||
Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
|
||||
exception(e);
|
||||
}
|
||||
doLoad();
|
||||
}
|
||||
};
|
||||
var init = function() {
|
||||
hiddenBrowser.addEventListener("load", onLoad, true);
|
||||
|
||||
if (firstDoc) {
|
||||
processor(firstDoc, doLoad);
|
||||
} else {
|
||||
doLoad();
|
||||
}
|
||||
}
|
||||
|
||||
init();
|
||||
} catch (e) {
|
||||
Scholar.debug("processDocuments: " + e);
|
||||
exception(e);
|
||||
return;
|
||||
}
|
||||
var urlIndex = -1;
|
||||
|
||||
var removeListeners = function() {
|
||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||
if(!saveBrowser) {
|
||||
Scholar.Browser.deleteHiddenBrowser(hiddenBrowser);
|
||||
}
|
||||
}
|
||||
var doLoad = function() {
|
||||
urlIndex++;
|
||||
if (urlIndex < urls.length) {
|
||||
url = urls[urlIndex];
|
||||
try {
|
||||
Scholar.debug("loading "+url);
|
||||
hiddenBrowser.loadURI(url);
|
||||
} catch (e) {
|
||||
removeListeners();
|
||||
if(exception) {
|
||||
exception(e);
|
||||
return;
|
||||
} else {
|
||||
throw(e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
removeListeners();
|
||||
done();
|
||||
}
|
||||
};
|
||||
var onLoad = function() {
|
||||
Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
|
||||
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
|
||||
prevUrl = hiddenBrowser.contentDocument.location.href;
|
||||
try {
|
||||
processor(hiddenBrowser.contentDocument);
|
||||
} catch (e) {
|
||||
removeListeners();
|
||||
if(exception) {
|
||||
exception(e);
|
||||
return;
|
||||
} else {
|
||||
throw(e);
|
||||
}
|
||||
}
|
||||
doLoad();
|
||||
}
|
||||
};
|
||||
var init = function() {
|
||||
hiddenBrowser.addEventListener("load", onLoad, true);
|
||||
|
||||
if (firstDoc) {
|
||||
processor(firstDoc, doLoad);
|
||||
} else {
|
||||
doLoad();
|
||||
}
|
||||
}
|
||||
|
||||
init();
|
||||
}
|
127
scrapers.sql
127
scrapers.sql
|
@ -1,7 +1,7 @@
|
|||
-- 48
|
||||
-- 49
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-11 11:18:00'));
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
|
||||
|
||||
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
|
||||
'function detectWeb(doc, url) {
|
||||
|
@ -112,7 +112,7 @@ function doWeb(doc, url) {
|
|||
}
|
||||
|
||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||
function() { Scholar.done(); }, function() {});
|
||||
function() { Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
} else {
|
||||
|
@ -646,7 +646,7 @@ function doWeb(doc, url) {
|
|||
}
|
||||
|
||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||
function() { Scholar.done(); }, function() {});
|
||||
function() { Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
} else {
|
||||
|
@ -763,7 +763,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006
|
|||
newItem.complete();
|
||||
|
||||
Scholar.done();
|
||||
}, function() {});
|
||||
}, null);
|
||||
} else { // Search results page
|
||||
// Require link to match this
|
||||
var tagRegexp = new RegExp();
|
||||
|
@ -952,7 +952,7 @@ function doWeb(doc, url) {
|
|||
}
|
||||
|
||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||
function() { Scholar.done() }, function() {});
|
||||
function() { Scholar.done() }, null);
|
||||
|
||||
Scholar.wait();
|
||||
}
|
||||
|
@ -1127,7 +1127,7 @@ function doWeb(doc, url) {
|
|||
}
|
||||
|
||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||
function() { Scholar.done(); }, function() {});
|
||||
function() { Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
} else {
|
||||
|
@ -1136,7 +1136,7 @@ function doWeb(doc, url) {
|
|||
if(m && (m[1] == "1" || m[1] == "2")) {
|
||||
scrape(doc);
|
||||
} else if(m) {
|
||||
Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, function() {});
|
||||
Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, null);
|
||||
Scholar.wait();
|
||||
}
|
||||
}
|
||||
|
@ -1366,7 +1366,7 @@ function doWeb(doc, url) {
|
|||
}
|
||||
|
||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||
function() { Scholar.done(); }, function() {});
|
||||
function() { Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
}
|
||||
|
@ -1457,7 +1457,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006
|
|||
newItem.source = uri;
|
||||
record.translate(newItem);
|
||||
newItem.complete();
|
||||
}, function() { Scholar.done(); }, function() {});
|
||||
}, function() { Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
@ -1544,7 +1544,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006
|
|||
newItem.source = uri;
|
||||
record.translate(newItem);
|
||||
newItem.complete();
|
||||
}, function() { Scholar.done() }, function() {});
|
||||
}, function() { Scholar.done() }, null);
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
@ -1647,7 +1647,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006
|
|||
newItem.source = uri;
|
||||
record.translate(newItem);
|
||||
newItem.complete();
|
||||
}, function(){ Scholar.done(); }, function() {});
|
||||
}, function(){ Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
@ -1721,8 +1721,7 @@ REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006
|
|||
Scholar.wait();
|
||||
}');
|
||||
|
||||
|
||||
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
|
||||
REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|GeacFETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))',
|
||||
'function detectWeb(doc, url) {
|
||||
if(doc.location.href.indexOf("/GeacQUERY") > 0) {
|
||||
return "multiple";
|
||||
|
@ -1804,7 +1803,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006
|
|||
newItem.source = uri;
|
||||
record.translate(newItem);
|
||||
newItem.complete();
|
||||
}, function() { Scholar.done(); }, function() {});
|
||||
}, function() { Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
@ -2037,7 +2036,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006
|
|||
newItem.source = uri;
|
||||
record.translate(newItem);
|
||||
newItem.complete();
|
||||
}, function() {Scholar.done(); }, function() {});
|
||||
}, function() {Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
@ -2568,7 +2567,79 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
|
|||
}
|
||||
}
|
||||
newItem.complete();
|
||||
}, function() { Scholar.done(); }, function() {});
|
||||
}, function() { Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('9c335444-a562-4f88-b291-607e8f46a9bb', '2006-08-15 15:42:00', 4, 'Berkeley Library', 'Simon Kornblith', '^http://[^/]*berkeley.edu[^/]*/WebZ/(?:html/results.html|FETCH)\?.*sessionid=',
|
||||
'function detectWeb(doc, url) {
|
||||
var resultsRegexp = /\/WebZ\/html\/results.html/i
|
||||
if(resultsRegexp.test(url)) {
|
||||
return "multiple";
|
||||
} else {
|
||||
return "book";
|
||||
}
|
||||
}',
|
||||
'function reformURL(url) {
|
||||
return url.replace(/fmtclass=[^&]*/, "")+":fmtclass=marc";
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var resultsRegexp = /\/WebZ\/html\/results.html/i
|
||||
|
||||
if(resultsRegexp.test(url)) {
|
||||
var items = Scholar.Utilities.getItemArray(doc, doc, "/WebZ/FETCH", "^[0-9]*$");
|
||||
items = Scholar.selectItems(items);
|
||||
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
var urls = new Array();
|
||||
for(var i in items) {
|
||||
urls.push(reformURL(i));
|
||||
}
|
||||
} else {
|
||||
var urls = [reformURL(url)];
|
||||
}
|
||||
|
||||
var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973");
|
||||
|
||||
Scholar.Utilities.processDocuments(urls, function(newDoc) {
|
||||
Scholar.Utilities.debug(newDoc.getElementsByTagName("body")[0].innerHTML);
|
||||
var uri = newDoc.location.href;
|
||||
|
||||
var namespace = newDoc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var elmts = newDoc.evaluate(''//table/tbody/tr[@valign="top"]'',
|
||||
newDoc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
|
||||
var record = new marc.MARC_Record();
|
||||
while(elmt = elmts.iterateNext()) {
|
||||
var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
|
||||
var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
|
||||
var ind1 = value[4];
|
||||
var ind2 = value[6];
|
||||
value = Scholar.Utilities.cleanString(value.substr(6)).
|
||||
replace(/\$([a-z0-9]) /g, record.subfield_delimiter+"$1");
|
||||
if(value[0] != record.subfield_delimiter) {
|
||||
value = record.subfield_delimiter+"a"+value;
|
||||
}
|
||||
|
||||
if(field != 0) {
|
||||
record.add_field(field, ind1, ind2, value);
|
||||
}
|
||||
}
|
||||
|
||||
var newItem = new Scholar.Item();
|
||||
newItem.source = uri;
|
||||
record.translate(newItem);
|
||||
newItem.complete();
|
||||
}, function() { Scholar.done(); }, null);
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
@ -2644,9 +2715,7 @@ function doSearch(item) {
|
|||
Scholar.done(false);
|
||||
});
|
||||
}
|
||||
}, function() {
|
||||
error();
|
||||
});
|
||||
}, null);
|
||||
|
||||
Scholar.wait();
|
||||
}');
|
||||
|
@ -4604,7 +4673,16 @@ MARC_Record.prototype.get_field_subfields = function(tag) { // returns a two-dim
|
|||
}
|
||||
|
||||
MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
|
||||
if (tag.length != 3) { return false; }
|
||||
/*if(tag.length != 3) {
|
||||
return false;
|
||||
}*/
|
||||
|
||||
if (tag.length < 3) {
|
||||
tag = Scholar.Utilities.lpad(tag.toString(),"0",3);
|
||||
} else if(tag.length > 3) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var F = new this.MARC_field(this,tag,ind1,ind2,value);
|
||||
// adds pointer to list of fields
|
||||
this.variable_fields[this.variable_fields.length] = F;
|
||||
|
@ -4666,9 +4744,11 @@ MARC_Record.prototype._clean = function(value) {
|
|||
}
|
||||
|
||||
MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) {
|
||||
|
||||
if(!part) {
|
||||
part = ''a'';
|
||||
}
|
||||
|
||||
var field = this.get_field_subfields(fieldNo);
|
||||
Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part);
|
||||
if(field) {
|
||||
|
@ -4685,6 +4765,7 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam
|
|||
}
|
||||
}
|
||||
if(value) {
|
||||
this._gotField = true;
|
||||
value = this._clean(value);
|
||||
|
||||
if(execMe) {
|
||||
|
@ -4807,6 +4888,10 @@ MARC_Record.prototype.translate = function(item) {
|
|||
|
||||
// Set type
|
||||
item.itemType = "book";
|
||||
|
||||
if(!this._gotField) {
|
||||
throw("tried to create a marc record with no fields!");
|
||||
}
|
||||
}
|
||||
|
||||
MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides
|
||||
|
|
Loading…
Reference in a new issue