- Make events listening for DOMContentLoaded listen for load, because DOMContentLoaded does not seem ready for prime time (hey, it's undocumented, what can you expect)
- Make Amazon scraper work with multiple documents - Fix bugs in processDocuments - Make Scholar.Ingester.Utilities.getItemArray() willing to take an array of DOM nodes to search for links, and finally take advantage of the fact that objects have no length
This commit is contained in:
parent
b4d65420f3
commit
098078627c
3 changed files with 145 additions and 82 deletions
|
@ -41,8 +41,8 @@ Scholar_Ingester_Interface.chromeLoad = function() {
|
|||
// this gives us onLocationChange
|
||||
Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
|
||||
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
|
||||
// this gives us DOMContentLoaded
|
||||
Scholar_Ingester_Interface.appContent.addEventListener("DOMContentLoaded",
|
||||
// let's use load instead of DOMContentLoaded
|
||||
Scholar_Ingester_Interface.appContent.addEventListener("load",
|
||||
Scholar_Ingester_Interface.contentLoad, true);
|
||||
}
|
||||
|
||||
|
|
|
@ -131,6 +131,8 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
|
|||
// also logged in the Firefox Scholar log)
|
||||
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
|
||||
var myWindow = this.window;
|
||||
var prevUrl, url;
|
||||
Scholar.debug("processDocuments called");
|
||||
|
||||
try {
|
||||
|
@ -148,7 +150,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
|||
urlIndex++;
|
||||
if (urlIndex < urls.length) {
|
||||
try {
|
||||
var url = urls[urlIndex];
|
||||
url = urls[urlIndex];
|
||||
Scholar.debug("loading "+url);
|
||||
hiddenBrowser.loadURI(url);
|
||||
} catch (e) {
|
||||
|
@ -156,13 +158,15 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
|||
exception(e);
|
||||
}
|
||||
} else {
|
||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
|
||||
hiddenBrowser.setTimeout(done, 10);
|
||||
done();
|
||||
}
|
||||
};
|
||||
var onLoad = function() {
|
||||
Scholar.debug("onLoad called");
|
||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||
Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
|
||||
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
|
||||
prevUrl = hiddenBrowser.contentDocument.location.href;
|
||||
try {
|
||||
var newHiddenBrowser = new Object();
|
||||
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
||||
|
@ -173,6 +177,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
|||
exception(e);
|
||||
}
|
||||
doLoad();
|
||||
}
|
||||
};
|
||||
var init = function() {
|
||||
Scholar.debug("init called");
|
||||
|
@ -332,19 +337,28 @@ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe,
|
|||
var availableItems = new Object(); // Technically, associative arrays are objects
|
||||
|
||||
// Require link to match this
|
||||
var tagRegexp = new RegExp();
|
||||
tagRegexp.compile(urlRe);
|
||||
if(urlRe) {
|
||||
var urlRegexp = new RegExp();
|
||||
urlRegexp.compile(urlRe);
|
||||
}
|
||||
// Do not allow text to match this
|
||||
if(rejectRe) {
|
||||
var rejectRegexp = new RegExp();
|
||||
rejectRegexp.compile(rejectRe);
|
||||
}
|
||||
|
||||
var links = inHere.getElementsByTagName("a");
|
||||
if(!inHere.length) {
|
||||
inHere = new Array(inHere);
|
||||
}
|
||||
|
||||
for(var j=0; j<inHere.length; j++) {
|
||||
var links = inHere[j].getElementsByTagName("a");
|
||||
for(var i=0; i<links.length; i++) {
|
||||
if(tagRegexp.test(links[i].href)) {
|
||||
if(!urlRe || urlRegexp.test(links[i].href)) {
|
||||
var text = this.getNodeString(doc, links[i], './/text()', null);
|
||||
if(text) {
|
||||
text = this.cleanString(text);
|
||||
if(!rejectRegexp.test(text)) {
|
||||
if(!rejectRe || !rejectRegexp.test(text)) {
|
||||
if(availableItems[links[i].href]) {
|
||||
availableItems[links[i].href] += " "+text;
|
||||
} else {
|
||||
|
@ -354,6 +368,7 @@ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe,
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return availableItems;
|
||||
}
|
||||
|
@ -822,7 +837,16 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
|||
if(this.model.data[uri][prefixDC + 'year']) {
|
||||
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
|
||||
} else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
|
||||
var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/
|
||||
if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) {
|
||||
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
|
||||
} else {
|
||||
var m;
|
||||
var yearRe = /[0-9]{4}$/;
|
||||
if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) {
|
||||
newItem.setField("year", m[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
55
scrapers.sql
55
scrapers.sql
|
@ -1,9 +1,9 @@
|
|||
-- 10
|
||||
-- 11
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 16:51:00'));
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 22:58:00'));
|
||||
|
||||
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-18 10:15:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||
|
@ -13,7 +13,8 @@ var nsResolver = namespace ? function(prefix) {
|
|||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var uri = doc.location.href;
|
||||
function scrape(doc) {
|
||||
uri = doc.location.href;
|
||||
|
||||
// Retrieve authors
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
|
||||
|
@ -34,9 +35,11 @@ for (var i = 0; i < elmts.length; i++) {
|
|||
var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
|
||||
if(attribute == "Publisher:") {
|
||||
if(value.lastIndexOf("(") != -1) {
|
||||
var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
||||
jsDate = new Date(jsDate);
|
||||
var date = utilities.dateToISO(jsDate);
|
||||
var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
||||
jsDate = new Date(date);
|
||||
if(!isNaN(jsDate.valueOf())) {
|
||||
date = utilities.dateToISO(jsDate);
|
||||
}
|
||||
|
||||
value = value.substring(0, value.lastIndexOf("(")-1);
|
||||
}
|
||||
|
@ -65,7 +68,43 @@ if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
|
|||
title = title.substring(0, title.lastIndexOf("(")-1);
|
||||
}
|
||||
model.addStatement(uri, prefixDC + ''title'', title);
|
||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);');
|
||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
||||
}
|
||||
|
||||
var searchRe = new RegExp(''http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)'');
|
||||
var m = searchRe.exec(doc.location.href)
|
||||
if(m) {
|
||||
// Why can''t amazon use standard stylesheets
|
||||
var xpath;
|
||||
if(m == "gp/search/") {
|
||||
xpath = ''//table[@class="searchresults"]'';
|
||||
} else {
|
||||
xpath = ''//table[@cellpadding="3"]'';
|
||||
}
|
||||
|
||||
var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
var items = utilities.getItemArray(doc, searchresults, ''http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$'');
|
||||
items = utilities.selectItems(items);
|
||||
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
var uris = new Array();
|
||||
for(i in items) {
|
||||
uris.push(i);
|
||||
}
|
||||
|
||||
utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) },
|
||||
function() {
|
||||
utilities.debugPrint("look, done");
|
||||
done();
|
||||
}, function() {});
|
||||
|
||||
wait();
|
||||
} else {
|
||||
scrape(doc);
|
||||
}');
|
||||
|
||||
REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
|
||||
'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue