- Make events listening for DOMContentLoaded listen for load, because DOMContentLoaded does not seem ready for prime time (hey, it's undocumented, what can you expect)

- Make Amazon scraper work with multiple documents
- Fix bugs in processDocuments
- Make Scholar.Ingester.Utilities.getItemArray() willing to take an array of DOM nodes to search for links, and finally take advantage of the fact that objects have no length
This commit is contained in:
Simon Kornblith 2006-06-23 03:02:30 +00:00
parent b4d65420f3
commit 098078627c
3 changed files with 145 additions and 82 deletions

View file

@ -41,8 +41,8 @@ Scholar_Ingester_Interface.chromeLoad = function() {
// this gives us onLocationChange // this gives us onLocationChange
Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener, Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION); Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
// this gives us DOMContentLoaded // let's use load instead of DOMContentLoaded
Scholar_Ingester_Interface.appContent.addEventListener("DOMContentLoaded", Scholar_Ingester_Interface.appContent.addEventListener("load",
Scholar_Ingester_Interface.contentLoad, true); Scholar_Ingester_Interface.contentLoad, true);
} }

View file

@ -131,11 +131,13 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
// also logged in the Firefox Scholar log) // also logged in the Firefox Scholar log)
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window); var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
var myWindow = this.window;
var prevUrl, url;
Scholar.debug("processDocuments called"); Scholar.debug("processDocuments called");
try { try {
if (urls.length == 0) { if (urls.length == 0) {
if (firstDoc) { if(firstDoc) {
processor(firstDoc, done); processor(firstDoc, done);
} else { } else {
done(); done();
@ -148,7 +150,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
urlIndex++; urlIndex++;
if (urlIndex < urls.length) { if (urlIndex < urls.length) {
try { try {
var url = urls[urlIndex]; url = urls[urlIndex];
Scholar.debug("loading "+url); Scholar.debug("loading "+url);
hiddenBrowser.loadURI(url); hiddenBrowser.loadURI(url);
} catch (e) { } catch (e) {
@ -156,23 +158,26 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
exception(e); exception(e);
} }
} else { } else {
hiddenBrowser.removeEventListener("load", onLoad, true);
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser); Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
hiddenBrowser.setTimeout(done, 10); done();
} }
}; };
var onLoad = function() { var onLoad = function() {
Scholar.debug("onLoad called"); Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
hiddenBrowser.removeEventListener("load", onLoad, true); if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
try { prevUrl = hiddenBrowser.contentDocument.location.href;
var newHiddenBrowser = new Object(); try {
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; var newHiddenBrowser = new Object();
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
processor(newHiddenBrowser); newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
} catch (e) { processor(newHiddenBrowser);
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); } catch (e) {
exception(e); Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
exception(e);
}
doLoad();
} }
doLoad();
}; };
var init = function() { var init = function() {
Scholar.debug("init called"); Scholar.debug("init called");
@ -332,23 +337,33 @@ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe,
var availableItems = new Object(); // Technically, associative arrays are objects var availableItems = new Object(); // Technically, associative arrays are objects
// Require link to match this // Require link to match this
var tagRegexp = new RegExp(); if(urlRe) {
tagRegexp.compile(urlRe); var urlRegexp = new RegExp();
urlRegexp.compile(urlRe);
}
// Do not allow text to match this // Do not allow text to match this
var rejectRegexp = new RegExp(); if(rejectRe) {
rejectRegexp.compile(rejectRe); var rejectRegexp = new RegExp();
rejectRegexp.compile(rejectRe);
}
var links = inHere.getElementsByTagName("a"); if(!inHere.length) {
for(var i=0; i<links.length; i++) { inHere = new Array(inHere);
if(tagRegexp.test(links[i].href)) { }
var text = this.getNodeString(doc, links[i], './/text()', null);
if(text) { for(var j=0; j<inHere.length; j++) {
text = this.cleanString(text); var links = inHere[j].getElementsByTagName("a");
if(!rejectRegexp.test(text)) { for(var i=0; i<links.length; i++) {
if(availableItems[links[i].href]) { if(!urlRe || urlRegexp.test(links[i].href)) {
availableItems[links[i].href] += " "+text; var text = this.getNodeString(doc, links[i], './/text()', null);
} else { if(text) {
availableItems[links[i].href] = text; text = this.cleanString(text);
if(!rejectRe || !rejectRegexp.test(text)) {
if(availableItems[links[i].href]) {
availableItems[links[i].href] += " "+text;
} else {
availableItems[links[i].href] = text;
}
} }
} }
} }
@ -822,7 +837,16 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
if(this.model.data[uri][prefixDC + 'year']) { if(this.model.data[uri][prefixDC + 'year']) {
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]); newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
} else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) { } else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/
if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) {
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
} else {
var m;
var yearRe = /[0-9]{4}$/;
if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) {
newItem.setField("year", m[0]);
}
}
} }
} }

View file

@ -1,9 +1,9 @@
-- 10 -- 11
-- Set the following timestamp to the most recent scraper update date -- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 16:51:00')); REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 22:58:00'));
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-18 10:15:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
@ -13,59 +13,98 @@ var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null; if (prefix == ''x'') return namespace; else return null;
} : null; } : null;
var uri = doc.location.href; function scrape(doc) {
uri = doc.location.href;
// Retrieve authors
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) {
var elmt = elmts[i];
model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here // Retrieve authors
} var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
// Retrieve data from "Product Details" box for (var i = 0; i < elmts.length; i++) {
var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li''; var elmt = elmts[i];
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
for (var i = 0; i < elmts.length; i++) { model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
var elmt = elmts[i]; }
var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { // Retrieve data from "Product Details" box
var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
if(attribute == "Publisher:") { var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
if(value.lastIndexOf("(") != -1) { for (var i = 0; i < elmts.length; i++) {
var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1); var elmt = elmts[i];
jsDate = new Date(jsDate); var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
var date = utilities.dateToISO(jsDate); if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
value = value.substring(0, value.lastIndexOf("(")-1); if(attribute == "Publisher:") {
if(value.lastIndexOf("(") != -1) {
var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
jsDate = new Date(date);
if(!isNaN(jsDate.valueOf())) {
date = utilities.dateToISO(jsDate);
}
value = value.substring(0, value.lastIndexOf("(")-1);
}
if(value.lastIndexOf(";") != -1) {
var edition = value.substring(value.lastIndexOf(";")+2, value.length);
value = value.substring(0, value.lastIndexOf(";"));
}
model.addStatement(uri, prefixDC + ''publisher'', value);
model.addStatement(uri, prefixDC + ''date'', date);
model.addStatement(uri, prefixDC + ''hasVersion'', edition);
} else if(attribute == "Language:") {
model.addStatement(uri, prefixDC + ''language'', value);
} else if(attribute == "ISBN:") {
model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
} }
if(value.lastIndexOf(";") != -1) {
var edition = value.substring(value.lastIndexOf(";")+2, value.length);
value = value.substring(0, value.lastIndexOf(";"));
}
model.addStatement(uri, prefixDC + ''publisher'', value);
model.addStatement(uri, prefixDC + ''date'', date);
model.addStatement(uri, prefixDC + ''hasVersion'', edition);
} else if(attribute == "Language:") {
model.addStatement(uri, prefixDC + ''language'', value);
} else if(attribute == "ISBN:") {
model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
} }
} }
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
title = title.substring(0, title.lastIndexOf("(")-1);
}
model.addStatement(uri, prefixDC + ''title'', title);
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
} }
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; var searchRe = new RegExp(''http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)'');
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var m = searchRe.exec(doc.location.href)
var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); if(m) {
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { // Why can''t amazon use standard stylesheets
title = title.substring(0, title.lastIndexOf("(")-1); var xpath;
} if(m == "gp/search/") {
model.addStatement(uri, prefixDC + ''title'', title); xpath = ''//table[@class="searchresults"]'';
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);'); } else {
xpath = ''//table[@cellpadding="3"]'';
}
var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
var items = utilities.getItemArray(doc, searchresults, ''http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$'');
items = utilities.selectItems(items);
if(!items) {
return true;
}
var uris = new Array();
for(i in items) {
uris.push(i);
}
utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) },
function() {
utilities.debugPrint("look, done");
done();
}, function() {});
wait();
} else {
scrape(doc);
}');
REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/', REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { 'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {