- Make events listening for DOMContentLoaded listen for load, because DOMContentLoaded does not seem ready for prime time (hey, it's undocumented, what can you expect)
- Make Amazon scraper work with multiple documents - Fix bugs in processDocuments - Make Scholar.Ingester.Utilities.getItemArray() willing to take an array of DOM nodes to search for links, and finally take advantage of the fact that objects have no length
This commit is contained in:
parent
b4d65420f3
commit
098078627c
3 changed files with 145 additions and 82 deletions
|
@ -41,8 +41,8 @@ Scholar_Ingester_Interface.chromeLoad = function() {
|
|||
// this gives us onLocationChange
|
||||
Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
|
||||
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
|
||||
// this gives us DOMContentLoaded
|
||||
Scholar_Ingester_Interface.appContent.addEventListener("DOMContentLoaded",
|
||||
// let's use load instead of DOMContentLoaded
|
||||
Scholar_Ingester_Interface.appContent.addEventListener("load",
|
||||
Scholar_Ingester_Interface.contentLoad, true);
|
||||
}
|
||||
|
||||
|
|
|
@ -131,11 +131,13 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
|
|||
// also logged in the Firefox Scholar log)
|
||||
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
|
||||
var myWindow = this.window;
|
||||
var prevUrl, url;
|
||||
Scholar.debug("processDocuments called");
|
||||
|
||||
try {
|
||||
if (urls.length == 0) {
|
||||
if (firstDoc) {
|
||||
if(firstDoc) {
|
||||
processor(firstDoc, done);
|
||||
} else {
|
||||
done();
|
||||
|
@ -148,7 +150,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
|||
urlIndex++;
|
||||
if (urlIndex < urls.length) {
|
||||
try {
|
||||
var url = urls[urlIndex];
|
||||
url = urls[urlIndex];
|
||||
Scholar.debug("loading "+url);
|
||||
hiddenBrowser.loadURI(url);
|
||||
} catch (e) {
|
||||
|
@ -156,23 +158,26 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
|||
exception(e);
|
||||
}
|
||||
} else {
|
||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
|
||||
hiddenBrowser.setTimeout(done, 10);
|
||||
done();
|
||||
}
|
||||
};
|
||||
var onLoad = function() {
|
||||
Scholar.debug("onLoad called");
|
||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||
try {
|
||||
var newHiddenBrowser = new Object();
|
||||
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
||||
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
|
||||
processor(newHiddenBrowser);
|
||||
} catch (e) {
|
||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
||||
exception(e);
|
||||
Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
|
||||
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
|
||||
prevUrl = hiddenBrowser.contentDocument.location.href;
|
||||
try {
|
||||
var newHiddenBrowser = new Object();
|
||||
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
||||
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
|
||||
processor(newHiddenBrowser);
|
||||
} catch (e) {
|
||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
||||
exception(e);
|
||||
}
|
||||
doLoad();
|
||||
}
|
||||
doLoad();
|
||||
};
|
||||
var init = function() {
|
||||
Scholar.debug("init called");
|
||||
|
@ -332,23 +337,33 @@ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe,
|
|||
var availableItems = new Object(); // Technically, associative arrays are objects
|
||||
|
||||
// Require link to match this
|
||||
var tagRegexp = new RegExp();
|
||||
tagRegexp.compile(urlRe);
|
||||
if(urlRe) {
|
||||
var urlRegexp = new RegExp();
|
||||
urlRegexp.compile(urlRe);
|
||||
}
|
||||
// Do not allow text to match this
|
||||
var rejectRegexp = new RegExp();
|
||||
rejectRegexp.compile(rejectRe);
|
||||
if(rejectRe) {
|
||||
var rejectRegexp = new RegExp();
|
||||
rejectRegexp.compile(rejectRe);
|
||||
}
|
||||
|
||||
var links = inHere.getElementsByTagName("a");
|
||||
for(var i=0; i<links.length; i++) {
|
||||
if(tagRegexp.test(links[i].href)) {
|
||||
var text = this.getNodeString(doc, links[i], './/text()', null);
|
||||
if(text) {
|
||||
text = this.cleanString(text);
|
||||
if(!rejectRegexp.test(text)) {
|
||||
if(availableItems[links[i].href]) {
|
||||
availableItems[links[i].href] += " "+text;
|
||||
} else {
|
||||
availableItems[links[i].href] = text;
|
||||
if(!inHere.length) {
|
||||
inHere = new Array(inHere);
|
||||
}
|
||||
|
||||
for(var j=0; j<inHere.length; j++) {
|
||||
var links = inHere[j].getElementsByTagName("a");
|
||||
for(var i=0; i<links.length; i++) {
|
||||
if(!urlRe || urlRegexp.test(links[i].href)) {
|
||||
var text = this.getNodeString(doc, links[i], './/text()', null);
|
||||
if(text) {
|
||||
text = this.cleanString(text);
|
||||
if(!rejectRe || !rejectRegexp.test(text)) {
|
||||
if(availableItems[links[i].href]) {
|
||||
availableItems[links[i].href] += " "+text;
|
||||
} else {
|
||||
availableItems[links[i].href] = text;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -822,7 +837,16 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
|||
if(this.model.data[uri][prefixDC + 'year']) {
|
||||
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
|
||||
} else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
|
||||
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
|
||||
var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/
|
||||
if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) {
|
||||
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
|
||||
} else {
|
||||
var m;
|
||||
var yearRe = /[0-9]{4}$/;
|
||||
if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) {
|
||||
newItem.setField("year", m[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
139
scrapers.sql
139
scrapers.sql
|
@ -1,9 +1,9 @@
|
|||
-- 10
|
||||
-- 11
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 16:51:00'));
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 22:58:00'));
|
||||
|
||||
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-18 10:15:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||
|
@ -13,59 +13,98 @@ var nsResolver = namespace ? function(prefix) {
|
|||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var uri = doc.location.href;
|
||||
|
||||
// Retrieve authors
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
for (var i = 0; i < elmts.length; i++) {
|
||||
var elmt = elmts[i];
|
||||
function scrape(doc) {
|
||||
uri = doc.location.href;
|
||||
|
||||
model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
|
||||
}
|
||||
|
||||
// Retrieve data from "Product Details" box
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
for (var i = 0; i < elmts.length; i++) {
|
||||
var elmt = elmts[i];
|
||||
var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
|
||||
if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
|
||||
var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
|
||||
if(attribute == "Publisher:") {
|
||||
if(value.lastIndexOf("(") != -1) {
|
||||
var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
||||
jsDate = new Date(jsDate);
|
||||
var date = utilities.dateToISO(jsDate);
|
||||
|
||||
value = value.substring(0, value.lastIndexOf("(")-1);
|
||||
// Retrieve authors
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
for (var i = 0; i < elmts.length; i++) {
|
||||
var elmt = elmts[i];
|
||||
|
||||
model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
|
||||
}
|
||||
|
||||
// Retrieve data from "Product Details" box
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
for (var i = 0; i < elmts.length; i++) {
|
||||
var elmt = elmts[i];
|
||||
var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
|
||||
if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
|
||||
var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
|
||||
if(attribute == "Publisher:") {
|
||||
if(value.lastIndexOf("(") != -1) {
|
||||
var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
||||
jsDate = new Date(date);
|
||||
if(!isNaN(jsDate.valueOf())) {
|
||||
date = utilities.dateToISO(jsDate);
|
||||
}
|
||||
|
||||
value = value.substring(0, value.lastIndexOf("(")-1);
|
||||
}
|
||||
if(value.lastIndexOf(";") != -1) {
|
||||
var edition = value.substring(value.lastIndexOf(";")+2, value.length);
|
||||
value = value.substring(0, value.lastIndexOf(";"));
|
||||
}
|
||||
model.addStatement(uri, prefixDC + ''publisher'', value);
|
||||
model.addStatement(uri, prefixDC + ''date'', date);
|
||||
model.addStatement(uri, prefixDC + ''hasVersion'', edition);
|
||||
} else if(attribute == "Language:") {
|
||||
model.addStatement(uri, prefixDC + ''language'', value);
|
||||
} else if(attribute == "ISBN:") {
|
||||
model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
|
||||
} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
|
||||
model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
|
||||
model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
|
||||
}
|
||||
if(value.lastIndexOf(";") != -1) {
|
||||
var edition = value.substring(value.lastIndexOf(";")+2, value.length);
|
||||
value = value.substring(0, value.lastIndexOf(";"));
|
||||
}
|
||||
model.addStatement(uri, prefixDC + ''publisher'', value);
|
||||
model.addStatement(uri, prefixDC + ''date'', date);
|
||||
model.addStatement(uri, prefixDC + ''hasVersion'', edition);
|
||||
} else if(attribute == "Language:") {
|
||||
model.addStatement(uri, prefixDC + ''language'', value);
|
||||
} else if(attribute == "ISBN:") {
|
||||
model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
|
||||
} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
|
||||
model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
|
||||
model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
|
||||
}
|
||||
}
|
||||
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
|
||||
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
|
||||
title = title.substring(0, title.lastIndexOf("(")-1);
|
||||
}
|
||||
model.addStatement(uri, prefixDC + ''title'', title);
|
||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
||||
}
|
||||
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
|
||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
|
||||
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
|
||||
title = title.substring(0, title.lastIndexOf("(")-1);
|
||||
}
|
||||
model.addStatement(uri, prefixDC + ''title'', title);
|
||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);');
|
||||
var searchRe = new RegExp(''http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)'');
|
||||
var m = searchRe.exec(doc.location.href)
|
||||
if(m) {
|
||||
// Why can''t amazon use standard stylesheets
|
||||
var xpath;
|
||||
if(m == "gp/search/") {
|
||||
xpath = ''//table[@class="searchresults"]'';
|
||||
} else {
|
||||
xpath = ''//table[@cellpadding="3"]'';
|
||||
}
|
||||
|
||||
var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
var items = utilities.getItemArray(doc, searchresults, ''http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$'');
|
||||
items = utilities.selectItems(items);
|
||||
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
var uris = new Array();
|
||||
for(i in items) {
|
||||
uris.push(i);
|
||||
}
|
||||
|
||||
utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) },
|
||||
function() {
|
||||
utilities.debugPrint("look, done");
|
||||
done();
|
||||
}, function() {});
|
||||
|
||||
wait();
|
||||
} else {
|
||||
scrape(doc);
|
||||
}');
|
||||
|
||||
REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
|
||||
'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
|
||||
|
|
Loading…
Add table
Reference in a new issue