- Make events listening for DOMContentLoaded listen for load, because DOMContentLoaded does not seem ready for prime time (hey, it's undocumented, what can you expect)
- Make Amazon scraper work with multiple documents - Fix bugs in processDocuments - Make Scholar.Ingester.Utilities.getItemArray() willing to take an array of DOM nodes to search for links, and finally take advantage of the fact that objects have no length
This commit is contained in:
parent
b4d65420f3
commit
098078627c
3 changed files with 145 additions and 82 deletions
|
@ -41,8 +41,8 @@ Scholar_Ingester_Interface.chromeLoad = function() {
|
||||||
// this gives us onLocationChange
|
// this gives us onLocationChange
|
||||||
Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
|
Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
|
||||||
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
|
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
|
||||||
// this gives us DOMContentLoaded
|
// let's use load instead of DOMContentLoaded
|
||||||
Scholar_Ingester_Interface.appContent.addEventListener("DOMContentLoaded",
|
Scholar_Ingester_Interface.appContent.addEventListener("load",
|
||||||
Scholar_Ingester_Interface.contentLoad, true);
|
Scholar_Ingester_Interface.contentLoad, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -131,11 +131,13 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
|
||||||
// also logged in the Firefox Scholar log)
|
// also logged in the Firefox Scholar log)
|
||||||
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||||
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
|
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
|
||||||
|
var myWindow = this.window;
|
||||||
|
var prevUrl, url;
|
||||||
Scholar.debug("processDocuments called");
|
Scholar.debug("processDocuments called");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (urls.length == 0) {
|
if (urls.length == 0) {
|
||||||
if (firstDoc) {
|
if(firstDoc) {
|
||||||
processor(firstDoc, done);
|
processor(firstDoc, done);
|
||||||
} else {
|
} else {
|
||||||
done();
|
done();
|
||||||
|
@ -148,7 +150,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
||||||
urlIndex++;
|
urlIndex++;
|
||||||
if (urlIndex < urls.length) {
|
if (urlIndex < urls.length) {
|
||||||
try {
|
try {
|
||||||
var url = urls[urlIndex];
|
url = urls[urlIndex];
|
||||||
Scholar.debug("loading "+url);
|
Scholar.debug("loading "+url);
|
||||||
hiddenBrowser.loadURI(url);
|
hiddenBrowser.loadURI(url);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
@ -156,23 +158,26 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
||||||
exception(e);
|
exception(e);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
hiddenBrowser.removeEventListener("load", onLoad, true);
|
||||||
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
|
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
|
||||||
hiddenBrowser.setTimeout(done, 10);
|
done();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
var onLoad = function() {
|
var onLoad = function() {
|
||||||
Scholar.debug("onLoad called");
|
Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
|
||||||
hiddenBrowser.removeEventListener("load", onLoad, true);
|
if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
|
||||||
try {
|
prevUrl = hiddenBrowser.contentDocument.location.href;
|
||||||
var newHiddenBrowser = new Object();
|
try {
|
||||||
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
var newHiddenBrowser = new Object();
|
||||||
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
|
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
||||||
processor(newHiddenBrowser);
|
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
|
||||||
} catch (e) {
|
processor(newHiddenBrowser);
|
||||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
} catch (e) {
|
||||||
exception(e);
|
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
||||||
|
exception(e);
|
||||||
|
}
|
||||||
|
doLoad();
|
||||||
}
|
}
|
||||||
doLoad();
|
|
||||||
};
|
};
|
||||||
var init = function() {
|
var init = function() {
|
||||||
Scholar.debug("init called");
|
Scholar.debug("init called");
|
||||||
|
@ -332,23 +337,33 @@ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe,
|
||||||
var availableItems = new Object(); // Technically, associative arrays are objects
|
var availableItems = new Object(); // Technically, associative arrays are objects
|
||||||
|
|
||||||
// Require link to match this
|
// Require link to match this
|
||||||
var tagRegexp = new RegExp();
|
if(urlRe) {
|
||||||
tagRegexp.compile(urlRe);
|
var urlRegexp = new RegExp();
|
||||||
|
urlRegexp.compile(urlRe);
|
||||||
|
}
|
||||||
// Do not allow text to match this
|
// Do not allow text to match this
|
||||||
var rejectRegexp = new RegExp();
|
if(rejectRe) {
|
||||||
rejectRegexp.compile(rejectRe);
|
var rejectRegexp = new RegExp();
|
||||||
|
rejectRegexp.compile(rejectRe);
|
||||||
|
}
|
||||||
|
|
||||||
var links = inHere.getElementsByTagName("a");
|
if(!inHere.length) {
|
||||||
for(var i=0; i<links.length; i++) {
|
inHere = new Array(inHere);
|
||||||
if(tagRegexp.test(links[i].href)) {
|
}
|
||||||
var text = this.getNodeString(doc, links[i], './/text()', null);
|
|
||||||
if(text) {
|
for(var j=0; j<inHere.length; j++) {
|
||||||
text = this.cleanString(text);
|
var links = inHere[j].getElementsByTagName("a");
|
||||||
if(!rejectRegexp.test(text)) {
|
for(var i=0; i<links.length; i++) {
|
||||||
if(availableItems[links[i].href]) {
|
if(!urlRe || urlRegexp.test(links[i].href)) {
|
||||||
availableItems[links[i].href] += " "+text;
|
var text = this.getNodeString(doc, links[i], './/text()', null);
|
||||||
} else {
|
if(text) {
|
||||||
availableItems[links[i].href] = text;
|
text = this.cleanString(text);
|
||||||
|
if(!rejectRe || !rejectRegexp.test(text)) {
|
||||||
|
if(availableItems[links[i].href]) {
|
||||||
|
availableItems[links[i].href] += " "+text;
|
||||||
|
} else {
|
||||||
|
availableItems[links[i].href] = text;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -822,7 +837,16 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||||
if(this.model.data[uri][prefixDC + 'year']) {
|
if(this.model.data[uri][prefixDC + 'year']) {
|
||||||
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
|
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
|
||||||
} else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
|
} else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
|
||||||
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
|
var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/
|
||||||
|
if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) {
|
||||||
|
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
|
||||||
|
} else {
|
||||||
|
var m;
|
||||||
|
var yearRe = /[0-9]{4}$/;
|
||||||
|
if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) {
|
||||||
|
newItem.setField("year", m[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
133
scrapers.sql
133
scrapers.sql
|
@ -1,9 +1,9 @@
|
||||||
-- 10
|
-- 11
|
||||||
|
|
||||||
-- Set the following timestamp to the most recent scraper update date
|
-- Set the following timestamp to the most recent scraper update date
|
||||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 16:51:00'));
|
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 22:58:00'));
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-18 10:15:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||||
|
@ -13,59 +13,98 @@ var nsResolver = namespace ? function(prefix) {
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
} : null;
|
} : null;
|
||||||
|
|
||||||
var uri = doc.location.href;
|
function scrape(doc) {
|
||||||
|
uri = doc.location.href;
|
||||||
|
|
||||||
// Retrieve authors
|
// Retrieve authors
|
||||||
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
|
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
|
||||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
for (var i = 0; i < elmts.length; i++) {
|
for (var i = 0; i < elmts.length; i++) {
|
||||||
var elmt = elmts[i];
|
var elmt = elmts[i];
|
||||||
|
|
||||||
model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
|
model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieve data from "Product Details" box
|
// Retrieve data from "Product Details" box
|
||||||
var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
|
var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
|
||||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
for (var i = 0; i < elmts.length; i++) {
|
for (var i = 0; i < elmts.length; i++) {
|
||||||
var elmt = elmts[i];
|
var elmt = elmts[i];
|
||||||
var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
|
var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
|
||||||
if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
|
if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
|
||||||
var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
|
var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
|
||||||
if(attribute == "Publisher:") {
|
if(attribute == "Publisher:") {
|
||||||
if(value.lastIndexOf("(") != -1) {
|
if(value.lastIndexOf("(") != -1) {
|
||||||
var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
||||||
jsDate = new Date(jsDate);
|
jsDate = new Date(date);
|
||||||
var date = utilities.dateToISO(jsDate);
|
if(!isNaN(jsDate.valueOf())) {
|
||||||
|
date = utilities.dateToISO(jsDate);
|
||||||
|
}
|
||||||
|
|
||||||
value = value.substring(0, value.lastIndexOf("(")-1);
|
value = value.substring(0, value.lastIndexOf("(")-1);
|
||||||
|
}
|
||||||
|
if(value.lastIndexOf(";") != -1) {
|
||||||
|
var edition = value.substring(value.lastIndexOf(";")+2, value.length);
|
||||||
|
value = value.substring(0, value.lastIndexOf(";"));
|
||||||
|
}
|
||||||
|
model.addStatement(uri, prefixDC + ''publisher'', value);
|
||||||
|
model.addStatement(uri, prefixDC + ''date'', date);
|
||||||
|
model.addStatement(uri, prefixDC + ''hasVersion'', edition);
|
||||||
|
} else if(attribute == "Language:") {
|
||||||
|
model.addStatement(uri, prefixDC + ''language'', value);
|
||||||
|
} else if(attribute == "ISBN:") {
|
||||||
|
model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
|
||||||
|
} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
|
||||||
|
model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
|
||||||
|
model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
|
||||||
}
|
}
|
||||||
if(value.lastIndexOf(";") != -1) {
|
|
||||||
var edition = value.substring(value.lastIndexOf(";")+2, value.length);
|
|
||||||
value = value.substring(0, value.lastIndexOf(";"));
|
|
||||||
}
|
|
||||||
model.addStatement(uri, prefixDC + ''publisher'', value);
|
|
||||||
model.addStatement(uri, prefixDC + ''date'', date);
|
|
||||||
model.addStatement(uri, prefixDC + ''hasVersion'', edition);
|
|
||||||
} else if(attribute == "Language:") {
|
|
||||||
model.addStatement(uri, prefixDC + ''language'', value);
|
|
||||||
} else if(attribute == "ISBN:") {
|
|
||||||
model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
|
|
||||||
} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
|
|
||||||
model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
|
|
||||||
model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
|
||||||
|
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
|
var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
|
||||||
|
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
|
||||||
|
title = title.substring(0, title.lastIndexOf("(")-1);
|
||||||
|
}
|
||||||
|
model.addStatement(uri, prefixDC + ''title'', title);
|
||||||
|
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
||||||
}
|
}
|
||||||
|
|
||||||
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
|
var searchRe = new RegExp(''http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)'');
|
||||||
var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
var m = searchRe.exec(doc.location.href)
|
||||||
var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
|
if(m) {
|
||||||
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
|
// Why can''t amazon use standard stylesheets
|
||||||
title = title.substring(0, title.lastIndexOf("(")-1);
|
var xpath;
|
||||||
}
|
if(m == "gp/search/") {
|
||||||
model.addStatement(uri, prefixDC + ''title'', title);
|
xpath = ''//table[@class="searchresults"]'';
|
||||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);');
|
} else {
|
||||||
|
xpath = ''//table[@cellpadding="3"]'';
|
||||||
|
}
|
||||||
|
|
||||||
|
var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||||
|
var items = utilities.getItemArray(doc, searchresults, ''http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$'');
|
||||||
|
items = utilities.selectItems(items);
|
||||||
|
|
||||||
|
if(!items) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var uris = new Array();
|
||||||
|
for(i in items) {
|
||||||
|
uris.push(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) },
|
||||||
|
function() {
|
||||||
|
utilities.debugPrint("look, done");
|
||||||
|
done();
|
||||||
|
}, function() {});
|
||||||
|
|
||||||
|
wait();
|
||||||
|
} else {
|
||||||
|
scrape(doc);
|
||||||
|
}');
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
|
REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
|
||||||
'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
|
'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue