- Make generalized function for finding search results case insensitive
- Scrape DRA search results
This commit is contained in:
parent
8fe72b3e3c
commit
2a74e88416
2 changed files with 53 additions and 25 deletions
|
@ -339,12 +339,12 @@ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe,
|
||||||
// Require link to match this
|
// Require link to match this
|
||||||
if(urlRe) {
|
if(urlRe) {
|
||||||
var urlRegexp = new RegExp();
|
var urlRegexp = new RegExp();
|
||||||
urlRegexp.compile(urlRe);
|
urlRegexp.compile(urlRe, "i");
|
||||||
}
|
}
|
||||||
// Do not allow text to match this
|
// Do not allow text to match this
|
||||||
if(rejectRe) {
|
if(rejectRe) {
|
||||||
var rejectRegexp = new RegExp();
|
var rejectRegexp = new RegExp();
|
||||||
rejectRegexp.compile(rejectRe);
|
rejectRegexp.compile(rejectRe, "i");
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!inHere.length) {
|
if(!inHere.length) {
|
||||||
|
|
74
scrapers.sql
74
scrapers.sql
|
@ -1,7 +1,7 @@
|
||||||
-- 13
|
-- 14
|
||||||
|
|
||||||
-- Set the following timestamp to the most recent scraper update date
|
-- Set the following timestamp to the most recent scraper update date
|
||||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 15:21:00'));
|
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 16:09:00'));
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
|
@ -1299,34 +1299,62 @@ utilities.processDocuments(browser, null, newUris, function(newBrowser) {
|
||||||
|
|
||||||
wait();');
|
wait();');
|
||||||
|
|
||||||
REPLACE INTO "scrapers" VALUES('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-18 11:19:00', 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)',
|
REPLACE INTO "scrapers" VALUES('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-23 16:09:00', 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)', NULL,
|
||||||
'if(doc.location.href.indexOf("authority_hits") > 0) {
|
|
||||||
var body = doc.getElementsByTagName("body");
|
|
||||||
if(body[0].innerHTML.indexOf("ISBN") < 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;',
|
|
||||||
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
|
||||||
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
||||||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||||
|
|
||||||
var uri = doc.location.href;
|
var checkItems = false;
|
||||||
var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i;
|
|
||||||
var m = uriRegexp.exec(uri);
|
if(doc.location.href.indexOf("/authority_hits") > 0) {
|
||||||
if(uri.indexOf("authority_hits") < 0) {
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3];
|
var nsResolver = namespace ? function(prefix) {
|
||||||
} else {
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc";
|
} : null;
|
||||||
|
|
||||||
|
checkItems = utilities.gatherElementsOnXPath(doc, doc, "/html/body//ol/li", nsResolver);
|
||||||
}
|
}
|
||||||
|
|
||||||
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
if(checkItems && checkItems.length) {
|
||||||
var record = new MARC_Record();
|
var items = utilities.getItemArray(doc, checkItems, ''https?://.*/web2/tramp2\.exe/see_record'');
|
||||||
record.load(text, "binary");
|
items = utilities.selectItems(items);
|
||||||
utilities.importMARCRecord(record, uri, model);
|
|
||||||
done();
|
if(!items) {
|
||||||
})
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var uris = new Array();
|
||||||
|
for(i in items) {
|
||||||
|
uris.push(i);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
var uris = new Array(doc.location.href);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(i in uris) {
|
||||||
|
var uri = uris[i];
|
||||||
|
var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i;
|
||||||
|
var m = uriRegexp.exec(uri);
|
||||||
|
if(uri.indexOf("/authority_hits") < 0) {
|
||||||
|
var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3];
|
||||||
|
} else {
|
||||||
|
var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep track of how many requests have been completed
|
||||||
|
var j = 0;
|
||||||
|
|
||||||
|
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
||||||
|
var record = new MARC_Record();
|
||||||
|
record.load(text, "binary");
|
||||||
|
utilities.importMARCRecord(record, uris[j], model);
|
||||||
|
j++;
|
||||||
|
if(j == uris.length) {
|
||||||
|
done();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
wait();');
|
wait();');
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue