The Voyager scraper now actually works on the search results page.
This commit is contained in:
parent
3890e5f122
commit
470f7c463f
4 changed files with 104 additions and 24 deletions
|
@ -21,7 +21,7 @@ Scholar_Ingester_Interface_SelectItems = function() {}
|
|||
Scholar_Ingester_Interface_SelectItems.init = function() {
|
||||
this.io = window.arguments[0];
|
||||
this.Scholar_Ingester_Interface = window.arguments[1];
|
||||
this.listbox = document.getElementById("scholar-selectitems-links");
|
||||
var listbox = document.getElementById("scholar-selectitems-links");
|
||||
|
||||
for(i in this.io.dataIn) { // we could use a tree for this if we wanted to
|
||||
var itemNode = document.createElement("listitem");
|
||||
|
@ -29,16 +29,29 @@ Scholar_Ingester_Interface_SelectItems.init = function() {
|
|||
itemNode.setAttribute("value", i);
|
||||
itemNode.setAttribute("label", this.io.dataIn[i]);
|
||||
itemNode.setAttribute("checked", false);
|
||||
this.listbox.appendChild(itemNode);
|
||||
listbox.appendChild(itemNode);
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
|
||||
var listbox = document.getElementById("scholar-selectitems-links");
|
||||
|
||||
var returnObject = false;
|
||||
this.io.dataOut = new Object();
|
||||
|
||||
// collect scrapeURLList from listbox
|
||||
for(var i=0; i<this.listbox.length; i++) {
|
||||
var itemNode = this.listbox[i];
|
||||
this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
|
||||
for(var i=0; i<listbox.childNodes.length; i++) {
|
||||
var itemNode = listbox.childNodes[i];
|
||||
if(itemNode.getAttribute("checked") == "true") {
|
||||
this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
|
||||
returnObject = true;
|
||||
}
|
||||
}
|
||||
|
||||
// What a hack! this makes code down the road much easier because otherwise
|
||||
// an empty array is true but empty and we can't figure that out, because
|
||||
// there's no length
|
||||
if(!returnObject) {
|
||||
this.io.dataOut = null;
|
||||
}
|
||||
}
|
|
@ -10,7 +10,6 @@ Borrowed from Linky, originally MPL/GPL/LGPL (now GPL, and modified into oblivio
|
|||
persist="width height screenX screenY"
|
||||
buttons="cancel,accept"
|
||||
ondialogaccept="Scholar_Ingester_Interface_SelectItems.acceptSelection()"
|
||||
ondialogcancel="self.close()"
|
||||
id="scholar-selectitems"
|
||||
onload="Scholar_Ingester_Interface_SelectItems.init()">
|
||||
|
||||
|
|
|
@ -700,6 +700,13 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue)
|
|||
if(this._scrapeCallback) {
|
||||
this._scrapeCallback(this, returnValue);
|
||||
}
|
||||
// Get us ready for another scrape
|
||||
delete this.model;
|
||||
delete this.items;
|
||||
this.model = new Scholar.Ingester.Model();
|
||||
this.items = new Array();
|
||||
// This is perhaps a bit paranoid, but we need to get the model redone anyway
|
||||
this._generateSandbox();
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
97
scrapers.sql
97
scrapers.sql
|
@ -192,21 +192,72 @@ var prefixDC = ''http://purl.org/dc/elements/1.1/'';
|
|||
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
|
||||
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
|
||||
|
||||
var uri = doc.location.href;
|
||||
var postString = '''';
|
||||
var form = doc.forms.namedItem(''frm'');
|
||||
var newUri = form.action;
|
||||
var multiple = false;
|
||||
|
||||
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
|
||||
var items = utilities.getItemArray(doc, doc, ''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='', ''\[ [0-9]+ \]'');
|
||||
var items = utilities.selectItems(items);
|
||||
multiple = true;
|
||||
|
||||
var availableItems = new Object(); // Technically, associative arrays are objects
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
// Require link to match this
|
||||
var tagRegexp = new RegExp();
|
||||
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
|
||||
// Do not allow text to match this
|
||||
var rejectRegexp = new RegExp();
|
||||
rejectRegexp.compile(''\[ [0-9]+ \]'');
|
||||
|
||||
var checkboxes = new Array();
|
||||
var urls = new Array();
|
||||
|
||||
var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/table/tbody/tr[td/input[@type="checkbox"]]'', nsResolver);
|
||||
// Go through table rows
|
||||
for(var i=0; i<tableRows.length; i++) {
|
||||
// CHK is what we need to get it all as one file
|
||||
var input = utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver);
|
||||
checkboxes[i] = input.value;
|
||||
var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
|
||||
urls[i] = links[0].href;
|
||||
utilities.debugPrint(urls[i]+" = "+links[0].href);
|
||||
// Go through links
|
||||
for(var j=0; j<links.length; j++) {
|
||||
if(tagRegexp.test(links[j].href)) {
|
||||
var text = utilities.getNodeString(doc, links[j], ''.//text()'', null);
|
||||
if(text) {
|
||||
text = utilities.cleanString(text);
|
||||
if(!rejectRegexp.test(text)) {
|
||||
if(availableItems[i]) {
|
||||
availableItems[i] += " "+text;
|
||||
} else {
|
||||
availableItems[i] = text;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var items = utilities.selectItems(availableItems);
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// add arguments for items we need to grab
|
||||
for(i in items) {
|
||||
postString += "CHK="+checkboxes[i]+"&";
|
||||
}
|
||||
}
|
||||
|
||||
var uri = doc.location.href;
|
||||
|
||||
var raw, unicode, latin1;
|
||||
|
||||
var form = doc.forms.namedItem(''frm'');
|
||||
var newUri = form.action;
|
||||
var postString = '''';
|
||||
for(i in form.elements) {
|
||||
if(form.elements[i].type == ''HIDDEN'' || form.elements[i].type == ''hidden'') {
|
||||
postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&'';
|
||||
|
@ -227,11 +278,21 @@ for(i in export_options) {
|
|||
}
|
||||
postString += ''RD=''+i+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT'';
|
||||
|
||||
utilities.debugPrint(postString);
|
||||
|
||||
// No idea why this doesn''t work as post
|
||||
utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
|
||||
var record = new MARC_Record();
|
||||
record.load(text, "binary");
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
var records = text.split("\x1D");
|
||||
for(var i=0; i<(records.length-1); i++) {
|
||||
if(multiple) {
|
||||
utilities.debugPrint("uri = urls["+i+"]");
|
||||
uri = urls[i];
|
||||
utilities.debugPrint("my uri = "+uri);
|
||||
}
|
||||
var record = new MARC_Record();
|
||||
record.load(records[i], "binary");
|
||||
utilities.importMARCRecord(record, uri, model);
|
||||
}
|
||||
done();
|
||||
})
|
||||
wait();');
|
||||
|
@ -466,7 +527,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
|
||||
var record = new MARC_Record();
|
||||
record.load(text, "MARC_PAC");
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
utilities.importMARCRecord(record, uri, model);
|
||||
done();
|
||||
}, function() {});
|
||||
|
||||
|
@ -867,7 +928,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
}
|
||||
|
||||
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
utilities.importMARCRecord(record, uri, model);
|
||||
done();
|
||||
}, function() {});
|
||||
|
||||
|
@ -915,7 +976,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
}
|
||||
}
|
||||
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
utilities.importMARCRecord(record, uri, model);
|
||||
done();
|
||||
}, function() {})
|
||||
|
||||
|
@ -952,7 +1013,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
record.add_field(field, ind1, ind2, value);
|
||||
}
|
||||
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
utilities.importMARCRecord(record, uri, model);
|
||||
done();
|
||||
}, function() {})
|
||||
|
||||
|
@ -983,7 +1044,7 @@ if(uri.indexOf("authority_hits") < 0) {
|
|||
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
|
||||
var record = new MARC_Record();
|
||||
record.load(text, "binary");
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
utilities.importMARCRecord(record, uri, model);
|
||||
done();
|
||||
})
|
||||
wait();');
|
||||
|
@ -1042,7 +1103,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
|
||||
}
|
||||
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
utilities.importMARCRecord(record, uri, model);
|
||||
done();
|
||||
}, function() {});
|
||||
|
||||
|
@ -1120,7 +1181,7 @@ utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format=
|
|||
}
|
||||
}
|
||||
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
utilities.importMARCRecord(record, uri, model);
|
||||
done();
|
||||
})
|
||||
wait();');
|
||||
|
@ -1191,7 +1252,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
|
|||
record.add_field(tag, ind1, ind2, content);
|
||||
}
|
||||
|
||||
model = utilities.importMARCRecord(record, uri, model);
|
||||
utilities.importMARCRecord(record, uri, model);
|
||||
done();
|
||||
}, function() {});
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue