The Voyager scraper now actually works on the search results page.

This commit is contained in:
Simon Kornblith 2006-06-22 20:50:57 +00:00
parent 3890e5f122
commit 470f7c463f
4 changed files with 104 additions and 24 deletions

View file

@ -21,7 +21,7 @@ Scholar_Ingester_Interface_SelectItems = function() {}
Scholar_Ingester_Interface_SelectItems.init = function() {
this.io = window.arguments[0];
this.Scholar_Ingester_Interface = window.arguments[1];
this.listbox = document.getElementById("scholar-selectitems-links");
var listbox = document.getElementById("scholar-selectitems-links");
for(i in this.io.dataIn) { // we could use a tree for this if we wanted to
var itemNode = document.createElement("listitem");
@ -29,16 +29,29 @@ Scholar_Ingester_Interface_SelectItems.init = function() {
itemNode.setAttribute("value", i);
itemNode.setAttribute("label", this.io.dataIn[i]);
itemNode.setAttribute("checked", false);
this.listbox.appendChild(itemNode);
listbox.appendChild(itemNode);
}
}
Scholar_Ingester_Interface_SelectItems.acceptSelection = function() {
var listbox = document.getElementById("scholar-selectitems-links");
var returnObject = false;
this.io.dataOut = new Object();
// collect scrapeURLList from listbox
for(var i=0; i<this.listbox.length; i++) {
var itemNode = this.listbox[i];
this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
for(var i=0; i<listbox.childNodes.length; i++) {
var itemNode = listbox.childNodes[i];
if(itemNode.getAttribute("checked") == "true") {
this.io.dataOut[itemNode.getAttribute("value")] = itemNode.getAttribute("label");
returnObject = true;
}
}
// What a hack! this makes code down the road much easier because otherwise
// an empty array is true but empty and we can't figure that out, because
// there's no length
if(!returnObject) {
this.io.dataOut = null;
}
}

View file

@ -10,7 +10,6 @@ Borrowed from Linky, originally MPL/GPL/LGPL (now GPL, and modified into oblivio
persist="width height screenX screenY"
buttons="cancel,accept"
ondialogaccept="Scholar_Ingester_Interface_SelectItems.acceptSelection()"
ondialogcancel="self.close()"
id="scholar-selectitems"
onload="Scholar_Ingester_Interface_SelectItems.init()">

View file

@ -700,6 +700,13 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue)
if(this._scrapeCallback) {
this._scrapeCallback(this, returnValue);
}
// Get us ready for another scrape
delete this.model;
delete this.items;
this.model = new Scholar.Ingester.Model();
this.items = new Array();
// This is perhaps a bit paranoid, but we need to get the model redone anyway
this._generateSandbox();
}
/*

View file

@ -192,21 +192,72 @@ var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
var uri = doc.location.href;
var postString = '''';
var form = doc.forms.namedItem(''frm'');
var newUri = form.action;
var multiple = false;
if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
var items = utilities.getItemArray(doc, doc, ''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='', ''\[ [0-9]+ \]'');
var items = utilities.selectItems(items);
multiple = true;
var availableItems = new Object(); // Technically, associative arrays are objects
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
// Require link to match this
var tagRegexp = new RegExp();
tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='');
// Do not allow text to match this
var rejectRegexp = new RegExp();
rejectRegexp.compile(''\[ [0-9]+ \]'');
var checkboxes = new Array();
var urls = new Array();
var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/table/tbody/tr[td/input[@type="checkbox"]]'', nsResolver);
// Go through table rows
for(var i=0; i<tableRows.length; i++) {
// CHK is what we need to get it all as one file
var input = utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver);
checkboxes[i] = input.value;
var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
urls[i] = links[0].href;
utilities.debugPrint(urls[i]+" = "+links[0].href);
// Go through links
for(var j=0; j<links.length; j++) {
if(tagRegexp.test(links[j].href)) {
var text = utilities.getNodeString(doc, links[j], ''.//text()'', null);
if(text) {
text = utilities.cleanString(text);
if(!rejectRegexp.test(text)) {
if(availableItems[i]) {
availableItems[i] += " "+text;
} else {
availableItems[i] = text;
}
}
}
}
}
}
var items = utilities.selectItems(availableItems);
if(!items) {
return true;
}
// add arguments for items we need to grab
for(i in items) {
postString += "CHK="+checkboxes[i]+"&";
}
}
var uri = doc.location.href;
var raw, unicode, latin1;
var form = doc.forms.namedItem(''frm'');
var newUri = form.action;
var postString = '''';
for(i in form.elements) {
if(form.elements[i].type == ''HIDDEN'' || form.elements[i].type == ''hidden'') {
postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&'';
@ -227,11 +278,21 @@ for(i in export_options) {
}
postString += ''RD=''+i+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT'';
utilities.debugPrint(postString);
// No idea why this doesn''t work as post
utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
var record = new MARC_Record();
record.load(text, "binary");
model = utilities.importMARCRecord(record, uri, model);
var records = text.split("\x1D");
for(var i=0; i<(records.length-1); i++) {
if(multiple) {
utilities.debugPrint("uri = urls["+i+"]");
uri = urls[i];
utilities.debugPrint("my uri = "+uri);
}
var record = new MARC_Record();
record.load(records[i], "binary");
utilities.importMARCRecord(record, uri, model);
}
done();
})
wait();');
@ -466,7 +527,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
var record = new MARC_Record();
record.load(text, "MARC_PAC");
model = utilities.importMARCRecord(record, uri, model);
utilities.importMARCRecord(record, uri, model);
done();
}, function() {});
@ -867,7 +928,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
}
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
model = utilities.importMARCRecord(record, uri, model);
utilities.importMARCRecord(record, uri, model);
done();
}, function() {});
@ -915,7 +976,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
}
}
model = utilities.importMARCRecord(record, uri, model);
utilities.importMARCRecord(record, uri, model);
done();
}, function() {})
@ -952,7 +1013,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
record.add_field(field, ind1, ind2, value);
}
model = utilities.importMARCRecord(record, uri, model);
utilities.importMARCRecord(record, uri, model);
done();
}, function() {})
@ -983,7 +1044,7 @@ if(uri.indexOf("authority_hits") < 0) {
utilities.HTTPUtilities.doGet(newUri, null, function(text) {
var record = new MARC_Record();
record.load(text, "binary");
model = utilities.importMARCRecord(record, uri, model);
utilities.importMARCRecord(record, uri, model);
done();
})
wait();');
@ -1042,7 +1103,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
}
model = utilities.importMARCRecord(record, uri, model);
utilities.importMARCRecord(record, uri, model);
done();
}, function() {});
@ -1120,7 +1181,7 @@ utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format=
}
}
model = utilities.importMARCRecord(record, uri, model);
utilities.importMARCRecord(record, uri, model);
done();
})
wait();');
@ -1191,7 +1252,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
record.add_field(tag, ind1, ind2, content);
}
model = utilities.importMARCRecord(record, uri, model);
utilities.importMARCRecord(record, uri, model);
done();
}, function() {});