From 470f7c463f7f1563216d385f7f67757fdffb48fd Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Thu, 22 Jun 2006 20:50:57 +0000 Subject: [PATCH] The Voyager scraper now actually works on the search results page. --- .../content/scholar/ingester/selectitems.js | 23 ++++- .../content/scholar/ingester/selectitems.xul | 1 - .../content/scholar/xpcom/ingester.js | 7 ++ scrapers.sql | 97 +++++++++++++++---- 4 files changed, 104 insertions(+), 24 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/ingester/selectitems.js b/chrome/chromeFiles/content/scholar/ingester/selectitems.js index e09f30b3e8..ea63b4f99e 100644 --- a/chrome/chromeFiles/content/scholar/ingester/selectitems.js +++ b/chrome/chromeFiles/content/scholar/ingester/selectitems.js @@ -21,7 +21,7 @@ Scholar_Ingester_Interface_SelectItems = function() {} Scholar_Ingester_Interface_SelectItems.init = function() { this.io = window.arguments[0]; this.Scholar_Ingester_Interface = window.arguments[1]; - this.listbox = document.getElementById("scholar-selectitems-links"); + var listbox = document.getElementById("scholar-selectitems-links"); for(i in this.io.dataIn) { // we could use a tree for this if we wanted to var itemNode = document.createElement("listitem"); @@ -29,16 +29,29 @@ Scholar_Ingester_Interface_SelectItems.init = function() { itemNode.setAttribute("value", i); itemNode.setAttribute("label", this.io.dataIn[i]); itemNode.setAttribute("checked", false); - this.listbox.appendChild(itemNode); + listbox.appendChild(itemNode); } } Scholar_Ingester_Interface_SelectItems.acceptSelection = function() { + var listbox = document.getElementById("scholar-selectitems-links"); + + var returnObject = false; this.io.dataOut = new Object(); // collect scrapeURLList from listbox - for(var i=0; i diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index d7e92b6c49..1a48b586ff 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -700,6 +700,13 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue) if(this._scrapeCallback) { this._scrapeCallback(this, returnValue); } + // Get us ready for another scrape + delete this.model; + delete this.items; + this.model = new Scholar.Ingester.Model(); + this.items = new Array(); + // This is perhaps a bit paranoid, but we need to get the model redone anyway + this._generateSandbox(); } /* diff --git a/scrapers.sql b/scrapers.sql index b0f96609eb..fc7b76a3a4 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -192,21 +192,72 @@ var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; +var uri = doc.location.href; +var postString = ''''; +var form = doc.forms.namedItem(''frm''); +var newUri = form.action; +var multiple = false; + if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) { - var items = utilities.getItemArray(doc, doc, ''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti='', ''\[ [0-9]+ \]''); - var items = utilities.selectItems(items); + multiple = true; + + var availableItems = new Object(); // Technically, associative arrays are objects + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti=''); + // Do not allow text to match this + var rejectRegexp = new RegExp(); + rejectRegexp.compile(''\[ [0-9]+ \]''); + + var checkboxes = new Array(); + var urls = new Array(); + + var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/table/tbody/tr[td/input[@type="checkbox"]]'', nsResolver); + // Go through table rows + for(var i=0; i