From 438ff82955f96b8ede7908ce69927ee53bd405e1 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Thu, 31 Aug 2006 07:45:03 +0000 Subject: [PATCH] - replace storage streams with plain old strings for translate IO. there's not much of a reason to use storage streams now, and it was screwing up non-ASCII characters. - make EBSCO scraper work better through a proxy - shorten Accession Number -> Accession No, Journal Abbreviation -> Journal Abbr, Publication Title -> Publication. it does look a bit stranger, but it also makes the interface more functional (especially for those of us without giant widescreen LCDs ;-) --- .../content/scholar/xpcom/scholar.js | 2 +- .../content/scholar/xpcom/translate.js | 132 +++++++----------- scrapers.sql | 41 +++--- 3 files changed, 75 insertions(+), 100 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/xpcom/scholar.js b/chrome/chromeFiles/content/scholar/xpcom/scholar.js index 5880d9ecfb..02b2618a34 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/scholar.js +++ b/chrome/chromeFiles/content/scholar/xpcom/scholar.js @@ -737,7 +737,7 @@ Scholar.Date = new function(){ var months = CSL.getMonthStrings("long"); string += months[date.month]; if(date.day) { - string += ", "+date.day; + string += " "+parseInt(date.day, 10).toString()+", "; } else { string += " "; } diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js index 186e7fd3e0..493a40acc0 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/translate.js +++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -241,15 +241,9 @@ Scholar.Translate.prototype.setLocation = function(location) { * sets the string to be used as a file */ Scholar.Translate.prototype.setString = function(string) { - this.string = string; - this._createStorageStream(); - - this._storageStreamLength = string.length; - - // write string - var fStream = this._storageStream.getOutputStream(0); - fStream.write(string, this._storageStreamLength); - fStream.close(); + this._storage = string; + this._storageLength = string.length; + this._storagePointer = 0; } /* @@ -467,7 +461,7 @@ Scholar.Translate.prototype.translate = function() { throw("cannot translate: no translator specified"); } - if(!this.location && this.type != "search" && !this._storageStream) { + if(!this.location && this.type != "search" && !this._storage) { // searches operate differently, because we could have an array of // translators and have to go through each throw("cannot translate: no location specified"); @@ -477,6 +471,12 @@ Scholar.Translate.prototype.translate = function() { return; } + if(this._storage) { + // enable reading from storage, which we can't do until the translator + // is loaded + this._storageFunctions(true); + } + // hack to see if there are any options, bc length does not work on objects if(this.type == "export") { for(var i in this._displayOptions) { @@ -1296,17 +1296,11 @@ Scholar.Translate.prototype._import = function() { * sets up import for IO */ Scholar.Translate.prototype._importConfigureIO = function() { - if(this._storageStream) { + if(this._storage) { if(this._configOptions.dataMode == "rdf") { this._rdf = new Object(); // read string out of storage stream - var sStream = Components.classes["@mozilla.org/scriptableinputstream;1"]. - createInstance(Components.interfaces.nsIScriptableInputStream); - sStream.init(this._storageStream.newInputStream(0)); - var str = sStream.read(this._storageStreamLength); - sStream.close(); - var IOService = Components.classes['@mozilla.org/network/io-service;1'] .getService(Components.interfaces.nsIIOService); this._rdf.dataSource = Components.classes["@mozilla.org/rdf/datasource;1?name=in-memory-datasource"]. @@ -1316,19 +1310,13 @@ Scholar.Translate.prototype._importConfigureIO = function() { // get URI and parse var baseURI = (this.location ? IOService.newURI(this.location, "utf-8", null) : null); - parser.parseString(this._rdf.dataSource, baseURI, str); + parser.parseString(this._rdf.dataSource, baseURI, this._storage); // make an instance of the RDF handler this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(this._rdf.dataSource); } else { - this._storageStreamFunctions(true); - - if(this._scriptableStream) { - // close scriptable stream so functions will be forced to get a - // new one - this._scriptableStream.close(); - this._scriptableStream = undefined; - } + this._storageFunctions(true); + this._storagePointer = 0; } } else { if(this._configOptions.dataMode == "rdf") { @@ -1619,37 +1607,25 @@ Scholar.Translate.prototype._initializeInternalIO = function() { // make an instance of the RDF handler this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(this._rdf.dataSource); } else { - this._createStorageStream(); - this._storageStreamFunctions(true, true); + this._storage = ""; + this._storageLength = 0; + this._storagePointer = 0; + this._storageFunctions(true, true); } } } -/* - * creates and returns storage stream - */ -Scholar.Translate.prototype._createStorageStream = function() { - // create a storage stream - this._storageStream = Components.classes["@mozilla.org/storagestream;1"]. - createInstance(Components.interfaces.nsIStorageStream); - this._storageStream.init(4096, 4294967295, null); // virtually no size limit -} - /* * sets up functions for reading/writing to a storage stream */ -Scholar.Translate.prototype._storageStreamFunctions = function(read, write) { +Scholar.Translate.prototype._storageFunctions = function(read, write) { var me = this; if(write) { // set up write() method - var fStream = this._storageStream.getOutputStream(0); - this._sandbox.Scholar.write = function(data) { fStream.write(data, data.length) }; - - // set Scholar.eof() to close the storage stream - this._sandbox.Scholar.eof = function() { - fStream.QueryInterface(Components.interfaces.nsIOutputStream); - fStream.close(); - } + this._sandbox.Scholar.write = function(data) { + me._storage += data; + me._storageLength += data.length; + }; } if(read) { @@ -1658,51 +1634,45 @@ Scholar.Translate.prototype._storageStreamFunctions = function(read, write) { var lastCharacter; this._sandbox.Scholar.read = function() { - if(!me._scriptableStream) { // allocate an fStream and sStream on the fly - // otherwise with no data we get an error - me._scriptableStream = Components.classes["@mozilla.org/scriptableinputstream;1"]. - createInstance(Components.interfaces.nsIScriptableInputStream); - me._scriptableStream.init(me._storageStream.newInputStream(0)); - - // attach sStream to stack of streams to close - me._streams.push(me._scriptableStream); - } - - var character = me._scriptableStream.read(1); - if(!character) { + if(me._storagePointer >= me._storageLength) { return false; } - var string = ""; - if(lastCharacter == "\r" && character == "\n") { - // if the last read got a cr, and this first char was - // an lf, ignore the lf - character = ""; + var oldPointer = me._storagePointer; + var lfIndex = me._storage.indexOf("\n", me._storagePointer); + + if(lfIndex != -1) { + // in case we have a CRLF + me._storagePointer = lfIndex+1; + if(me._storageLength > lfIndex && me._storage[lfIndex-1] == "\r") { + lfIndex--; + } + return me._storage.substr(oldPointer, lfIndex-oldPointer); } - while(character != "\n" && character != "\r" && character) { - string += character; - character = me._scriptableStream.read(1); + var crIndex = me._storage.indexOf("\r", me._storagePointer); + if(crIndex != -1) { + me._storagePointer = crIndex+1; + return me._storage.substr(oldPointer, crIndex-oldPointer-1); } - lastCharacter = character; - - return string; + me._storagePointer = me._storageLength; + return me._storage; } - } else { // block reading + } else { // block reading this._sandbox.Scholar.read = function(amount) { - if(!me._scriptableStream) { // allocate an fStream and - // sStream on the fly; otherwise - // with no data we get an error - me._scriptableStream = Components.classes["@mozilla.org/scriptableinputstream;1"]. - createInstance(Components.interfaces.nsIScriptableInputStream); - me._scriptableStream.init(me._storageStream.newInputStream(0)); - - // attach sStream to stack of streams to close - me._streams.push(me._scriptableStream); + if(me._storagePointer >= me._storageLength) { + return false; } - return me._scriptableStream.read(amount); + if((me._storagePointer+amount) <= me._storageLength) { + me._storagePointer = me._storageLength; + return me._storage; + } + + var oldPointer = me._storagePointer; + me._storagePointer += amount; + return me._storage.substr(oldPointer, amount); } } } diff --git a/scrapers.sql b/scrapers.sql index 39ce57ba5e..a7d555b72e 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 68 +-- 69 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00')); @@ -3038,7 +3038,7 @@ function doWeb(doc, url) { var urls = new Array(); for(var i in items) { var m = relatedMatch.exec(relatedLinks[i]); - urls.push("http://scholar.google.com/scholar.ris?hl=en&lr=&q=info:"+m[1]+"&output=citation&oi=citation"); + urls.push("http://scholar.google.com/scholar.ris?hl=en&lr=&q=info:"+m[1]+"&oe=UTF-8&output=citation&oi=citation"); if(links[i]) { attachments.push([{title:"Google Scholar Linked Page", type:"text/html", url:links[i]}]); @@ -3150,24 +3150,24 @@ function doWeb(doc, url) { Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('d0b1914a-11f1-4dd7-8557-b32fe8a3dd47', '2006-08-18 18:03:00', 4, 'EBSCOhost', 'Simon Kornblith', '^http://web\.ebscohost\.com/ehost/(?:results|detail)', +REPLACE INTO "translators" VALUES ('d0b1914a-11f1-4dd7-8557-b32fe8a3dd47', '2006-08-18 18:03:00', 4, 'EBSCOhost', 'Simon Kornblith', '^http://[^/]+/ehost/(?:results|detail)', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - var searchRe = new RegExp("^http://web\\.ebscohost\\.com/ehost/results", "i"); - // See if this is a seach results page - if(searchRe.test(url)) { + var searchResult = doc.evaluate(''//table[@class="result-list-inner"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + if(searchResult) { return "multiple"; - } else { - var persistentLink = doc.evaluate(''//tr[td[@class="left-content-ft"]/text() = "Persistent link to this record:"]/td[@class="right-content-ft"]'', - doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if(persistentLink) { - return "journalArticle"; - } + } + + var persistentLink = doc.evaluate(''//tr[td[@class="left-content-ft"]/text() = "Persistent link to this record:"]/td[@class="right-content-ft"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(persistentLink) { + return "journalArticle"; } }', 'function fullEscape(text) { @@ -3180,6 +3180,10 @@ function doWeb(doc, url) { if (prefix == ''x'') return namespace; else return null; } : null; + var hostRe = new RegExp("^http://([^/]+)/"); + var m = hostRe.exec(url); + var host = m[1]; + var queryRe = /\?(.*)$/; var m = queryRe.exec(url); var queryString = m[1]; @@ -3191,8 +3195,9 @@ function doWeb(doc, url) { XPathResult.ANY_TYPE, null).iterateNext(); viewState = fullEscape(viewState.value); - var searchRe = new RegExp("^http://web\\.ebscohost\\.com/ehost/results", "i"); - if(searchRe.test(url)) { + var searchResult = doc.evaluate(''//table[@class="result-list-inner"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + if(searchResult) { var items = new Object(); var tableRows = doc.evaluate(''//table[@class="cluster-result-record-table"]/tbody/tr'', @@ -3253,7 +3258,7 @@ function doWeb(doc, url) { folderBase += "&__EVENTVALIDATION="+fullEscape(folderEventValidation); var deliverString = "__EVENTTARGET=ctl00%24ctl00%24MainContentArea%24MainContentArea%24btnDelivery%24lnkSave&"+folderBase - Scholar.Utilities.HTTP.doPost("http://web.ebscohost.com/ehost/"+folderURL, + Scholar.Utilities.HTTP.doPost("http://"+host+"/ehost/"+folderURL, deliverString, function(text) { var postLocation = /