diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index 1ce0cdd022..cb702defd9 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -27,6 +27,7 @@ Scholar_Ingester_Interface.init = function() { Scholar_Ingester_Interface._scrapePopupShowing = false; Scholar.Ingester.ProxyMonitor.init(); Scholar.Ingester.MIMEHandler.init(); + Scholar.Translate.init(); window.addEventListener("load", Scholar_Ingester_Interface.chromeLoad, false); window.addEventListener("unload", Scholar_Ingester_Interface.chromeUnload, false); diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js index 7904608cff..ba945d26be 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/translate.js +++ b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -115,6 +115,81 @@ Scholar.Translate = function(type, saveItem) { this._streams = new Array(); } +/* + * (singleton) initializes scrapers, loading from the database and separating + * into types + */ +Scholar.Translate.init = function() { + if(!Scholar.Translate.cache) { + var cachePref = Scholar.Prefs.get("cacheTranslatorData"); + + if(cachePref) { + // fetch translator list + var translators = Scholar.DB.query("SELECT translatorID, type, label, target, detectCode IS NULL as noDetectCode FROM translators ORDER BY target IS NULL"); + var detectCodes = Scholar.DB.query("SELECT translatorID, detectCode FROM translators WHERE target IS NULL"); + + Scholar.Translate.cache = new Object(); + Scholar.Translate.cache["import"] = new Array(); + Scholar.Translate.cache["export"] = new Array(); + Scholar.Translate.cache["web"] = new Array(); + Scholar.Translate.cache["search"] = new Array(); + + for each(translator in translators) { + var type = translator.type; + + // not sure why this is necessary + var wrappedTranslator = {translatorID:translator.translatorID, + label:translator.label, + target:translator.target} + + if(translator.noDetectCode) { + wrappedTranslator.noDetectCode = true; + } + + // import translator + var mod = type % 2; + if(mod) { + var regexp = new RegExp(); + regexp.compile("\."+translator.target+"$", "i"); + wrappedTranslator.importRegexp = regexp; + Scholar.Translate.cache["import"].push(wrappedTranslator); + type -= mod; + } + // search translator + var mod = type % 4; + if(mod) { + Scholar.Translate.cache["export"].push(wrappedTranslator); + type -= mod; + } + // web translator + var mod = type % 8; + if(mod) { + var regexp = new RegExp(); + regexp.compile(translator.target, "i"); + wrappedTranslator.webRegexp = regexp; + Scholar.Translate.cache["web"].push(wrappedTranslator); + + if(!translator.target) { + for each(var detectCode in detectCodes) { + if(detectCode.translatorID == translator.translatorID) { + wrappedTranslator.detectCode = detectCode.detectCode; + } + } + } + type -= mod; + } + // search translator + var mod = type % 16; + if(mod) { + Scholar.Translate.cache["search"].push(wrappedTranslator); + type -= mod; + } + } + } + + } +} + /* * sets the browser to be used for web translation; also sets the location */ @@ -288,8 +363,12 @@ Scholar.Translate.prototype.setHandler = function(type, handler) { * itemType - the type of item this scraper says it will scrape */ Scholar.Translate.prototype.getTranslators = function() { - var sql = "SELECT translatorID, label, target, detectCode FROM translators WHERE type IN ("+this._numericTypes+") ORDER BY target IS NULL"; - var translators = Scholar.DB.query(sql); + if(Scholar.Translate.cache) { + var translators = Scholar.Translate.cache[this.type]; + } else { + var sql = "SELECT translatorID, label, target, detectCode IS NULL as noDetectCode FROM translators WHERE type IN ("+this._numericTypes+") ORDER BY target IS NULL"; + var translators = Scholar.DB.query(sql); + } if(!this.location && !this.search) { return translators; // no need to see which can translate, because @@ -369,6 +448,8 @@ Scholar.Translate.prototype._loadTranslator = function() { * does the actual translation */ Scholar.Translate.prototype.translate = function() { + Scholar.debug("translate called"); + this.newItems = new Array(); this.newCollections = new Array(); this._IDMap = new Array(); @@ -378,7 +459,7 @@ Scholar.Translate.prototype.translate = function() { throw("cannot translate: no translator specified"); } - if(!this.location && this.type != "search") { + if(!this.location && this.type != "search" && !this._storageStream) { // searches operate differently, because we could have an array of // translators and have to go through each throw("cannot translate: no location specified"); @@ -388,15 +469,16 @@ Scholar.Translate.prototype.translate = function() { return; } - // hack to see if there are any options, bc length does not work on objects - for(var i in this._displayOptions) { - // run handler for options if there are any - if(!(this._displayOptions = this._runHandler("options", this._displayOptions))) { - this._translationComplete(true); - return false; + if(this.type == "export") { + for(var i in this._displayOptions) { + // run handler for options if there are any + if(!(this._displayOptions = this._runHandler("options", this._displayOptions))) { + this._translationComplete(true); + return false; + } + break; } - break; } var returnValue; @@ -495,45 +577,65 @@ Scholar.Translate.prototype._generateSandbox = function() { this._sandbox.Scholar.getOption = function(option) { return me._getOption(option) }; // for loading other translators and accessing their methods - this._sandbox.Scholar.loadTranslator = function(type, translatorID) { - var translation = new Scholar.Translate(type, (translatorID ? true : false)); + this._sandbox.Scholar.loadTranslator = function(type) { + var translation = new Scholar.Translate(type, false); translation._parentTranslator = me; - if(translatorID) { - // assign same handlers as for parent, because the done handler won't - // get called anyway, and the itemDone/selectItems handlers should be - // the same - translation._handlers = me._handlers; - // set the translator - translation.setTranslator(translatorID); - // load the translator into our sandbox - translation._loadTranslator(); - // use internal io - translation._initializeInternalIO(); - // when a new item is added, we should be notified - translation.newItems = me.newItems; - translation.newCollections = me.newCollections; - - return translation._sandbox; - } else { - // create a safe translator object, so that scrapers can't get - // access to potentially harmful methods. - if(type == "import" || type == "export") { - throw("you must specify a translatorID for "+type+" translation"); + if(type == "export" && (this.type == "web" || this.type == "search")) { + throw("for security reasons, web and search translators may not call export translators"); + } + + // for security reasons, safeTranslator wraps the translator object. + // note that setLocation() is not allowed + var safeTranslator = new Object(); + safeTranslator.setSearch = function(arg) { return translation.setSearch(arg) }; + safeTranslator.setBrowser = function(arg) { return translation.setBrowser(arg) }; + safeTranslator.setHandler = function(arg1, arg2) { translation.setHandler(arg1, arg2) }; + safeTranslator.setString = function(arg) { translation.setString(arg) }; + safeTranslator.setTranslator = function(arg) { return translation.setTranslator(arg) }; + safeTranslator.getTranslators = function() { return translation.getTranslators() }; + safeTranslator.translate = function() { + var noHandlers = true; + for(var i in translation._handlers) { + noHandlers = false; + break; + } + if(noHandlers) { + if(type != "export") { + translation.setHandler("itemDone", function(obj, item) { item.complete() }); + } + if(type == "web") { + translation.setHandler("selectItems", me._handlers["selectItems"]); + } } - var safeTranslator = new Object(); - safeTranslator.setSearch = function(arg) { return translation.setSearch(arg) }; - safeTranslator.setBrowser = function(arg) { return translation.setBrowser(arg) }; - safeTranslator.setHandler = function(arg1, arg2) { translation.setHandler(arg1, arg2) }; - safeTranslator.setString = function(arg) { translation.setString(arg) }; - safeTranslator.setTranslator = function(arg) { return translation.setTranslator(arg) }; - safeTranslator.getTranslators = function() { return translation.getTranslators() }; - safeTranslator.translate = function() { return translation.translate() }; - translation._parentTranslator = me; + return translation.translate() + }; + safeTranslator.getTranslatorObject = function() { + // load the translator into our sandbox + translation._loadTranslator(); + // initialize internal IO + translation._initializeInternalIO(); - return safeTranslator; - } + var noHandlers = true; + for(var i in translation._handlers) { + noHandlers = false; + break; + } + if(noHandlers) { + if(type != "export") { + translation.setHandler("itemDone", function(obj, item) { item.complete() }); + } + if(type == "web") { + translation.setHandler("selectItems", me._handlers["selectItems"]); + } + } + + // return sandbox + return translation._sandbox; + }; + + return safeTranslator; } } @@ -542,14 +644,21 @@ Scholar.Translate.prototype._generateSandbox = function() { */ Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtensions) { // Test location with regular expression - // If this is slow, we could preload all scrapers and compile regular - // expressions, so each check will be faster - if(translator.target && this.type != "search") { + if(translator.target && (this.type == "import" || this.type == "web")) { var canTranslate = false; + if(this.type == "web") { - var regularExpression = new RegExp(translator.target, "i"); + if(translator.webRegexp) { + var regularExpression = translator.webRegexp; + } else { + var regularExpression = new RegExp(translator.target, "i"); + } } else { - var regularExpression = new RegExp("\."+translator.target+"$", "i"); + if(translator.importRegexp) { + var regularExpression = translator.importRegexp; + } else { + var regularExpression = new RegExp("\."+translator.target+"$", "i"); + } } if(regularExpression.test(this.path)) { @@ -562,7 +671,7 @@ Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtension canTranslate = !canTranslate; // if a translator has no detectCode, don't offer it as an option - if(!translator.detectCode) { + if(translator.noDetectCode) { return false; } } @@ -572,8 +681,7 @@ Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtension // Test with JavaScript if available and didn't have a regular expression or // passed regular expression test - if((!translator.target || canTranslate) - && translator.detectCode) { + if(!translator.target || canTranslate) { // parse the detect code and execute this._parseDetectCode(translator); @@ -629,8 +737,16 @@ Scholar.Translate.prototype._parseDetectCode = function(translator) { this._displayOptions = new Array(); if(translator.detectCode) { + var detectCode = translator.detectCode; + } else if(!translator.noDetectCode) { + // get detect code from database + var detectCode = Scholar.DB.valueQuery("SELECT detectCode FROM translators WHERE translatorID = ?", + [translator.translatorID]); + } + + if(detectCode) { try { - Components.utils.evalInSandbox(translator.detectCode, this._sandbox); + Components.utils.evalInSandbox(detectCode, this._sandbox); } catch(e) { Scholar.debug(e+' in parsing detectCode for '+translator.label); return; @@ -850,10 +966,6 @@ Scholar.Translate.prototype._itemDone = function(item) { } this._runHandler("itemDone", item); return; - } else if(this._parentTranslator) { - // run done on parent - this._parentTranslator._itemDone(item); - return; } if(!item.title) { @@ -968,11 +1080,13 @@ Scholar.Translate.prototype._itemDone = function(item) { attachmentItem.setField("title", attachment.title); } } else { - Scholar.Attachments.importFromURL(attachment.url, myID); + Scholar.Attachments.importFromURL(attachment.url, myID, + (attachment.mimeType ? attachment.mimeType : attachment.document.contentType), + (attachment.title ? attachment.title : attachment.document.title)); } } else { if(attachment.document) { - Scholar.Attachments.linkFromURL(attachment.document.location.href, myID, + var attachmentID = Scholar.Attachments.linkFromURL(attachment.document.location.href, myID, (attachment.mimeType ? attachment.mimeType : attachment.document.contentType), (attachment.title ? attachment.title : attachment.document.title)); } else { @@ -980,7 +1094,7 @@ Scholar.Translate.prototype._itemDone = function(item) { Scholar.debug("notice: either mimeType or title is missing; attaching file will be slower"); } - Scholar.Attachments.linkFromURL(attachment.url, myID, + var attachmentID = Scholar.Attachments.linkFromURL(attachment.url, myID, (attachment.mimeType ? attachment.mimeType : undefined), (attachment.title ? attachment.title : undefined)); } diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index e5476a496b..e99f03869e 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -383,6 +383,7 @@ Scholar.Utilities.HTTP = new function() { .createInstance(); xmlhttp.open('POST', url, true); + xmlhttp.setRequestHeader("Content-Type", "application/x-www-form-urlencoded"); xmlhttp.onreadystatechange = function(){ _stateChange(xmlhttp, onDone); diff --git a/defaults/preferences/scholar.js b/defaults/preferences/scholar.js index 91c43d5d54..5b4628a9ba 100644 --- a/defaults/preferences/scholar.js +++ b/defaults/preferences/scholar.js @@ -2,6 +2,7 @@ // Display internal shortcut pref("extensions.scholar.automaticScraperUpdates",true); +pref("extensions.scholar.cacheTranslatorData",true); pref("extensions.scholar.scholarPaneOnTop",false); pref("extensions.scholar.openURL.resolver","http://athene.gmu.edu:8888/lfp/LinkFinderPlus/Display"); pref("extensions.scholar.openURL.version","0.1"); diff --git a/scrapers.sql b/scrapers.sql index e450e99de5..f917f795a1 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 51 +-- 52 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00')); @@ -66,7 +66,6 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006 } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { newItem.pages = value.substring(0, value.indexOf(" ")); } else if(attribute != "Average Customer Review:") { - Scholar.Utilities.debug(''"''+attribute+''"''); if(attribute == "In-Print Editions:") { value = value.replace(" | All Editions", ""); } else { @@ -79,7 +78,6 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006 } catch(ex) {} } - Scholar.Utilities.debug(newItem.extra); if(newItem.extra) { newItem.extra = newItem.extra.substr(0, newItem.extra.length-1); } @@ -154,7 +152,6 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 Scholar.Utilities.HTTP.doPost(newUrl, ''exportselect=record&exporttype=plaintext'', function(text) { - Scholar.Utilities.debug(text); var lineRegexp = new RegExp(); lineRegexp.compile("^([\\w() ]+): *(.*)$"); @@ -437,10 +434,10 @@ REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006 // No idea why this doesn''t work as post Scholar.Utilities.HTTP.doGet(newUri+''?''+postString, function(text) { // load translator for MARC - var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - marc.Scholar.write(text); - marc.Scholar.eof(); - marc.doImport(url); + var marc = Scholar.loadTranslator("import"); + marc.setTranslator("a6ee60df-1ddc-4aae-bb25-45e0537be973"); + marc.setString(text); + marc.translate(); Scholar.done(); }) @@ -583,7 +580,6 @@ function doWeb(doc, url) { getList(saveCitations, null, function() { // mark this Scholar.Utilities.HTTP.doGet(''http://www.jstor.org/browse/citations.txt?exportAction=Save+as+Text+File&exportFormat=cm&viewCitations=1'', function(text) { // get marked - Scholar.Utilities.debug(text); var k = 0; var lines = text.split("\n"); var haveStarted = false; @@ -765,44 +761,51 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 var xpath = ''//pre/text()[1]''; var text = newDoc.evaluate(xpath, newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; - var newItem = new Scholar.Item(); - var record = new marc.MARC_Record(); + var newItem = new Scholar.Item(); + var record = new marc.record(); var linee = text.split("\n"); for (var i=0; i ''008'' && tag < ''899'') { // jumps low and high tags - if (tag != ''040'') record.add_field(tag,ind1,ind2,value); + + linee[i] = linee[i].replace(/[\xA0_\t]/g, " "); + var value = linee[i].substr(7); + + if(linee[i].substr(0, 6) == " ") { + // add this onto previous value + tagValue += value; + } else { + if(linee[i].substr(0, 6) == "LEADER") { + // trap leader + record.leader = value; + } else { + if(tagValue) { // finish last tag + tagValue = tagValue.replace(/\|(.)/g, marc.subfieldDelimiter+"$1"); + if(tagValue[0] != marc.subfieldDelimiter) { + tagValue = marc.subfieldDelimiter+"a"+tagValue; + } + + // add previous tag + record.addField(tag, ind, tagValue); + } + + var tag = linee[i].substr(0, 3); + var ind = linee[i].substr(4, 2); + var tagValue = value; + } } + } + if(tagValue) { + tagValue = tagValue.replace(/\|(.)/g, marc.subfieldDelimiter+"$1"); + if(tagValue[0] != marc.subfieldDelimiter) { + tagValue = marc.subfieldDelimiter+"a"+tagValue; + } + + // add previous tag + record.addField(tag, ind, tagValue); } record.translate(newItem); @@ -811,16 +814,16 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 function pageByPage(marc, urls) { Scholar.Utilities.processDocuments(urls, function(newDoc) { - scrape(marc, newDoc); + scrape(marc.getTranslatorObject(), newDoc); }, function() { Scholar.done() }); } function doWeb(doc, url) { var uri = doc.location.href; var newUri; - // load translator for MARC - var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var marc = Scholar.loadTranslator("import"); + marc.setTranslator("a6ee60df-1ddc-4aae-bb25-45e0537be973"); var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); var m = matchRegexp.exec(uri); @@ -840,7 +843,7 @@ function doWeb(doc, url) { var xpath = ''//a[img[@src="/screens/regdisp.gif" or @alt="REGULAR RECORD DISPLAY"]]''; var aTag = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(aTag) { - scrape(marc, doc); + scrape(marc.getTranslatorObject(), doc); return; } } @@ -862,7 +865,6 @@ function doWeb(doc, url) { // Go through table rows var i = 0; while(tableRow = tableRows.iterateNext()) { - Scholar.Utilities.debug("row"); // CHK is what we need to get it all as one file var input = doc.evaluate(''./td/input[@type="checkbox"]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); @@ -885,7 +887,6 @@ function doWeb(doc, url) { // Go through links while(link) { if(tagRegexp.test(link.href)) { - Scholar.Utilities.debug("hello"); var text = Scholar.Utilities.getNodeString(doc, link, ".//text()", null); if(text) { @@ -909,9 +910,12 @@ function doWeb(doc, url) { if(!items) { return true; } - - var urlRe = new RegExp("^(http://[^/]+(/search/[^/]+/))"); + var urlRe = new RegExp("^(https?://[^/]+(/search/[^/]+(?:/|$)))"); var m = urlRe.exec(urls[0]); + if(!m) { + throw("urlRe choked on "+urls[0]); + } + var clearUrl = m[0]+"?clear_saves=1"; var postUrl = m[0]; var exportUrl = m[1]+"++export/1,-1,-1,B/export"; @@ -925,6 +929,9 @@ function doWeb(doc, url) { number++; } var m = matchRegexp.exec(urls[i]); + if(!m) { + throw("matchRegexp choked on "+urls[i]); + } newUrls.push(m[1]+"marc"+m[2]); } @@ -937,9 +944,8 @@ function doWeb(doc, url) { Scholar.Utilities.HTTP.doPost(exportUrl, "ex_format=50&ex_device=45&SUBMIT=Submit", function(text) { var notSpace = /[^\s]/ if(notSpace.test(text)) { - marc.Scholar.write(text); - marc.Scholar.eof(); - marc.doImport(); + marc.setString(text); + marc.translate(); Scholar.done(); } else { @@ -1081,7 +1087,6 @@ function doWeb(doc, url) { var m = hostRe.exec(doc.location.href); var hitlist = doc.forms.namedItem("hitlist"); var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value; - Scholar.Utilities.debug(baseUrl); var uris = new Array(); for(var i in items) { @@ -1104,8 +1109,6 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006 } }', 'function scrape(doc) { - Scholar.Utilities.debug("hello"); - var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -1186,7 +1189,6 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006 var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(value.nodeValue) { value = Scholar.Utilities.superCleanString(value.nodeValue).toLowerCase(); - Scholar.Utilities.debug(value); if(value.indexOf("periodical") >= 0) { newItem.itemType = "magazineArticle"; @@ -1223,7 +1225,6 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006 while(currentSubject = subjects.iterateNext()) { var subjectValue = Scholar.Utilities.getNodeString(doc, currentSubject, ".//text()", nsResolver); subjectValue = Scholar.Utilities.superCleanString(subjectValue); - Scholar.Utilities.debug("tag: "+subjectValue); if(subjectValue) { newItem.tags.push(subjectValue); } @@ -1247,7 +1248,6 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006 var item = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(item) { var title = attachArray[xpath]; - Scholar.Utilities.debug(title); if(item.parentNode.tagName.toLowerCase() == "a") { // item is not this page @@ -1618,7 +1618,9 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006 } } - var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var translator = Scholar.loadTranslator("import"); + translator.setTranslator("a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var marc = translator.getTranslatorObject(); Scholar.Utilities.processDocuments(newUris, function(newDoc) { var uri = newDoc.location.href; @@ -1628,26 +1630,29 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006 } : null; var xpath = ''/html/body/table/tbody/tr[td[1][@id="bold"]][td[2]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + var elmts = newDoc.evaluate(xpath, newDoc, nsResolver, XPathResult.ANY_TYPE, null); + var elmt; - var record = new marc.MARC_Record(); - for(var i=0; i 3) { - var ind1 = field.charAt(3); + ind = field[3]; if(field.length > 4) { - var ind2 = field.charAt(4); + ind += field[4]; } } - record.add_field(code, ind1, ind2, value); + + record.addField(code, ind, value); } } @@ -1668,7 +1673,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006 return "multiple"; } }', -'function scrape(doc, url) { +'function doWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -1701,7 +1706,9 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006 } } - var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var translator = Scholar.loadTranslator("import"); + translator.setTranslator("a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var marc = translator.getTranslatorObject(); Scholar.Utilities.processDocuments(uris, function(newDoc) { var uri = newDoc.location.href; @@ -1712,28 +1719,29 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006 } : null; var xpath = ''//form/table[@class="tableBackground"]/tbody/tr/td/table[@class="tableBackground"]/tbody/tr[td[1]/a[@class="normalBlackFont1"]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + var elmts = newDoc.evaluate(xpath, newDoc, nsResolver, XPathResult.ANY_TYPE, null); + var elmt; - var record = new marc.MARC_Record(); - for(var i=0; i 3) { + ind = field[3]; + if(field.length > 4) { + ind += field[4]; } - value = m[3]; } - marc.add_field(field, ind1, ind2, value); + + record.addField(code, ind, value); } } @@ -1765,10 +1773,12 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006 var uri = doc.location.href; var newUris = new Array(); - var marcs = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//a[text()="marc"]'', nsResolver); + var marcs = doc.evaluate(''//a[text()="marc"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + var record = marcs.iterateNext(); - if(marcs.length == 1) { - newUris.push(marcs[0].href) + if(record && !marcs.iterateNext()) { + newUris.push(record.href); } else { // Require link to match this var tagRegexp = new RegExp(); @@ -1776,10 +1786,12 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006 var items = new Array(); - var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//tr[@class="intrRow"]'', nsResolver); + var tableRows = doc.evaluate(''//tr[@class="intrRow"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + var tableRow // Go through table rows - for(var i=0; i 10) { - ind1 = line.substring(4, 5); - ind2 = line.substring(5, 6); - content = line.substring(7); - content = content.replace(/\$([a-z])(?: |$)/g, record.subfield_delimiter+"$1"); + tag = line.substr(0, 3); + if(tag[0] != "0" || tag[1] != "0") { + ind = line.substr(4, 2); + content = line.substr(7).replace(/\$([a-z])(?: |$)/g, marc.subfieldDelimiter+"$1"); } else { - ind1 = ""; - ind2 = ""; - content = line.substring(4); + if(tag == "000") { + tag = undefined; + record.leader = "00000"+line.substr(4); + } else { + content = line.substr(4); + } } } @@ -2008,15 +2027,18 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006 if (prefix == ''x'') return namespace; else return null; } : null; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver); - for(var i=0; i"); - texts = texts[1].split(""); - text = unescapeHTML(texts[0]); + Scholar.Utilities.loadDocument(newUri+''?marks=''+recNumbers.join(",")+''&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type='', function(doc) { + var pre = doc.getElementsByTagName("pre"); + var text = pre[0].textContent; + var documents = text.split("*** DOCUMENT BOUNDARY ***"); for(var j=1; j 10) { - ind1 = line.substr(6, 1); - ind2 = line.substr(7, 1); + if(tag[0] != "0" || tag[1] != "0") { + ind = line.substr(6, 2); content = line.substr(8); } else { - ind1 = ""; - ind2 = ""; - content = line.substring(6); + content = line.substr(7); + if(tag == "000") { + tag = undefined; + record.leader = "00000"+content; + Scholar.Utilities.debug("the leader is: "+record.leader); + } } } @@ -2170,7 +2191,9 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006 } } - var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var translator = Scholar.loadTranslator("import"); + translator.setTranslator("a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var marc = translator.getTranslatorObject(); Scholar.Utilities.processDocuments(newUris, function(newDoc) { var uri = newDoc.location.href; @@ -2180,48 +2203,40 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006 if (prefix == ''x'') return namespace; else return null; } : null; - var record = new marc.MARC_Record(); + var record = new marc.record(); - var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, ''/html/body/table/tbody/tr[td[4]]'', nsResolver); - var tag, ind1, ind2, content; + var elmts = newDoc.evaluate(''/html/body/table/tbody/tr[td[4]]'', newDoc, nsResolver, + XPathResult.ANY_TYPE, null); + var tag, ind, content, elmt; - for(var i=0; i/ + var eventValidationMatch = // + + Scholar.Utilities.HTTP.doPost(url, saveString, function() { // mark records + Scholar.Utilities.HTTP.doPost(url, folderString, function(text) { + var postLocation = /
0) return true; - return false; -} - -MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC field - this.tag = tag; - this.occ = rec.count_occ(tag)+1; // occurrence order no. - this.ind1 = ind1; if (this.ind1 == '''') this.ind1 = '' ''; - this.ind2 = ind2; if (this.ind2 == '''') this.ind2 = '' ''; - if (tag.substr(0,2) == ''00'') { - this.ind1 = ''''; this.ind2 = ''''; - } - this.value = value; - return this; -} - -MARC_Record.prototype.display = function(type) { // displays record in format type - type = type.toLowerCase(); - if (type == ''binary'') return this.show_leader() + - this.directory + - this.field_terminator + - this.show_fields() + - this.record_terminator; - if (type == ''xml'') { - s = ''''; - s += ''''; - s += ''''+this.show_leader()+''''; - // var i; - for (i=0; i''+this.variable_fields[i].value+''''; - else { - var subfields = this.variable_fields[i].value.split(this.subfield_delimiter); - // alert(this.variable_fields[i].value+'' ''+subfields.length); // test - if (subfields.length == 1) subfields[1] = ''?''+this.variable_fields[i].value; - var sf = ''''; - for (var j=1; j''+subfields[j].substr(1)+''''; - } - s += '''' + sf + ''''; - } - } - s += ''''; - return s; - } - return false; -} - -MARC_Record.prototype.get_field = function(tag) { // returns an array of values, one for each occurrence - var v = new Array(); var i; - for (i=0; i 3) { - return false; - } - - var F = new this.MARC_field(this,tag,ind1,ind2,value); - // adds pointer to list of fields - this.variable_fields[this.variable_fields.length] = F; - // adds the entry to the directory - this.directory += F.tag+this._zero_fill(F.ind1.length+F.ind2.length+F.value.length+1,4)+''00000''; - // sorts the directory - this.sort_directory(); - // updates lengths - this.update_base_address_of_data(); - this.update_displacements(); - this.update_record_length(); - return F; -} - -MARC_Record.prototype.delete_field = function(tag,occurrence) { - // lookup and delete the occurrence from array variable_fields - var i; - for (i=0; i= this.directory.length) alert(''Internal error!''); - this.directory = this.directory.substr(0,i) + this.directory.substr(i+12); - // updates lengths - this.update_base_address_of_data(); - this.update_displacements(); - this.update_record_length(); - return true; -} - -MARC_Record.prototype._clean = function(value) { +// general purpose cleaning +function clean(value) { value = value.replace(/^[\s\.\,\/\:]+/, ''''); value = value.replace(/[\s\.\,\/\:]+$/, ''''); value = value.replace(/ +/g, '' ''); - var char1 = value[1]; + var char1 = value[0]; var char2 = value[value.length-1]; if((char1 == "[" && char2 == "]") || (char1 == "(" && char2 == ")")) { // chop of extraneous characters @@ -5145,14 +5040,147 @@ MARC_Record.prototype._clean = function(value) { return value; } -MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) { +// number extraction +function pullNumber(text) { + var pullRe = /[0-9]+/; + var m = pullRe.exec(text); + if(m) { + return m[0]; + } +} + +// ISBN extraction +function pullISBN(text) { + var pullRe = /[0-9X\-]+/; + var m = pullRe.exec(text); + if(m) { + return m[0]; + } +} + +// corporate author extraction +function corpAuthor(author) { + return {lastName:author}; +} + +// regular author extraction +function author(author, type, useComma) { + return Scholar.Utilities.cleanAuthor(author, type, useComma); +} + +/* + * END CLEANING FUNCTIONS + */ + +var record = function() { + this.directory = new Object(); + this.leader = ""; + this.content = ""; - if(!part) { - part = ''a''; + // defaults + this.indicatorLength = 2; + this.subfieldCodeLength = 2; +} + +// import a binary MARC record into this record +record.prototype.importBinary = function(record) { + // get directory and leader + var directory = record.substr(0, record.indexOf(fieldTerminator)); + this.leader = directory.substr(0, 24); + var directory = directory.substr(24); + + // get various data + this.indicatorLength = parseInt(this.leader[10], 10); + this.subfieldCodeLength = parseInt(this.leader[11], 10); + var baseAddress = parseInt(this.leader.substr(12, 5), 10); + + // get record data + this.content = record.substr(baseAddress); + + // read directory + for(var i=0; i this.indicatorLength) { + indicator = indicator.substr(0, this.indicatorLength); + } else if(indicator.length != this.indicatorLength) { + indicator = Scholar.Utilities.lpad(indicator, " ", this.indicatorLength); } - var field = this.get_field_subfields(fieldNo); - Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part); + // add terminator + value = indicator+value+fieldTerminator; + + // add field to directory + if(!this.directory[field]) { + this.directory[field] = new Array(); + } + this.directory[field].push([this.content.length, value.length]); + + // add field to record + this.content += value; +} + +// get all fields with a certain field number +record.prototype.getField = function(field) { + var fields = new Array(); + + // make sure fields exist + if(!this.directory[field]) { + return fields; + } + + // get fields + for(var i in this.directory[field]) { + var location = this.directory[field][i]; + + // add to array + fields.push([this.content.substr(location[0], this.indicatorLength), + this.content.substr(location[0]+this.indicatorLength, + location[1]-this.indicatorLength-1)]); + } + + return fields; +} + +// get subfields from a field +record.prototype.getFieldSubfields = function(tag) { // returns a two-dimensional array of values + var fields = this.getField(tag); + var returnFields = new Array(); + + for(var i in fields) { + returnFields[i] = new Object(); + + var subfields = fields[i][1].split(subfieldDelimiter); + if (subfields.length == 1) { + returnFields[i]["?"] = fields[i][1]; + } else { + for(var j in subfields) { + if(subfields[j]) { + returnFields[i][subfields[j].substr(0, this.subfieldCodeLength-1)] = subfields[j].substr(this.subfieldCodeLength-1); + } + } + } + } + + return returnFields; +} + +// add field to DB +record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) { + var field = this.getFieldSubfields(fieldNo); + Scholar.Utilities.debug(''found ''+field.length+'' matches for ''+fieldNo+part); if(field) { for(var i in field) { var value = false; @@ -5166,9 +5194,8 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam } } } - if(value) { - this._gotField = true; - value = this._clean(value); + if(value) { + value = clean(value); if(execMe) { value = execMe(value, arg1, arg2); @@ -5184,57 +5211,54 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam } } -MARC_Record.prototype._associateTags = function(item, fieldNo, part) { - var field = this.get_field_subfields(fieldNo); +// add field to DB as tags +record.prototype._associateTags = function(item, fieldNo, part) { + var field = this.getFieldSubfields(fieldNo); for(var i in field) { for(var j=0; j