closes #78, figure out import/export architecture
closes #100, migrate ingester to Scholar.Translate closes #88, migrate scrapers away from RDF closes #9, pull out LC subject heading tags references #87, add fromArray() and toArray() methods to item objects API changes: all translation (import/export/web) now goes through Scholar.Translate all Scholar-specific functions in scrapers start with "Scholar." rather than the jumbled up piggy bank un-namespaced confusion scrapers now longer specify items through RDF (the beginning of an item.fromArray()-like function exists in Scholar.Translate.prototype._itemDone()) scrapers can be any combination of import, export, and web (type is the sum of 1/2/4 respectively) scrapers now contain functions (doImport, doExport, doWeb) rather than loose code scrapers can call functions in other scrapers or just call the function to translate itself export accesses items item-by-item, rather than accepting a huge array of items MARC functions are now in the MARC import translator, and accessed by the web translators new features: import now works rudimentary RDF (unqualified dublin core only), RIS, and MARC import translators are implemented (although they are a little picky with respect to file extensions at the moment) items appear as they are scraped MARC import translator pulls out tags, although this seems to slow things down no icon appears next to a the URL when Scholar hasn't detected metadata, since this seemed somewhat confusing apologizes for the size of this diff. i figured if i was going to re-write the API, i might as well do it all at once and get everything working right.
This commit is contained in:
8 changed files with 4058 additions and 3725 deletions
@ -1,5 +1,6 @@
Scholar_File_Interface = new function() {
this.exportFile = exportFile;
this.importFile = importFile;
* Creates Scholar.Translate instance and shows file picker for file export
@ -23,4 +24,41 @@ Scholar_File_Interface = new function() {
* Creates Scholar.Translate instance and shows file picker for file import
function importFile() {
var translation = new Scholar.Translate("import");
var translators = translation.getTranslators();
const nsIFilePicker = Components.interfaces.nsIFilePicker;
var fp = Components.classes[";1"]
fp.init(window, "Import", nsIFilePicker.modeOpen);
for(var i in translators) {
fp.appendFilter(translators[i].label, "*."+translators[i].target);
var rv =;
if (rv == nsIFilePicker.returnOK || rv == nsIFilePicker.returnReplace) {
// get translators again, bc now we can check against the file
translators = translation.getTranslators();
if(translators.length) {
// TODO: display a list of available translators
translation.setHandler("itemDone", _importItemDone);
* Saves items after they've been imported. We could have a nice little
* "items imported" indicator, too.
function _importItemDone(obj, item) {
@ -25,8 +25,7 @@ Scholar_Ingester_Interface._scrapeProgress = new Array();
Scholar_Ingester_Interface.init = function() {
Scholar_Ingester_Interface.browsers = new Array();
Scholar_Ingester_Interface.browserDocuments = new Object();
Scholar_Ingester_Interface.browserUris = new Array();
Scholar_Ingester_Interface.browserData = new Object();
Scholar_Ingester_Interface._scrapePopupShowing = false;
@ -54,7 +53,7 @@ Scholar_Ingester_Interface.chromeLoad = function() {
* When chrome unloads, delete our document objects and remove our listeners
Scholar_Ingester_Interface.chromeUnload = function() {
delete Scholar_Ingester_Interface.browserDocuments;
delete Scholar_Ingester_Interface.browserData, Scholar_Ingester_Interface.browsers;
@ -62,30 +61,20 @@ Scholar_Ingester_Interface.chromeUnload = function() {
* Scrapes a page (called when the capture icon is clicked)
Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
if(documentObject.scraper) {
var scrapeProgress = new Scholar_Ingester_Interface.Progress(window);
documentObject.scrapePage(function(obj, returnValue) { Scholar_Ingester_Interface._finishScraping(obj, returnValue, scrapeProgress, saveLocation) });
* Updates the status of the capture icon to reflect the scrapability or lack
* thereof of the current page
Scholar_Ingester_Interface.updateStatus = function() {
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
if(documentObject && documentObject.scraper) {
if(documentObject.type == "multiple") {
// Use folder icon for multiple types, for now
Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treesource-collection.png";
} else {
Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+documentObject.type+".png";
Scholar_Ingester_Interface.statusImage.hidden = false;
} else {
Scholar_Ingester_Interface.statusImage.hidden = true;
var browser = Scholar_Ingester_Interface.tabBrowser.selectedBrowser;
var data = Scholar_Ingester_Interface._getData(browser);
if(data.translators && data.translators.length) {
var translate = new Scholar.Translate("web");
// use first translator available
translate.setHandler("select", Scholar_Ingester_Interface._selectItems);
translate.setHandler("itemDone", Scholar_Ingester_Interface._itemDone);
translate.setHandler("done", Scholar_Ingester_Interface._finishScraping);
@ -122,8 +111,14 @@ Scholar_Ingester_Interface.contentLoad = function(event) {
// get data object
var data = Scholar_Ingester_Interface._getData(browser);
// get translators
var translate = new Scholar.Translate("web");
data.translators = translate.getTranslators();
// update status
@ -162,13 +157,12 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject)
var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
// Make sure scrape progress is gone
var scrapeProgress;
while(scrapeProgress = Scholar_Ingester_Interface._scrapeProgress.pop()) {
Scholar_Ingester_Interface.hidePopup = function(collectionID) {
@ -219,95 +213,101 @@ Scholar_Ingester_Interface.showPopup = function(collectionID, parentElement) {
* Gets a document object given a browser window object
* Gets a data object given a browser window object
* NOTE: Browser objects are associated with document objects via keys generated
* from the time the browser object is opened. I'm not sure if this is the
* appropriate mechanism for handling this, but it's what PiggyBank used and it
* appears to work.
* Currently, the data object contains only one property: "translators," which
* is an array of translators that should work with the given page as returned
* from Scholar.Translate.getTranslator()
Scholar_Ingester_Interface._getDocument = function(browser) {
Scholar_Ingester_Interface._getData = function(browser) {
try {
var key = browser.getAttribute("scholar-key");
if(Scholar_Ingester_Interface.browserDocuments[key]) {
return Scholar_Ingester_Interface.browserDocuments[key];
if(Scholar_Ingester_Interface.browserData[key]) {
return Scholar_Ingester_Interface.browserData[key];
} finally {}
return false;
* Creates a new document object for a browser window object, attempts to
* retrieve appropriate scraper
Scholar_Ingester_Interface._setDocument = function(browser) {
try {
var key = browser.getAttribute("scholar-key");
} finally {
if(!key) {
var key = (new Date()).getTime();
browser.setAttribute("scholar-key", key);
Scholar_Ingester_Interface.browserData[key] = new Array();
return Scholar_Ingester_Interface.browserData[key];
// Only re-load the scraper if it's a new document
//if(Scholar_Ingester_Interface.browserUris[key] != browser.contentDocument.location.href) {
Scholar_Ingester_Interface.browserUris[key] = browser.contentDocument.location.href;
Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, window);
* Deletes the document object associated with a given browser window object
Scholar_Ingester_Interface._deleteDocument = function(browser) {
Scholar_Ingester_Interface._deleteData = function(browser) {
try {
var key = browser.getAttribute("scholar-key");
if(Scholar_Ingester_Interface.browserDocuments[key]) {
delete Scholar_Ingester_Interface.browserDocuments[key];
if(Scholar_Ingester_Interface.browserData[key]) {
delete Scholar_Ingester_Interface.browserData[key];
return true;
} finally {}
return false;
* Updates the status of the capture icon to reflect the scrapability or lack
* thereof of the current page
Scholar_Ingester_Interface._updateStatus = function(data) {
if(data.translators && data.translators.length) {
var itemType = data.translators[0].itemType;
if(itemType == "multiple") {
// Use folder icon for multiple types, for now
Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treesource-collection.png";
} else {
Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+itemType+".png";
Scholar_Ingester_Interface.statusImage.hidden = false;
} else {
Scholar_Ingester_Interface.statusImage.hidden = true;
* Callback to be executed when an item has been finished
Scholar_Ingester_Interface._itemDone = function(obj, item) {
var title = item.getField("title");
var icon = "chrome://scholar/skin/treeitem-"+Scholar.ItemTypes.getName(item.getField("itemTypeID"))+".png"
Scholar_Ingester_Interface.Progress.addLines([title], [icon]);
* called when a user is supposed to select items
Scholar_Ingester_Interface._selectItems = function(obj, itemList) {
// this is kinda ugly, mozillazine made me do it! honest!
var io = { dataIn:itemList, dataOut:null }
var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
"_blank","chrome,modal,centerscreen,resizable=yes", io);
if(!io.dataOut) { // user selected no items, so kill the progress indicatior
return io.dataOut;
* Callback to be executed when scraping is complete
Scholar_Ingester_Interface._finishScraping = function(obj, returnValue, scrapeProgress, saveLocation) {
if(obj.items.length) {
// Display title and creators
var labels = new Array();
var icons = new Array();
for(var i in obj.items) {
scrapeProgress.addLines(labels, icons);
// Get collection if the user used the drop-down menu
if(saveLocation) {
var saveCollection = Scholar.Collections.get(saveLocation);
// Save items
for(i in obj.items) {
if(saveLocation) {
setTimeout(function() { scrapeProgress.fade() }, 2500);
} else if(returnValue) {
} else {
setTimeout(function() { scrapeProgress.fade() }, 2500);
Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) {
if(!returnValue) {
@ -317,99 +317,126 @@ Scholar_Ingester_Interface._finishScraping = function(obj, returnValue, scrapePr
// Handles the display of a div showing progress in scraping
Scholar_Ingester_Interface.Progress = function(myWindow) {
this.openerWindow = myWindow;
this.progressWindow = myWindow.openDialog("chrome://scholar/chrome/ingester/progress.xul", "", "chrome,dialog=no,titlebar=no,popup=yes");
var me = this;
this.progressWindow.addEventListener("load", function() { me.windowLoaded() }, false);
Scholar_Ingester_Interface.Progress = new function() {
var _windowLoaded = false;
var _windowLoading = false;
// keep track of all of these things in case they're called before we're
// done loading the progress window
var _loadDescription = null;
var _loadLines = new Array();
var _loadIcons = new Array();
var _loadHeadline = Scholar.getString("ingester.scraping");
this._loadDescription = null;
this._loadLines = new Array();
this._loadIcons = new Array();
this._loadHeadline = Scholar.getString("ingester.scraping");
Scholar_Ingester_Interface.Progress.prototype.windowLoaded = function() {
this._windowLoaded = true;
|||| = show;
this.changeHeadline = changeHeadline;
this.addLines = addLines;
this.addDescription = addDescription;
this.fade = fade;
this.kill = kill;
this.addLines(this._loadLines, this._loadIcons);
if(this._loadDescription) {
function show() {
if(_windowLoading || _windowLoaded) { // already loading or loaded
return false;
_progressWindow = window.openDialog("chrome://scholar/chrome/ingester/progress.xul", "", "chrome,dialog=no,titlebar=no,popup=yes");
_progressWindow.addEventListener("load", _onWindowLoaded, false);
_windowLoading = true;
Scholar_Ingester_Interface.Progress.prototype.changeHeadline = function(headline) {
if(this._windowLoaded) {
this.progressWindow.document.getElementById("scholar-progress-text-headline").value = headline;
} else {
this._loadHeadline = headline;
function changeHeadline(headline) {
if(_windowLoaded) {
_progressWindow.document.getElementById("scholar-progress-text-headline").value = headline;
} else {
_loadHeadline = headline;
Scholar_Ingester_Interface.Progress.prototype.addLines = function(label, icon) {
if(this._windowLoaded) {
for(i in label) {
var newLabel = this.progressWindow.document.createElement("label");
newLabel.setAttribute("class", "scholar-progress-item-label");
newLabel.setAttribute("crop", "end");
newLabel.setAttribute("value", label[i]);
function addLines(label, icon) {
if(_windowLoaded) {
for(i in label) {
var newLabel = _progressWindow.document.createElement("label");
newLabel.setAttribute("class", "scholar-progress-item-label");
newLabel.setAttribute("crop", "end");
newLabel.setAttribute("value", label[i]);
var newImage = _progressWindow.document.createElement("image");
newImage.setAttribute("class", "scholar-progress-item-icon");
newImage.setAttribute("src", icon[i]);
var newHB = _progressWindow.document.createElement("hbox");
newHB.setAttribute("class", "scholar-progress-item-hbox");
newHB.setAttribute("valign", "center");
var newImage = this.progressWindow.document.createElement("image");
newImage.setAttribute("class", "scholar-progress-item-icon");
newImage.setAttribute("src", icon[i]);
var newHB = this.progressWindow.document.createElement("hbox");
} else {
_loadLines = _loadLines.concat(label);
_loadIcons = _loadIcons.concat(icon);
function addDescription(text) {
if(_windowLoaded) {
var newHB = _progressWindow.document.createElement("hbox");
newHB.setAttribute("class", "scholar-progress-item-hbox");
newHB.setAttribute("valign", "center");
var newDescription = _progressWindow.document.createElement("description");
newDescription.setAttribute("class", "scholar-progress-description");
var newText = _progressWindow.document.createTextNode(text);
} else {
_loadDescription = text;
function fade() {
setTimeout(_timeout, 2500);
function kill() {
_windowLoaded = false;
try {
} catch(ex) {}
function _onWindowLoaded() {
_windowLoading = false;
_windowLoaded = true;
// do things we delayed because the winodw was loading
addLines(_loadLines, _loadIcons);
if(_loadDescription) {
} else {
this._loadLines = this._loadLines.concat(label);
this._loadIcons = this._loadIcons.concat(icon);
// reset parameters
_loadDescription = null;
_loadLines = new Array();
_loadIcons = new Array();
_loadHeadline = Scholar.getString("ingester.scraping")
function _move() {
window.screenX + window.outerWidth - _progressWindow.outerWidth - 30,
window.screenY + window.outerHeight - _progressWindow.outerHeight
function _timeout() {
kill(); // could check to see if we're really supposed to fade yet
// (in case multiple scrapers are operating at once)
Scholar_Ingester_Interface.Progress.prototype.addDescription = function(text) {
if(this._windowLoaded) {
var newHB = this.progressWindow.document.createElement("hbox");
newHB.setAttribute("class", "scholar-progress-item-hbox");
var newDescription = this.progressWindow.document.createElement("description");
newDescription.setAttribute("class", "scholar-progress-description");
var newText = this.progressWindow.document.createTextNode(text);
} else {
this._loadDescription = text;
Scholar_Ingester_Interface.Progress.prototype._move = function() {
this.openerWindow.screenX + this.openerWindow.outerWidth - this.progressWindow.outerWidth - 30,
this.openerWindow.screenY + this.openerWindow.outerHeight - this.progressWindow.outerHeight
Scholar_Ingester_Interface.Progress.prototype.fade = function() {
Scholar_Ingester_Interface.Progress.prototype.kill = function() {
try {
} catch(ex) {}
@ -19,47 +19,6 @@ Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) {
Scholar.debug("deleted hidden browser");
* Operates the ingester given only a URL
* url - URL to scrape
* complete - callback function to be executed if page grab completes
* (will be passed document object; obj.items contains array of
* *unsaved* items scraped; empty array indicates unscrapable page)
* error - callback function to be executed if an error occurred loading page
* myWindow - optional argument indicating window to attach a dialog to. if no
* window is given, Firefox Scholar uses the hidden DOM window and
* will simply avoid scraping multiple pages
Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) {
var isHidden = false;
if(!myWindow) {
var myWindow = Components.classes[";1"]
var isHidden = true;
var succeeded = function(browser) {
var myDoc = new Scholar.Ingester.Document(browser, myWindow, isHidden);
if(myDoc.retrieveTranslator()) {
myDoc.scrapePage(function(myDoc) {
} else {
var failed = function() {
Scholar.debug("Scholar.Ingester.ingestURL: could not ingest "+url);
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed, true);
// Scholar.Ingester.ProxyMonitor
@ -101,54 +60,56 @@ Scholar.Ingester.ProxyMonitor = new function() {
function observe(channel) {
if(channel.getResponseHeader("Server") == "EZproxy") {
// We're connected to an EZproxy
if(channel.responseStatus != "302") {
// We should be able to scrape the URL out of this
var m = _ezProxyRe.exec(channel.URI.spec);
if(!m) {
// Found URL
var variable = m[1];
var properURL = m[2];
if(variable.toLowerCase() == "qurl") {
properURL = unescape(properURL);
var properURI = _parseURL(properURL);
if(!properURI) {
// Get the new URL
var newURL = channel.getResponseHeader("Location");
if(!newURL) {
var newURI = _parseURL(newURL);
if(!newURI) {
if( == && channel.URI.port != newURI.port) {
// Different ports but the same server means EZproxy active
Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort);
// Initialize variables here so people who never use EZProxies
// don't get the (very very minor) speed hit
if(!_mapFromProxy) {
_mapFromProxy = new Object();
_mapToProxy = new Object();
try {
if(channel.getResponseHeader("Server") == "EZproxy") {
// We're connected to an EZproxy
if(channel.responseStatus != "302") {
// We should be able to scrape the URL out of this
var m = _ezProxyRe.exec(channel.URI.spec);
if(!m) {
// Found URL
var variable = m[1];
var properURL = m[2];
if(variable.toLowerCase() == "qurl") {
properURL = unescape(properURL);
var properURI = _parseURL(properURL);
if(!properURI) {
// Get the new URL
var newURL = channel.getResponseHeader("Location");
if(!newURL) {
var newURI = _parseURL(newURL);
if(!newURI) {
if( == && channel.URI.port != newURI.port) {
// Different ports but the same server means EZproxy active
Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort);
// Initialize variables here so people who never use EZProxies
// don't get the (very very minor) speed hit
if(!_mapFromProxy) {
_mapFromProxy = new Object();
_mapToProxy = new Object();
_mapFromProxy[newURI.hostPort] = properURI.hostPort;
_mapToProxy[properURI.hostPort] = newURI.hostPort;
_mapFromProxy[newURI.hostPort] = properURI.hostPort;
_mapToProxy[properURI.hostPort] = newURI.hostPort;
} catch(e) {}
@ -195,394 +156,4 @@ Scholar.Ingester.ProxyMonitor = new function() {
var uri = ioService.newURI(url, null, null);
return uri;
// Scholar.Ingester.Model
// Scholar.Ingester.Model, an object representing an RDF data model with
// methods to add to that model. In Piggy Bank, this was implemented in Java,
// but seeing as we don't really want an enormous web server running with FS,
// but we don't actually need that, so it's much simpler.
// The Java version of this class can be viewed at
Scholar.Ingester.Model = function() {
|||| = new Object();
// Piggy Bank provides a fourth argument, one that determines if the third
// argument is a literal or an RDF URI. Since our ontologies are
// sufficiently restricted, we have no chance of confusing a literal and an
// RDF URI and thus this is unnecessary.
Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) {
if(![uri])[uri] = new Object();
if(![uri][rdfUri]) {
||||[uri][rdfUri] = new Array();
Scholar.debug(rdfUri+" for "+uri+" is "+literal);
// Additional functions added for compatibility purposes only
// No idea if any scraper actually uses these, but just in case, they're
// implemented so as not to throw an exception
Scholar.Ingester.Model.prototype.addTag = function() {}
Scholar.Ingester.Model.prototype.getRepository = function() {}
Scholar.Ingester.Model.prototype.detachRepository = function() {}
// Scholar.Ingester.Document
* eventually, all ingesting will be part of a unified API in Scholar.Translate.
* until then, Scholar.Ingester.Document reigns supreme.
* Public properties:
* browser - browser window object of document
* model - data model for semantic scrapers
* scraper - best scraper to use to scrape page
* items - items returned after page is scraped
* window - window, for creating new hidden browsers
* url - url, as passed through proxy system
* type - type of item that will be scraped (set after retrieveScraper() is
* called)
* Private properties:
* _sandbox - sandbox for code execution
* _scrapeCallback - callback function to be executed when scraping is complete
// Public Scholar.Ingester.Document methods
* Constructor for Document object
Scholar.Ingester.Document = function(myBrowser, myWindow, isHidden) {
this.browser = myBrowser;
this.window = myWindow;
this.isHidden = isHidden;
this.scraper = this.type = null;
this.model = new Scholar.Ingester.Model();
// Create separate URL to account for proxies
this.url = Scholar.Ingester.ProxyMonitor.proxyToProper(this.browser.contentDocument.location.href);
if(this.url != this.browser.contentDocument.location.href) {
this.proxiedURL = true;
this.items = new Array();
* Retrieves the best scraper to scrape a given page
Scholar.Ingester.Document.prototype.retrieveScraper = function() {
Scholar.debug("Retrieving scrapers for "+this.url);
var sql = 'SELECT * FROM translators WHERE type = 3 ORDER BY target IS NULL ASC';
var scrapers = Scholar.DB.query(sql);
for(var i=0; i<scrapers.length; i++) {
var currentScraper = scrapers[i];
if(this.canScrape(currentScraper)) {
this.scraper = currentScraper;
Scholar.debug("Found scraper "+this.scraper.label);
return true;
return false;
* Check to see if _scraper_ can scrape this document
Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
var canScrape = false;
// Test with regular expression
// If this is slow, we could preload all scrapers and compile regular
// expressions, so each check will be faster
if( {
var regularExpression = new RegExp(, "i");
if(regularExpression.test(this.url)) {
canScrape = true;
// Test with JavaScript if available and didn't have a regular expression or
// passed regular expression test
if((! || canScrape)
&& currentScraper.detectCode) {
Scholar.debug("Checking detectCode");
var scraperSandbox = this._sandbox;
try {
canScrape = Components.utils.evalInSandbox("(function(){\n" +
currentScraper.detectCode +
"\n})()", scraperSandbox);
} catch(e) {
Scholar.debug(e+' in detectCode for '+currentScraper.label);
return false;
// detectCode returns text type
if(canScrape.toString() != "") {
this.type = canScrape;
} else {
this.type = "website";
return canScrape;
* Populate model with semantic data regarding this page using _scraper_
* Callback will be executed once scraping is complete
Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
if(callback) {
this._scrapeCallback = callback;
Scholar.debug("Scraping "+this.url);
var scraperSandbox = this._sandbox;
try {
var returnValue = Components.utils.evalInSandbox("(function(){\n" +
this.scraper.code +
"\n})()", scraperSandbox);
} catch(e) {
Scholar.debug(e+' in code for '+this.scraper.label);
// If synchronous, call _scrapePageComplete();
if(!this._waitForCompletion) {
Scholar.debug("is asynch");
// Private Scholar.Ingester.Document methods
* Piggy Bank/FS offers four objects to JavaScript scrapers
* browser - the object representing the open browser window containing the
* document to be processes
* doc - the DOM (basically just browser.contentDocument)
* model - the object representing the RDF model of data to be returned
* (see Scholar.Ingester.Model)
* utilities - a set of utilities for making certain tasks easier
* (see Scholar.Utilities);
* Piggy Bank/FS also offers two functions to simplify asynchronous requests
* (these will only be available for scraping, and not for scrape detection)
* wait() - called on asynchronous requests so that Piggy Bank/FS will not
* automatically return at the end of code execution
* done() - when wait() is called, Piggy Bank/FS will wait for this
* function before returning
* Called when scraping (synchronous or asynchronous) is complete
Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue) {
if(this._scrapeCallback) {
this._scrapeCallback(this, returnValue);
// Get us ready for another scrape
delete this.model;
delete this.items;
this.model = new Scholar.Ingester.Model();
this.items = new Array();
this._waitForCompletion = false;
// This is perhaps a bit paranoid, but we need to get the model redone anyway
* Generates a sandbox for scraping/scraper detection
Scholar.Ingester.Document.prototype._generateSandbox = function() {
this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
this._sandbox.browser = this.browser;
this._sandbox.doc = this.browser.contentDocument;
this._sandbox.url = this.url;
this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL, this.isHidden);
this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.proxiedURL);
this._sandbox.window = this.window;
this._sandbox.model = this.model;
this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
this._sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
this._sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
var me = this;
this._sandbox.wait = function(){ me._waitForCompletion = true; };
this._sandbox.done = function(){ me._scrapePageComplete(); };
Scholar.Ingester.Document.prototype._associateRDF = function(rdfUri, field, uri, item, typeID) {
var fieldID;
if(fieldID = Scholar.ItemFields.getID(field)) {
if([uri][rdfUri] && Scholar.ItemFields.isValidForType(fieldID, typeID)) {
} else {
Scholar.debug("discarded scraper " + field + " data: not valid for item type "+typeID);
} else {
Scholar.debug("discarded scraper " + field + " data: no field in database");
* Add data ingested using RDF to database
* (Ontologies are hard-coded until we have a real way of dealing with them)
Scholar.Ingester.Document.prototype._updateDatabase = function() {
Scholar.debug("doing updating");
var prefixRDF = '';
var prefixDC = '';
var prefixDCMI = '';
var prefixDummy = '';
// Call number fields, in order of preference
var callNumbers = new Array("LCC", "DDC", "UDC", "NLM", "NAL", "CN");
try {
for(var uri in {
// Get typeID, defaulting to "website"
try {
var type =[uri][prefixRDF + 'type'][0].substr(prefixDummy.length);
var typeID = Scholar.ItemTypes.getID(type);
} catch(ex) {
var typeID = Scholar.ItemTypes.getID("website")
var newItem = Scholar.Items.getNewItemByType(typeID);
// Handle source and title
newItem.setField("source", uri);
if([uri][prefixDC + 'title']) {
newItem.setField("title",[uri][prefixDC + 'title'][0]);
// Handle creators and contributors
var creatorIndex = 0;
if([uri][prefixDC + 'creator']) {
for(i in[uri][prefixDC + 'creator']) {
var creator =[uri][prefixDC + 'creator'][i];
var spaceIndex = creator.lastIndexOf(" ");
var lastName = creator.substring(spaceIndex+1, creator.length);
var firstName = creator.substring(0, spaceIndex);
newItem.setCreator(creatorIndex, firstName, lastName, 1);
if([uri][prefixDC + 'contributor']) {
for(i in[uri][prefixDC + 'contributor']) {
var creator =[uri][prefixDC + 'contributor'][i];
var spaceIndex = creator.lastIndexOf(" ");
var lastName = creator.substring(spaceIndex+1, creator.length);
var firstName = creator.substring(0, spaceIndex);
newItem.setCreator(creatorIndex, firstName, lastName, 2);
if([uri][prefixDummy + 'corporateCreator']) {
for(i in[uri][prefixDummy + 'corporateCreator']) {
newItem.setCreator(creatorIndex, null,[uri][prefixDummy + 'corporateCreator'][i], 1);
if([uri][prefixDummy + 'corporateContributor']) {
for(i in[uri][prefixDummy + 'corporateContributor']) {
newItem.setCreator(creatorIndex, null,[uri][prefixDummy + 'corporateContributor'][i], 2);
if([uri][prefixDummy + 'editor']) {
for(i in[uri][prefixDummy + 'editor']) {
newItem.setCreator(creatorIndex, null,[uri][prefixDummy + 'editor'][i], 3);
// Handle years, extracting from date if necessary
if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) {
if([uri][prefixDC + 'year']) {
newItem.setField("year",[uri][prefixDC + 'year'][0]);
} else if([uri][prefixDC + 'date'] &&[uri][prefixDC + 'date'][0].length >= 4) {
var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/
if(ISORe.test([uri][prefixDC + 'date'][0])) {
newItem.setField("year",[uri][prefixDC + 'date'][0].substr(0, 4));
} else {
var m;
var yearRe = /[0-9]{4}$/;
if(m = yearRe.exec([uri][prefixDC + 'date'][0])) {
newItem.setField("year", m[0]);
// Handle ISBNs/ISSNs/Call Numbers
if([uri][prefixDC + 'identifier']) {
var oldIndex = -1;
var needISSN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISSN"), typeID);
var needISBN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISBN"), typeID);
for(i in[uri][prefixDC + 'identifier']) {
prefix =[uri][prefixDC + 'identifier'][i].substr(0,[uri][prefixDC + 'identifier'][i].indexOf(" "));
if(needISSN && prefix == 'ISSN') {
newItem.setField("ISSN",[uri][prefixDC + 'identifier'][i].substring(5));
needISSN = false;
if(needISBN && prefix == 'ISBN') {
newItem.setField("ISBN",[uri][prefixDC + 'identifier'][i].substring(5));
needISBN = false;
var newIndex = Scholar.arraySearch(prefix, callNumbers);
if(newIndex && newIndex > oldIndex) {
oldIndex = newIndex;
var callNumber =[uri][prefixDC + 'identifier'][i].substring(prefix.length+1);
if(callNumber) {
newItem.setField("callNumber", callNumber);
this._associateRDF(prefixDummy + 'publication', "publication", uri, newItem, typeID);
this._associateRDF(prefixDummy + 'volume', "volume", uri, newItem, typeID);
this._associateRDF(prefixDummy + 'number', "number", uri, newItem, typeID);
this._associateRDF(prefixDummy + 'pages', "pages", uri, newItem, typeID);
this._associateRDF(prefixDC + 'publisher', "publisher", uri, newItem, typeID);
this._associateRDF(prefixDC + 'date', "date", uri, newItem, typeID);
this._associateRDF(prefixDC + 'hasVersion', "edition", uri, newItem, typeID);
this._associateRDF(prefixDummy + 'series', "series", uri, newItem, typeID);
this._associateRDF(prefixDummy + 'place', "place", uri, newItem, typeID);
} catch(ex) {
Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex);
@ -1,532 +0,0 @@
* Scholar.Ingester.MARC_Record.js
* Stefano Bargioni, Pontificia Universitˆ della Santa Croce - Biblioteca
* Trattamento di record MARC in JavaScript
* Original version copyright (C) 2005 Stefano Bargioni, licensed under the LGPL
* (Available at
* This library is free software; you can redistribute it or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Lesser General Public License for more details.
Scholar.Ingester.MARC_Record = function() { // new MARC record
this.VERSIONE = '2.6.6b';
this.VERSIONE_data ='2005-05-10';
this.leader = {
record_status:'n', // acdnp
type_of_record:' ',
bibliographic_level:' ',
type_of_control:' ',
character_coding_scheme:' ',
encoding_level:' ',
descriptive_cataloging_form:' ',
linked_record_requirement:' ',
}; // 24 chars
this.field_terminator = '\x1E';
this.record_terminator = '\x1D';
this.subfield_delimiter = '\x1F';
|||| = '';
this.directory_terminator = this.field_terminator;
this.variable_fields = new Array();
return this;
Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s passed in format f
if (f == 'binary') {
this.leader.record_length = '00000';
this.leader.record_status = s.substr(5,1);
this.leader.type_of_record = s.substr(6,1);
this.leader.bibliographic_level = s.substr(7,1);
this.leader.type_of_control = s.substr(8,1);
this.leader.character_coding_scheme = s.substr(9,1);
this.leader.indicator_count = '2';
this.leader.subfield_code_length = '2';
this.leader.base_address_of_data = '00000';
this.leader.encoding_level = s.substr(17,1);
this.leader.descriptive_cataloging_form = s.substr(18,1);
this.leader.linked_record_requirement = s.substr(19,1);
this.leader.entry_map = '4500';
|||| = '';
this.directory_terminator = this.field_terminator;
this.variable_fields = new Array();
// loads fields
var campi = s.split(this.field_terminator);
var k;
for (k=1; k<-1+campi.length; k++) { // the first and the last are unuseful
// the first is the header + directory, the last is the this.record_terminator
var tag = campi[0].substr(24+(k-1)*12,3);
var ind1 = ''; var ind2 = ''; var value = campi[k];
if (tag.substr(0,2) != '00') {
ind1 = campi[k].substr(0,1);
ind2 = campi[k].substr(1,1);
value = campi[k].substr(2);
} else if (f == 'MARC_Harvard') {
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = this._trim(linee[i]);
if (linee[i] == '') continue; // jumps empty lines
// linee[i] = linee[i].replace(/\t/g,' ');
linee[i] = linee[i].replace(/ \t/g,'\t');
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
var tranche = linee[i].split('|a ');
var tag, ind1, ind2, value;
if (tranche.length == 1) {
tag = linee[i].substr(0,3);
value = linee[i].substr(4);
else {
tag = tranche[0].substr(0,3);
ind1 = tranche[0].substr(3,1);
ind2 = tranche[0].substr(4,1);
value = tranche[1];
value = this._trim(value);
var replacer = this.subfield_delimiter+'$1';
value = value.replace(/\|(.) /g,replacer);
if (tag == 'LDR') {
this.leader.record_length = '00000';
this.leader.record_status = value.substr(5,1);
this.leader.type_of_record = value.substr(6,1);
this.leader.bibliographic_level = value.substr(7,1);
this.leader.type_of_control = value.substr(8,1);
this.leader.character_coding_scheme = value.substr(9,1);
this.leader.indicator_count = '2';
this.leader.subfield_code_length = '2';
this.leader.base_address_of_data = '00000';
this.leader.encoding_level = value.substr(17,1);
this.leader.descriptive_cataloging_form = value.substr(18,1);
this.leader.linked_record_requirement = value.substr(19,1);
this.leader.entry_map = '4500';
|||| = '';
this.directory_terminator = this.field_terminator;
this.variable_fields = new Array();
else if (tag > '008' && tag < '899') { // jumps low and high tags, also H03 and similia
if (tag != '040') this.add_field(tag,ind1,ind2,value);
} else if (f == 'MARC_BNI') {
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = this._trim(linee[i]);
if (linee[i] == '') continue; // jumps empty lines
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
linee[i] = linee[i].replace(/\|/g,' ');
linee[i] = linee[i].replace(/_/g,' ');
linee[i] = linee[i].replace(/\$/g,this.subfield_delimiter);
var tranche = linee[i].split('\t');
var tag = tranche[0];
var ind1 = tranche[1].substr(0,1);
var ind2 = tranche[1].substr(1,1);
var value = this._trim(tranche[2]);
if (tag == 'LEA') {
this.leader.record_length = '00000';
this.leader.record_status = value.substr(5,1);
this.leader.type_of_record = value.substr(6,1);
this.leader.bibliographic_level = value.substr(7,1);
this.leader.type_of_control = value.substr(8,1);
this.leader.character_coding_scheme = value.substr(9,1);
this.leader.indicator_count = '2';
this.leader.subfield_code_length = '2';
this.leader.base_address_of_data = '00000';
this.leader.encoding_level = value.substr(17,1);
this.leader.descriptive_cataloging_form = value.substr(18,1);
this.leader.linked_record_requirement = value.substr(19,1);
this.leader.entry_map = '4500';
|||| = '';
this.directory_terminator = this.field_terminator;
this.variable_fields = new Array();
else if (tag > '008' && tag < '899') { // jumps low and high tags
if (tag != '040') this.add_field(tag,ind1,ind2,value);
} else if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = this._trim(linee[i]);
if (linee[i] == '') continue; // jumps empty lines
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
linee[i] = linee[i].replace(/_/g,' ');
linee[i] = linee[i].replace(/\t/g,'');
var replacer = this.subfield_delimiter+'$1';
linee[i] = linee[i].replace(/\|(.) /g,replacer);
linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter);
var tag = linee[i].substr(0,3);
var ind1 = linee[i].substr(4,1);
var ind2 = linee[i].substr(5,1);
var value = linee[i].substr(7);
if (tag == '000') {
linee[i] = linee[i].replace(/ /,' ');
value = linee[i].substr(4);
this.leader.record_length = '00000';
this.leader.record_status = value.substr(5,1);
this.leader.type_of_record = value.substr(6,1);
this.leader.bibliographic_level = value.substr(7,1);
this.leader.type_of_control = value.substr(8,1);
this.leader.character_coding_scheme = value.substr(9,1);
this.leader.indicator_count = '2';
this.leader.subfield_code_length = '2';
this.leader.base_address_of_data = '00000';
this.leader.encoding_level = value.substr(17,1);
this.leader.descriptive_cataloging_form = value.substr(18,1);
this.leader.linked_record_requirement = value.substr(19,1);
this.leader.entry_map = '4500';
|||| = '';
this.directory_terminator = this.field_terminator;
this.variable_fields = new Array();
else if (tag > '008' && tag < '899') { // jumps low and high tags
if (tag != '040') this.add_field(tag,ind1,ind2,value);
} else if (f == 'MARC_PAC') {
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
linee[i] = linee[i].replace(/_/g,' ');
linee[i] = linee[i].replace(/\t/g,'');
linee[i] = this._trim(linee[i]);
if (linee[i] == '') continue; // jumps empty lines
var replacer = this.subfield_delimiter+'$1';
linee[i] = linee[i].replace(/\|(.)/g,replacer);
linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter);
var tag = linee[i].substr(0,3);
var ind1 = linee[i].substr(4,1);
var ind2 = linee[i].substr(5,1);
var value = this.subfield_delimiter+'a'+linee[i].substr(7);
if(linee[i].substr(0, 6) == "LEADER") {
value = linee[i].substr(7);
this.leader.record_length = '00000';
this.leader.record_status = value.substr(5,1);
this.leader.type_of_record = value.substr(6,1);
this.leader.bibliographic_level = value.substr(7,1);
this.leader.type_of_control = value.substr(8,1);
this.leader.character_coding_scheme = value.substr(9,1);
this.leader.indicator_count = '2';
this.leader.subfield_code_length = '2';
this.leader.base_address_of_data = '00000';
this.leader.encoding_level = value.substr(17,1);
this.leader.descriptive_cataloging_form = value.substr(18,1);
this.leader.linked_record_requirement = value.substr(19,1);
this.leader.entry_map = '4500';
|||| = '';
this.directory_terminator = this.field_terminator;
this.variable_fields = new Array();
else if (tag > '008' && tag < '899') { // jumps low and high tags
if (tag != '040') this.add_field(tag,ind1,ind2,value);
return this;
Scholar.Ingester.MARC_Record.prototype.update_base_address_of_data = function() { // updates the base_address
this.leader.base_address_of_data = this._zero_fill(24+this.variable_fields.length*12+1,5);
return this.leader.base_address_of_data;
Scholar.Ingester.MARC_Record.prototype.update_displacements = function() { // rebuilds the directory
var displ = 0;
|||| = '';
for (var i=0; i<this.variable_fields.length; i++) {
var len = this.variable_fields[i].value.length + 1 +
this.variable_fields[i].ind1.length +
|||| += this.variable_fields[i].tag +
this._zero_fill(len,4) + this._zero_fill(displ,5);
displ += len;
return true;
Scholar.Ingester.MARC_Record.prototype.update_record_length = function() { // updates total record length
var fields_total_length = 0; var f;
for (f=0; f<this.variable_fields.length;f++) {
fields_total_length += this.variable_fields[f].ind1.length+this.variable_fields[f].ind2.length+this.variable_fields[f].value.length + 1;
var rl =;
this.leader.record_length = this._zero_fill(rl,5);
Scholar.Ingester.MARC_Record.prototype.sort_directory = function() { // sorts directory and array variable_fields by tag and occ
// ordinamento della directory
if ( <= 12) { return true; } // already sorted
var directory_entries = new Array();
var i;
for (i=0; i<; i=i+12) {
directory_entries[directory_entries.length] =,12);
|||| = directory_entries.join('');
// sorts array variable_fields
this.variable_fields.sort(function(a,b) { return a.tag - b.tag + a.occ - b.occ; });
return true;
Scholar.Ingester.MARC_Record.prototype.show_leader = function() {
var leader = ''; var f;
for (f in this.leader) { leader += this.leader[f]; }
return leader;
Scholar.Ingester.MARC_Record.prototype.show_fields = function() {
var fields = ''; var f;
for (f=0; f<this.variable_fields.length;f++) {
fields += this.variable_fields[f].ind1 +
this.variable_fields[f].ind2 +
this.variable_fields[f].value +
return fields;
Scholar.Ingester.MARC_Record.prototype.show_directory = function() {
var d = '';
for (var i = 0; i<; i+=12) {
d +=,3) + ' ' +
||||,4) + ' ' +
||||,5) + '\n';
return d;
Scholar.Ingester.MARC_Record.prototype.add_field_005 = function() {
var now = new Date();
now = now.getFullYear() +
this._zero_fill(now.getMonth()+1,2) +
this._zero_fill(now.getDate(),2) +
this._zero_fill(now.getHours(),2) +
this._zero_fill(now.getMinutes(),2) +
this._zero_fill(now.getSeconds(),2) + '.0';
return now;
Scholar.Ingester.MARC_Record.prototype.count_occ = function(tag) { // counts occ of tag
var n = 0;
for (var i=0; i<this.variable_fields.length; i++) {
if (this.variable_fields[i].tag == tag) { n++; }
return n;
Scholar.Ingester.MARC_Record.prototype.exists = function(tag) { // field existence
if (this.count_occ(tag) > 0) return true;
return false;
Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield
this.tag = tag;
this.occ = rec.count_occ(tag)+1; // occurrence order no.
this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' ';
this.ind2 = ind2; if (this.ind2 == '') this.ind2 = ' ';
if (tag.substr(0,2) == '00') {
this.ind1 = ''; this.ind2 = '';
this.value = value;
return this;
Scholar.Ingester.MARC_Record.prototype.display = function(type) { // displays record in format type
type = type.toLowerCase();
if (type == 'binary') return this.show_leader() +
|||| +
this.field_terminator +
this.show_fields() +
if (type == 'html') {
var s = '<table class="record_table">';
var l = R.show_leader();
s += '<tr><td class="tag">000</td><td class="ind"></td><td class="ind"></td><td class="record_value">'+l+'</td></tr>';
var i;
for (i=0; i<this.variable_fields.length; i++) {
var ind1 = this.variable_fields[i].ind1; if (ind1 == ' ') { ind1 = ' '; }
var ind2 = this.variable_fields[i].ind2; if (ind2 == ' ') { ind2 = ' '; }
s += '<tr>';
s += '<td class="tag">'+this.variable_fields[i].tag+'</td>';
s += '<td class="ind">'+ind1+'</td>';
s += '<td class="ind">'+ind2+'</td>';
var v = this.variable_fields[i].value;
if (this.variable_fields[i].tag == '008') v = v.replace(/ /g,' ');
s += '<td class="record_value">'+this._ddagger(v)+'</td>';
s += '</tr>';
s += '</table>';
return s;
if (type == 'xml') {
s = '';
s += '<?xml version="1.0" encoding="iso-8859-1"?><collection xmlns=""><record>';
s += '<leader>'+this.show_leader()+'</leader>';
// var i;
for (i=0; i<this.variable_fields.length; i++) {
ind1 = this.variable_fields[i].ind1; if (ind1 != '') ind1 = ' ind1="'+ind1+'"';
ind2 = this.variable_fields[i].ind2; if (ind2 != '') ind2 = ' ind2="'+ind2+'"';
if (this.variable_fields[i].tag.substr(0,2) == '00') s += '<controlfield tag="'+this.variable_fields[i].tag+'">'+this.variable_fields[i].value+'</controlfield>';
else {
var subfields = this.variable_fields[i].value.split(this.subfield_delimiter);
// alert(this.variable_fields[i].value+' '+subfields.length); // test
if (subfields.length == 1) subfields[1] = '?'+this.variable_fields[i].value;
var sf = '';
for (var j=1; j<subfields.length; j++) {
sf += '<subfield code="'+subfields[j].substr(0,1)+'">'+subfields[j].substr(1)+'</subfield>';
s += '<datafield tag="' + this.variable_fields[i].tag + '"' + ind1 + ind2 + '>' + sf + '</datafield>';
s += '</record></collection>';
return s;
if (type == 'xml-html') {
s = this.display('xml');
// abbellimenti
s = s.replace(/\<leader\>/,'\n <leader>');
s = s.replace(/\<controlfield/g,'\n <controlfield');
s = s.replace(/\<datafield/g,'\n <datafield');
s = s.replace(/\<collection/g,'\n<collection');
s = s.replace(/\<record/g,'\n<record');
s = s.replace(/\<\/datafield/g,'\n </datafield');
s = s.replace(/\<\/collection/g,'\n</collection');
s = s.replace(/\<\/record/g,'\n</record');
s = s.replace(/\<subfield/g,'\n <subfield');
s = s.replace(/\x1F/g,'%1F'); s = this._ddagger(s);
// escape chars < e >
s = s.replace(/\</g,'<');
s = s.replace(/\>/g,'>');
// colore alle keyword
s = s.replace(/(controlfield|datafield|collection|record|leader|subfield)/g,'<span class="cdfield">$1</span>');
s = s.replace(/(tag|code|ind1|ind2)=/g,'<span class="attrib">$1=</span>');
return s;
return false;
Scholar.Ingester.MARC_Record.prototype.get_field = function(tag) { // returns an array of values, one for each occurrence
var v = new Array(); var i;
for (i=0; i<this.variable_fields.length; i++) {
if (this.variable_fields[i].tag == tag) {
v[v.length] = this.variable_fields[i].ind1 +
this.variable_fields[i].ind2 +
return v;
// This function added by Simon Kornblith
Scholar.Ingester.MARC_Record.prototype.get_field_subfields = function(tag) { // returns a two-dimensional array of values
var field = this.get_field(tag);
var return_me = new Array();
for(var i in field) {
return_me[i] = new Object();
var subfields = field[i].split(this.subfield_delimiter);
if (subfields.length == 1) {
return_me[i]['?'] = field[i];
} else {
for (var j=1; j<subfields.length; j++) {
return_me[i][subfields[j].substr(0,1)] = subfields[j].substr(1);
return return_me;
Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
if (tag.length != 3) { return false; }
var F = new this.MARC_field(this,tag,ind1,ind2,value);
// adds pointer to list of fields
this.variable_fields[this.variable_fields.length] = F;
// adds the entry to the directory
|||| += F.tag+this._zero_fill(F.ind1.length+F.ind2.length+F.value.length+1,4)+'00000';
// sorts the directory
// updates lengths
return F;
Scholar.Ingester.MARC_Record.prototype.delete_field = function(tag,occurrence) {
// lookup and delete the occurrence from array variable_fields
var i;
for (i=0; i<this.variable_fields.length; i++) {
if (this.variable_fields[i].tag == tag && this.variable_fields[i].occ == occurrence) break;
if (i==this.variable_fields.length) return false; // campo non trovato
// deletes the occ. i from array variable_fields scaling next values
var j;
for (j=i+1; j<this.variable_fields.length; j++) {
this.variable_fields.length--; // deletes last element
// lookup and delete the occurrence from directory (must exist; no sort is needed)
var nocc = 0;
// var i;
for (i=0; i<;i=i+12) {
if (,3) == tag) nocc++;
if (occurrence == nocc) { // occ found
if (i >= alert('Internal error!');
|||| =,i) +;
// updates lengths
return true;
Scholar.Ingester.MARC_Record.prototype._ddagger = function(s) { // display doubledagger in html code
s = s.replace(/\%1F(.)/g, "<span class=\"this._ddagger\">‡$1</span>");
s = s.replace(/\x1F(.)/g, "<span class=\"this._ddagger\">‡$1</span>");
return s;
Scholar.Ingester.MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides
s = s.replace(/\s+$/,'');
return s.replace(/^\s+/,'');
Scholar.Ingester.MARC_Record.prototype._zero_fill = function(s,l) { // left '0' padding of s, up to l (l<=15)
var t = '000000000000000';
t = t+s;
return t.substr(t.length-l,l);
Scholar.Ingester.MARC_Record.prototype.version = function() { // returns version and date
return 'MARC Editor Lite '+this.VERSIONE+' ('+this.VERSIONE_data+')';
File diff suppressed because it is too large
Load diff
@ -82,19 +82,29 @@ Scholar.Utilities.prototype.dateToISO = function(jsDate) {
* Cleans extraneous punctuation off an author name
Scholar.Utilities.prototype.cleanAuthor = function(author) {
Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
author = author.replace(/ +/, ' ');
// Add period for initials
if(author.substring(author.length-2, author.length-1) == " ") {
author += ".";
if(useComma) {
// Add period for initials
if(author.substr(author.length-2, 1) == " ") {
author += ".";
var splitNames = author.split(', ');
if(splitNames.length > 1) {
var lastName = splitNames[0];
var firstName = splitNames[1];
} else {
var lastName = author;
} else {
var spaceIndex = author.lastIndexOf(" ");
var lastName = author.substring(spaceIndex+1);
var firstName = author.substring(0, spaceIndex);
var splitNames = author.split(', ');
if(splitNames.length > 1) {
author = splitNames[1]+' '+splitNames[0];
return author;
// TODO: take type into account
return {firstName:firstName, lastName:lastName, creatorType:type};
@ -141,7 +151,7 @@ Scholar.Utilities.prototype.getVersion = function() {
* Get a page range, given a user-entered set of pages
Scholar.Utilities.prototype._pageRangeRegexp = /^\s*([0-9]+)-([0-9]+)\s*$/
Scholar.Utilities.prototype._pageRangeRegexp = /^\s*([0-9]+)-([0-9]+)\s*$/;
Scholar.Utilities.prototype.getPageRange = function(pages) {
var pageNumbers;
var m = this._pageRangeRegexp.exec(pages);
@ -155,8 +165,21 @@ Scholar.Utilities.prototype.getPageRange = function(pages) {
return pageNumbers;
* provide inArray function
Scholar.Utilities.prototype.inArray = Scholar.inArray;
* pads a number or other string with a given string on the left
Scholar.Utilities.prototype.lpad = function(string, pad, length) {
while(string.length < length) {
string = pad + string;
return string;
@ -169,10 +192,8 @@ Scholar.Utilities.prototype.inArray = Scholar.inArray;
// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
// classes relating to data extraction specifically from HTML documents.
Scholar.Utilities.Ingester = function(myWindow, proxiedURL, isHidden) {
this.window = myWindow;
Scholar.Utilities.Ingester = function(proxiedURL) {
this.proxiedURL = proxiedURL;
this.isHidden = isHidden;
Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
@ -240,21 +261,6 @@ Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode,
return returnVar;
* Allows a user to select which items to scrape
Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) {
if(this.isHidden != true) {
// this is kinda ugly, mozillazine made me do it! honest!
var io = { dataIn:itemList, dataOut:null }
var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
"_blank","chrome,modal,centerscreen,resizable=yes", io);
return io.dataOut;
} else {
return null;
* Grabs items based on URLs
@ -300,129 +306,19 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
return availableItems;
// These functions are for use by importMARCRecord. They're private, because,
// while they are useful, it's also nice if as many of our scrapers as possible
// are PiggyBank compatible, and if our scrapers used functions, that would
// break compatibility
Scholar.Utilities.Ingester.prototype._MARCCleanString = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
return author.replace(/ +/, ' ');
Scholar.Utilities.Ingester.prototype._MARCCleanNumber = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
var regexp = /^[^ ]*/;
var m = regexp.exec(author);
if(m) {
return m[0];
Scholar.Utilities.Ingester.prototype._MARCPullYear = function(text) {
var pullRe = /[0-9]+/;
var m = pullRe.exec(text);
if(m) {
return m[0];
Scholar.Utilities.Ingester.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
if(!part) {
part = 'a';
var field = record.get_field_subfields(fieldNo);
Scholar.debug('Found '+field.length+' matches for '+fieldNo+part);
if(field) {
for(i in field) {
var value;
for(var j=0; j<part.length; j++) {
var myPart = part.substr(j, 1);
if(field[i][myPart]) {
if(value) {
value += " "+field[i][myPart];
} else {
value = field[i][myPart];
if(value) {
if(execMe) {
value = execMe(value);
if(prefix) {
value = prefix + value;
model.addStatement(uri, rdfUri, value);
return model;
// This is an extension to PiggyBank's architecture. It's here so that we don't
// need an enormous library for each scraper that wants to use MARC records
Scholar.Utilities.Ingester.prototype.importMARCRecord = function(record, uri, model) {
var prefixDC = '';
var prefixDCMI = '';
var prefixDummy = '';
var prefixRDF = '';
// Extract ISBNs
model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
// Extract ISSNs
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
// Extract creators
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
if(![uri] || (![uri][prefixDC + 'creator'] && ![uri][prefixDC + 'contributor'] && ![uri][prefixDummy + 'corporateCreator'] && ![uri][prefixDummy + 'corporateContributor'])) {
// some LOC entries have no listed author, but have the author in the person subject field as the first entry
var field = record.get_field_subfields('600');
if(field[0]) {
model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
// Extract title
model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab');
// Extract edition
model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString);
// Extract place info
model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
// Extract publisher info
model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
// Extract year
model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c');
// Extract series
model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
// Extract call number
model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab');
model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab');
model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab');
model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab');
model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a');
model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab');
// Set type
model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true);
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) {
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
if(this.proxiedURL) {
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
Scholar.Utilities.Ingester.prototype.processDocuments = function(firstDoc, urls, processor, done, exception) {
for(i in urls) {
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
@ -476,6 +372,7 @@ Scholar.Utilities.HTTP = new function() {
* in our code, is required for compatiblity with the Piggy Bank project
function doGet(url, callback1, callback2) {
Scholar.debug("HTTP GET "+url);
if (this.browserIsOffline()){
return false;
@ -508,6 +405,7 @@ Scholar.Utilities.HTTP = new function() {
* in our code, is required for compatiblity with the Piggy Bank project
function doPost(url, body, callback1, callback2) {
Scholar.debug("HTTP POST "+body+" to "+url);
if (this.browserIsOffline()){
return false;
@ -538,6 +436,7 @@ Scholar.Utilities.HTTP = new function() {
* in our code, is required for compatiblity with the Piggy Bank project
function doOptions(url, body, callback1, callback2) {
Scholar.debug("HTTP OPTIONS "+url);
if (this.browserIsOffline()){
return false;
@ -641,7 +540,6 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(myWindow);
var prevUrl, url;
Scholar.debug("processDocuments called");
try {
if (urls.length == 0) {
@ -690,14 +588,11 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do
var init = function() {
Scholar.debug("init called");
hiddenBrowser.addEventListener("load", onLoad, true);
if (firstDoc) {
processor(firstDoc, doLoad);
} else {
Scholar.debug("doing load");
@ -45,10 +45,6 @@ Cc[";1"]
File diff suppressed because it is too large
Load diff
Add table
Reference in a new issue