closes #78, figure out import/export architecture
closes #100, migrate ingester to Scholar.Translate closes #88, migrate scrapers away from RDF closes #9, pull out LC subject heading tags references #87, add fromArray() and toArray() methods to item objects API changes: all translation (import/export/web) now goes through Scholar.Translate all Scholar-specific functions in scrapers start with "Scholar." rather than the jumbled up piggy bank un-namespaced confusion scrapers now longer specify items through RDF (the beginning of an item.fromArray()-like function exists in Scholar.Translate.prototype._itemDone()) scrapers can be any combination of import, export, and web (type is the sum of 1/2/4 respectively) scrapers now contain functions (doImport, doExport, doWeb) rather than loose code scrapers can call functions in other scrapers or just call the function to translate itself export accesses items item-by-item, rather than accepting a huge array of items MARC functions are now in the MARC import translator, and accessed by the web translators new features: import now works rudimentary RDF (unqualified dublin core only), RIS, and MARC import translators are implemented (although they are a little picky with respect to file extensions at the moment) items appear as they are scraped MARC import translator pulls out tags, although this seems to slow things down no icon appears next to a the URL when Scholar hasn't detected metadata, since this seemed somewhat confusing apologizes for the size of this diff. i figured if i was going to re-write the API, i might as well do it all at once and get everything working right.
This commit is contained in:
parent
d65328c830
commit
c64e5c841f
8 changed files with 4058 additions and 3725 deletions
|
@ -1,5 +1,6 @@
|
|||
Scholar_File_Interface = new function() {
|
||||
this.exportFile = exportFile;
|
||||
this.importFile = importFile;
|
||||
|
||||
/*
|
||||
* Creates Scholar.Translate instance and shows file picker for file export
|
||||
|
@ -23,4 +24,41 @@ Scholar_File_Interface = new function() {
|
|||
translation.translate();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Creates Scholar.Translate instance and shows file picker for file import
|
||||
*/
|
||||
function importFile() {
|
||||
var translation = new Scholar.Translate("import");
|
||||
var translators = translation.getTranslators();
|
||||
|
||||
const nsIFilePicker = Components.interfaces.nsIFilePicker;
|
||||
var fp = Components.classes["@mozilla.org/filepicker;1"]
|
||||
.createInstance(nsIFilePicker);
|
||||
fp.init(window, "Import", nsIFilePicker.modeOpen);
|
||||
for(var i in translators) {
|
||||
fp.appendFilter(translators[i].label, "*."+translators[i].target);
|
||||
}
|
||||
|
||||
var rv = fp.show();
|
||||
if (rv == nsIFilePicker.returnOK || rv == nsIFilePicker.returnReplace) {
|
||||
translation.setLocation(fp.file);
|
||||
// get translators again, bc now we can check against the file
|
||||
translators = translation.getTranslators();
|
||||
if(translators.length) {
|
||||
// TODO: display a list of available translators
|
||||
translation.setTranslator(translators[0]);
|
||||
translation.setHandler("itemDone", _importItemDone);
|
||||
translation.translate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Saves items after they've been imported. We could have a nice little
|
||||
* "items imported" indicator, too.
|
||||
*/
|
||||
function _importItemDone(obj, item) {
|
||||
item.save();
|
||||
}
|
||||
}
|
|
@ -25,8 +25,7 @@ Scholar_Ingester_Interface._scrapeProgress = new Array();
|
|||
*/
|
||||
Scholar_Ingester_Interface.init = function() {
|
||||
Scholar_Ingester_Interface.browsers = new Array();
|
||||
Scholar_Ingester_Interface.browserDocuments = new Object();
|
||||
Scholar_Ingester_Interface.browserUris = new Array();
|
||||
Scholar_Ingester_Interface.browserData = new Object();
|
||||
Scholar_Ingester_Interface._scrapePopupShowing = false;
|
||||
Scholar.Ingester.ProxyMonitor.init();
|
||||
|
||||
|
@ -54,7 +53,7 @@ Scholar_Ingester_Interface.chromeLoad = function() {
|
|||
* When chrome unloads, delete our document objects and remove our listeners
|
||||
*/
|
||||
Scholar_Ingester_Interface.chromeUnload = function() {
|
||||
delete Scholar_Ingester_Interface.browserDocuments;
|
||||
delete Scholar_Ingester_Interface.browserData, Scholar_Ingester_Interface.browsers;
|
||||
this.tabBrowser.removeProgressListener(this);
|
||||
}
|
||||
|
||||
|
@ -62,30 +61,20 @@ Scholar_Ingester_Interface.chromeUnload = function() {
|
|||
* Scrapes a page (called when the capture icon is clicked)
|
||||
*/
|
||||
Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
|
||||
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
||||
if(documentObject.scraper) {
|
||||
var scrapeProgress = new Scholar_Ingester_Interface.Progress(window);
|
||||
Scholar_Ingester_Interface._scrapeProgress.push(scrapeProgress);
|
||||
documentObject.scrapePage(function(obj, returnValue) { Scholar_Ingester_Interface._finishScraping(obj, returnValue, scrapeProgress, saveLocation) });
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Updates the status of the capture icon to reflect the scrapability or lack
|
||||
* thereof of the current page
|
||||
*/
|
||||
Scholar_Ingester_Interface.updateStatus = function() {
|
||||
var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
||||
if(documentObject && documentObject.scraper) {
|
||||
if(documentObject.type == "multiple") {
|
||||
// Use folder icon for multiple types, for now
|
||||
Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treesource-collection.png";
|
||||
} else {
|
||||
Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+documentObject.type+".png";
|
||||
}
|
||||
Scholar_Ingester_Interface.statusImage.hidden = false;
|
||||
} else {
|
||||
Scholar_Ingester_Interface.statusImage.hidden = true;
|
||||
var browser = Scholar_Ingester_Interface.tabBrowser.selectedBrowser;
|
||||
var data = Scholar_Ingester_Interface._getData(browser);
|
||||
|
||||
if(data.translators && data.translators.length) {
|
||||
Scholar_Ingester_Interface.Progress.show();
|
||||
|
||||
var translate = new Scholar.Translate("web");
|
||||
translate.setBrowser(browser);
|
||||
// use first translator available
|
||||
translate.setTranslator(data.translators[0]);
|
||||
translate.setHandler("select", Scholar_Ingester_Interface._selectItems);
|
||||
translate.setHandler("itemDone", Scholar_Ingester_Interface._itemDone);
|
||||
translate.setHandler("done", Scholar_Ingester_Interface._finishScraping);
|
||||
translate.translate();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -122,8 +111,14 @@ Scholar_Ingester_Interface.contentLoad = function(event) {
|
|||
return;
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface._setDocument(browser);
|
||||
Scholar_Ingester_Interface.updateStatus();
|
||||
// get data object
|
||||
var data = Scholar_Ingester_Interface._getData(browser);
|
||||
// get translators
|
||||
var translate = new Scholar.Translate("web");
|
||||
translate.setBrowser(browser);
|
||||
data.translators = translate.getTranslators();
|
||||
// update status
|
||||
Scholar_Ingester_Interface._updateStatus(data);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -162,13 +157,12 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject)
|
|||
Scholar_Ingester_Interface._deleteDocument(browser);
|
||||
}
|
||||
}
|
||||
Scholar_Ingester_Interface.updateStatus();
|
||||
|
||||
var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
|
||||
Scholar_Ingester_Interface._updateStatus(data);
|
||||
|
||||
// Make sure scrape progress is gone
|
||||
var scrapeProgress;
|
||||
while(scrapeProgress = Scholar_Ingester_Interface._scrapeProgress.pop()) {
|
||||
scrapeProgress.kill();
|
||||
}
|
||||
Scholar_Ingester_Interface.Progress.kill();
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.hidePopup = function(collectionID) {
|
||||
|
@ -219,95 +213,101 @@ Scholar_Ingester_Interface.showPopup = function(collectionID, parentElement) {
|
|||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
* Gets a document object given a browser window object
|
||||
* Gets a data object given a browser window object
|
||||
*
|
||||
* NOTE: Browser objects are associated with document objects via keys generated
|
||||
* from the time the browser object is opened. I'm not sure if this is the
|
||||
* appropriate mechanism for handling this, but it's what PiggyBank used and it
|
||||
* appears to work.
|
||||
*
|
||||
* Currently, the data object contains only one property: "translators," which
|
||||
* is an array of translators that should work with the given page as returned
|
||||
* from Scholar.Translate.getTranslator()
|
||||
*/
|
||||
Scholar_Ingester_Interface._getDocument = function(browser) {
|
||||
Scholar_Ingester_Interface._getData = function(browser) {
|
||||
try {
|
||||
var key = browser.getAttribute("scholar-key");
|
||||
if(Scholar_Ingester_Interface.browserDocuments[key]) {
|
||||
return Scholar_Ingester_Interface.browserDocuments[key];
|
||||
if(Scholar_Ingester_Interface.browserData[key]) {
|
||||
return Scholar_Ingester_Interface.browserData[key];
|
||||
}
|
||||
} finally {}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Creates a new document object for a browser window object, attempts to
|
||||
* retrieve appropriate scraper
|
||||
*/
|
||||
Scholar_Ingester_Interface._setDocument = function(browser) {
|
||||
try {
|
||||
var key = browser.getAttribute("scholar-key");
|
||||
} finally {
|
||||
if(!key) {
|
||||
var key = (new Date()).getTime();
|
||||
browser.setAttribute("scholar-key", key);
|
||||
Scholar_Ingester_Interface.browserData[key] = new Array();
|
||||
return Scholar_Ingester_Interface.browserData[key];
|
||||
}
|
||||
}
|
||||
|
||||
// Only re-load the scraper if it's a new document
|
||||
//if(Scholar_Ingester_Interface.browserUris[key] != browser.contentDocument.location.href) {
|
||||
Scholar_Ingester_Interface.browserUris[key] = browser.contentDocument.location.href;
|
||||
Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, window);
|
||||
Scholar_Ingester_Interface.browserDocuments[key].retrieveScraper();
|
||||
//}
|
||||
}
|
||||
|
||||
/*
|
||||
* Deletes the document object associated with a given browser window object
|
||||
*/
|
||||
Scholar_Ingester_Interface._deleteDocument = function(browser) {
|
||||
Scholar_Ingester_Interface._deleteData = function(browser) {
|
||||
try {
|
||||
var key = browser.getAttribute("scholar-key");
|
||||
if(Scholar_Ingester_Interface.browserDocuments[key]) {
|
||||
delete Scholar_Ingester_Interface.browserDocuments[key];
|
||||
if(Scholar_Ingester_Interface.browserData[key]) {
|
||||
delete Scholar_Ingester_Interface.browserData[key];
|
||||
return true;
|
||||
}
|
||||
} finally {}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Updates the status of the capture icon to reflect the scrapability or lack
|
||||
* thereof of the current page
|
||||
*/
|
||||
Scholar_Ingester_Interface._updateStatus = function(data) {
|
||||
if(data.translators && data.translators.length) {
|
||||
var itemType = data.translators[0].itemType;
|
||||
if(itemType == "multiple") {
|
||||
// Use folder icon for multiple types, for now
|
||||
Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treesource-collection.png";
|
||||
} else {
|
||||
Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+itemType+".png";
|
||||
}
|
||||
Scholar_Ingester_Interface.statusImage.hidden = false;
|
||||
} else {
|
||||
Scholar_Ingester_Interface.statusImage.hidden = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Callback to be executed when an item has been finished
|
||||
*/
|
||||
Scholar_Ingester_Interface._itemDone = function(obj, item) {
|
||||
var title = item.getField("title");
|
||||
var icon = "chrome://scholar/skin/treeitem-"+Scholar.ItemTypes.getName(item.getField("itemTypeID"))+".png"
|
||||
Scholar_Ingester_Interface.Progress.addLines([title], [icon]);
|
||||
item.save();
|
||||
}
|
||||
|
||||
/*
|
||||
* called when a user is supposed to select items
|
||||
*/
|
||||
Scholar_Ingester_Interface._selectItems = function(obj, itemList) {
|
||||
// this is kinda ugly, mozillazine made me do it! honest!
|
||||
var io = { dataIn:itemList, dataOut:null }
|
||||
var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
|
||||
"_blank","chrome,modal,centerscreen,resizable=yes", io);
|
||||
|
||||
if(!io.dataOut) { // user selected no items, so kill the progress indicatior
|
||||
Scholar_Ingester_Interface.Progress.kill();
|
||||
}
|
||||
|
||||
return io.dataOut;
|
||||
}
|
||||
|
||||
/*
|
||||
* Callback to be executed when scraping is complete
|
||||
*/
|
||||
Scholar_Ingester_Interface._finishScraping = function(obj, returnValue, scrapeProgress, saveLocation) {
|
||||
if(obj.items.length) {
|
||||
scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
|
||||
|
||||
// Display title and creators
|
||||
var labels = new Array();
|
||||
var icons = new Array();
|
||||
for(var i in obj.items) {
|
||||
labels.push(obj.items[i].getField("title"));
|
||||
icons.push("chrome://scholar/skin/treeitem-"+Scholar.ItemTypes.getName(obj.items[i].getField("itemTypeID"))+".png");
|
||||
}
|
||||
scrapeProgress.addLines(labels, icons);
|
||||
|
||||
// Get collection if the user used the drop-down menu
|
||||
if(saveLocation) {
|
||||
var saveCollection = Scholar.Collections.get(saveLocation);
|
||||
}
|
||||
// Save items
|
||||
for(i in obj.items) {
|
||||
obj.items[i].save();
|
||||
if(saveLocation) {
|
||||
saveCollection.addItem(obj.items[i].getID());
|
||||
}
|
||||
}
|
||||
|
||||
setTimeout(function() { scrapeProgress.fade() }, 2500);
|
||||
} else if(returnValue) {
|
||||
scrapeProgress.kill();
|
||||
} else {
|
||||
scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
|
||||
scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
|
||||
setTimeout(function() { scrapeProgress.fade() }, 2500);
|
||||
Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) {
|
||||
if(!returnValue) {
|
||||
Scholar_Ingester_Interface.Progress.changeHeadline(Scholar.getString("ingester.scrapeError"));
|
||||
Scholar_Ingester_Interface.Progress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
|
||||
}
|
||||
Scholar_Ingester_Interface.Progress.fade();
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -317,99 +317,126 @@ Scholar_Ingester_Interface._finishScraping = function(obj, returnValue, scrapePr
|
|||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Handles the display of a div showing progress in scraping
|
||||
|
||||
Scholar_Ingester_Interface.Progress = function(myWindow) {
|
||||
this.openerWindow = myWindow;
|
||||
this.progressWindow = myWindow.openDialog("chrome://scholar/chrome/ingester/progress.xul", "", "chrome,dialog=no,titlebar=no,popup=yes");
|
||||
var me = this;
|
||||
this.progressWindow.addEventListener("load", function() { me.windowLoaded() }, false);
|
||||
Scholar_Ingester_Interface.Progress = new function() {
|
||||
var _windowLoaded = false;
|
||||
var _windowLoading = false;
|
||||
// keep track of all of these things in case they're called before we're
|
||||
// done loading the progress window
|
||||
var _loadDescription = null;
|
||||
var _loadLines = new Array();
|
||||
var _loadIcons = new Array();
|
||||
var _loadHeadline = Scholar.getString("ingester.scraping");
|
||||
|
||||
this._loadDescription = null;
|
||||
this._loadLines = new Array();
|
||||
this._loadIcons = new Array();
|
||||
this._loadHeadline = Scholar.getString("ingester.scraping");
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.Progress.prototype.windowLoaded = function() {
|
||||
this._windowLoaded = true;
|
||||
this._move();
|
||||
this.show = show;
|
||||
this.changeHeadline = changeHeadline;
|
||||
this.addLines = addLines;
|
||||
this.addDescription = addDescription;
|
||||
this.fade = fade;
|
||||
this.kill = kill;
|
||||
|
||||
this.changeHeadline(this._loadHeadline);
|
||||
this.addLines(this._loadLines, this._loadIcons);
|
||||
if(this._loadDescription) {
|
||||
this.addDescription(this._loadDescription);
|
||||
function show() {
|
||||
if(_windowLoading || _windowLoaded) { // already loading or loaded
|
||||
return false;
|
||||
}
|
||||
_progressWindow = window.openDialog("chrome://scholar/chrome/ingester/progress.xul", "", "chrome,dialog=no,titlebar=no,popup=yes");
|
||||
_progressWindow.addEventListener("load", _onWindowLoaded, false);
|
||||
_windowLoading = true;
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.Progress.prototype.changeHeadline = function(headline) {
|
||||
if(this._windowLoaded) {
|
||||
this.progressWindow.document.getElementById("scholar-progress-text-headline").value = headline;
|
||||
} else {
|
||||
this._loadHeadline = headline;
|
||||
|
||||
function changeHeadline(headline) {
|
||||
if(_windowLoaded) {
|
||||
_progressWindow.document.getElementById("scholar-progress-text-headline").value = headline;
|
||||
} else {
|
||||
_loadHeadline = headline;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.Progress.prototype.addLines = function(label, icon) {
|
||||
if(this._windowLoaded) {
|
||||
for(i in label) {
|
||||
var newLabel = this.progressWindow.document.createElement("label");
|
||||
newLabel.setAttribute("class", "scholar-progress-item-label");
|
||||
newLabel.setAttribute("crop", "end");
|
||||
newLabel.setAttribute("value", label[i]);
|
||||
|
||||
function addLines(label, icon) {
|
||||
if(_windowLoaded) {
|
||||
for(i in label) {
|
||||
var newLabel = _progressWindow.document.createElement("label");
|
||||
newLabel.setAttribute("class", "scholar-progress-item-label");
|
||||
newLabel.setAttribute("crop", "end");
|
||||
newLabel.setAttribute("value", label[i]);
|
||||
|
||||
var newImage = _progressWindow.document.createElement("image");
|
||||
newImage.setAttribute("class", "scholar-progress-item-icon");
|
||||
newImage.setAttribute("src", icon[i]);
|
||||
|
||||
var newHB = _progressWindow.document.createElement("hbox");
|
||||
newHB.setAttribute("class", "scholar-progress-item-hbox");
|
||||
newHB.setAttribute("valign", "center");
|
||||
newHB.appendChild(newImage);
|
||||
newHB.appendChild(newLabel);
|
||||
|
||||
_progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB);
|
||||
}
|
||||
|
||||
var newImage = this.progressWindow.document.createElement("image");
|
||||
newImage.setAttribute("class", "scholar-progress-item-icon");
|
||||
newImage.setAttribute("src", icon[i]);
|
||||
|
||||
var newHB = this.progressWindow.document.createElement("hbox");
|
||||
_move();
|
||||
} else {
|
||||
_loadLines = _loadLines.concat(label);
|
||||
_loadIcons = _loadIcons.concat(icon);
|
||||
}
|
||||
}
|
||||
|
||||
function addDescription(text) {
|
||||
if(_windowLoaded) {
|
||||
var newHB = _progressWindow.document.createElement("hbox");
|
||||
newHB.setAttribute("class", "scholar-progress-item-hbox");
|
||||
newHB.setAttribute("valign", "center");
|
||||
newHB.appendChild(newImage);
|
||||
newHB.appendChild(newLabel);
|
||||
var newDescription = _progressWindow.document.createElement("description");
|
||||
newDescription.setAttribute("class", "scholar-progress-description");
|
||||
var newText = _progressWindow.document.createTextNode(text);
|
||||
|
||||
this.progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB);
|
||||
newDescription.appendChild(newText);
|
||||
newHB.appendChild(newDescription);
|
||||
_progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB);
|
||||
|
||||
_move();
|
||||
} else {
|
||||
_loadDescription = text;
|
||||
}
|
||||
}
|
||||
|
||||
function fade() {
|
||||
setTimeout(_timeout, 2500);
|
||||
}
|
||||
|
||||
function kill() {
|
||||
_windowLoaded = false;
|
||||
try {
|
||||
_progressWindow.close();
|
||||
} catch(ex) {}
|
||||
}
|
||||
|
||||
function _onWindowLoaded() {
|
||||
_windowLoading = false;
|
||||
_windowLoaded = true;
|
||||
|
||||
_move();
|
||||
// do things we delayed because the winodw was loading
|
||||
changeHeadline(_loadHeadline);
|
||||
addLines(_loadLines, _loadIcons);
|
||||
if(_loadDescription) {
|
||||
addDescription(_loadDescription);
|
||||
}
|
||||
|
||||
this._move();
|
||||
} else {
|
||||
this._loadLines = this._loadLines.concat(label);
|
||||
this._loadIcons = this._loadIcons.concat(icon);
|
||||
// reset parameters
|
||||
_loadDescription = null;
|
||||
_loadLines = new Array();
|
||||
_loadIcons = new Array();
|
||||
_loadHeadline = Scholar.getString("ingester.scraping")
|
||||
}
|
||||
|
||||
function _move() {
|
||||
_progressWindow.sizeToContent();
|
||||
_progressWindow.moveTo(
|
||||
window.screenX + window.outerWidth - _progressWindow.outerWidth - 30,
|
||||
window.screenY + window.outerHeight - _progressWindow.outerHeight
|
||||
);
|
||||
}
|
||||
|
||||
function _timeout() {
|
||||
kill(); // could check to see if we're really supposed to fade yet
|
||||
// (in case multiple scrapers are operating at once)
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.Progress.prototype.addDescription = function(text) {
|
||||
if(this._windowLoaded) {
|
||||
var newHB = this.progressWindow.document.createElement("hbox");
|
||||
newHB.setAttribute("class", "scholar-progress-item-hbox");
|
||||
var newDescription = this.progressWindow.document.createElement("description");
|
||||
newDescription.setAttribute("class", "scholar-progress-description");
|
||||
var newText = this.progressWindow.document.createTextNode(text);
|
||||
|
||||
newDescription.appendChild(newText);
|
||||
newHB.appendChild(newDescription);
|
||||
this.progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB);
|
||||
|
||||
this._move();
|
||||
} else {
|
||||
this._loadDescription = text;
|
||||
}
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.Progress.prototype._move = function() {
|
||||
this.progressWindow.sizeToContent();
|
||||
this.progressWindow.moveTo(
|
||||
this.openerWindow.screenX + this.openerWindow.outerWidth - this.progressWindow.outerWidth - 30,
|
||||
this.openerWindow.screenY + this.openerWindow.outerHeight - this.progressWindow.outerHeight
|
||||
);
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.Progress.prototype.fade = function() {
|
||||
this.kill();
|
||||
}
|
||||
|
||||
Scholar_Ingester_Interface.Progress.prototype.kill = function() {
|
||||
try {
|
||||
this.progressWindow.close();
|
||||
} catch(ex) {}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,47 +19,6 @@ Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) {
|
|||
Scholar.debug("deleted hidden browser");
|
||||
}
|
||||
|
||||
/*
|
||||
* Operates the ingester given only a URL
|
||||
* url - URL to scrape
|
||||
* complete - callback function to be executed if page grab completes
|
||||
* (will be passed document object; obj.items contains array of
|
||||
* *unsaved* items scraped; empty array indicates unscrapable page)
|
||||
* error - callback function to be executed if an error occurred loading page
|
||||
* myWindow - optional argument indicating window to attach a dialog to. if no
|
||||
* window is given, Firefox Scholar uses the hidden DOM window and
|
||||
* will simply avoid scraping multiple pages
|
||||
*/
|
||||
Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) {
|
||||
var isHidden = false;
|
||||
if(!myWindow) {
|
||||
var myWindow = Components.classes["@mozilla.org/appshell/appShellService;1"]
|
||||
.getService(Components.interfaces.nsIAppShellService)
|
||||
.hiddenDOMWindow;
|
||||
var isHidden = true;
|
||||
}
|
||||
|
||||
var succeeded = function(browser) {
|
||||
var myDoc = new Scholar.Ingester.Document(browser, myWindow, isHidden);
|
||||
if(myDoc.retrieveTranslator()) {
|
||||
myDoc.scrapePage(function(myDoc) {
|
||||
Scholar.Ingester.deleteHiddenBrowser(browser);
|
||||
complete(myDoc);
|
||||
});
|
||||
} else {
|
||||
Scholar.Ingester.deleteHiddenBrowser(browser);
|
||||
complete(myDoc);
|
||||
}
|
||||
}
|
||||
|
||||
var failed = function() {
|
||||
Scholar.debug("Scholar.Ingester.ingestURL: could not ingest "+url);
|
||||
error();
|
||||
}
|
||||
|
||||
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed, true);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scholar.Ingester.ProxyMonitor
|
||||
|
@ -101,54 +60,56 @@ Scholar.Ingester.ProxyMonitor = new function() {
|
|||
|
||||
function observe(channel) {
|
||||
channel.QueryInterface(Components.interfaces.nsIHttpChannel);
|
||||
if(channel.getResponseHeader("Server") == "EZproxy") {
|
||||
// We're connected to an EZproxy
|
||||
if(channel.responseStatus != "302") {
|
||||
return;
|
||||
}
|
||||
|
||||
Scholar.debug(channel.URI.spec);
|
||||
// We should be able to scrape the URL out of this
|
||||
var m = _ezProxyRe.exec(channel.URI.spec);
|
||||
if(!m) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Found URL
|
||||
var variable = m[1];
|
||||
var properURL = m[2];
|
||||
if(variable.toLowerCase() == "qurl") {
|
||||
properURL = unescape(properURL);
|
||||
}
|
||||
var properURI = _parseURL(properURL);
|
||||
if(!properURI) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the new URL
|
||||
var newURL = channel.getResponseHeader("Location");
|
||||
if(!newURL) {
|
||||
return;
|
||||
}
|
||||
var newURI = _parseURL(newURL);
|
||||
if(!newURI) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) {
|
||||
// Different ports but the same server means EZproxy active
|
||||
|
||||
Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort);
|
||||
// Initialize variables here so people who never use EZProxies
|
||||
// don't get the (very very minor) speed hit
|
||||
if(!_mapFromProxy) {
|
||||
_mapFromProxy = new Object();
|
||||
_mapToProxy = new Object();
|
||||
try {
|
||||
if(channel.getResponseHeader("Server") == "EZproxy") {
|
||||
// We're connected to an EZproxy
|
||||
if(channel.responseStatus != "302") {
|
||||
return;
|
||||
}
|
||||
|
||||
Scholar.debug(channel.URI.spec);
|
||||
// We should be able to scrape the URL out of this
|
||||
var m = _ezProxyRe.exec(channel.URI.spec);
|
||||
if(!m) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Found URL
|
||||
var variable = m[1];
|
||||
var properURL = m[2];
|
||||
if(variable.toLowerCase() == "qurl") {
|
||||
properURL = unescape(properURL);
|
||||
}
|
||||
var properURI = _parseURL(properURL);
|
||||
if(!properURI) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the new URL
|
||||
var newURL = channel.getResponseHeader("Location");
|
||||
if(!newURL) {
|
||||
return;
|
||||
}
|
||||
var newURI = _parseURL(newURL);
|
||||
if(!newURI) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) {
|
||||
// Different ports but the same server means EZproxy active
|
||||
|
||||
Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort);
|
||||
// Initialize variables here so people who never use EZProxies
|
||||
// don't get the (very very minor) speed hit
|
||||
if(!_mapFromProxy) {
|
||||
_mapFromProxy = new Object();
|
||||
_mapToProxy = new Object();
|
||||
}
|
||||
_mapFromProxy[newURI.hostPort] = properURI.hostPort;
|
||||
_mapToProxy[properURI.hostPort] = newURI.hostPort;
|
||||
}
|
||||
_mapFromProxy[newURI.hostPort] = properURI.hostPort;
|
||||
_mapToProxy[properURI.hostPort] = newURI.hostPort;
|
||||
}
|
||||
}
|
||||
} catch(e) {}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -195,394 +156,4 @@ Scholar.Ingester.ProxyMonitor = new function() {
|
|||
var uri = ioService.newURI(url, null, null);
|
||||
return uri;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scholar.Ingester.Model
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
// Scholar.Ingester.Model, an object representing an RDF data model with
|
||||
// methods to add to that model. In Piggy Bank, this was implemented in Java,
|
||||
// but seeing as we don't really want an enormous web server running with FS,
|
||||
// but we don't actually need that, so it's much simpler.
|
||||
//
|
||||
// The Java version of this class can be viewed at
|
||||
// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java
|
||||
Scholar.Ingester.Model = function() {
|
||||
this.data = new Object();
|
||||
}
|
||||
|
||||
// Piggy Bank provides a fourth argument, one that determines if the third
|
||||
// argument is a literal or an RDF URI. Since our ontologies are
|
||||
// sufficiently restricted, we have no chance of confusing a literal and an
|
||||
// RDF URI and thus this is unnecessary.
|
||||
Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) {
|
||||
if(!this.data[uri]) this.data[uri] = new Object();
|
||||
if(!this.data[uri][rdfUri]) {
|
||||
this.data[uri][rdfUri] = new Array();
|
||||
}
|
||||
this.data[uri][rdfUri].push(literal);
|
||||
Scholar.debug(rdfUri+" for "+uri+" is "+literal);
|
||||
}
|
||||
|
||||
// Additional functions added for compatibility purposes only
|
||||
// No idea if any scraper actually uses these, but just in case, they're
|
||||
// implemented so as not to throw an exception
|
||||
Scholar.Ingester.Model.prototype.addTag = function() {}
|
||||
Scholar.Ingester.Model.prototype.getRepository = function() {}
|
||||
Scholar.Ingester.Model.prototype.detachRepository = function() {}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Scholar.Ingester.Document
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/* THIS CODE IS GOING AWAY
|
||||
* eventually, all ingesting will be part of a unified API in Scholar.Translate.
|
||||
* until then, Scholar.Ingester.Document reigns supreme.
|
||||
*
|
||||
* Public properties:
|
||||
* browser - browser window object of document
|
||||
* model - data model for semantic scrapers
|
||||
* scraper - best scraper to use to scrape page
|
||||
* items - items returned after page is scraped
|
||||
* window - window, for creating new hidden browsers
|
||||
* url - url, as passed through proxy system
|
||||
* type - type of item that will be scraped (set after retrieveScraper() is
|
||||
* called)
|
||||
*
|
||||
* Private properties:
|
||||
* _sandbox - sandbox for code execution
|
||||
* _scrapeCallback - callback function to be executed when scraping is complete
|
||||
*/
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Public Scholar.Ingester.Document methods
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
* Constructor for Document object
|
||||
*/
|
||||
Scholar.Ingester.Document = function(myBrowser, myWindow, isHidden) {
|
||||
this.browser = myBrowser;
|
||||
this.window = myWindow;
|
||||
this.isHidden = isHidden;
|
||||
this.scraper = this.type = null;
|
||||
this.model = new Scholar.Ingester.Model();
|
||||
|
||||
// Create separate URL to account for proxies
|
||||
this.url = Scholar.Ingester.ProxyMonitor.proxyToProper(this.browser.contentDocument.location.href);
|
||||
if(this.url != this.browser.contentDocument.location.href) {
|
||||
this.proxiedURL = true;
|
||||
}
|
||||
|
||||
this.items = new Array();
|
||||
this._generateSandbox();
|
||||
}
|
||||
|
||||
/*
|
||||
* Retrieves the best scraper to scrape a given page
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype.retrieveScraper = function() {
|
||||
Scholar.debug("Retrieving scrapers for "+this.url);
|
||||
|
||||
var sql = 'SELECT * FROM translators WHERE type = 3 ORDER BY target IS NULL ASC';
|
||||
var scrapers = Scholar.DB.query(sql);
|
||||
for(var i=0; i<scrapers.length; i++) {
|
||||
var currentScraper = scrapers[i];
|
||||
if(this.canScrape(currentScraper)) {
|
||||
this.scraper = currentScraper;
|
||||
Scholar.debug("Found scraper "+this.scraper.label);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check to see if _scraper_ can scrape this document
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
|
||||
var canScrape = false;
|
||||
|
||||
// Test with regular expression
|
||||
// If this is slow, we could preload all scrapers and compile regular
|
||||
// expressions, so each check will be faster
|
||||
if(currentScraper.target) {
|
||||
var regularExpression = new RegExp(currentScraper.target, "i");
|
||||
if(regularExpression.test(this.url)) {
|
||||
canScrape = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Test with JavaScript if available and didn't have a regular expression or
|
||||
// passed regular expression test
|
||||
if((!currentScraper.target || canScrape)
|
||||
&& currentScraper.detectCode) {
|
||||
Scholar.debug("Checking detectCode");
|
||||
var scraperSandbox = this._sandbox;
|
||||
try {
|
||||
canScrape = Components.utils.evalInSandbox("(function(){\n" +
|
||||
currentScraper.detectCode +
|
||||
"\n})()", scraperSandbox);
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in detectCode for '+currentScraper.label);
|
||||
return false;
|
||||
}
|
||||
|
||||
// detectCode returns text type
|
||||
if(canScrape.toString() != "") {
|
||||
this.type = canScrape;
|
||||
} else {
|
||||
this.type = "website";
|
||||
}
|
||||
}
|
||||
return canScrape;
|
||||
}
|
||||
|
||||
/*
|
||||
* Populate model with semantic data regarding this page using _scraper_
|
||||
* Callback will be executed once scraping is complete
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
|
||||
if(callback) {
|
||||
this._scrapeCallback = callback;
|
||||
}
|
||||
|
||||
Scholar.debug("Scraping "+this.url);
|
||||
|
||||
var scraperSandbox = this._sandbox;
|
||||
try {
|
||||
var returnValue = Components.utils.evalInSandbox("(function(){\n" +
|
||||
this.scraper.code +
|
||||
"\n})()", scraperSandbox);
|
||||
} catch(e) {
|
||||
Scholar.debug(e+' in code for '+this.scraper.label);
|
||||
this._scrapePageComplete(false);
|
||||
return;
|
||||
}
|
||||
|
||||
// If synchronous, call _scrapePageComplete();
|
||||
if(!this._waitForCompletion) {
|
||||
Scholar.debug("is asynch");
|
||||
this._scrapePageComplete(returnValue);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Private Scholar.Ingester.Document methods
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
* Piggy Bank/FS offers four objects to JavaScript scrapers
|
||||
* browser - the object representing the open browser window containing the
|
||||
* document to be processes
|
||||
* doc - the DOM (basically just browser.contentDocument)
|
||||
* model - the object representing the RDF model of data to be returned
|
||||
* (see Scholar.Ingester.Model)
|
||||
* utilities - a set of utilities for making certain tasks easier
|
||||
* (see Scholar.Utilities);
|
||||
*
|
||||
* Piggy Bank/FS also offers two functions to simplify asynchronous requests
|
||||
* (these will only be available for scraping, and not for scrape detection)
|
||||
* wait() - called on asynchronous requests so that Piggy Bank/FS will not
|
||||
* automatically return at the end of code execution
|
||||
* done() - when wait() is called, Piggy Bank/FS will wait for this
|
||||
* function before returning
|
||||
*/
|
||||
|
||||
/*
|
||||
* Called when scraping (synchronous or asynchronous) is complete
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue) {
|
||||
this._updateDatabase();
|
||||
if(this._scrapeCallback) {
|
||||
this._scrapeCallback(this, returnValue);
|
||||
}
|
||||
// Get us ready for another scrape
|
||||
delete this.model;
|
||||
delete this.items;
|
||||
this.model = new Scholar.Ingester.Model();
|
||||
this.items = new Array();
|
||||
this._waitForCompletion = false;
|
||||
// This is perhaps a bit paranoid, but we need to get the model redone anyway
|
||||
this._generateSandbox();
|
||||
}
|
||||
|
||||
/*
|
||||
* Generates a sandbox for scraping/scraper detection
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
||||
this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
||||
this._sandbox.browser = this.browser;
|
||||
this._sandbox.doc = this.browser.contentDocument;
|
||||
this._sandbox.url = this.url;
|
||||
this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL, this.isHidden);
|
||||
this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.proxiedURL);
|
||||
this._sandbox.window = this.window;
|
||||
this._sandbox.model = this.model;
|
||||
this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
|
||||
this._sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
|
||||
this._sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
|
||||
|
||||
var me = this;
|
||||
this._sandbox.wait = function(){ me._waitForCompletion = true; };
|
||||
this._sandbox.done = function(){ me._scrapePageComplete(); };
|
||||
}
|
||||
|
||||
Scholar.Ingester.Document.prototype._associateRDF = function(rdfUri, field, uri, item, typeID) {
|
||||
var fieldID;
|
||||
if(fieldID = Scholar.ItemFields.getID(field)) {
|
||||
if(this.model.data[uri][rdfUri] && Scholar.ItemFields.isValidForType(fieldID, typeID)) {
|
||||
item.setField(field, this.model.data[uri][rdfUri][0]);
|
||||
} else {
|
||||
Scholar.debug("discarded scraper " + field + " data: not valid for item type "+typeID);
|
||||
}
|
||||
} else {
|
||||
Scholar.debug("discarded scraper " + field + " data: no field in database");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Add data ingested using RDF to database
|
||||
* (Ontologies are hard-coded until we have a real way of dealing with them)
|
||||
*/
|
||||
Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||
Scholar.debug("doing updating");
|
||||
|
||||
var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
|
||||
var prefixDC = 'http://purl.org/dc/elements/1.1/';
|
||||
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
|
||||
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
|
||||
|
||||
// Call number fields, in order of preference
|
||||
var callNumbers = new Array("LCC", "DDC", "UDC", "NLM", "NAL", "CN");
|
||||
|
||||
try {
|
||||
for(var uri in this.model.data) {
|
||||
// Get typeID, defaulting to "website"
|
||||
try {
|
||||
var type = this.model.data[uri][prefixRDF + 'type'][0].substr(prefixDummy.length);
|
||||
var typeID = Scholar.ItemTypes.getID(type);
|
||||
} catch(ex) {
|
||||
var typeID = Scholar.ItemTypes.getID("website")
|
||||
}
|
||||
|
||||
var newItem = Scholar.Items.getNewItemByType(typeID);
|
||||
|
||||
// Handle source and title
|
||||
newItem.setField("source", uri);
|
||||
if(this.model.data[uri][prefixDC + 'title']) {
|
||||
newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
|
||||
}
|
||||
|
||||
// Handle creators and contributors
|
||||
var creatorIndex = 0;
|
||||
if(this.model.data[uri][prefixDC + 'creator']) {
|
||||
for(i in this.model.data[uri][prefixDC + 'creator']) {
|
||||
var creator = this.model.data[uri][prefixDC + 'creator'][i];
|
||||
var spaceIndex = creator.lastIndexOf(" ");
|
||||
var lastName = creator.substring(spaceIndex+1, creator.length);
|
||||
var firstName = creator.substring(0, spaceIndex);
|
||||
|
||||
newItem.setCreator(creatorIndex, firstName, lastName, 1);
|
||||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
if(this.model.data[uri][prefixDC + 'contributor']) {
|
||||
for(i in this.model.data[uri][prefixDC + 'contributor']) {
|
||||
var creator = this.model.data[uri][prefixDC + 'contributor'][i];
|
||||
var spaceIndex = creator.lastIndexOf(" ");
|
||||
var lastName = creator.substring(spaceIndex+1, creator.length);
|
||||
var firstName = creator.substring(0, spaceIndex);
|
||||
|
||||
newItem.setCreator(creatorIndex, firstName, lastName, 2);
|
||||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
if(this.model.data[uri][prefixDummy + 'corporateCreator']) {
|
||||
for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) {
|
||||
newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateCreator'][i], 1);
|
||||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
if(this.model.data[uri][prefixDummy + 'corporateContributor']) {
|
||||
for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) {
|
||||
newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateContributor'][i], 2);
|
||||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
if(this.model.data[uri][prefixDummy + 'editor']) {
|
||||
for(i in this.model.data[uri][prefixDummy + 'editor']) {
|
||||
newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'editor'][i], 3);
|
||||
creatorIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle years, extracting from date if necessary
|
||||
if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) {
|
||||
if(this.model.data[uri][prefixDC + 'year']) {
|
||||
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
|
||||
} else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
|
||||
var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/
|
||||
if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) {
|
||||
newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
|
||||
} else {
|
||||
var m;
|
||||
var yearRe = /[0-9]{4}$/;
|
||||
if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) {
|
||||
newItem.setField("year", m[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle ISBNs/ISSNs/Call Numbers
|
||||
if(this.model.data[uri][prefixDC + 'identifier']) {
|
||||
var oldIndex = -1;
|
||||
var needISSN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISSN"), typeID);
|
||||
var needISBN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISBN"), typeID);
|
||||
for(i in this.model.data[uri][prefixDC + 'identifier']) {
|
||||
prefix = this.model.data[uri][prefixDC + 'identifier'][i].substr(0, this.model.data[uri][prefixDC + 'identifier'][i].indexOf(" "));
|
||||
if(needISSN && prefix == 'ISSN') {
|
||||
newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5));
|
||||
needISSN = false;
|
||||
}
|
||||
if(needISBN && prefix == 'ISBN') {
|
||||
newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5));
|
||||
needISBN = false;
|
||||
}
|
||||
var newIndex = Scholar.arraySearch(prefix, callNumbers);
|
||||
if(newIndex && newIndex > oldIndex) {
|
||||
oldIndex = newIndex;
|
||||
var callNumber = this.model.data[uri][prefixDC + 'identifier'][i].substring(prefix.length+1);
|
||||
}
|
||||
}
|
||||
if(callNumber) {
|
||||
newItem.setField("callNumber", callNumber);
|
||||
}
|
||||
}
|
||||
|
||||
this._associateRDF(prefixDummy + 'publication', "publication", uri, newItem, typeID);
|
||||
this._associateRDF(prefixDummy + 'volume', "volume", uri, newItem, typeID);
|
||||
this._associateRDF(prefixDummy + 'number', "number", uri, newItem, typeID);
|
||||
this._associateRDF(prefixDummy + 'pages', "pages", uri, newItem, typeID);
|
||||
this._associateRDF(prefixDC + 'publisher', "publisher", uri, newItem, typeID);
|
||||
this._associateRDF(prefixDC + 'date', "date", uri, newItem, typeID);
|
||||
this._associateRDF(prefixDC + 'hasVersion', "edition", uri, newItem, typeID);
|
||||
this._associateRDF(prefixDummy + 'series', "series", uri, newItem, typeID);
|
||||
this._associateRDF(prefixDummy + 'place', "place", uri, newItem, typeID);
|
||||
|
||||
this.items.push(newItem);
|
||||
}
|
||||
} catch(ex) {
|
||||
Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex);
|
||||
}
|
||||
}
|
|
@ -1,532 +0,0 @@
|
|||
/*
|
||||
* Scholar.Ingester.MARC_Record.js
|
||||
* Stefano Bargioni, Pontificia Universitˆ della Santa Croce - Biblioteca
|
||||
* Trattamento di record MARC in JavaScript
|
||||
*
|
||||
* Original version copyright (C) 2005 Stefano Bargioni, licensed under the LGPL
|
||||
* (Available at http://www.pusc.it/bib/mel/Scholar.Ingester.MARC_Record.js)
|
||||
*
|
||||
* This library is free software; you can redistribute it or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*/
|
||||
|
||||
Scholar.Ingester.MARC_Record = function() { // new MARC record
|
||||
this.VERSIONE = '2.6.6b';
|
||||
this.VERSIONE_data ='2005-05-10';
|
||||
|
||||
this.leader = {
|
||||
record_length:'00000',
|
||||
record_status:'n', // acdnp
|
||||
type_of_record:' ',
|
||||
bibliographic_level:' ',
|
||||
type_of_control:' ',
|
||||
character_coding_scheme:' ',
|
||||
indicator_count:'2',
|
||||
subfield_code_length:'2',
|
||||
base_address_of_data:'00000',
|
||||
encoding_level:' ',
|
||||
descriptive_cataloging_form:' ',
|
||||
linked_record_requirement:' ',
|
||||
entry_map:'4500'
|
||||
}; // 24 chars
|
||||
|
||||
this.field_terminator = '\x1E';
|
||||
this.record_terminator = '\x1D';
|
||||
this.subfield_delimiter = '\x1F';
|
||||
this.directory = '';
|
||||
this.directory_terminator = this.field_terminator;
|
||||
this.variable_fields = new Array();
|
||||
return this;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s passed in format f
|
||||
if (f == 'binary') {
|
||||
this.leader.record_length = '00000';
|
||||
this.leader.record_status = s.substr(5,1);
|
||||
this.leader.type_of_record = s.substr(6,1);
|
||||
this.leader.bibliographic_level = s.substr(7,1);
|
||||
this.leader.type_of_control = s.substr(8,1);
|
||||
this.leader.character_coding_scheme = s.substr(9,1);
|
||||
this.leader.indicator_count = '2';
|
||||
this.leader.subfield_code_length = '2';
|
||||
this.leader.base_address_of_data = '00000';
|
||||
this.leader.encoding_level = s.substr(17,1);
|
||||
this.leader.descriptive_cataloging_form = s.substr(18,1);
|
||||
this.leader.linked_record_requirement = s.substr(19,1);
|
||||
this.leader.entry_map = '4500';
|
||||
|
||||
this.directory = '';
|
||||
this.directory_terminator = this.field_terminator;
|
||||
this.variable_fields = new Array();
|
||||
|
||||
// loads fields
|
||||
var campi = s.split(this.field_terminator);
|
||||
var k;
|
||||
for (k=1; k<-1+campi.length; k++) { // the first and the last are unuseful
|
||||
// the first is the header + directory, the last is the this.record_terminator
|
||||
var tag = campi[0].substr(24+(k-1)*12,3);
|
||||
var ind1 = ''; var ind2 = ''; var value = campi[k];
|
||||
if (tag.substr(0,2) != '00') {
|
||||
ind1 = campi[k].substr(0,1);
|
||||
ind2 = campi[k].substr(1,1);
|
||||
value = campi[k].substr(2);
|
||||
}
|
||||
this.add_field(tag,ind1,ind2,value);
|
||||
}
|
||||
} else if (f == 'MARC_Harvard') {
|
||||
var linee = s.split('\n');
|
||||
for (var i=0; i<linee.length; i++) {
|
||||
linee[i] = this._trim(linee[i]);
|
||||
if (linee[i] == '') continue; // jumps empty lines
|
||||
// linee[i] = linee[i].replace(/\t/g,' ');
|
||||
linee[i] = linee[i].replace(/ \t/g,'\t');
|
||||
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
|
||||
var tranche = linee[i].split('|a ');
|
||||
var tag, ind1, ind2, value;
|
||||
if (tranche.length == 1) {
|
||||
tag = linee[i].substr(0,3);
|
||||
value = linee[i].substr(4);
|
||||
}
|
||||
else {
|
||||
tag = tranche[0].substr(0,3);
|
||||
ind1 = tranche[0].substr(3,1);
|
||||
ind2 = tranche[0].substr(4,1);
|
||||
value = tranche[1];
|
||||
value = this._trim(value);
|
||||
var replacer = this.subfield_delimiter+'$1';
|
||||
value = value.replace(/\|(.) /g,replacer);
|
||||
}
|
||||
if (tag == 'LDR') {
|
||||
this.leader.record_length = '00000';
|
||||
this.leader.record_status = value.substr(5,1);
|
||||
this.leader.type_of_record = value.substr(6,1);
|
||||
this.leader.bibliographic_level = value.substr(7,1);
|
||||
this.leader.type_of_control = value.substr(8,1);
|
||||
this.leader.character_coding_scheme = value.substr(9,1);
|
||||
this.leader.indicator_count = '2';
|
||||
this.leader.subfield_code_length = '2';
|
||||
this.leader.base_address_of_data = '00000';
|
||||
this.leader.encoding_level = value.substr(17,1);
|
||||
this.leader.descriptive_cataloging_form = value.substr(18,1);
|
||||
this.leader.linked_record_requirement = value.substr(19,1);
|
||||
this.leader.entry_map = '4500';
|
||||
|
||||
this.directory = '';
|
||||
this.directory_terminator = this.field_terminator;
|
||||
this.variable_fields = new Array();
|
||||
}
|
||||
else if (tag > '008' && tag < '899') { // jumps low and high tags, also H03 and similia
|
||||
if (tag != '040') this.add_field(tag,ind1,ind2,value);
|
||||
}
|
||||
}
|
||||
this.add_field_005();
|
||||
} else if (f == 'MARC_BNI') {
|
||||
var linee = s.split('\n');
|
||||
for (var i=0; i<linee.length; i++) {
|
||||
linee[i] = this._trim(linee[i]);
|
||||
if (linee[i] == '') continue; // jumps empty lines
|
||||
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
|
||||
linee[i] = linee[i].replace(/\|/g,' ');
|
||||
linee[i] = linee[i].replace(/_/g,' ');
|
||||
linee[i] = linee[i].replace(/\$/g,this.subfield_delimiter);
|
||||
var tranche = linee[i].split('\t');
|
||||
var tag = tranche[0];
|
||||
var ind1 = tranche[1].substr(0,1);
|
||||
var ind2 = tranche[1].substr(1,1);
|
||||
var value = this._trim(tranche[2]);
|
||||
if (tag == 'LEA') {
|
||||
this.leader.record_length = '00000';
|
||||
this.leader.record_status = value.substr(5,1);
|
||||
this.leader.type_of_record = value.substr(6,1);
|
||||
this.leader.bibliographic_level = value.substr(7,1);
|
||||
this.leader.type_of_control = value.substr(8,1);
|
||||
this.leader.character_coding_scheme = value.substr(9,1);
|
||||
this.leader.indicator_count = '2';
|
||||
this.leader.subfield_code_length = '2';
|
||||
this.leader.base_address_of_data = '00000';
|
||||
this.leader.encoding_level = value.substr(17,1);
|
||||
this.leader.descriptive_cataloging_form = value.substr(18,1);
|
||||
this.leader.linked_record_requirement = value.substr(19,1);
|
||||
this.leader.entry_map = '4500';
|
||||
|
||||
this.directory = '';
|
||||
this.directory_terminator = this.field_terminator;
|
||||
this.variable_fields = new Array();
|
||||
}
|
||||
else if (tag > '008' && tag < '899') { // jumps low and high tags
|
||||
if (tag != '040') this.add_field(tag,ind1,ind2,value);
|
||||
}
|
||||
}
|
||||
this.add_field_005();
|
||||
} else if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov
|
||||
var linee = s.split('\n');
|
||||
for (var i=0; i<linee.length; i++) {
|
||||
linee[i] = this._trim(linee[i]);
|
||||
if (linee[i] == '') continue; // jumps empty lines
|
||||
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
|
||||
linee[i] = linee[i].replace(/_/g,' ');
|
||||
linee[i] = linee[i].replace(/\t/g,'');
|
||||
var replacer = this.subfield_delimiter+'$1';
|
||||
linee[i] = linee[i].replace(/\|(.) /g,replacer);
|
||||
linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter);
|
||||
var tag = linee[i].substr(0,3);
|
||||
var ind1 = linee[i].substr(4,1);
|
||||
var ind2 = linee[i].substr(5,1);
|
||||
var value = linee[i].substr(7);
|
||||
if (tag == '000') {
|
||||
linee[i] = linee[i].replace(/ /,' ');
|
||||
value = linee[i].substr(4);
|
||||
this.leader.record_length = '00000';
|
||||
this.leader.record_status = value.substr(5,1);
|
||||
this.leader.type_of_record = value.substr(6,1);
|
||||
this.leader.bibliographic_level = value.substr(7,1);
|
||||
this.leader.type_of_control = value.substr(8,1);
|
||||
this.leader.character_coding_scheme = value.substr(9,1);
|
||||
this.leader.indicator_count = '2';
|
||||
this.leader.subfield_code_length = '2';
|
||||
this.leader.base_address_of_data = '00000';
|
||||
this.leader.encoding_level = value.substr(17,1);
|
||||
this.leader.descriptive_cataloging_form = value.substr(18,1);
|
||||
this.leader.linked_record_requirement = value.substr(19,1);
|
||||
this.leader.entry_map = '4500';
|
||||
|
||||
this.directory = '';
|
||||
this.directory_terminator = this.field_terminator;
|
||||
this.variable_fields = new Array();
|
||||
}
|
||||
else if (tag > '008' && tag < '899') { // jumps low and high tags
|
||||
if (tag != '040') this.add_field(tag,ind1,ind2,value);
|
||||
}
|
||||
}
|
||||
this.add_field_005();
|
||||
} else if (f == 'MARC_PAC') {
|
||||
var linee = s.split('\n');
|
||||
for (var i=0; i<linee.length; i++) {
|
||||
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
|
||||
linee[i] = linee[i].replace(/_/g,' ');
|
||||
linee[i] = linee[i].replace(/\t/g,'');
|
||||
linee[i] = this._trim(linee[i]);
|
||||
if (linee[i] == '') continue; // jumps empty lines
|
||||
var replacer = this.subfield_delimiter+'$1';
|
||||
linee[i] = linee[i].replace(/\|(.)/g,replacer);
|
||||
linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter);
|
||||
var tag = linee[i].substr(0,3);
|
||||
var ind1 = linee[i].substr(4,1);
|
||||
var ind2 = linee[i].substr(5,1);
|
||||
var value = this.subfield_delimiter+'a'+linee[i].substr(7);
|
||||
if(linee[i].substr(0, 6) == "LEADER") {
|
||||
value = linee[i].substr(7);
|
||||
this.leader.record_length = '00000';
|
||||
this.leader.record_status = value.substr(5,1);
|
||||
this.leader.type_of_record = value.substr(6,1);
|
||||
this.leader.bibliographic_level = value.substr(7,1);
|
||||
this.leader.type_of_control = value.substr(8,1);
|
||||
this.leader.character_coding_scheme = value.substr(9,1);
|
||||
this.leader.indicator_count = '2';
|
||||
this.leader.subfield_code_length = '2';
|
||||
this.leader.base_address_of_data = '00000';
|
||||
this.leader.encoding_level = value.substr(17,1);
|
||||
this.leader.descriptive_cataloging_form = value.substr(18,1);
|
||||
this.leader.linked_record_requirement = value.substr(19,1);
|
||||
this.leader.entry_map = '4500';
|
||||
|
||||
this.directory = '';
|
||||
this.directory_terminator = this.field_terminator;
|
||||
this.variable_fields = new Array();
|
||||
}
|
||||
else if (tag > '008' && tag < '899') { // jumps low and high tags
|
||||
if (tag != '040') this.add_field(tag,ind1,ind2,value);
|
||||
}
|
||||
}
|
||||
this.add_field_005();
|
||||
}
|
||||
|
||||
this.update_record_length();
|
||||
this.update_base_address_of_data();
|
||||
return this;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.update_base_address_of_data = function() { // updates the base_address
|
||||
this.leader.base_address_of_data = this._zero_fill(24+this.variable_fields.length*12+1,5);
|
||||
return this.leader.base_address_of_data;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.update_displacements = function() { // rebuilds the directory
|
||||
var displ = 0;
|
||||
this.directory = '';
|
||||
for (var i=0; i<this.variable_fields.length; i++) {
|
||||
var len = this.variable_fields[i].value.length + 1 +
|
||||
this.variable_fields[i].ind1.length +
|
||||
this.variable_fields[i].ind2.length;
|
||||
this.directory += this.variable_fields[i].tag +
|
||||
this._zero_fill(len,4) + this._zero_fill(displ,5);
|
||||
displ += len;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
Scholar.Ingester.MARC_Record.prototype.update_record_length = function() { // updates total record length
|
||||
var fields_total_length = 0; var f;
|
||||
for (f=0; f<this.variable_fields.length;f++) {
|
||||
fields_total_length += this.variable_fields[f].ind1.length+this.variable_fields[f].ind2.length+this.variable_fields[f].value.length + 1;
|
||||
}
|
||||
var rl = 24+this.directory.length+1+fields_total_length+1;
|
||||
this.leader.record_length = this._zero_fill(rl,5);
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.sort_directory = function() { // sorts directory and array variable_fields by tag and occ
|
||||
// ordinamento della directory
|
||||
if (this.directory.length <= 12) { return true; } // already sorted
|
||||
var directory_entries = new Array();
|
||||
var i;
|
||||
for (i=0; i<this.directory.length; i=i+12) {
|
||||
directory_entries[directory_entries.length] = this.directory.substr(i,12);
|
||||
}
|
||||
directory_entries.sort();
|
||||
this.directory = directory_entries.join('');
|
||||
// sorts array variable_fields
|
||||
this.variable_fields.sort(function(a,b) { return a.tag - b.tag + a.occ - b.occ; });
|
||||
return true;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.show_leader = function() {
|
||||
var leader = ''; var f;
|
||||
for (f in this.leader) { leader += this.leader[f]; }
|
||||
return leader;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.show_fields = function() {
|
||||
var fields = ''; var f;
|
||||
for (f=0; f<this.variable_fields.length;f++) {
|
||||
fields += this.variable_fields[f].ind1 +
|
||||
this.variable_fields[f].ind2 +
|
||||
this.variable_fields[f].value +
|
||||
this.field_terminator;
|
||||
}
|
||||
return fields;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.show_directory = function() {
|
||||
var d = '';
|
||||
for (var i = 0; i<this.directory.length; i+=12) {
|
||||
d += this.directory.substr(i,3) + ' ' +
|
||||
this.directory.substr(i+3,4) + ' ' +
|
||||
this.directory.substr(i+7,5) + '\n';
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.add_field_005 = function() {
|
||||
var now = new Date();
|
||||
now = now.getFullYear() +
|
||||
this._zero_fill(now.getMonth()+1,2) +
|
||||
this._zero_fill(now.getDate(),2) +
|
||||
this._zero_fill(now.getHours(),2) +
|
||||
this._zero_fill(now.getMinutes(),2) +
|
||||
this._zero_fill(now.getSeconds(),2) + '.0';
|
||||
this.add_field('005','','',now);
|
||||
return now;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.count_occ = function(tag) { // counts occ of tag
|
||||
var n = 0;
|
||||
for (var i=0; i<this.variable_fields.length; i++) {
|
||||
if (this.variable_fields[i].tag == tag) { n++; }
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.exists = function(tag) { // field existence
|
||||
if (this.count_occ(tag) > 0) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield
|
||||
this.tag = tag;
|
||||
this.occ = rec.count_occ(tag)+1; // occurrence order no.
|
||||
this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' ';
|
||||
this.ind2 = ind2; if (this.ind2 == '') this.ind2 = ' ';
|
||||
if (tag.substr(0,2) == '00') {
|
||||
this.ind1 = ''; this.ind2 = '';
|
||||
}
|
||||
this.value = value;
|
||||
return this;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.display = function(type) { // displays record in format type
|
||||
type = type.toLowerCase();
|
||||
if (type == 'binary') return this.show_leader() +
|
||||
this.directory +
|
||||
this.field_terminator +
|
||||
this.show_fields() +
|
||||
this.record_terminator;
|
||||
if (type == 'html') {
|
||||
var s = '<table class="record_table">';
|
||||
var l = R.show_leader();
|
||||
s += '<tr><td class="tag">000</td><td class="ind"></td><td class="ind"></td><td class="record_value">'+l+'</td></tr>';
|
||||
var i;
|
||||
for (i=0; i<this.variable_fields.length; i++) {
|
||||
var ind1 = this.variable_fields[i].ind1; if (ind1 == ' ') { ind1 = ' '; }
|
||||
var ind2 = this.variable_fields[i].ind2; if (ind2 == ' ') { ind2 = ' '; }
|
||||
s += '<tr>';
|
||||
s += '<td class="tag">'+this.variable_fields[i].tag+'</td>';
|
||||
s += '<td class="ind">'+ind1+'</td>';
|
||||
s += '<td class="ind">'+ind2+'</td>';
|
||||
var v = this.variable_fields[i].value;
|
||||
if (this.variable_fields[i].tag == '008') v = v.replace(/ /g,' ');
|
||||
s += '<td class="record_value">'+this._ddagger(v)+'</td>';
|
||||
s += '</tr>';
|
||||
}
|
||||
s += '</table>';
|
||||
return s;
|
||||
}
|
||||
if (type == 'xml') {
|
||||
s = '';
|
||||
s += '<?xml version="1.0" encoding="iso-8859-1"?><collection xmlns="http://www.loc.gov/MARC21/slim"><record>';
|
||||
s += '<leader>'+this.show_leader()+'</leader>';
|
||||
// var i;
|
||||
for (i=0; i<this.variable_fields.length; i++) {
|
||||
ind1 = this.variable_fields[i].ind1; if (ind1 != '') ind1 = ' ind1="'+ind1+'"';
|
||||
ind2 = this.variable_fields[i].ind2; if (ind2 != '') ind2 = ' ind2="'+ind2+'"';
|
||||
if (this.variable_fields[i].tag.substr(0,2) == '00') s += '<controlfield tag="'+this.variable_fields[i].tag+'">'+this.variable_fields[i].value+'</controlfield>';
|
||||
else {
|
||||
var subfields = this.variable_fields[i].value.split(this.subfield_delimiter);
|
||||
// alert(this.variable_fields[i].value+' '+subfields.length); // test
|
||||
if (subfields.length == 1) subfields[1] = '?'+this.variable_fields[i].value;
|
||||
var sf = '';
|
||||
for (var j=1; j<subfields.length; j++) {
|
||||
sf += '<subfield code="'+subfields[j].substr(0,1)+'">'+subfields[j].substr(1)+'</subfield>';
|
||||
}
|
||||
s += '<datafield tag="' + this.variable_fields[i].tag + '"' + ind1 + ind2 + '>' + sf + '</datafield>';
|
||||
}
|
||||
}
|
||||
s += '</record></collection>';
|
||||
return s;
|
||||
}
|
||||
if (type == 'xml-html') {
|
||||
s = this.display('xml');
|
||||
// abbellimenti
|
||||
s = s.replace(/\<leader\>/,'\n <leader>');
|
||||
s = s.replace(/\<controlfield/g,'\n <controlfield');
|
||||
s = s.replace(/\<datafield/g,'\n <datafield');
|
||||
s = s.replace(/\<collection/g,'\n<collection');
|
||||
s = s.replace(/\<record/g,'\n<record');
|
||||
s = s.replace(/\<\/datafield/g,'\n </datafield');
|
||||
s = s.replace(/\<\/collection/g,'\n</collection');
|
||||
s = s.replace(/\<\/record/g,'\n</record');
|
||||
s = s.replace(/\<subfield/g,'\n <subfield');
|
||||
s = s.replace(/\x1F/g,'%1F'); s = this._ddagger(s);
|
||||
// escape chars < e >
|
||||
s = s.replace(/\</g,'<');
|
||||
s = s.replace(/\>/g,'>');
|
||||
// colore alle keyword
|
||||
s = s.replace(/(controlfield|datafield|collection|record|leader|subfield)/g,'<span class="cdfield">$1</span>');
|
||||
s = s.replace(/(tag|code|ind1|ind2)=/g,'<span class="attrib">$1=</span>');
|
||||
return s;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.get_field = function(tag) { // returns an array of values, one for each occurrence
|
||||
var v = new Array(); var i;
|
||||
for (i=0; i<this.variable_fields.length; i++) {
|
||||
if (this.variable_fields[i].tag == tag) {
|
||||
v[v.length] = this.variable_fields[i].ind1 +
|
||||
this.variable_fields[i].ind2 +
|
||||
this.variable_fields[i].value;
|
||||
}
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
// This function added by Simon Kornblith
|
||||
Scholar.Ingester.MARC_Record.prototype.get_field_subfields = function(tag) { // returns a two-dimensional array of values
|
||||
var field = this.get_field(tag);
|
||||
var return_me = new Array();
|
||||
for(var i in field) {
|
||||
return_me[i] = new Object();
|
||||
var subfields = field[i].split(this.subfield_delimiter);
|
||||
if (subfields.length == 1) {
|
||||
return_me[i]['?'] = field[i];
|
||||
} else {
|
||||
for (var j=1; j<subfields.length; j++) {
|
||||
return_me[i][subfields[j].substr(0,1)] = subfields[j].substr(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
return return_me;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
|
||||
if (tag.length != 3) { return false; }
|
||||
var F = new this.MARC_field(this,tag,ind1,ind2,value);
|
||||
// adds pointer to list of fields
|
||||
this.variable_fields[this.variable_fields.length] = F;
|
||||
// adds the entry to the directory
|
||||
this.directory += F.tag+this._zero_fill(F.ind1.length+F.ind2.length+F.value.length+1,4)+'00000';
|
||||
// sorts the directory
|
||||
this.sort_directory();
|
||||
// updates lengths
|
||||
this.update_base_address_of_data();
|
||||
this.update_displacements();
|
||||
this.update_record_length();
|
||||
return F;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.delete_field = function(tag,occurrence) {
|
||||
// lookup and delete the occurrence from array variable_fields
|
||||
var i;
|
||||
for (i=0; i<this.variable_fields.length; i++) {
|
||||
if (this.variable_fields[i].tag == tag && this.variable_fields[i].occ == occurrence) break;
|
||||
}
|
||||
if (i==this.variable_fields.length) return false; // campo non trovato
|
||||
// deletes the occ. i from array variable_fields scaling next values
|
||||
var j;
|
||||
for (j=i+1; j<this.variable_fields.length; j++) {
|
||||
this.variable_fields[i++]=this.variable_fields[j];
|
||||
}
|
||||
this.variable_fields.length--; // deletes last element
|
||||
// lookup and delete the occurrence from directory (must exist; no sort is needed)
|
||||
var nocc = 0;
|
||||
// var i;
|
||||
for (i=0; i<this.directory.length;i=i+12) {
|
||||
if (this.directory.substr(i,3) == tag) nocc++;
|
||||
if (occurrence == nocc) { // occ found
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i >= this.directory.length) alert('Internal error!');
|
||||
this.directory = this.directory.substr(0,i) + this.directory.substr(i+12);
|
||||
// updates lengths
|
||||
this.update_base_address_of_data();
|
||||
this.update_displacements();
|
||||
this.update_record_length();
|
||||
return true;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype._ddagger = function(s) { // display doubledagger in html code
|
||||
s = s.replace(/\%1F(.)/g, "<span class=\"this._ddagger\">‡$1</span>");
|
||||
s = s.replace(/\x1F(.)/g, "<span class=\"this._ddagger\">‡$1</span>");
|
||||
return s;
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides
|
||||
s = s.replace(/\s+$/,'');
|
||||
return s.replace(/^\s+/,'');
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype._zero_fill = function(s,l) { // left '0' padding of s, up to l (l<=15)
|
||||
var t = '000000000000000';
|
||||
t = t+s;
|
||||
return t.substr(t.length-l,l);
|
||||
}
|
||||
|
||||
Scholar.Ingester.MARC_Record.prototype.version = function() { // returns version and date
|
||||
return 'MARC Editor Lite '+this.VERSIONE+' ('+this.VERSIONE_data+')';
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -82,19 +82,29 @@ Scholar.Utilities.prototype.dateToISO = function(jsDate) {
|
|||
/*
|
||||
* Cleans extraneous punctuation off an author name
|
||||
*/
|
||||
Scholar.Utilities.prototype.cleanAuthor = function(author) {
|
||||
Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) {
|
||||
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
||||
author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
|
||||
author = author.replace(/ +/, ' ');
|
||||
// Add period for initials
|
||||
if(author.substring(author.length-2, author.length-1) == " ") {
|
||||
author += ".";
|
||||
if(useComma) {
|
||||
// Add period for initials
|
||||
if(author.substr(author.length-2, 1) == " ") {
|
||||
author += ".";
|
||||
}
|
||||
var splitNames = author.split(', ');
|
||||
if(splitNames.length > 1) {
|
||||
var lastName = splitNames[0];
|
||||
var firstName = splitNames[1];
|
||||
} else {
|
||||
var lastName = author;
|
||||
}
|
||||
} else {
|
||||
var spaceIndex = author.lastIndexOf(" ");
|
||||
var lastName = author.substring(spaceIndex+1);
|
||||
var firstName = author.substring(0, spaceIndex);
|
||||
}
|
||||
var splitNames = author.split(', ');
|
||||
if(splitNames.length > 1) {
|
||||
author = splitNames[1]+' '+splitNames[0];
|
||||
}
|
||||
return author;
|
||||
// TODO: take type into account
|
||||
return {firstName:firstName, lastName:lastName, creatorType:type};
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -141,7 +151,7 @@ Scholar.Utilities.prototype.getVersion = function() {
|
|||
/*
|
||||
* Get a page range, given a user-entered set of pages
|
||||
*/
|
||||
Scholar.Utilities.prototype._pageRangeRegexp = /^\s*([0-9]+)-([0-9]+)\s*$/
|
||||
Scholar.Utilities.prototype._pageRangeRegexp = /^\s*([0-9]+)-([0-9]+)\s*$/;
|
||||
Scholar.Utilities.prototype.getPageRange = function(pages) {
|
||||
var pageNumbers;
|
||||
var m = this._pageRangeRegexp.exec(pages);
|
||||
|
@ -155,8 +165,21 @@ Scholar.Utilities.prototype.getPageRange = function(pages) {
|
|||
return pageNumbers;
|
||||
}
|
||||
|
||||
/*
|
||||
* provide inArray function
|
||||
*/
|
||||
Scholar.Utilities.prototype.inArray = Scholar.inArray;
|
||||
|
||||
/*
|
||||
* pads a number or other string with a given string on the left
|
||||
*/
|
||||
Scholar.Utilities.prototype.lpad = function(string, pad, length) {
|
||||
while(string.length < length) {
|
||||
string = pad + string;
|
||||
}
|
||||
return string;
|
||||
}
|
||||
|
||||
/*
|
||||
* END SCHOLAR FOR FIREFOX EXTENSIONS
|
||||
*/
|
||||
|
@ -169,10 +192,8 @@ Scholar.Utilities.prototype.inArray = Scholar.inArray;
|
|||
// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
|
||||
// classes relating to data extraction specifically from HTML documents.
|
||||
|
||||
Scholar.Utilities.Ingester = function(myWindow, proxiedURL, isHidden) {
|
||||
this.window = myWindow;
|
||||
Scholar.Utilities.Ingester = function(proxiedURL) {
|
||||
this.proxiedURL = proxiedURL;
|
||||
this.isHidden = isHidden;
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
|
||||
|
@ -240,21 +261,6 @@ Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode,
|
|||
return returnVar;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allows a user to select which items to scrape
|
||||
*/
|
||||
Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) {
|
||||
if(this.isHidden != true) {
|
||||
// this is kinda ugly, mozillazine made me do it! honest!
|
||||
var io = { dataIn:itemList, dataOut:null }
|
||||
var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
|
||||
"_blank","chrome,modal,centerscreen,resizable=yes", io);
|
||||
return io.dataOut;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Grabs items based on URLs
|
||||
*/
|
||||
|
@ -300,129 +306,19 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
|
|||
return availableItems;
|
||||
}
|
||||
|
||||
// These functions are for use by importMARCRecord. They're private, because,
|
||||
// while they are useful, it's also nice if as many of our scrapers as possible
|
||||
// are PiggyBank compatible, and if our scrapers used functions, that would
|
||||
// break compatibility
|
||||
Scholar.Utilities.Ingester.prototype._MARCCleanString = function(author) {
|
||||
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
||||
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
|
||||
return author.replace(/ +/, ' ');
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.prototype._MARCCleanNumber = function(author) {
|
||||
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
||||
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
|
||||
var regexp = /^[^ ]*/;
|
||||
var m = regexp.exec(author);
|
||||
if(m) {
|
||||
return m[0];
|
||||
}
|
||||
}
|
||||
Scholar.Utilities.Ingester.prototype._MARCPullYear = function(text) {
|
||||
var pullRe = /[0-9]+/;
|
||||
var m = pullRe.exec(text);
|
||||
if(m) {
|
||||
return m[0];
|
||||
}
|
||||
}
|
||||
|
||||
Scholar.Utilities.Ingester.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
|
||||
if(!part) {
|
||||
part = 'a';
|
||||
}
|
||||
var field = record.get_field_subfields(fieldNo);
|
||||
Scholar.debug('Found '+field.length+' matches for '+fieldNo+part);
|
||||
if(field) {
|
||||
for(i in field) {
|
||||
var value;
|
||||
for(var j=0; j<part.length; j++) {
|
||||
var myPart = part.substr(j, 1);
|
||||
if(field[i][myPart]) {
|
||||
if(value) {
|
||||
value += " "+field[i][myPart];
|
||||
} else {
|
||||
value = field[i][myPart];
|
||||
}
|
||||
}
|
||||
}
|
||||
if(value) {
|
||||
if(execMe) {
|
||||
value = execMe(value);
|
||||
}
|
||||
if(prefix) {
|
||||
value = prefix + value;
|
||||
}
|
||||
model.addStatement(uri, rdfUri, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
// This is an extension to PiggyBank's architecture. It's here so that we don't
|
||||
// need an enormous library for each scraper that wants to use MARC records
|
||||
Scholar.Utilities.Ingester.prototype.importMARCRecord = function(record, uri, model) {
|
||||
var prefixDC = 'http://purl.org/dc/elements/1.1/';
|
||||
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
|
||||
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
|
||||
var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
|
||||
|
||||
// Extract ISBNs
|
||||
model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
|
||||
// Extract ISSNs
|
||||
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
|
||||
// Extract creators
|
||||
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
|
||||
model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
|
||||
model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
|
||||
model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
|
||||
if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
|
||||
// some LOC entries have no listed author, but have the author in the person subject field as the first entry
|
||||
var field = record.get_field_subfields('600');
|
||||
if(field[0]) {
|
||||
model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
|
||||
}
|
||||
}
|
||||
// Extract title
|
||||
model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab');
|
||||
// Extract edition
|
||||
model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString);
|
||||
// Extract place info
|
||||
model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
|
||||
// Extract publisher info
|
||||
model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
|
||||
// Extract year
|
||||
model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c');
|
||||
// Extract series
|
||||
model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
|
||||
// Extract call number
|
||||
model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab');
|
||||
model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab');
|
||||
model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab');
|
||||
model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab');
|
||||
model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a');
|
||||
model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab');
|
||||
|
||||
// Set type
|
||||
model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true);
|
||||
}
|
||||
|
||||
/*
|
||||
* END SCHOLAR FOR FIREFOX EXTENSIONS
|
||||
*/
|
||||
|
||||
// Ingester adapters for Scholar.Utilities.HTTP to handle proxies
|
||||
|
||||
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) {
|
||||
Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) {
|
||||
if(this.proxiedURL) {
|
||||
url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
|
||||
}
|
||||
Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
|
||||
}
|
||||
Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||
Scholar.Utilities.Ingester.prototype.processDocuments = function(firstDoc, urls, processor, done, exception) {
|
||||
for(i in urls) {
|
||||
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
|
||||
}
|
||||
|
@ -476,6 +372,7 @@ Scholar.Utilities.HTTP = new function() {
|
|||
* in our code, is required for compatiblity with the Piggy Bank project
|
||||
**/
|
||||
function doGet(url, callback1, callback2) {
|
||||
Scholar.debug("HTTP GET "+url);
|
||||
if (this.browserIsOffline()){
|
||||
return false;
|
||||
}
|
||||
|
@ -508,6 +405,7 @@ Scholar.Utilities.HTTP = new function() {
|
|||
* in our code, is required for compatiblity with the Piggy Bank project
|
||||
**/
|
||||
function doPost(url, body, callback1, callback2) {
|
||||
Scholar.debug("HTTP POST "+body+" to "+url);
|
||||
if (this.browserIsOffline()){
|
||||
return false;
|
||||
}
|
||||
|
@ -538,6 +436,7 @@ Scholar.Utilities.HTTP = new function() {
|
|||
* in our code, is required for compatiblity with the Piggy Bank project
|
||||
**/
|
||||
function doOptions(url, body, callback1, callback2) {
|
||||
Scholar.debug("HTTP OPTIONS "+url);
|
||||
if (this.browserIsOffline()){
|
||||
return false;
|
||||
}
|
||||
|
@ -641,7 +540,6 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do
|
|||
.hiddenDOMWindow;
|
||||
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(myWindow);
|
||||
var prevUrl, url;
|
||||
Scholar.debug("processDocuments called");
|
||||
|
||||
try {
|
||||
if (urls.length == 0) {
|
||||
|
@ -690,14 +588,11 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do
|
|||
}
|
||||
};
|
||||
var init = function() {
|
||||
Scholar.debug("init called");
|
||||
hiddenBrowser.addEventListener("load", onLoad, true);
|
||||
|
||||
if (firstDoc) {
|
||||
Scholar.debug("processing");
|
||||
processor(firstDoc, doLoad);
|
||||
} else {
|
||||
Scholar.debug("doing load");
|
||||
doLoad();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,10 +45,6 @@ Cc["@mozilla.org/moz/jssubscript-loader;1"]
|
|||
Cc["@mozilla.org/moz/jssubscript-loader;1"]
|
||||
.getService(Ci.mozIJSSubScriptLoader)
|
||||
.loadSubScript("chrome://scholar/content/xpcom/translate.js");
|
||||
|
||||
Cc["@mozilla.org/moz/jssubscript-loader;1"]
|
||||
.getService(Ci.mozIJSSubScriptLoader)
|
||||
.loadSubScript("chrome://scholar/content/xpcom/marc.js");
|
||||
|
||||
Cc["@mozilla.org/moz/jssubscript-loader;1"]
|
||||
.getService(Ci.mozIJSSubScriptLoader)
|
||||
|
|
5124
scrapers.sql
5124
scrapers.sql
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue