- Small changes to MARC record support
- Implemented loadDocument API, for loading and parsing the DOMs of HTML documents in the background - Added scraper code to SVN repository (now includes 12 scrapers, see Writeboard for details) To update to the latest versions of all scrapers, ensure you have an up-to-date version of sqlite3, then run: sqlite3 ~/Library/Application\ Support/Firefox/Profiles/profileName/scholar.sqlite < scrapers.sql
This commit is contained in:
parent
6c55e63eab
commit
152c9bf9e7
5 changed files with 1205 additions and 85 deletions
|
@ -35,6 +35,7 @@ Scholar.Ingester.Interface.init = function() {
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Interface.chromeLoad = function() {
|
Scholar.Ingester.Interface.chromeLoad = function() {
|
||||||
Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
|
Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
|
||||||
|
Scholar.Ingester.Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser");
|
||||||
Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
|
Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
|
||||||
Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image");
|
Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image");
|
||||||
|
|
||||||
|
@ -189,7 +190,7 @@ Scholar.Ingester.Interface._setDocument = function(browser) {
|
||||||
browser.setAttribute("scholar-key", key);
|
browser.setAttribute("scholar-key", key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser);
|
Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar.Ingester.Interface.hiddenBrowser);
|
||||||
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
|
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,4 +19,7 @@
|
||||||
<image id="scholar-status-image" width="16" height="16" onclick="Scholar.Ingester.Interface.scrapeThisPage()" />
|
<image id="scholar-status-image" width="16" height="16" onclick="Scholar.Ingester.Interface.scrapeThisPage()" />
|
||||||
</statusbarpanel>
|
</statusbarpanel>
|
||||||
</statusbar>
|
</statusbar>
|
||||||
|
<box style="visibility: collapse">
|
||||||
|
<browser id="scholar-hidden-browser" />
|
||||||
|
</box>
|
||||||
</overlay>
|
</overlay>
|
||||||
|
|
|
@ -48,7 +48,9 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// Scholar.Ingester.Utilities class, a set of methods to assist in data
|
// Scholar.Ingester.Utilities class, a set of methods to assist in data
|
||||||
// extraction. Most code here was stolen directly from the Piggy Bank project.
|
// extraction. Most code here was stolen directly from the Piggy Bank project.
|
||||||
Scholar.Ingester.Utilities = function() {}
|
Scholar.Ingester.Utilities = function(hiddenBrowser) {
|
||||||
|
this.hiddenBrowser = hiddenBrowser;
|
||||||
|
}
|
||||||
|
|
||||||
// Adapter for Piggy Bank function to print debug messages; log level is
|
// Adapter for Piggy Bank function to print debug messages; log level is
|
||||||
// fixed at 4 (could change this)
|
// fixed at 4 (could change this)
|
||||||
|
@ -99,6 +101,7 @@ Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, paren
|
||||||
// Loads a single document for a scraper, running succeeded() on success or
|
// Loads a single document for a scraper, running succeeded() on success or
|
||||||
// failed() on failure
|
// failed() on failure
|
||||||
Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
|
Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
|
||||||
|
Scholar.debug("loadDocument called");
|
||||||
this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
|
this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -112,6 +115,9 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
|
||||||
// exception - a function to execute if an exception occurs (exceptions are
|
// exception - a function to execute if an exception occurs (exceptions are
|
||||||
// also logged in the Firefox Scholar log)
|
// also logged in the Firefox Scholar log)
|
||||||
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
|
||||||
|
var hiddenBrowser = this.hiddenBrowser;
|
||||||
|
Scholar.debug("processDocuments called");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (urls.length == 0) {
|
if (urls.length == 0) {
|
||||||
if (firstDoc) {
|
if (firstDoc) {
|
||||||
|
@ -128,53 +134,51 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
|
||||||
if (urlIndex < urls.length) {
|
if (urlIndex < urls.length) {
|
||||||
try {
|
try {
|
||||||
var url = urls[urlIndex];
|
var url = urls[urlIndex];
|
||||||
var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
|
Scholar.debug("loading "+url);
|
||||||
b.loadURI(url);
|
hiddenBrowser.loadURI(url);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
exception(e);
|
|
||||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
|
Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
|
||||||
|
exception(e);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
window.setTimeout(done, 10);
|
hiddenBrowser.setTimeout(done, 10);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
var onLoad = function() {
|
var onLoad = function() {
|
||||||
try {
|
Scholar.debug("onLoad called");
|
||||||
var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
|
if(hiddenBrowser.id == "scholar-hidden-browser") {
|
||||||
processor(b.contentDocument, doLoad);
|
hiddenBrowser.removeEventListener("DOMContentLoaded", onLoad, true);
|
||||||
} catch (e) {
|
try {
|
||||||
exception(e);
|
var newHiddenBrowser = new Object();
|
||||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
Scholar.debug("new hidden browser");
|
||||||
|
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
|
||||||
|
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
|
||||||
|
Scholar.debug("added attributes");
|
||||||
|
processor(newHiddenBrowser);
|
||||||
|
Scholar.debug("called processor");
|
||||||
|
} catch (e) {
|
||||||
|
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
|
||||||
|
exception(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
var init = function() {
|
var init = function() {
|
||||||
var listener;
|
Scholar.debug("init called");
|
||||||
listener.onStateChange = function(webProgress, request, stateFlags, status) {
|
hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true);
|
||||||
if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
|
|
||||||
request.name == urls[urlIndex]) {
|
|
||||||
try {
|
|
||||||
Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
|
|
||||||
} catch (e) {
|
|
||||||
exception(e);
|
|
||||||
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
|
|
||||||
tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
|
|
||||||
|
|
||||||
if (firstDoc) {
|
if (firstDoc) {
|
||||||
|
Scholar.debug("processing");
|
||||||
processor(firstDoc, doLoad);
|
processor(firstDoc, doLoad);
|
||||||
} else {
|
} else {
|
||||||
|
Scholar.debug("doing load");
|
||||||
doLoad();
|
doLoad();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
w.addEventListener("load", init, false);
|
init();
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
Scholar.debug("processDocuments: " + e);
|
||||||
exception(e);
|
exception(e);
|
||||||
PB_Debug.print("processDocuments: " + e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -209,12 +213,18 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
|
||||||
// break compatibility
|
// break compatibility
|
||||||
Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) {
|
Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) {
|
||||||
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
||||||
return author.replace(/[\s\.\,\/\[\]\:]+$/, '');
|
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
|
||||||
|
return author.replace(/ +/, ' ');
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
|
Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
|
||||||
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
||||||
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
|
author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
|
||||||
|
author = author.replace(/ +/, ' ');
|
||||||
|
// Add period for initials
|
||||||
|
if(author.substring(author.length-2, author.length-1) == " ") {
|
||||||
|
author += ".";
|
||||||
|
}
|
||||||
var splitNames = author.split(', ');
|
var splitNames = author.split(', ');
|
||||||
if(splitNames.length > 1) {
|
if(splitNames.length > 1) {
|
||||||
author = splitNames[1]+' '+splitNames[0];
|
author = splitNames[1]+' '+splitNames[0];
|
||||||
|
@ -222,6 +232,16 @@ Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
|
||||||
return author;
|
return author;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) {
|
||||||
|
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
|
||||||
|
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
|
||||||
|
var regexp = /^[^ ]*/;
|
||||||
|
var m = regexp.exec(author);
|
||||||
|
if(m) {
|
||||||
|
return m[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
|
Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
|
||||||
if(!part) {
|
if(!part) {
|
||||||
part = 'a';
|
part = 'a';
|
||||||
|
@ -253,27 +273,29 @@ Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri,
|
||||||
|
|
||||||
// This is an extension to PiggyBank's architecture. It's here so that we don't
|
// This is an extension to PiggyBank's architecture. It's here so that we don't
|
||||||
// need an enormous library for each scraper that wants to use MARC records
|
// need an enormous library for each scraper that wants to use MARC records
|
||||||
Scholar.Ingester.Utilities.prototype.importMARCRecord = function(text, format, uri, model) {
|
Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, model) {
|
||||||
var prefixDC = 'http://purl.org/dc/elements/1.1/';
|
var prefixDC = 'http://purl.org/dc/elements/1.1/';
|
||||||
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
|
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
|
||||||
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
|
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
|
||||||
|
|
||||||
var record = new Scholar.Ingester.MARC_Record();
|
|
||||||
record.load(text, format);
|
|
||||||
|
|
||||||
// Extract ISBNs
|
// Extract ISBNs
|
||||||
model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
|
model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
|
||||||
// Extract ISSNs
|
// Extract ISSNs
|
||||||
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
|
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
|
||||||
// Extract creators
|
// Extract creators
|
||||||
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor);
|
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor);
|
||||||
model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
|
||||||
model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
|
||||||
model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
|
model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
|
||||||
if(!model.data[uri][prefixDC + 'creator']) {
|
model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this._MARCCleanAuthor);
|
||||||
|
model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
|
||||||
|
model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
|
||||||
|
model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
|
||||||
|
if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author
|
||||||
|
// in the person subject field as the first entry
|
||||||
var field = record.get_field_subfields('600');
|
var field = record.get_field_subfields('600');
|
||||||
if(field) {
|
if(field[0]) {
|
||||||
model = this.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));
|
model.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Extract title
|
// Extract title
|
||||||
|
@ -403,12 +425,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
|
||||||
/*
|
/*
|
||||||
* Constructor for Document object
|
* Constructor for Document object
|
||||||
*/
|
*/
|
||||||
Scholar.Ingester.Document = function(browserWindow){
|
Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
|
||||||
this.browser = browserWindow;
|
this.browser = browserWindow;
|
||||||
|
this.model = new Scholar.Ingester.Model();
|
||||||
this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
|
this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
|
||||||
.getService(Ci.nsIAppShellService);
|
.getService(Ci.nsIAppShellService);
|
||||||
this.scraper = null
|
this.scraper = null;
|
||||||
this.model = new Scholar.Ingester.Model();
|
this.hiddenBrowser = hiddenBrowser;
|
||||||
this._generateSandbox();
|
this._generateSandbox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -530,11 +553,13 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
|
||||||
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
|
||||||
this.sandbox.browser = this.browser;
|
this.sandbox.browser = this.browser;
|
||||||
this.sandbox.doc = this.sandbox.browser.contentDocument;
|
this.sandbox.doc = this.sandbox.browser.contentDocument;
|
||||||
this.sandbox.utilities = new Scholar.Ingester.Utilities;
|
this.sandbox.utilities = new Scholar.Ingester.Utilities(this.hiddenBrowser);
|
||||||
this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
|
this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
|
||||||
this.sandbox.window = this.window;
|
this.sandbox.window = this.window;
|
||||||
this.sandbox.model = this.model;
|
this.sandbox.model = this.model;
|
||||||
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
|
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
|
||||||
|
this.sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
|
||||||
|
this.sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
|
||||||
|
|
||||||
var me = this;
|
var me = this;
|
||||||
this.sandbox.wait = function(){ me._waitForCompletion = true; };
|
this.sandbox.wait = function(){ me._waitForCompletion = true; };
|
||||||
|
@ -552,42 +577,16 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||||
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
|
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
|
||||||
|
|
||||||
for(var uri in this.model.data) {
|
for(var uri in this.model.data) {
|
||||||
var newItem = Scholar.Items.getNewItemByType(1);
|
if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
|
||||||
|
var newItem = Scholar.Items.getNewItemByType(2);
|
||||||
|
} else {
|
||||||
|
var newItem = Scholar.Items.getNewItemByType(1);
|
||||||
|
}
|
||||||
newItem.setField("source", uri);
|
newItem.setField("source", uri);
|
||||||
if(this.model.data[uri][prefixDC + 'title']) {
|
if(this.model.data[uri][prefixDC + 'title']) {
|
||||||
newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
|
newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
|
||||||
}
|
}
|
||||||
if(this.model.data[uri][prefixDC + 'publisher']) {
|
var creatorIndex = 0;
|
||||||
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
|
|
||||||
}
|
|
||||||
if(this.model.data[uri][prefixDC + 'year']) {
|
|
||||||
if(this.model.data[uri][prefixDC + 'year'].length == 4) {
|
|
||||||
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
|
|
||||||
} else {
|
|
||||||
try {
|
|
||||||
newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
|
|
||||||
this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
|
|
||||||
this.model.data[uri][prefixDC + 'year'][0].length));
|
|
||||||
} catch(e) {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(this.model.data[uri][prefixDC + 'edition']) {
|
|
||||||
newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
|
|
||||||
}
|
|
||||||
if(this.model.data[uri][prefixDummy + 'series']) {
|
|
||||||
newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
|
|
||||||
}
|
|
||||||
if(this.model.data[uri][prefixDummy + 'place']) {
|
|
||||||
newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
|
|
||||||
}
|
|
||||||
if(this.model.data[uri][prefixDC + 'identifier']) {
|
|
||||||
for(i in this.model.data[uri][prefixDC + 'identifier']) {
|
|
||||||
if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
|
|
||||||
newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(this.model.data[uri][prefixDC + 'creator']) {
|
if(this.model.data[uri][prefixDC + 'creator']) {
|
||||||
for(i in this.model.data[uri][prefixDC + 'creator']) {
|
for(i in this.model.data[uri][prefixDC + 'creator']) {
|
||||||
var creator = this.model.data[uri][prefixDC + 'creator'][i];
|
var creator = this.model.data[uri][prefixDC + 'creator'][i];
|
||||||
|
@ -595,7 +594,73 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
|
||||||
var lastName = creator.substring(spaceIndex+1, creator.length);
|
var lastName = creator.substring(spaceIndex+1, creator.length);
|
||||||
var firstName = creator.substring(0, spaceIndex);
|
var firstName = creator.substring(0, spaceIndex);
|
||||||
|
|
||||||
newItem.setCreator(i, firstName, lastName);
|
newItem.setCreator(creatorIndex, firstName, lastName, 1);
|
||||||
|
creatorIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDC + 'contributor']) {
|
||||||
|
for(i in this.model.data[uri][prefixDC + 'contributor']) {
|
||||||
|
var creator = this.model.data[uri][prefixDC + 'contributor'][i];
|
||||||
|
var spaceIndex = creator.lastIndexOf(" ");
|
||||||
|
var lastName = creator.substring(spaceIndex+1, creator.length);
|
||||||
|
var firstName = creator.substring(0, spaceIndex);
|
||||||
|
|
||||||
|
newItem.setCreator(creatorIndex, firstName, lastName, 2);
|
||||||
|
creatorIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
|
||||||
|
if(this.model.data[uri][prefixDummy + 'publication']) {
|
||||||
|
newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]);
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDummy + 'volume']) {
|
||||||
|
newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]);
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDummy + 'number']) {
|
||||||
|
newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]);
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDummy + 'pages']) {
|
||||||
|
newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]);
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDC + 'identifier']) {
|
||||||
|
for(i in this.model.data[uri][prefixDC + 'identifier']) {
|
||||||
|
if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') {
|
||||||
|
newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(this.model.data[uri][prefixDC + 'publisher']) {
|
||||||
|
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDC + 'year']) {
|
||||||
|
if(this.model.data[uri][prefixDC + 'year'].length == 4) {
|
||||||
|
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
|
||||||
|
this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
|
||||||
|
this.model.data[uri][prefixDC + 'year'][0].length));
|
||||||
|
} catch(e) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDC + 'edition']) {
|
||||||
|
newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDummy + 'series']) {
|
||||||
|
newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDummy + 'place']) {
|
||||||
|
newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
|
||||||
|
}
|
||||||
|
if(this.model.data[uri][prefixDC + 'identifier']) {
|
||||||
|
for(i in this.model.data[uri][prefixDC + 'identifier']) {
|
||||||
|
if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
|
||||||
|
newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
newItem.save();
|
newItem.save();
|
||||||
|
|
|
@ -80,8 +80,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
|
||||||
}
|
}
|
||||||
this.add_field(tag,ind1,ind2,value);
|
this.add_field(tag,ind1,ind2,value);
|
||||||
}
|
}
|
||||||
}
|
} else if (f == 'MARC_Harvard') {
|
||||||
if (f == 'MARC_Harvard') {
|
|
||||||
var linee = s.split('\n');
|
var linee = s.split('\n');
|
||||||
for (var i=0; i<linee.length; i++) {
|
for (var i=0; i<linee.length; i++) {
|
||||||
linee[i] = this._trim(linee[i]);
|
linee[i] = this._trim(linee[i]);
|
||||||
|
@ -128,8 +127,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.add_field_005();
|
this.add_field_005();
|
||||||
}
|
} else if (f == 'MARC_BNI') {
|
||||||
if (f == 'MARC_BNI') {
|
|
||||||
var linee = s.split('\n');
|
var linee = s.split('\n');
|
||||||
for (var i=0; i<linee.length; i++) {
|
for (var i=0; i<linee.length; i++) {
|
||||||
linee[i] = this._trim(linee[i]);
|
linee[i] = this._trim(linee[i]);
|
||||||
|
@ -167,8 +165,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.add_field_005();
|
this.add_field_005();
|
||||||
}
|
} else if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov
|
||||||
if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov
|
|
||||||
var linee = s.split('\n');
|
var linee = s.split('\n');
|
||||||
for (var i=0; i<linee.length; i++) {
|
for (var i=0; i<linee.length; i++) {
|
||||||
linee[i] = this._trim(linee[i]);
|
linee[i] = this._trim(linee[i]);
|
||||||
|
@ -209,6 +206,46 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.add_field_005();
|
this.add_field_005();
|
||||||
|
} else if (f == 'MARC_PAC') {
|
||||||
|
var linee = s.split('\n');
|
||||||
|
for (var i=0; i<linee.length; i++) {
|
||||||
|
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
|
||||||
|
linee[i] = linee[i].replace(/_/g,' ');
|
||||||
|
linee[i] = linee[i].replace(/\t/g,'');
|
||||||
|
linee[i] = this._trim(linee[i]);
|
||||||
|
if (linee[i] == '') continue; // jumps empty lines
|
||||||
|
var replacer = this.subfield_delimiter+'$1';
|
||||||
|
linee[i] = linee[i].replace(/\|(.)/g,replacer);
|
||||||
|
linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter);
|
||||||
|
var tag = linee[i].substr(0,3);
|
||||||
|
var ind1 = linee[i].substr(4,1);
|
||||||
|
var ind2 = linee[i].substr(5,1);
|
||||||
|
var value = this.subfield_delimiter+'a'+linee[i].substr(7);
|
||||||
|
if(linee[i].substr(0, 6) == "LEADER") {
|
||||||
|
value = linee[i].substr(7);
|
||||||
|
this.leader.record_length = '00000';
|
||||||
|
this.leader.record_status = value.substr(5,1);
|
||||||
|
this.leader.type_of_record = value.substr(6,1);
|
||||||
|
this.leader.bibliographic_level = value.substr(7,1);
|
||||||
|
this.leader.type_of_control = value.substr(8,1);
|
||||||
|
this.leader.character_coding_scheme = value.substr(9,1);
|
||||||
|
this.leader.indicator_count = '2';
|
||||||
|
this.leader.subfield_code_length = '2';
|
||||||
|
this.leader.base_address_of_data = '00000';
|
||||||
|
this.leader.encoding_level = value.substr(17,1);
|
||||||
|
this.leader.descriptive_cataloging_form = value.substr(18,1);
|
||||||
|
this.leader.linked_record_requirement = value.substr(19,1);
|
||||||
|
this.leader.entry_map = '4500';
|
||||||
|
|
||||||
|
this.directory = '';
|
||||||
|
this.directory_terminator = this.field_terminator;
|
||||||
|
this.variable_fields = new Array();
|
||||||
|
}
|
||||||
|
else if (tag > '008' && tag < '899') { // jumps low and high tags
|
||||||
|
if (tag != '040') this.add_field(tag,ind1,ind2,value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this.add_field_005();
|
||||||
}
|
}
|
||||||
|
|
||||||
this.update_record_length();
|
this.update_record_length();
|
||||||
|
@ -310,7 +347,7 @@ Scholar.Ingester.MARC_Record.prototype.exists = function(tag) { // field existen
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
function MARC_field(rec,tag,ind1,ind2,value) { // new MARC gield
|
Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield
|
||||||
this.tag = tag;
|
this.tag = tag;
|
||||||
this.occ = rec.count_occ(tag)+1; // occurrence order no.
|
this.occ = rec.count_occ(tag)+1; // occurrence order no.
|
||||||
this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' ';
|
this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' ';
|
||||||
|
@ -428,7 +465,7 @@ Scholar.Ingester.MARC_Record.prototype.get_field_subfields = function(tag) { //
|
||||||
|
|
||||||
Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
|
Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
|
||||||
if (tag.length != 3) { return false; }
|
if (tag.length != 3) { return false; }
|
||||||
var F = new MARC_field(this,tag,ind1,ind2,value);
|
var F = new this.MARC_field(this,tag,ind1,ind2,value);
|
||||||
// adds pointer to list of fields
|
// adds pointer to list of fields
|
||||||
this.variable_fields[this.variable_fields.length] = F;
|
this.variable_fields[this.variable_fields.length] = F;
|
||||||
// adds the entry to the directory
|
// adds the entry to the directory
|
||||||
|
|
1014
scrapers.sql
Normal file
1014
scrapers.sql
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue