- Small changes to MARC record support

- Implemented loadDocument API, for loading and parsing the DOMs of HTML documents in the background
- Added scraper code to SVN repository (now includes 12 scrapers, see Writeboard for details)

To update to the latest versions of all scrapers, ensure you have an up-to-date version of sqlite3, then run:
sqlite3 ~/Library/Application\ Support/Firefox/Profiles/profileName/scholar.sqlite < scrapers.sql
This commit is contained in:
Simon Kornblith 2006-06-06 18:25:45 +00:00
parent 6c55e63eab
commit 152c9bf9e7
5 changed files with 1205 additions and 85 deletions

View file

@ -35,6 +35,7 @@ Scholar.Ingester.Interface.init = function() {
*/
Scholar.Ingester.Interface.chromeLoad = function() {
Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
Scholar.Ingester.Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser");
Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image");
@ -189,7 +190,7 @@ Scholar.Ingester.Interface._setDocument = function(browser) {
browser.setAttribute("scholar-key", key);
}
}
Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser);
Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar.Ingester.Interface.hiddenBrowser);
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
}

View file

@ -19,4 +19,7 @@
<image id="scholar-status-image" width="16" height="16" onclick="Scholar.Ingester.Interface.scrapeThisPage()" />
</statusbarpanel>
</statusbar>
<box style="visibility: collapse">
<browser id="scholar-hidden-browser" />
</box>
</overlay>

View file

@ -48,7 +48,9 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
/////////////////////////////////////////////////////////////////
// Scholar.Ingester.Utilities class, a set of methods to assist in data
// extraction. Most code here was stolen directly from the Piggy Bank project.
Scholar.Ingester.Utilities = function() {}
Scholar.Ingester.Utilities = function(hiddenBrowser) {
this.hiddenBrowser = hiddenBrowser;
}
// Adapter for Piggy Bank function to print debug messages; log level is
// fixed at 4 (could change this)
@ -99,6 +101,7 @@ Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, paren
// Loads a single document for a scraper, running succeeded() on success or
// failed() on failure
Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
Scholar.debug("loadDocument called");
this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
}
@ -112,6 +115,9 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
// exception - a function to execute if an exception occurs (exceptions are
// also logged in the Firefox Scholar log)
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
var hiddenBrowser = this.hiddenBrowser;
Scholar.debug("processDocuments called");
try {
if (urls.length == 0) {
if (firstDoc) {
@ -128,53 +134,51 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
if (urlIndex < urls.length) {
try {
var url = urls[urlIndex];
var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
b.loadURI(url);
Scholar.debug("loading "+url);
hiddenBrowser.loadURI(url);
} catch (e) {
exception(e);
Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
exception(e);
}
} else {
window.setTimeout(done, 10);
hiddenBrowser.setTimeout(done, 10);
}
};
var onLoad = function() {
try {
var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
processor(b.contentDocument, doLoad);
} catch (e) {
exception(e);
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
Scholar.debug("onLoad called");
if(hiddenBrowser.id == "scholar-hidden-browser") {
hiddenBrowser.removeEventListener("DOMContentLoaded", onLoad, true);
try {
var newHiddenBrowser = new Object();
Scholar.debug("new hidden browser");
newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
Scholar.debug("added attributes");
processor(newHiddenBrowser);
Scholar.debug("called processor");
} catch (e) {
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
exception(e);
}
}
};
var init = function() {
var listener;
listener.onStateChange = function(webProgress, request, stateFlags, status) {
if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
request.name == urls[urlIndex]) {
try {
Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
} catch (e) {
exception(e);
Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
}
}
};
var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
Scholar.debug("init called");
hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true);
if (firstDoc) {
Scholar.debug("processing");
processor(firstDoc, doLoad);
} else {
Scholar.debug("doing load");
doLoad();
}
}
w.addEventListener("load", init, false);
init();
} catch (e) {
Scholar.debug("processDocuments: " + e);
exception(e);
PB_Debug.print("processDocuments: " + e);
}
}
@ -209,12 +213,18 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
// break compatibility
Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
return author.replace(/[\s\.\,\/\[\]\:]+$/, '');
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
return author.replace(/ +/, ' ');
}
Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
author = author.replace(/ +/, ' ');
// Add period for initials
if(author.substring(author.length-2, author.length-1) == " ") {
author += ".";
}
var splitNames = author.split(', ');
if(splitNames.length > 1) {
author = splitNames[1]+' '+splitNames[0];
@ -222,6 +232,16 @@ Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
return author;
}
Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
var regexp = /^[^ ]*/;
var m = regexp.exec(author);
if(m) {
return m[0];
}
}
Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
if(!part) {
part = 'a';
@ -253,27 +273,29 @@ Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri,
// This is an extension to PiggyBank's architecture. It's here so that we don't
// need an enormous library for each scraper that wants to use MARC records
Scholar.Ingester.Utilities.prototype.importMARCRecord = function(text, format, uri, model) {
Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, model) {
var prefixDC = 'http://purl.org/dc/elements/1.1/';
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
var record = new Scholar.Ingester.MARC_Record();
record.load(text, format);
// Extract ISBNs
model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
// Extract ISSNs
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
// Extract creators
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor);
model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
if(!model.data[uri][prefixDC + 'creator']) {
model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this._MARCCleanAuthor);
model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author
// in the person subject field as the first entry
var field = record.get_field_subfields('600');
if(field) {
model = this.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));
if(field[0]) {
model.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));
}
}
// Extract title
@ -403,12 +425,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
/*
* Constructor for Document object
*/
Scholar.Ingester.Document = function(browserWindow){
Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
this.browser = browserWindow;
this.model = new Scholar.Ingester.Model();
this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
.getService(Ci.nsIAppShellService);
this.scraper = null
this.model = new Scholar.Ingester.Model();
this.scraper = null;
this.hiddenBrowser = hiddenBrowser;
this._generateSandbox();
}
@ -530,11 +553,13 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
this.sandbox.browser = this.browser;
this.sandbox.doc = this.sandbox.browser.contentDocument;
this.sandbox.utilities = new Scholar.Ingester.Utilities;
this.sandbox.utilities = new Scholar.Ingester.Utilities(this.hiddenBrowser);
this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
this.sandbox.window = this.window;
this.sandbox.model = this.model;
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
this.sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
this.sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
var me = this;
this.sandbox.wait = function(){ me._waitForCompletion = true; };
@ -552,42 +577,16 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
for(var uri in this.model.data) {
var newItem = Scholar.Items.getNewItemByType(1);
if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
var newItem = Scholar.Items.getNewItemByType(2);
} else {
var newItem = Scholar.Items.getNewItemByType(1);
}
newItem.setField("source", uri);
if(this.model.data[uri][prefixDC + 'title']) {
newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
}
if(this.model.data[uri][prefixDC + 'publisher']) {
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
}
if(this.model.data[uri][prefixDC + 'year']) {
if(this.model.data[uri][prefixDC + 'year'].length == 4) {
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
} else {
try {
newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
this.model.data[uri][prefixDC + 'year'][0].length));
} catch(e) {}
}
}
if(this.model.data[uri][prefixDC + 'edition']) {
newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
}
if(this.model.data[uri][prefixDummy + 'series']) {
newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
}
if(this.model.data[uri][prefixDummy + 'place']) {
newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
}
if(this.model.data[uri][prefixDC + 'identifier']) {
for(i in this.model.data[uri][prefixDC + 'identifier']) {
if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
break;
}
}
}
var creatorIndex = 0;
if(this.model.data[uri][prefixDC + 'creator']) {
for(i in this.model.data[uri][prefixDC + 'creator']) {
var creator = this.model.data[uri][prefixDC + 'creator'][i];
@ -595,7 +594,73 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
var lastName = creator.substring(spaceIndex+1, creator.length);
var firstName = creator.substring(0, spaceIndex);
newItem.setCreator(i, firstName, lastName);
newItem.setCreator(creatorIndex, firstName, lastName, 1);
creatorIndex++;
}
}
if(this.model.data[uri][prefixDC + 'contributor']) {
for(i in this.model.data[uri][prefixDC + 'contributor']) {
var creator = this.model.data[uri][prefixDC + 'contributor'][i];
var spaceIndex = creator.lastIndexOf(" ");
var lastName = creator.substring(spaceIndex+1, creator.length);
var firstName = creator.substring(0, spaceIndex);
newItem.setCreator(creatorIndex, firstName, lastName, 2);
creatorIndex++;
}
}
if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
if(this.model.data[uri][prefixDummy + 'publication']) {
newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]);
}
if(this.model.data[uri][prefixDummy + 'volume']) {
newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]);
}
if(this.model.data[uri][prefixDummy + 'number']) {
newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]);
}
if(this.model.data[uri][prefixDummy + 'pages']) {
newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]);
}
if(this.model.data[uri][prefixDC + 'identifier']) {
for(i in this.model.data[uri][prefixDC + 'identifier']) {
if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') {
newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
break;
}
}
}
} else {
if(this.model.data[uri][prefixDC + 'publisher']) {
newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
}
if(this.model.data[uri][prefixDC + 'year']) {
if(this.model.data[uri][prefixDC + 'year'].length == 4) {
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
} else {
try {
newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
this.model.data[uri][prefixDC + 'year'][0].length));
} catch(e) {}
}
}
if(this.model.data[uri][prefixDC + 'edition']) {
newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
}
if(this.model.data[uri][prefixDummy + 'series']) {
newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
}
if(this.model.data[uri][prefixDummy + 'place']) {
newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
}
if(this.model.data[uri][prefixDC + 'identifier']) {
for(i in this.model.data[uri][prefixDC + 'identifier']) {
if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
break;
}
}
}
}
newItem.save();

View file

@ -80,8 +80,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
}
this.add_field(tag,ind1,ind2,value);
}
}
if (f == 'MARC_Harvard') {
} else if (f == 'MARC_Harvard') {
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = this._trim(linee[i]);
@ -128,8 +127,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
}
}
this.add_field_005();
}
if (f == 'MARC_BNI') {
} else if (f == 'MARC_BNI') {
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = this._trim(linee[i]);
@ -167,8 +165,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
}
}
this.add_field_005();
}
if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov
} else if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = this._trim(linee[i]);
@ -209,6 +206,46 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
}
}
this.add_field_005();
} else if (f == 'MARC_PAC') {
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
linee[i] = linee[i].replace(/_/g,' ');
linee[i] = linee[i].replace(/\t/g,'');
linee[i] = this._trim(linee[i]);
if (linee[i] == '') continue; // jumps empty lines
var replacer = this.subfield_delimiter+'$1';
linee[i] = linee[i].replace(/\|(.)/g,replacer);
linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter);
var tag = linee[i].substr(0,3);
var ind1 = linee[i].substr(4,1);
var ind2 = linee[i].substr(5,1);
var value = this.subfield_delimiter+'a'+linee[i].substr(7);
if(linee[i].substr(0, 6) == "LEADER") {
value = linee[i].substr(7);
this.leader.record_length = '00000';
this.leader.record_status = value.substr(5,1);
this.leader.type_of_record = value.substr(6,1);
this.leader.bibliographic_level = value.substr(7,1);
this.leader.type_of_control = value.substr(8,1);
this.leader.character_coding_scheme = value.substr(9,1);
this.leader.indicator_count = '2';
this.leader.subfield_code_length = '2';
this.leader.base_address_of_data = '00000';
this.leader.encoding_level = value.substr(17,1);
this.leader.descriptive_cataloging_form = value.substr(18,1);
this.leader.linked_record_requirement = value.substr(19,1);
this.leader.entry_map = '4500';
this.directory = '';
this.directory_terminator = this.field_terminator;
this.variable_fields = new Array();
}
else if (tag > '008' && tag < '899') { // jumps low and high tags
if (tag != '040') this.add_field(tag,ind1,ind2,value);
}
}
this.add_field_005();
}
this.update_record_length();
@ -310,7 +347,7 @@ Scholar.Ingester.MARC_Record.prototype.exists = function(tag) { // field existen
return false;
}
function MARC_field(rec,tag,ind1,ind2,value) { // new MARC gield
Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield
this.tag = tag;
this.occ = rec.count_occ(tag)+1; // occurrence order no.
this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' ';
@ -428,7 +465,7 @@ Scholar.Ingester.MARC_Record.prototype.get_field_subfields = function(tag) { //
Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
if (tag.length != 3) { return false; }
var F = new MARC_field(this,tag,ind1,ind2,value);
var F = new this.MARC_field(this,tag,ind1,ind2,value);
// adds pointer to list of fields
this.variable_fields[this.variable_fields.length] = F;
// adds the entry to the directory

1014
scrapers.sql Normal file

File diff suppressed because it is too large Load diff