From c0251085a9d0c6bd1b87594ac9f59316715493f9 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Wed, 5 Jul 2006 21:44:01 +0000 Subject: [PATCH] Add export filters for RIS and Dublin Core RDF --- .../content/scholar/xpcom/utilities.js | 17 ++ scrapers.sql | 283 ++++++++++++++++-- 2 files changed, 282 insertions(+), 18 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index 1341886edb..40df4e30cf 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -138,6 +138,23 @@ Scholar.Utilities.prototype.getVersion = function() { return Scholar.version; } +/* + * Get a page range, given a user-entered set of pages + */ +Scholar.Utilities.prototype._pageRangeRegexp = /^\s*([0-9]+)-([0-9]+)\s*$/ +Scholar.Utilities.prototype.getPageRange = function(pages) { + var pageNumbers; + var m = this._pageRangeRegexp.exec(pages); + if(m) { + // A page range + pageNumbers = [m[1], m[2]]; + } else { + // Assume start and end are the same + pageNumbers = [pages, pages]; + } + return pageNumbers; +} + Scholar.Utilities.prototype.inArray = Scholar.inArray; /* diff --git a/scrapers.sql b/scrapers.sql index 59596981f0..be63c2b478 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -2464,9 +2464,9 @@ utilities.processDocuments(browser, null, newUris, function(newBrowser) { wait();'); -REPLACE INTO "translators" VALUES ('0e2235e7-babf-413c-9acf-f27cce5f059c', '2006-06-28 16:00:00', 2, 'MODS', 'Simon Kornblith', 'xml', -'options.add("Export project structure", "checkbox", "true"); -options.add("Export notes", "checkbox", "true");', +REPLACE INTO "translators" VALUES ('0e2235e7-babf-413c-9acf-f27cce5f059c', '2006-06-28 16:00:00', 2, 'MODS (XML)', 'Simon Kornblith', 'xml', +'addOption("exportNotes", true); +addOption("exportFileData", true);', 'var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; function doExport(items) { @@ -2619,19 +2619,8 @@ function doExport(items) { // XML tag detail; object field pages if(item.pages) { - var start, end; - - if(typeof(item.pages) == "string" && item.pages.indexOf("-")) { - // A page range - var pageNumbers = item.pages.split("-"); - start = pageNumbers[0]; - end = pageNumbers[1]; - } else { - // Assume start and end are the same - start = item.pages; - end = item.pages; - } - part += {start}{end}; + var range = utilities.getPageRange(item.pages); + part += {range[0]}{range[1]}; } // Assign part if something was assigned @@ -2681,7 +2670,7 @@ function doExport(items) { } // XML tag identifier; object fields ISBN, ISSN - var identifier = null; + var identifier = false; if(item.ISBN) { identifier = {item.ISBN}; } else if(item.ISSN) { @@ -2728,6 +2717,264 @@ function doExport(items) { modsCollection.mods += mods; } - write(modsCollection.toString()); + write(''''+"\n"); + write(modsCollection.toXMLString()); }'); +REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006-06-28 18:04:00', 2, 'Dublin Core (RDF/XML)', 'Simon Kornblith', 'xml', '', +'function doExport(items) { + var addSubclass = new Object(); + var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; + + var rdfDoc = ; + var rdf = new Namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); + var dcterms = new Namespace("dcterms", "http://purl.org/dc/terms/"); + var dc = new Namespace("dc", "http://purl.org/dc/elements/1.1/"); + + for(var i in items) { + var item = items[i]; + + if(item.itemType == "note") { + continue; + } + + var isPartialItem = false; + if(utilities.inArray(item.itemType, partialItemTypes)) { + isPartialItem = true; + } + + var description = ; + if(item.ISBN) { + description.@rdf::about = "urn:isbn:"+item.ISBN; + } else if(item.ISSN) { + description.@rdf::about = "urn:issn:"+item.ISSN; + } else if(item.url) { + description.@rdf::about = item.url; + } else { + // generate a guid, bc that''s all we can do + description.@rdf::about = "urn:uuid:"+item.itemID; + } + + /** CORE FIELDS **/ + + // XML tag titleInfo; object field title + description.dc::title = item.title; + + // XML tag typeOfResource/genre; object field type + var type; + if(item.itemType == "film") { + type = "MovingImage"; + } else if(item.itemType == "artwork") { + type = "StillImage"; + } else { + type = "Text"; + } + description.dc::type.@rdf::resource = "http://purl.org/dc/dcmitype/"+type; + + // XML tag name; object field creators + for(var j in item.creators) { + // put creators in lastName, firstName format (although DC doesn''t specify) + var creator = item.creators[j].lastName; + if(item.creators[j].firstName) { + creator += ", "+item.creators[j].firstName; + } + + if(item.creators[j].creatorType == "author") { + description.dc::creator += {creator}; + } else { + description.dc::contributor.* += {creator}; + } + } + + /** FIELDS ON NEARLY EVERYTHING BUT NOT A PART OF THE CORE **/ + + // source + if(item.source) { + description.dc::source = item.source; + } + + // accessionNumber as generic ID + if(item.accessionNumber) { + description.dc::identifier = item.accessionNumber; + } + + // rights + if(item.rights) { + description.dc::rights = item.rights; + } + + /** SUPPLEMENTAL FIELDS **/ + + // publication/series -> isPartOf + if(item.publication) { + description.dcterms::isPartOf = item.publication; + addSubclass.isPartOf = true; + } else if(item.series) { + description.dcterms::isPartOf = item.series; + addSubclass.isPartOf = true; + } + + // TODO - create text citation and OpenURL citation to handle volume, number, pages, issue, place + + // edition + if(item.edition) { + description.dcterms::hasVersion = item.edition; + } + // publisher/distributor + if(item.publisher) { + description.dc::publisher = item.publisher; + } else if(item.distributor) { + description.dc::publisher = item.distributor; + } + // date/year + if(item.date) { + description.dc::date = item.date; + } else if(item.year) { + description.dc::date = item.year; + } + + // ISBN/ISSN + var resource = false; + if(item.ISBN) { + resource = "urn:isbn:"+item.ISBN; + } else if(item.ISSN) { + resource = "urn:issn:"+item.ISSN; + } + if(resource) { + if(isPartialItem) { + description.dcterms::isPartOf.@rdf::resource = resource; + addSubclass.isPartOf = true; + } else { + description.dc::identifier.@rdf::resource = resource; + } + } + + // callNumber + if(item.callNumber) { + description.dc::identifier += item.callNumber; + } + + // archiveLocation + if(item.archiveLocation) { + description.dc::coverage = item.archiveLocation; + } + + rdfDoc.rdf::Description += description; + } + + if(addSubclass.isPartOf) { + rdfDoc.rdf::Description += + + ; + } + + write(''''+"\n"); + write(rdfDoc.toXMLString()); +}'); + + +REPLACE INTO "translators" VALUES ('32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7', '2006-06-30 15:36:00', 2, 'RIS', 'Simon Kornblith', 'ris', +'addOption("exportNotes", true); +addOption("exportFileData", true);', +'function addTag(tag, value) { + if(value) { + write(tag+" - "+value+"\r\n"); + } +} + +function doExport(items) { + for(var i in items) { + var item = items[i]; + + // can''t store notes in RIS + if(item.itemType == "note") { + continue; + } + + // type + // TODO - figure out if these are the best types for letter, interview, website + if(item.itemType == "book") { + var risType = "BOOK"; + } else if(item.itemType == "bookSection") { + var risType = "CHAP"; + } else if(item.itemType == "journalArticle") { + var risType = "JOUR"; + } else if(item.itemType == "magazineArticle") { + var risType = "MGZN"; + } else if(item.itemType == "newspaperArticle") { + var risType = "NEWS"; + } else if(item.itemType == "thesis") { + var risType = "THES"; + } else if(item.itemType == "letter" || item.itemType == "interview") { + var risType = "PCOMM"; + } else if(item.itemType == "film") { + var risType = "MPCT"; + } else if(item.itemType == "artwork") { + var risType = "ART"; + } else if(item.itemType == "website") { + var risType = "ICOMM"; + } + addTag("TY", risType); + // ID + addTag("ID", item.itemID); + // primary title + addTag("T1", item.title); + // series title + addTag("T3", item.series); + // creators + for(var j in item.creators) { + // only two types, primary and secondary + var risTag = "A1" + if(item.creators[j].creatorType != "author") { + risTag = "A2"; + } + + addTag(risTag, item.creators[j].lastName+","+item.creators[j].firstName); + } + // date + if(item.date) { + var isoDate = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/; + if(isoDate.test(item.date)) { // can directly accept ISO format with minor mods + addTag("Y1", item.date.replace("-", "/")+"/"); + } else { // otherwise, extract year and attach other data + var year = /^(.*?) *([0-9]{4})/; + var m = year.exec(item.date); + if(m) { + addTag("Y1", m[2]+"///"+m[1]); + } + } + } else if(item.year) { + addTag("Y1", item.year+"///"); + } + // notes + for(var j in item.notes) { + addTag("N1", item.notes[j].note); + } + // publication + addTag("JF", item.publication); + // volume + addTag("VL", item.volume); + // number + addTag("IS", item.number); + // pages + if(item.pages) { + var range = utilities.getPageRange(item.pages); + addTag("SP", range[0]); + addTag("EP", range[1]); + } + // place + addTag("CP", item.place); + // publisher + addTag("PB", item.publisher); + // ISBN/ISSN + addTag("SN", item.ISBN); + addTag("SN", item.ISSN); + // URL + if(item.url) { + addTag("UR", item.url); + } else if(item.source && item.source.substr(0, 7) == "http://") { + addTag("UR", item.source); + } + write("\r\n"); + } +}'); \ No newline at end of file