closes #112, ingested items should be automatically added to selected project

references #178, changes to various date fields

- updates CSL to work with the latest schema. we can now (almost) generate completely valid APA style. the only issue is that there's no syntax for specifying short forms for page and creator type labels.
- updates scrapers to use date field rather than year field.
- removes now-unnecessary translation engine code pertaining to year field.
This commit is contained in:
Simon Kornblith 2006-08-14 05:12:28 +00:00
parent 1ea0e14ad3
commit 3195a1c382
4 changed files with 191 additions and 160 deletions

View file

@ -292,7 +292,7 @@ Scholar_Ingester_Interface._itemDone = function(obj, item, collection) {
// add item to collection, if one was specified
if(collection) {
collection.addItem(item);
collection.addItem(item.getID());
}
}

View file

@ -47,7 +47,6 @@ Scholar.Cite = new function() {
* want to use the Scholar data model, but does want to use CSL in JavaScript
*/
CSL = function(csl) {
Scholar.debug(csl);
this._csl = new XML(this._cleanXML(csl));
// initialize CSL
@ -72,12 +71,13 @@ CSL = function(csl) {
// load options
this._opt = this._parseOptions(this._csl.bibliography);
Scholar.debug(this._opt);
// create an associative array of available types
this._types = new Object();
this._serializations = new Object();
for each(var type in this._csl.bibliography.layout.item.choose.type) {
this._types[type.@name] = true;
this._serializations[type.@name] = new Object();
}
}
@ -90,12 +90,10 @@ CSL.prototype.createBibliography = function(items, format) {
this._preprocessItems(items);
// sort by sort order
Scholar.debug("sorting items");
var me = this;
items.sort(function(a, b) {
return me._compareItem(a, b);
});
Scholar.debug(items);
// disambiguate items
this._disambiguateItems(items);
@ -374,7 +372,7 @@ CSL.prototype._parseFieldDefaults = function(ref) {
/*
* parses a list of fields into an array of objects
*/
CSL.prototype._parseFields = function(ref, useDefaults) {
CSL.prototype._parseFields = function(ref, type) {
var typeDesc = new Array();
for each(var element in ref) {
if(element.namespace() == CSL.ns) { // ignore elements in other namespaces
@ -384,17 +382,31 @@ CSL.prototype._parseFields = function(ref, useDefaults) {
// parse attributes on this field
this._parseFieldAttrChildren(element, itemDesc);
// add defaults
if(useDefaults) {
// add defaults, but only if we're parsing as a reference type
if(type) {
var fieldDefaults = this._getFieldDefaults(itemDesc.name);
itemDesc = this._merge(fieldDefaults, itemDesc);
itemDesc = this._merge(this._opt.format, itemDesc);
// create serialized representation
itemDesc._serialized = this._serializeElement(itemDesc.name, itemDesc);
// add to serialization for type
this._serializations[itemDesc._serialized] = itemDesc;
}
// parse group children
if(itemDesc.name == "group" && itemDesc.children) {
for(var i in itemDesc.children) {
itemDesc.children[i] = this._merge(this._getFieldDefaults(itemDesc.children[i].name), itemDesc.children[i]);
// don't bother merging fieldDefaults
itemDesc.children[i] = this._merge(this._getFieldDefaults(itemDesc.children[i].name),
itemDesc.children[i]);
if(type) {
// serialize children
itemDesc.children[i]._serialized = this._serializeElement(itemDesc.children[i].name,
itemDesc.children[i]);
// add to serialization for type
this._serializations[itemDesc._serialized] = itemDesc;
}
}
}
@ -422,18 +434,6 @@ CSL.prototype._parseOptions = function(bibliography) {
opt.hangingIndent = true;
}
// author as sort order
// for our purposes, this controls whether an author is last, first or
// first last (although for internationalized names it means something
// different)
if(bibliography['@author-as-sort-order']) {
if(bibliography['@author-as-sort-order'] == "first-author") {
opt.authorAsSortOrder = "first-author";
} else if(bibliography['@author-as-sort-order'] == "all") {
opt.authorAsSortOrder = "all";
}
}
// sort order
var algorithm = bibliography.sort.@algorithm.toString();
if(algorithm) {
@ -483,7 +483,7 @@ CSL.prototype._parseOptions = function(bibliography) {
*/
CSL.prototype._parseReferenceType = function(reftype) {
var ref = this._csl.bibliography.layout.item.choose.type.(@name==reftype).children();
this._types[reftype] = this._parseFields(ref, true);
this._types[reftype] = this._parseFields(ref, reftype);
}
/*
@ -549,13 +549,19 @@ CSL.prototype._getTerm = function(term, plural) {
*/
CSL.prototype._processDate = function(string) {
var date = new Object();
var dateRe = /^([0-9]{4})-([0-9]{2})-([0-9]{2})$/;
var m = dateRe.exec(string);
if(m) { // sql date
var jsDate = new Date(m[1], m[2]-1, m[3], false, false, false);
} else { // not an sql date
var jsDate = new Date(string);
var yearRe = /^[0-9]+$/;
if(yearRe) { // is a year
date.year = string;
return date;
} else { // who knows what this is
var jsDate = new Date(string)
}
}
if(isNaN(jsDate.valueOf())) { // couldn't parse
@ -580,15 +586,24 @@ CSL.prototype._processDate = function(string) {
* formats a string according to the cs-format attributes on element
*/
CSL.prototype._formatString = function(element, string, format) {
if(format != "compare" && element.prefix) {
string = element.prefix+string;
if(element["text-transform"]) {
if(element["text-transform"] == "lowercase") {
// all lowercase
string = string.toLowerCase();
} else if(element["text-transform"] == "uppercase") {
// all uppercase
string = string.toUpperCase();
} else if(element["text-transform"] == "capitalize") {
// capitalize first
string = string[0].toUpperCase()+string.substr(1);
}
}
if(format == "HTML") {
var style = "";
var cssAttributes = ["font-family", "font-style", "font-variant",
"font-weight", "text-transform"];
"font-weight"];
for(var j in cssAttributes) {
if(element[cssAttributes[j]] && element[cssAttributes[j]].indexOf('"') == -1) {
style += cssAttributes[j]+":"+element[cssAttributes[j]];
@ -600,6 +615,9 @@ CSL.prototype._formatString = function(element, string, format) {
}
}
if(format != "compare" && element.prefix) {
string = element.prefix+string;
}
if(format != "compare" && element.suffix &&
(element.suffix.length != 1 || string[string.length-1] != element.suffix)) {
// skip if suffix is the same as the last char
@ -610,9 +628,10 @@ CSL.prototype._formatString = function(element, string, format) {
}
/*
* formats a locator (pages, volume, issue)
* formats a locator (pages, volume, issue) or an identifier (isbn, doi)
* note that label should be null for an identifier
*/
CSL.prototype._formatLocator = function(element, number, format) {
CSL.prototype._formatLocator = function(identifier, element, number, format) {
var data = "";
if(number) {
@ -623,8 +642,12 @@ CSL.prototype._formatLocator = function(element, number, format) {
if(child.name == "number") {
string = number;
} else if(child.name == "text") {
var plural = (item.pages.indexOf(",") != -1 || item.pages.indexOf("-") != -1);
var plural = (identifier && (number.indexOf(",") != -1
|| number.indexOf("-") != -1));
string = this._getTerm(child["term-name"], plural);
} else if(identifier && child.name == "label") {
var plural = (number.indexOf(",") != -1 || number.indexOf("-") != -1);
string = this._getTerm(identifier, plural);
}
if(string) {
@ -687,6 +710,21 @@ CSL.prototype._formatDate = function(element, date, format) {
return data;
}
/*
* serializes an element into a string suitable to prevent substitutes from
* recurring in the same style
*/
CSL.prototype._serializeElement = function(name, element) {
var string = name;
if(element.relation) {
string += " relation:"+element.relation;
}
if(element.role) {
string += " role"+element.role;
}
return string;
}
/*
* pads a number or other string with a given string on the left
*/
@ -730,12 +768,6 @@ CSL.prototype._preprocessItems = function(items) {
// parse
if(item.date) { // specific date
item._csl.date = CSL.prototype._processDate(item.date);
} else { // no real date, but might salvage a year
item._csl.date = new Object();
if(item.year) {
item._csl.date.year = item.year;
}
}
}
}
@ -759,10 +791,8 @@ CSL.prototype._disambiguateItems = function(items) {
var citation = author+" "+this._getFieldValue("date",
this._getFieldDefaults("date"),
item, "disambiguate");
Scholar.debug(citation);
if(usedCitations[citation]) {
Scholar.debug("disambiguation necessary");
if(!usedCitations[citation]._csl.date.disambiguation) {
usedCitations[citation]._csl.date.disambiguation = "a";
item._csl.date.disambiguation = "b";
@ -848,8 +878,11 @@ CSL.prototype._processCreators = function(type, element, creators, format) {
var authorStrings = [];
var firstName, lastName;
for(var i=0; i<maxCreators; i++) {
if(child["initialize-with"]) {
// use firist initials
if(typeof(child["initialize-with"]) == "string") {
// even if initialize-with is simply an empty string, use
// initials
// use first initials
var firstName = "";
var firstNames = creators[i].firstName.split(" ");
for(var j in firstNames) {
@ -863,8 +896,8 @@ CSL.prototype._processCreators = function(type, element, creators, format) {
}
lastName = creators[i].lastName;
if(((i == 0 && this._opt.authorAsSortOrder == "first-author")
|| this._opt.authorAsSortOrder == "all")
if(((i == 0 && element["name-as-sort-order"] == "first-author")
|| element["name-as-sort-order"] == "all")
&& child["sort-separator"]) {
// if this is the first author and author-as-sort-order="first-author"
// or if this is a subsequent author and author-as-sort-order="all"
@ -899,7 +932,7 @@ CSL.prototype._processCreators = function(type, element, creators, format) {
}
}
string = authorStrings.join(joinString);
} else if(child.name == "role") {
} else if(child.name == "label") {
string = this._getTerm(type, (maxCreators != 1));
}
@ -920,11 +953,8 @@ CSL.prototype._processCreators = function(type, element, creators, format) {
CSL.prototype._getFieldValue = function(name, element, item, format, typeName) {
var data = "";
// make sure we're not supposed to ignore this (bc it's already substituted)
for(var i in item._csl.ignore) {
if(item._csl.ignore[i] == element) {
return "";
}
if(item._csl.ignore[element._serialized] == true) {
return "";
}
if(name == "author") {
@ -1008,28 +1038,36 @@ CSL.prototype._getFieldValue = function(name, element, item, format, typeName) {
data = "";
}
} else if(name == "volume") {
data = this._formatLocator(element, item.volume, format);
data = this._formatLocator("volume", element, item.volume, format);
} else if(name == "issue") {
data = this._formatLocator(element, item.issue, format);
data = this._formatLocator("issue", element, item.issue, format);
} else if(name == "pages") {
data = this._formatLocator(element, item.pages, format);
data = this._formatLocator("page", element, item.pages, format);
} else if(name == "edition") {
data = item.edition;
} else if(name == "genre") {
data = (item.type ? item.type : item.thesisType);
} else if(name == "group") {
var childData = new Array();
for(var i in element.children) {
// get data for each child element
var child = element.children[i];
data += this._getFieldValue(child.name, child, item, format, typeName);
var string = this._getFieldValue(child.name, child, item,
format, typeName);
if(string) {
childData.push(string);
}
}
// implode with delimiter
data = childData.join((element["delimiter"] ? element["delimiter"] : ""));
} else if(name == "text") {
string = this._getTerm(child["term-name"]);
data = this._getTerm(element["term-name"]);
} else if(name == "isbn") {
data = this._formatLocator(element, item.ISBN, format);
data = this._formatLocator(null, element, item.ISBN, format);
} else if(name == "doi") {
data = this._formatLocator(element, item.DOI, format);
} else {
data = name;
data = this._formatLocator(null, element, item.DOI, format);
}
if(data) {
@ -1038,35 +1076,42 @@ CSL.prototype._getFieldValue = function(name, element, item, format, typeName) {
// try each substitute element until one returns something
for(var i in element.substitute) {
var substituteElement = element.substitute[i];
var serialization = this._serializeElement(substituteElement.name,
substituteElement);
var defaultElement;
// first try to get from the type
if(typeName && this._types[typeName]) {
for(var i in this._types[typeName]) {
var field = this._types[typeName][i];
if(field.name == substituteElement.name
&& (!substituteElement.relation
|| field.relation == substituteElement.relation)
&& (!substituteElement.role
|| field.role == substituteElement.role)) {
defaultElement = field;
// flag to be ignored later
item._csl.ignore.push(defaultElement);
}
var inheritElement;
if(CSL._inherit[substituteElement.name] == CSL._inherit[name]) {
// if both substituteElement and the parent element inheirt from
// the same base element, apply styles here
inheritElement = element;
} else {
// search for elements with the same serialization
if(typeName && this._serializations[typeName]
&& this._serializations[typeName][serialization]) {
inheritElement = this._serializations[typeName][serialization];
} else {
// otherwise, use defaults
inheritElement = this._getFieldDefaults(substituteElement.name);
}
}
// otherwise, get default
if(!defaultElement) {
defaultElement = this._getFieldDefaults(substituteElement.name);
}
// copy prefix and suffix
// merge inheritElement and element
substituteElement = this._merge(inheritElement, substituteElement);
// regardless of inheritance pathway, make sure substitute inherits
// general prefix and suffix from the element it's substituting for
substituteElement.prefix = element.prefix;
substituteElement.suffix = element.suffix;
// clear substitute element off of the element we're substituting
substituteElement.substitute = undefined;
// ignore elements with the same serialization
item._csl.ignore[serialization] = true;
// get field value
data = this._getFieldValue(substituteElement.name,
this._merge(defaultElement, substituteElement), item, format);
substituteElement, item, format);
// return field value, if there is one; otherwise, keep processing
// the data
if(data) {
return data;
}

View file

@ -779,30 +779,6 @@ Scholar.Translate.prototype._itemDone = function(item) {
// makes looping through easier
item.itemType = item.complete = undefined;
if(item.date && !item.year) {
// date can serve as a year
var dateID = Scholar.ItemFields.getID("date");
var yearID = Scholar.ItemFields.getID("year");
if(!Scholar.ItemFields.isValidForType(dateID, typeID) && Scholar.ItemFields.isValidForType(yearID, typeID)) {
// year is valid but date is not
var yearRe = /[0-9]{4}/;
var m = yearRe.exec(item.date);
if(m) {
item.year = m[0]
item.date = undefined;
}
}
} else if(!item.date && item.year) {
// the converse is also true
var dateID = Scholar.ItemFields.getID("date");
var yearID = Scholar.ItemFields.getID("year");
if(Scholar.ItemFields.isValidForType(dateID, typeID) && !Scholar.ItemFields.isValidForType(yearID, typeID)) {
// date is valid but year is not
item.date = item.year;
item.year = undefined;
}
}
var fieldID, field;
for(var i in item) {
// loop through item fields

View file

@ -1,4 +1,4 @@
-- 45
-- 46
-- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-11 11:18:00'));
@ -273,7 +273,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
}
}
} else if(match[1] == ''Year'') {
newItem.year = match[2];
newItem.date = match[2];
}
}
}
@ -881,7 +881,7 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006
} else if(field == "pub date") {
var re = /[0-9]+/;
var m = re.exec(value);
newItem.year = m[0];
newItem.date = m[0];
} else if(field == "isbn") {
var re = /^[0-9](?:[0-9X]+)/;
var m = re.exec(value);
@ -2127,8 +2127,10 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006
newItem.publicationTitle = newDOM.journal.text();
newItem.volume = newDOM.volume.text();
newItem.issue = newDOM.issue.text();
newItem.year = newDOM.year.text();
newItem.date = newDOM.pubdate.text();
newItem.date = newDOM.pubdate.text().toString();
if(!newItem.date) {
newItem.date = newDOM.year.text();
}
newItem.title = newDOM.doctitle.text();
newItem.ISSN = newDOM.issn.text();
@ -2915,11 +2917,11 @@ function doExport() {
} else if(item.distributor) {
originInfo += <publisher>{item.distributor}</publisher>;
}
if(item.year) {
// Assume year is copyright date
originInfo += <copyrightDate encoding="iso8601">{item.year}</copyrightDate>;
}
if(item.date) {
if(inArray(item.itemType, ["book", "bookSection"]) {
// Assume year is copyright date
originInfo += <copyrightDate encoding="iso8601">{item.year}</copyrightDate>;
}
if(inArray(item.itemType, ["magazineArticle", "newspaperArticle"])) {
// Assume date is date issued
var dateType = "dateIssued";
@ -3477,15 +3479,10 @@ function doExport() {
// date/year
if(item.date) {
Scholar.RDF.addStatement(resource, n.dc+"date", item.date, true);
} else if(item.year) {
Scholar.RDF.addStatement(resource, n.dc+"date", item.year, true);
}
if(item.accessDate) { // use date submitted for access date?
Scholar.RDF.addStatement(resource, n.dcterms+"dateSubmitted", item.accessDate, true);
}
if(item.lastModified) {
Scholar.RDF.addStatement(resource, n.dcterms+"modified", item.lastModified, true);
}
// callNumber
if(item.callNumber) {
@ -3637,10 +3634,6 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006
// date/year
if(item.date) {
Scholar.RDF.addStatement(resource, dc+"date", item.date, true);
} else if(item.year) {
Scholar.RDF.addStatement(resource, dc+"date", item.year, true);
} else if(item.lastModified) {
Scholar.RDF.addStatement(resource, dc+"date", item.lastModified, true);
}
// ISBN/ISSN/DOI
@ -4306,7 +4299,7 @@ function doExport() {
// date
if(item.date) {
var isoDate = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/;
var isoDate = /^[0-9]{4}(-[0-9]{2}-[0-9]{2})?$/;
if(isoDate.test(item.date)) { // can directly accept ISO format with minor mods
addTag("Y1", item.date.replace("-", "/")+"/");
} else { // otherwise, extract year and attach other data
@ -4316,8 +4309,6 @@ function doExport() {
addTag("Y1", m[2]+"///"+m[1]);
}
}
} else if(item.year) {
addTag("Y1", item.year+"///");
}
// notes
@ -4805,7 +4796,7 @@ MARC_Record.prototype.translate = function(item) {
// Extract publisher info
this._associateDBField(item, ''260'', ''b'', ''publisher'');
// Extract year
this._associateDBField(item, ''260'', ''c'', ''year'', _pullNumber);
this._associateDBField(item, ''260'', ''c'', ''date'', _pullNumber);
// Extract series
this._associateDBField(item, ''440'', ''a'', ''seriesTitle'');
// Extract call number
@ -4871,30 +4862,27 @@ REPLACE INTO "csl" VALUES('id-not-yet-given', '2006-08-12 19:22:00', 'American P
<name>Bruce DArcus</name>
<email>bdarcus@sourceforge.net</email>
</author>
<updated>2006-08-03T11:01:30-05:00</updated>
<author>
<name>Simon Kornblith</name>
<email>simon@simonster.com</email>
</author>
<updated>2006-08-13T23:28:00-05:00</updated>
</info>
<defaults>
<contributor>
<contributors name-as-sort-order="no">
<name and="symbol" initialize-with="."/>
<label prefix=", " text-transform="capitalize"/>
</contributors>
<author name-as-sort-order="all">
<name and="symbol" sort-separator=", " initialize-with="."/>
<role prefix=", " />
</contributor>
<author>
<label prefix=" (" suffix=")" text-transform="capitalize"/>
<substitute>
<choose>
<editor>
<name and="symbol" sort-separator=", " initialize-with="."/>
<role prefix=" (" suffix=")" />
</editor>
<translator>
<name and="symbol" sort-separator=", " initialize-with="."/>
<role prefix=" (" suffix=")" />
</translator>
<titles relation="container" font-style="italic"/>
<titles>
<name form="short"/>
</titles>
<editor/>
<translator/>
<titles/>
</choose>
</substitute>
</substitute>
</author>
<locator>
<number/>
@ -4915,6 +4903,13 @@ REPLACE INTO "csl" VALUES('id-not-yet-given', '2006-08-12 19:22:00', 'American P
<name/>
</publisher>
<access>
<text term-name="retrieved" text-transform="capitalize"/>
<date suffix=", ">
<month/>
<day suffix=", "/>
<year/>
</date>
<text term-name="from"/>
<url/>
<date prefix=", "/>
</access>
@ -4928,11 +4923,11 @@ REPLACE INTO "csl" VALUES('id-not-yet-given', '2006-08-12 19:22:00', 'American P
<date>
<year/>
</date>
<locator prefix=": " include-label="false"/>
<locator prefix=": "/>
</item>
</layout>
</citation>
<bibliography author-as-sort-order="all" hanging-indent="true">
<bibliography hanging-indent="true">
<sort algorithm="author-date"/>
<et-al min-authors="4" use-first="3"/>
<layout>
@ -4950,7 +4945,10 @@ REPLACE INTO "csl" VALUES('id-not-yet-given', '2006-08-12 19:22:00', 'American P
</date>
<group suffix=".">
<titles font-style="italic" prefix=" "/>
<editor prefix=" (" suffix=")"/>
<group prefix=" (" suffix=")" delimiter=", ">
<editor/>
<translator/>
</group>
</group>
<publisher prefix=" "/>
<access prefix=" "/>
@ -4960,16 +4958,26 @@ REPLACE INTO "csl" VALUES('id-not-yet-given', '2006-08-12 19:22:00', 'American P
<date prefix=" (" suffix=").">
<year/>
</date>
<titles prefix=" "/>
<group class="container">
<text term-name="in"/>
<editor prefix=" "/>
<titles font-style="italic" prefix=" "/>
<group class="container" prefix=" ">
<text term-name="in" text-transform="capitalize"/>
<editor prefix=" " suffix=",">
<name and="symbol" sort-separator=", " initialize-with="."/>
<label prefix=" (" suffix=")" text-transform="capitalize"/>
</editor>
<translator prefix=" " suffix=",">
<name and="symbol" sort-separator=", " initialize-with="."/>
<label prefix=" (" suffix=")" text-transform="capitalize"/>
</translator>
<titles relation="container" font-style="italic" prefix=" " suffix="."/>
<titles relation="collection" prefix=" " suffix="."/>
<publisher prefix=" "/>
<access prefix=" "/>
<pages prefix=", "/>
<pages prefix=" (" suffix=")">
<label text-transform="capitalize" suffix=". "/>
<number/>
</pages>
</group>
<access prefix=" "/>
</type>
<type name="article">
<author/>
@ -4977,20 +4985,22 @@ REPLACE INTO "csl" VALUES('id-not-yet-given', '2006-08-12 19:22:00', 'American P
<year/>
</date>
<group suffix=".">
<titles font-style="italic" prefix=" "/>
<editor prefix=" (" suffix=")"/>
<titles prefix=" "/>
<group prefix=" (" suffix=")" delimiter=", ">
<editor/>
<translator/>
</group>
</group>
<group class="container" prefix=" " suffix=".">
<titles relation="container" font-style="italic" prefix=" "/>
<access prefix=" "/>
<volume prefix=", " font-style="italic"/>
<issue prefix="(" suffix=")"/>
<pages prefix=", "/>
<volume prefix=", " font-style="italic"/>
<issue prefix="(" suffix=")"/>
<pages prefix=", "/>
</group>
<access prefix=" "/>
</type>
</choose>
</item>
</layout>
</bibliography>
</style>
');
</style>');