- use DOIs for PDF metadata lookup when available (needs testing)

- fix accessibility of createContextObject in Zotero.Utilities
- improved CrossRef translator
This commit is contained in:
Simon Kornblith 2009-03-24 02:08:08 +00:00
parent 9ca461c59b
commit 1f0d24ceef
3 changed files with 156 additions and 82 deletions

View file

@ -26,6 +26,7 @@
const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png"; const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png"; const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png";
const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://zotero/skin/indicator.gif"; const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://zotero/skin/indicator.gif";
const DOIre = /\bdoi\: *([^\s]+)/i;
/** /**
* Front end for recognizing PDFs * Front end for recognizing PDFs
@ -309,6 +310,16 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, ca
} }
} }
inputStream.close();
cacheFile.remove(false);
// look for DOI
var allText = lines.join("\n");
var m = DOIre.exec(allText);
if(m) {
this._DOI = m[1];
}
// get (not quite) median length // get (not quite) median length
var lineLengthsLength = lineLengths.length; var lineLengthsLength = lineLengths.length;
if(lineLengthsLength < 20) { if(lineLengthsLength < 20) {
@ -328,9 +339,6 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, ca
this._startLine = this._iteration = 0; this._startLine = this._iteration = 0;
} }
inputStream.close();
cacheFile.remove(false);
if(lineLengthsLength >= 20) { if(lineLengthsLength >= 20) {
this._queryGoogle(); this._queryGoogle();
} }
@ -349,53 +357,67 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
return; return;
} }
this._iteration++; this._iteration++;
// take the relevant parts of some lines (exclude hyphenated word)
var queryStringWords = 0;
var queryString = ""; var queryString = "";
while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
var words = this._goodLines[this._startLine].split(/\s+/);
// get rid of first and last words
words.shift();
words.pop();
// make sure there are no long words (probably OCR mistakes)
var skipLine = false;
for(var i=0; i<words.length; i++) {
if(words[i].length > 20) {
skipLine = true;
break;
}
}
// add words to query
if(!skipLine && words.length) {
queryStringWords += words.length;
queryString += '"'+words.join(" ")+'" ';
}
this._startLine++;
}
Zotero.debug("RecognizePDF: Query string "+queryString);
// pass query string to Google Scholar and translate
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search";
if(!this._hiddenBrowser) {
this._hiddenBrowser = Zotero.Browser.createHiddenBrowser();
this._hiddenBrowser.docShell.allowImages = false;
}
var me = this; var me = this;
var translate = new Zotero.Translate("web", true, false); if(this._DOI) {
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); // use CrossRef to look for DOI
translate.setHandler("itemDone", function(translate, item) { translate = new Zotero.Translate("search", true, false);
Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
me._callback(item); var item = {"itemType":"journalArticle", "DOI":this._DOI};
}); translate.setSearch(item);
translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) }); translate.setHandler("itemDone", function(translate, item) { me._callback(item); });
translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); }); translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true); translate.translate();
delete this._DOI;
this._hiddenBrowser.loadURIWithFlags(url, } else {
Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null); // take the relevant parts of some lines (exclude hyphenated word)
var queryStringWords = 0;
while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
var words = this._goodLines[this._startLine].split(/\s+/);
// get rid of first and last words
words.shift();
words.pop();
// make sure there are no long words (probably OCR mistakes)
var skipLine = false;
for(var i=0; i<words.length; i++) {
if(words[i].length > 20) {
skipLine = true;
break;
}
}
// add words to query
if(!skipLine && words.length) {
queryStringWords += words.length;
queryString += '"'+words.join(" ")+'" ';
}
this._startLine++;
}
Zotero.debug("RecognizePDF: Query string "+queryString);
// pass query string to Google Scholar and translate
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search";
if(!this._hiddenBrowser) {
this._hiddenBrowser = Zotero.Browser.createHiddenBrowser();
this._hiddenBrowser.docShell.allowImages = false;
}
var translate = new Zotero.Translate("web", true, false);
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
translate.setHandler("itemDone", function(translate, item) {
Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
me._callback(item);
});
translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
this._hiddenBrowser.loadURIWithFlags(url,
Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
}
} }
/** /**

View file

@ -581,7 +581,7 @@ Zotero.Utilities.prototype.processAsync = function (sets, callbacks, onDone) {
* @borrows Zotero.Date.formatDate as this.formatDate * @borrows Zotero.Date.formatDate as this.formatDate
* @borrows Zotero.Date.strToDate as this.strToDate * @borrows Zotero.Date.strToDate as this.strToDate
* @borrows Zotero.Date.strToISO as this.strToISO * @borrows Zotero.Date.strToISO as this.strToISO
* @borrows Zotero.OpenURL.lookupContextObject as this.lookupContextObject * @borrows Zotero.OpenURL.createContextObject as this.createContextObject
* @borrows Zotero.OpenURL.parseContextObject as this.parseContextObject * @borrows Zotero.OpenURL.parseContextObject as this.parseContextObject
* @borrows Zotero.Utilities.HTTP.processDocuments as this.processDocuments * @borrows Zotero.Utilities.HTTP.processDocuments as this.processDocuments
* @borrows Zotero.Utilities.HTTP.doPost as this.doPost * @borrows Zotero.Utilities.HTTP.doPost as this.doPost
@ -596,7 +596,7 @@ Zotero.Utilities.Translate.prototype.inArray = Zotero.inArray;
Zotero.Utilities.Translate.prototype.formatDate = Zotero.Date.formatDate; Zotero.Utilities.Translate.prototype.formatDate = Zotero.Date.formatDate;
Zotero.Utilities.Translate.prototype.strToDate = Zotero.Date.strToDate; Zotero.Utilities.Translate.prototype.strToDate = Zotero.Date.strToDate;
Zotero.Utilities.Translate.prototype.strToISO = Zotero.Date.strToISO; Zotero.Utilities.Translate.prototype.strToISO = Zotero.Date.strToISO;
Zotero.Utilities.Translate.prototype.lookupContextObject = Zotero.OpenURL.lookupContextObject; Zotero.Utilities.Translate.prototype.createContextObject = Zotero.OpenURL.createContextObject;
Zotero.Utilities.Translate.prototype.parseContextObject = Zotero.OpenURL.parseContextObject; Zotero.Utilities.Translate.prototype.parseContextObject = Zotero.OpenURL.parseContextObject;
/** /**

View file

@ -18,11 +18,17 @@ function detectSearch(item) {
return false; return false;
} }
function fixAuthorCapitalization(string) {
if(string.toUpperCase() == string) {
string = string.toLowerCase().replace(/\b[a-z]/g, function(m) { return m[0].toUpperCase() });
}
return string;
}
function processCrossRef(xmlOutput) { function processCrossRef(xmlOutput) {
xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, ""); xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, "");
// parse XML with E4X // parse XML with E4X
var qr = new Namespace("http://www.crossref.org/qrschema/2.0");
try { try {
var xml = new XML(xmlOutput); var xml = new XML(xmlOutput);
} catch(e) { } catch(e) {
@ -30,41 +36,87 @@ function processCrossRef(xmlOutput) {
} }
// ensure status is valid // ensure status is valid
var status = xml.qr::query_result.qr::body.qr::query.@status.toString(); if(!xml.doi_record.length()) return false;
if(status != "resolved" && status != "multiresolved") { if(xml.doi_record[0].crossref.journal.length()) {
return false; var item = new Zotero.Item("journalArticle");
var itemXML = xml.doi_record.crossref.journal;
var refXML = itemXML.journal_article;
var metadataXML = itemXML.journal_metadata;
item.ISSN = itemXML.journal_metadata.issn.toString();
item.publicationTitle = itemXML.journal_metadata.full_title.toString();
item.journalAbbreviation = itemXML.journal_metadata.abbrev_title.toString();
item.volume = itemXML.journal_issue.journal_volume.volume.toString();
item.issue = itemXML.journal_issue.issue.toString();
} else if(xml.doi_record[0].crossref.book.length()) {
var item = new Zotero.Item("book");
var refXML = xml.doi_record[0].crossref.book.book_metadata;
var metadataXML = refXML;
var seriesXML = metadataXML.series_metadata;
item.place = metadataXML.publisher.publisher_place.toString();
} else if(xml.doi_record[0].crossref.conference.length()) {
var item = new Zotero.Item("conferencePaper");
var itemXML = xml.doi_record[0].crossref.conference;
var refXML = itemXML.conference_paper;
var metadataXML = itemXML.proceedingsMetadata;
var seriesXML = metadataXML.series_metadata;
item.publicationTitle = itemXML.proceedings_metadata.proceedings_title.toString();
item.place = itemXML.event_metadata.conference_location.toString();
item.conferenceName = itemXML.event_metadata.conference_name.toString();
} }
var query = xml.qr::query_result.qr::body.qr::query; var contributors = refXML.contributors.children();
var item = new Zotero.Item("journalArticle");
// try to get a DOI if(metadataXML.isbn.length()) item.ISBN = metadataXML.isbn[0].toString();
item.DOI = query.qr::doi.(@type=="journal_article").text().toString(); if(metadataXML.issn.length()) item.ISSN = metadataXML.issn[0].toString();
if(!item.DOI) { item.publisher = metadataXML.publisher.publisher_name.toString();
item.DOI = query.qr::doi.(@type=="book_title").text().toString(); item.edition = metadataXML.edition_number.toString();
} if(!item.volume) item.volume = metadataXML.volume.toString();
if(!item.DOI) {
item.DOI = query.qr::doi.(@type=="book_content").text().toString(); if(seriesXML && seriesXML.length()) {
if(seriesXML.contributors.length()) {
contributors += seriesXML.contributors.children();
}
item.seriesNumber = seriesXML.series_number.toString();
} }
// try to get an ISSN (no print/electronic preferences) for each(var creatorXML in contributors) {
item.ISSN = query.qr::issn[0].text().toString(); var creator = {creatorType:"author"};
// get title if(creatorXML.contributor_role == "editor") {
item.title = query.qr::article_title.text().toString(); creator.creatorType = "editor";
// get publicationTitle } else if(creatorXML.contributor_role == "translator") {
item.publicationTitle = query.qr::journal_title.text().toString(); creator.creatorType = "translator";
// get author } else if(creatorXML.contributor_role == "chair") {
item.creators.push(Zotero.Utilities.cleanAuthor(query.qr::author.text().toString(), "author", true)); creator.creatorType = "contributor";
// get volume }
item.volume = query.qr::volume.text().toString();
// get issue if(creatorXML.localName() == "organization") {
item.issue = query.qr::issue.text().toString(); creator.fieldMode = 1;
// get year creator.lastName = creatorXML.toString();
item.date = query.qr::year.text().toString(); } else if(creatorXML.localName() == "person_name") {
// get edition creator.firstName = fixAuthorCapitalization(creatorXML.given_name.toString());
item.edition = query.qr::edition_number.text().toString(); creator.lastName = fixAuthorCapitalization(creatorXML.surname.toString());
// get first page }
item.pages = query.qr::first_page.text().toString(); item.creators.push(creator);
}
item.date = refXML.publication_date.year.toString();
if(refXML.publication_date.month.length()) {
item.date = refXML.publication_date.month.toString()+"/"+item.date;
}
if(refXML.pages.length()) {
item.pages = refXML.pages.first_page.toString();
if(refXML.pages.last_page.length()) {
item.pages += "-"+refXML.pages.last_page.toString();
}
}
item.DOI = refXML.doi_data.doi.toString();
item.url = refXML.doi_data.resource.toString();
item.title = refXML.titles.title.toString();
item.complete(); item.complete();
return true; return true;
@ -80,7 +132,7 @@ function doSearch(item) {
var co = Zotero.Utilities.createContextObject(item); var co = Zotero.Utilities.createContextObject(item);
} }
Zotero.Utilities.HTTP.doGet("http://www.crossref.org/openurl?req_dat=zter:zter321&"+co+"&noredirect=true", function(responseText) { Zotero.Utilities.HTTP.doGet("http://www.crossref.org/openurl?req_dat=zter:zter321&"+co+"&noredirect=true&format=unixref", function(responseText) {
processCrossRef(responseText); processCrossRef(responseText);
Zotero.done(); Zotero.done();
}); });