- use DOIs for PDF metadata lookup when available (needs testing)
- fix accessibility of createContextObject in Zotero.Utilities - improved CrossRef translator
This commit is contained in:
parent
9ca461c59b
commit
1f0d24ceef
3 changed files with 156 additions and 82 deletions
|
@ -26,6 +26,7 @@
|
||||||
const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
|
const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
|
||||||
const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png";
|
const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png";
|
||||||
const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://zotero/skin/indicator.gif";
|
const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://zotero/skin/indicator.gif";
|
||||||
|
const DOIre = /\bdoi\: *([^\s]+)/i;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Front end for recognizing PDFs
|
* Front end for recognizing PDFs
|
||||||
|
@ -309,6 +310,16 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, ca
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inputStream.close();
|
||||||
|
cacheFile.remove(false);
|
||||||
|
|
||||||
|
// look for DOI
|
||||||
|
var allText = lines.join("\n");
|
||||||
|
var m = DOIre.exec(allText);
|
||||||
|
if(m) {
|
||||||
|
this._DOI = m[1];
|
||||||
|
}
|
||||||
|
|
||||||
// get (not quite) median length
|
// get (not quite) median length
|
||||||
var lineLengthsLength = lineLengths.length;
|
var lineLengthsLength = lineLengths.length;
|
||||||
if(lineLengthsLength < 20) {
|
if(lineLengthsLength < 20) {
|
||||||
|
@ -328,9 +339,6 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, ca
|
||||||
this._startLine = this._iteration = 0;
|
this._startLine = this._iteration = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
inputStream.close();
|
|
||||||
cacheFile.remove(false);
|
|
||||||
|
|
||||||
if(lineLengthsLength >= 20) {
|
if(lineLengthsLength >= 20) {
|
||||||
this._queryGoogle();
|
this._queryGoogle();
|
||||||
}
|
}
|
||||||
|
@ -349,53 +357,67 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
this._iteration++;
|
this._iteration++;
|
||||||
|
|
||||||
// take the relevant parts of some lines (exclude hyphenated word)
|
|
||||||
var queryStringWords = 0;
|
|
||||||
var queryString = "";
|
var queryString = "";
|
||||||
while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
|
|
||||||
var words = this._goodLines[this._startLine].split(/\s+/);
|
|
||||||
// get rid of first and last words
|
|
||||||
words.shift();
|
|
||||||
words.pop();
|
|
||||||
// make sure there are no long words (probably OCR mistakes)
|
|
||||||
var skipLine = false;
|
|
||||||
for(var i=0; i<words.length; i++) {
|
|
||||||
if(words[i].length > 20) {
|
|
||||||
skipLine = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// add words to query
|
|
||||||
if(!skipLine && words.length) {
|
|
||||||
queryStringWords += words.length;
|
|
||||||
queryString += '"'+words.join(" ")+'" ';
|
|
||||||
}
|
|
||||||
this._startLine++;
|
|
||||||
}
|
|
||||||
Zotero.debug("RecognizePDF: Query string "+queryString);
|
|
||||||
|
|
||||||
// pass query string to Google Scholar and translate
|
|
||||||
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search";
|
|
||||||
if(!this._hiddenBrowser) {
|
|
||||||
this._hiddenBrowser = Zotero.Browser.createHiddenBrowser();
|
|
||||||
this._hiddenBrowser.docShell.allowImages = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
var me = this;
|
var me = this;
|
||||||
var translate = new Zotero.Translate("web", true, false);
|
if(this._DOI) {
|
||||||
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
|
// use CrossRef to look for DOI
|
||||||
translate.setHandler("itemDone", function(translate, item) {
|
translate = new Zotero.Translate("search", true, false);
|
||||||
Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
|
translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
|
||||||
me._callback(item);
|
var item = {"itemType":"journalArticle", "DOI":this._DOI};
|
||||||
});
|
translate.setSearch(item);
|
||||||
translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
|
translate.setHandler("itemDone", function(translate, item) { me._callback(item); });
|
||||||
translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
|
translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
|
||||||
|
translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
|
||||||
this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
|
translate.translate();
|
||||||
|
delete this._DOI;
|
||||||
this._hiddenBrowser.loadURIWithFlags(url,
|
} else {
|
||||||
Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
|
// take the relevant parts of some lines (exclude hyphenated word)
|
||||||
|
var queryStringWords = 0;
|
||||||
|
while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
|
||||||
|
var words = this._goodLines[this._startLine].split(/\s+/);
|
||||||
|
// get rid of first and last words
|
||||||
|
words.shift();
|
||||||
|
words.pop();
|
||||||
|
// make sure there are no long words (probably OCR mistakes)
|
||||||
|
var skipLine = false;
|
||||||
|
for(var i=0; i<words.length; i++) {
|
||||||
|
if(words[i].length > 20) {
|
||||||
|
skipLine = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// add words to query
|
||||||
|
if(!skipLine && words.length) {
|
||||||
|
queryStringWords += words.length;
|
||||||
|
queryString += '"'+words.join(" ")+'" ';
|
||||||
|
}
|
||||||
|
this._startLine++;
|
||||||
|
}
|
||||||
|
|
||||||
|
Zotero.debug("RecognizePDF: Query string "+queryString);
|
||||||
|
|
||||||
|
// pass query string to Google Scholar and translate
|
||||||
|
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search";
|
||||||
|
if(!this._hiddenBrowser) {
|
||||||
|
this._hiddenBrowser = Zotero.Browser.createHiddenBrowser();
|
||||||
|
this._hiddenBrowser.docShell.allowImages = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var translate = new Zotero.Translate("web", true, false);
|
||||||
|
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
|
||||||
|
translate.setHandler("itemDone", function(translate, item) {
|
||||||
|
Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
|
||||||
|
me._callback(item);
|
||||||
|
});
|
||||||
|
translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
|
||||||
|
translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
|
||||||
|
|
||||||
|
this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
|
||||||
|
|
||||||
|
this._hiddenBrowser.loadURIWithFlags(url,
|
||||||
|
Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -581,7 +581,7 @@ Zotero.Utilities.prototype.processAsync = function (sets, callbacks, onDone) {
|
||||||
* @borrows Zotero.Date.formatDate as this.formatDate
|
* @borrows Zotero.Date.formatDate as this.formatDate
|
||||||
* @borrows Zotero.Date.strToDate as this.strToDate
|
* @borrows Zotero.Date.strToDate as this.strToDate
|
||||||
* @borrows Zotero.Date.strToISO as this.strToISO
|
* @borrows Zotero.Date.strToISO as this.strToISO
|
||||||
* @borrows Zotero.OpenURL.lookupContextObject as this.lookupContextObject
|
* @borrows Zotero.OpenURL.createContextObject as this.createContextObject
|
||||||
* @borrows Zotero.OpenURL.parseContextObject as this.parseContextObject
|
* @borrows Zotero.OpenURL.parseContextObject as this.parseContextObject
|
||||||
* @borrows Zotero.Utilities.HTTP.processDocuments as this.processDocuments
|
* @borrows Zotero.Utilities.HTTP.processDocuments as this.processDocuments
|
||||||
* @borrows Zotero.Utilities.HTTP.doPost as this.doPost
|
* @borrows Zotero.Utilities.HTTP.doPost as this.doPost
|
||||||
|
@ -596,7 +596,7 @@ Zotero.Utilities.Translate.prototype.inArray = Zotero.inArray;
|
||||||
Zotero.Utilities.Translate.prototype.formatDate = Zotero.Date.formatDate;
|
Zotero.Utilities.Translate.prototype.formatDate = Zotero.Date.formatDate;
|
||||||
Zotero.Utilities.Translate.prototype.strToDate = Zotero.Date.strToDate;
|
Zotero.Utilities.Translate.prototype.strToDate = Zotero.Date.strToDate;
|
||||||
Zotero.Utilities.Translate.prototype.strToISO = Zotero.Date.strToISO;
|
Zotero.Utilities.Translate.prototype.strToISO = Zotero.Date.strToISO;
|
||||||
Zotero.Utilities.Translate.prototype.lookupContextObject = Zotero.OpenURL.lookupContextObject;
|
Zotero.Utilities.Translate.prototype.createContextObject = Zotero.OpenURL.createContextObject;
|
||||||
Zotero.Utilities.Translate.prototype.parseContextObject = Zotero.OpenURL.parseContextObject;
|
Zotero.Utilities.Translate.prototype.parseContextObject = Zotero.OpenURL.parseContextObject;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -18,11 +18,17 @@ function detectSearch(item) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function fixAuthorCapitalization(string) {
|
||||||
|
if(string.toUpperCase() == string) {
|
||||||
|
string = string.toLowerCase().replace(/\b[a-z]/g, function(m) { return m[0].toUpperCase() });
|
||||||
|
}
|
||||||
|
return string;
|
||||||
|
}
|
||||||
|
|
||||||
function processCrossRef(xmlOutput) {
|
function processCrossRef(xmlOutput) {
|
||||||
xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, "");
|
xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, "");
|
||||||
|
|
||||||
// parse XML with E4X
|
// parse XML with E4X
|
||||||
var qr = new Namespace("http://www.crossref.org/qrschema/2.0");
|
|
||||||
try {
|
try {
|
||||||
var xml = new XML(xmlOutput);
|
var xml = new XML(xmlOutput);
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
|
@ -30,41 +36,87 @@ function processCrossRef(xmlOutput) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// ensure status is valid
|
// ensure status is valid
|
||||||
var status = xml.qr::query_result.qr::body.qr::query.@status.toString();
|
if(!xml.doi_record.length()) return false;
|
||||||
if(status != "resolved" && status != "multiresolved") {
|
if(xml.doi_record[0].crossref.journal.length()) {
|
||||||
return false;
|
var item = new Zotero.Item("journalArticle");
|
||||||
|
var itemXML = xml.doi_record.crossref.journal;
|
||||||
|
var refXML = itemXML.journal_article;
|
||||||
|
var metadataXML = itemXML.journal_metadata;
|
||||||
|
|
||||||
|
item.ISSN = itemXML.journal_metadata.issn.toString();
|
||||||
|
item.publicationTitle = itemXML.journal_metadata.full_title.toString();
|
||||||
|
item.journalAbbreviation = itemXML.journal_metadata.abbrev_title.toString();
|
||||||
|
item.volume = itemXML.journal_issue.journal_volume.volume.toString();
|
||||||
|
item.issue = itemXML.journal_issue.issue.toString();
|
||||||
|
} else if(xml.doi_record[0].crossref.book.length()) {
|
||||||
|
var item = new Zotero.Item("book");
|
||||||
|
var refXML = xml.doi_record[0].crossref.book.book_metadata;
|
||||||
|
var metadataXML = refXML;
|
||||||
|
var seriesXML = metadataXML.series_metadata;
|
||||||
|
|
||||||
|
item.place = metadataXML.publisher.publisher_place.toString();
|
||||||
|
} else if(xml.doi_record[0].crossref.conference.length()) {
|
||||||
|
var item = new Zotero.Item("conferencePaper");
|
||||||
|
var itemXML = xml.doi_record[0].crossref.conference;
|
||||||
|
var refXML = itemXML.conference_paper;
|
||||||
|
var metadataXML = itemXML.proceedingsMetadata;
|
||||||
|
var seriesXML = metadataXML.series_metadata;
|
||||||
|
|
||||||
|
item.publicationTitle = itemXML.proceedings_metadata.proceedings_title.toString();
|
||||||
|
item.place = itemXML.event_metadata.conference_location.toString();
|
||||||
|
item.conferenceName = itemXML.event_metadata.conference_name.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
var query = xml.qr::query_result.qr::body.qr::query;
|
var contributors = refXML.contributors.children();
|
||||||
var item = new Zotero.Item("journalArticle");
|
|
||||||
|
|
||||||
// try to get a DOI
|
if(metadataXML.isbn.length()) item.ISBN = metadataXML.isbn[0].toString();
|
||||||
item.DOI = query.qr::doi.(@type=="journal_article").text().toString();
|
if(metadataXML.issn.length()) item.ISSN = metadataXML.issn[0].toString();
|
||||||
if(!item.DOI) {
|
item.publisher = metadataXML.publisher.publisher_name.toString();
|
||||||
item.DOI = query.qr::doi.(@type=="book_title").text().toString();
|
item.edition = metadataXML.edition_number.toString();
|
||||||
}
|
if(!item.volume) item.volume = metadataXML.volume.toString();
|
||||||
if(!item.DOI) {
|
|
||||||
item.DOI = query.qr::doi.(@type=="book_content").text().toString();
|
if(seriesXML && seriesXML.length()) {
|
||||||
|
if(seriesXML.contributors.length()) {
|
||||||
|
contributors += seriesXML.contributors.children();
|
||||||
|
}
|
||||||
|
item.seriesNumber = seriesXML.series_number.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
// try to get an ISSN (no print/electronic preferences)
|
for each(var creatorXML in contributors) {
|
||||||
item.ISSN = query.qr::issn[0].text().toString();
|
var creator = {creatorType:"author"};
|
||||||
// get title
|
if(creatorXML.contributor_role == "editor") {
|
||||||
item.title = query.qr::article_title.text().toString();
|
creator.creatorType = "editor";
|
||||||
// get publicationTitle
|
} else if(creatorXML.contributor_role == "translator") {
|
||||||
item.publicationTitle = query.qr::journal_title.text().toString();
|
creator.creatorType = "translator";
|
||||||
// get author
|
} else if(creatorXML.contributor_role == "chair") {
|
||||||
item.creators.push(Zotero.Utilities.cleanAuthor(query.qr::author.text().toString(), "author", true));
|
creator.creatorType = "contributor";
|
||||||
// get volume
|
}
|
||||||
item.volume = query.qr::volume.text().toString();
|
|
||||||
// get issue
|
if(creatorXML.localName() == "organization") {
|
||||||
item.issue = query.qr::issue.text().toString();
|
creator.fieldMode = 1;
|
||||||
// get year
|
creator.lastName = creatorXML.toString();
|
||||||
item.date = query.qr::year.text().toString();
|
} else if(creatorXML.localName() == "person_name") {
|
||||||
// get edition
|
creator.firstName = fixAuthorCapitalization(creatorXML.given_name.toString());
|
||||||
item.edition = query.qr::edition_number.text().toString();
|
creator.lastName = fixAuthorCapitalization(creatorXML.surname.toString());
|
||||||
// get first page
|
}
|
||||||
item.pages = query.qr::first_page.text().toString();
|
item.creators.push(creator);
|
||||||
|
}
|
||||||
|
|
||||||
|
item.date = refXML.publication_date.year.toString();
|
||||||
|
if(refXML.publication_date.month.length()) {
|
||||||
|
item.date = refXML.publication_date.month.toString()+"/"+item.date;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(refXML.pages.length()) {
|
||||||
|
item.pages = refXML.pages.first_page.toString();
|
||||||
|
if(refXML.pages.last_page.length()) {
|
||||||
|
item.pages += "-"+refXML.pages.last_page.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
item.DOI = refXML.doi_data.doi.toString();
|
||||||
|
item.url = refXML.doi_data.resource.toString();
|
||||||
|
item.title = refXML.titles.title.toString();
|
||||||
|
|
||||||
item.complete();
|
item.complete();
|
||||||
return true;
|
return true;
|
||||||
|
@ -80,7 +132,7 @@ function doSearch(item) {
|
||||||
var co = Zotero.Utilities.createContextObject(item);
|
var co = Zotero.Utilities.createContextObject(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
Zotero.Utilities.HTTP.doGet("http://www.crossref.org/openurl?req_dat=zter:zter321&"+co+"&noredirect=true", function(responseText) {
|
Zotero.Utilities.HTTP.doGet("http://www.crossref.org/openurl?req_dat=zter:zter321&"+co+"&noredirect=true&format=unixref", function(responseText) {
|
||||||
processCrossRef(responseText);
|
processCrossRef(responseText);
|
||||||
Zotero.done();
|
Zotero.done();
|
||||||
});
|
});
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue