diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js index c4d73b0a67..ec5e9879f7 100644 --- a/chrome/content/zotero/recognizePDF.js +++ b/chrome/content/zotero/recognizePDF.js @@ -26,6 +26,7 @@ const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png"; const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png"; const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://zotero/skin/indicator.gif"; +const DOIre = /\bdoi\: *([^\s]+)/i; /** * Front end for recognizing PDFs @@ -309,6 +310,16 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, ca } } + inputStream.close(); + cacheFile.remove(false); + + // look for DOI + var allText = lines.join("\n"); + var m = DOIre.exec(allText); + if(m) { + this._DOI = m[1]; + } + // get (not quite) median length var lineLengthsLength = lineLengths.length; if(lineLengthsLength < 20) { @@ -328,9 +339,6 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, ca this._startLine = this._iteration = 0; } - inputStream.close(); - cacheFile.remove(false); - if(lineLengthsLength >= 20) { this._queryGoogle(); } @@ -349,53 +357,67 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { return; } this._iteration++; - - // take the relevant parts of some lines (exclude hyphenated word) - var queryStringWords = 0; + var queryString = ""; - while(queryStringWords < 25 && this._startLine < this._goodLines.length) { - var words = this._goodLines[this._startLine].split(/\s+/); - // get rid of first and last words - words.shift(); - words.pop(); - // make sure there are no long words (probably OCR mistakes) - var skipLine = false; - for(var i=0; i 20) { - skipLine = true; - break; - } - } - // add words to query - if(!skipLine && words.length) { - queryStringWords += words.length; - queryString += '"'+words.join(" ")+'" '; - } - this._startLine++; - } - Zotero.debug("RecognizePDF: Query string "+queryString); - - // pass query string to Google Scholar and translate - var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search"; - if(!this._hiddenBrowser) { - this._hiddenBrowser = Zotero.Browser.createHiddenBrowser(); - this._hiddenBrowser.docShell.allowImages = false; - } - var me = this; - var translate = new Zotero.Translate("web", true, false); - translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); - translate.setHandler("itemDone", function(translate, item) { - Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); - me._callback(item); - }); - translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) }); - translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); }); - - this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true); - - this._hiddenBrowser.loadURIWithFlags(url, - Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null); + if(this._DOI) { + // use CrossRef to look for DOI + translate = new Zotero.Translate("search", true, false); + translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753"); + var item = {"itemType":"journalArticle", "DOI":this._DOI}; + translate.setSearch(item); + translate.setHandler("itemDone", function(translate, item) { me._callback(item); }); + translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) }); + translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); }); + translate.translate(); + delete this._DOI; + } else { + // take the relevant parts of some lines (exclude hyphenated word) + var queryStringWords = 0; + while(queryStringWords < 25 && this._startLine < this._goodLines.length) { + var words = this._goodLines[this._startLine].split(/\s+/); + // get rid of first and last words + words.shift(); + words.pop(); + // make sure there are no long words (probably OCR mistakes) + var skipLine = false; + for(var i=0; i 20) { + skipLine = true; + break; + } + } + // add words to query + if(!skipLine && words.length) { + queryStringWords += words.length; + queryString += '"'+words.join(" ")+'" '; + } + this._startLine++; + } + + Zotero.debug("RecognizePDF: Query string "+queryString); + + // pass query string to Google Scholar and translate + var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search"; + if(!this._hiddenBrowser) { + this._hiddenBrowser = Zotero.Browser.createHiddenBrowser(); + this._hiddenBrowser.docShell.allowImages = false; + } + + var translate = new Zotero.Translate("web", true, false); + translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); + translate.setHandler("itemDone", function(translate, item) { + Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); + me._callback(item); + }); + translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) }); + translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); }); + + this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true); + + this._hiddenBrowser.loadURIWithFlags(url, + Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null); + } } /** diff --git a/chrome/content/zotero/xpcom/utilities.js b/chrome/content/zotero/xpcom/utilities.js index 2bf8ddb23a..4ca5ad5a99 100644 --- a/chrome/content/zotero/xpcom/utilities.js +++ b/chrome/content/zotero/xpcom/utilities.js @@ -581,7 +581,7 @@ Zotero.Utilities.prototype.processAsync = function (sets, callbacks, onDone) { * @borrows Zotero.Date.formatDate as this.formatDate * @borrows Zotero.Date.strToDate as this.strToDate * @borrows Zotero.Date.strToISO as this.strToISO - * @borrows Zotero.OpenURL.lookupContextObject as this.lookupContextObject + * @borrows Zotero.OpenURL.createContextObject as this.createContextObject * @borrows Zotero.OpenURL.parseContextObject as this.parseContextObject * @borrows Zotero.Utilities.HTTP.processDocuments as this.processDocuments * @borrows Zotero.Utilities.HTTP.doPost as this.doPost @@ -596,7 +596,7 @@ Zotero.Utilities.Translate.prototype.inArray = Zotero.inArray; Zotero.Utilities.Translate.prototype.formatDate = Zotero.Date.formatDate; Zotero.Utilities.Translate.prototype.strToDate = Zotero.Date.strToDate; Zotero.Utilities.Translate.prototype.strToISO = Zotero.Date.strToISO; -Zotero.Utilities.Translate.prototype.lookupContextObject = Zotero.OpenURL.lookupContextObject; +Zotero.Utilities.Translate.prototype.createContextObject = Zotero.OpenURL.createContextObject; Zotero.Utilities.Translate.prototype.parseContextObject = Zotero.OpenURL.parseContextObject; /** diff --git a/translators/CrossRef.js b/translators/CrossRef.js index 11b00e0095..7b66c9259c 100644 --- a/translators/CrossRef.js +++ b/translators/CrossRef.js @@ -18,11 +18,17 @@ function detectSearch(item) { return false; } +function fixAuthorCapitalization(string) { + if(string.toUpperCase() == string) { + string = string.toLowerCase().replace(/\b[a-z]/g, function(m) { return m[0].toUpperCase() }); + } + return string; +} + function processCrossRef(xmlOutput) { xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, ""); // parse XML with E4X - var qr = new Namespace("http://www.crossref.org/qrschema/2.0"); try { var xml = new XML(xmlOutput); } catch(e) { @@ -30,41 +36,87 @@ function processCrossRef(xmlOutput) { } // ensure status is valid - var status = xml.qr::query_result.qr::body.qr::query.@status.toString(); - if(status != "resolved" && status != "multiresolved") { - return false; + if(!xml.doi_record.length()) return false; + if(xml.doi_record[0].crossref.journal.length()) { + var item = new Zotero.Item("journalArticle"); + var itemXML = xml.doi_record.crossref.journal; + var refXML = itemXML.journal_article; + var metadataXML = itemXML.journal_metadata; + + item.ISSN = itemXML.journal_metadata.issn.toString(); + item.publicationTitle = itemXML.journal_metadata.full_title.toString(); + item.journalAbbreviation = itemXML.journal_metadata.abbrev_title.toString(); + item.volume = itemXML.journal_issue.journal_volume.volume.toString(); + item.issue = itemXML.journal_issue.issue.toString(); + } else if(xml.doi_record[0].crossref.book.length()) { + var item = new Zotero.Item("book"); + var refXML = xml.doi_record[0].crossref.book.book_metadata; + var metadataXML = refXML; + var seriesXML = metadataXML.series_metadata; + + item.place = metadataXML.publisher.publisher_place.toString(); + } else if(xml.doi_record[0].crossref.conference.length()) { + var item = new Zotero.Item("conferencePaper"); + var itemXML = xml.doi_record[0].crossref.conference; + var refXML = itemXML.conference_paper; + var metadataXML = itemXML.proceedingsMetadata; + var seriesXML = metadataXML.series_metadata; + + item.publicationTitle = itemXML.proceedings_metadata.proceedings_title.toString(); + item.place = itemXML.event_metadata.conference_location.toString(); + item.conferenceName = itemXML.event_metadata.conference_name.toString(); } - var query = xml.qr::query_result.qr::body.qr::query; - var item = new Zotero.Item("journalArticle"); + var contributors = refXML.contributors.children(); - // try to get a DOI - item.DOI = query.qr::doi.(@type=="journal_article").text().toString(); - if(!item.DOI) { - item.DOI = query.qr::doi.(@type=="book_title").text().toString(); - } - if(!item.DOI) { - item.DOI = query.qr::doi.(@type=="book_content").text().toString(); + if(metadataXML.isbn.length()) item.ISBN = metadataXML.isbn[0].toString(); + if(metadataXML.issn.length()) item.ISSN = metadataXML.issn[0].toString(); + item.publisher = metadataXML.publisher.publisher_name.toString(); + item.edition = metadataXML.edition_number.toString(); + if(!item.volume) item.volume = metadataXML.volume.toString(); + + if(seriesXML && seriesXML.length()) { + if(seriesXML.contributors.length()) { + contributors += seriesXML.contributors.children(); + } + item.seriesNumber = seriesXML.series_number.toString(); } - // try to get an ISSN (no print/electronic preferences) - item.ISSN = query.qr::issn[0].text().toString(); - // get title - item.title = query.qr::article_title.text().toString(); - // get publicationTitle - item.publicationTitle = query.qr::journal_title.text().toString(); - // get author - item.creators.push(Zotero.Utilities.cleanAuthor(query.qr::author.text().toString(), "author", true)); - // get volume - item.volume = query.qr::volume.text().toString(); - // get issue - item.issue = query.qr::issue.text().toString(); - // get year - item.date = query.qr::year.text().toString(); - // get edition - item.edition = query.qr::edition_number.text().toString(); - // get first page - item.pages = query.qr::first_page.text().toString(); + for each(var creatorXML in contributors) { + var creator = {creatorType:"author"}; + if(creatorXML.contributor_role == "editor") { + creator.creatorType = "editor"; + } else if(creatorXML.contributor_role == "translator") { + creator.creatorType = "translator"; + } else if(creatorXML.contributor_role == "chair") { + creator.creatorType = "contributor"; + } + + if(creatorXML.localName() == "organization") { + creator.fieldMode = 1; + creator.lastName = creatorXML.toString(); + } else if(creatorXML.localName() == "person_name") { + creator.firstName = fixAuthorCapitalization(creatorXML.given_name.toString()); + creator.lastName = fixAuthorCapitalization(creatorXML.surname.toString()); + } + item.creators.push(creator); + } + + item.date = refXML.publication_date.year.toString(); + if(refXML.publication_date.month.length()) { + item.date = refXML.publication_date.month.toString()+"/"+item.date; + } + + if(refXML.pages.length()) { + item.pages = refXML.pages.first_page.toString(); + if(refXML.pages.last_page.length()) { + item.pages += "-"+refXML.pages.last_page.toString(); + } + } + + item.DOI = refXML.doi_data.doi.toString(); + item.url = refXML.doi_data.resource.toString(); + item.title = refXML.titles.title.toString(); item.complete(); return true; @@ -80,7 +132,7 @@ function doSearch(item) { var co = Zotero.Utilities.createContextObject(item); } - Zotero.Utilities.HTTP.doGet("http://www.crossref.org/openurl?req_dat=zter:zter321&"+co+"&noredirect=true", function(responseText) { + Zotero.Utilities.HTTP.doGet("http://www.crossref.org/openurl?req_dat=zter:zter321&"+co+"&noredirect=true&format=unixref", function(responseText) { processCrossRef(responseText); Zotero.done(); });