Extract ISBNs and DOIs from EPUB content (#64)

And move EPUB functionality to class.
2023-08-06 17:52:26 -04:00 · 2023-08-06 17:52:26 -04:00 · 2ef560f7d8
commit 2ef560f7d8
parent cab0fa93e7
8 changed files with 365 additions and 88 deletions
--- a/chrome/content/zotero/xpcom/epub.js
+++ b/chrome/content/zotero/xpcom/epub.js
@ -23,16 +23,43 @@
    ***** END LICENSE BLOCK *****
 */
 var EXPORTED_SYMBOLS = ["EPUB"];
 const { XPCOMUtils } = ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm");
 XPCOMUtils.defineLazyModuleGetters(this, {
 	Zotero: "chrome://zotero/content/include.jsm"
 });
 const ZipReader = Components.Constructor(
 	"@mozilla.org/libjar/zip-reader;1",
 	"nsIZipReader",
 	"open"
 );
-Zotero.EPUB = {
+const DC_NS = 'http://purl.org/dc/elements/1.1/';
-	async* getSectionDocuments(epubPath) {
+const OPF_NS = 'http://www.idpf.org/2007/opf';
-		let zipReader = new ZipReader(Zotero.File.pathToFile(epubPath));
+
-		let contentOPFDoc = await this._getContentOPF(zipReader);
+class EPUB {
 	_zipReader;
 	_contentOPF = null;
 	_contentOPFPath = null;
 	/**
 	 * @param {String | nsIFile} file
 	 */
 	constructor(file) {
 		this._zipReader = new ZipReader(Zotero.File.pathToFile(file));
 	}
 	close() {
 		this._zipReader.close();
 	}
 	async* getSectionDocuments() {
 		let contentOPFDoc = await this._getContentOPF();
 		let manifest = contentOPFDoc.documentElement.querySelector(':scope > manifest');
 		let spine = contentOPFDoc.documentElement.querySelector(':scope > spine');
 		if (!manifest || !spine) {
@ -46,35 +73,53 @@ Zotero.EPUB = {
 					|| manifestItem.getAttribute('media-type') !== 'application/xhtml+xml') {
 				continue;
 			}
-			idToHref.set(manifestItem.getAttribute('id'), manifestItem.getAttribute('href'));
+			let href = manifestItem.getAttribute('href');
 			href = this._resolveRelativeToContentOPF(href);
 			idToHref.set(manifestItem.getAttribute('id'), href);
 		}
 		for (let spineItem of spine.querySelectorAll('itemref')) {
 			let id = spineItem.getAttribute('idref');
 			let href = idToHref.get(id);
-			if (!href || !zipReader.hasEntry(href)) {
+			if (!href || !this._zipReader.hasEntry(href)) {
 				Zotero.debug('EPUB: Skipping missing or invalid href in spine: ' + href);
 				continue;
 			}
-			let entryStream = zipReader.getInputStream(href);
+			let doc = await this._parseEntryToDocument(href, 'application/xhtml+xml');
-			let doc;
+			yield {
-			try {
+				href,
-				doc = await this._parseStreamToDocument(entryStream, 'application/xhtml+xml');
+				doc
-			}
+			};
 			finally {
 				entryStream.close();
 			}
 			yield { href, doc };
 		}
-	},
+	}
-	async getMetadataRDF(epubPath) {
+	async getDocumentByReferenceType(referenceType) {
-		const DC_NS = 'http://purl.org/dc/elements/1.1/';
+		let contentOPFDoc = await this._getContentOPF();
-		const OPF_NS = 'http://www.idpf.org/2007/opf';
+		let guide = contentOPFDoc.documentElement.querySelector(':scope > guide');
 		if (!guide) {
 			return null;
 		}
-		let zipReader = new ZipReader(Zotero.File.pathToFile(epubPath));
+		let reference = guide.querySelector(`:scope > reference[type="${referenceType}"]`);
-		let doc = await this._getContentOPF(zipReader);
+		if (!reference) {
 			return null;
 		}
 		let href = reference.getAttribute('href')
 			?.split('#')[0];
 		if (!href) {
 			return null;
 		}
 		href = this._resolveRelativeToContentOPF(href);
 		if (!this._zipReader.hasEntry(href)) {
 			return null;
 		}
 		return this._parseEntryToDocument(href, 'application/xhtml+xml');
 	}
 	async getMetadataRDF() {
 		let doc = await this._getContentOPF();
 		let metadata = doc.documentElement.querySelector(':scope > metadata');
 		metadata = metadata.cloneNode(true);
 		if (!metadata.getAttribute('xmlns')) {
 			metadata.setAttribute('xmlns', doc.documentElement.namespaceURI || '');
@ -98,38 +143,50 @@ Zotero.EPUB = {
 		}
 		return new XMLSerializer().serializeToString(metadata);
-	},
+	}
 	/**
 	 * @param {ZipReader} zipReader
 	 * @return {Promise<XMLDocument>}
 	 */
-	async _getContentOPF(zipReader) {
+	async _getContentOPF() {
-		if (!zipReader.hasEntry('META-INF/container.xml')) {
+		if (this._contentOPF) {
 			return this._contentOPF;
 		}
 		if (!this._zipReader.hasEntry('META-INF/container.xml')) {
 			throw new Error('EPUB file does not contain container.xml');
 		}
-		let containerXMLStream = zipReader.getInputStream('META-INF/container.xml');
+		let containerXMLDoc = await this._parseEntryToDocument('META-INF/container.xml', 'text/xml');
 		let containerXMLDoc = await this._parseStreamToDocument(containerXMLStream, 'text/xml');
 		containerXMLStream.close();
 		let rootFile = containerXMLDoc.documentElement.querySelector(':scope > rootfiles > rootfile');
 		if (!rootFile || !rootFile.hasAttribute('full-path')) {
 			throw new Error('container.xml does not contain <rootfile full-path="...">');
 		}
-		let contentOPFStream = zipReader.getInputStream(rootFile.getAttribute('full-path'));
+		this._contentOPFPath = rootFile.getAttribute('full-path');
 		this._contentOPF = await this._parseEntryToDocument(this._contentOPFPath, 'text/xml');
 		return this._contentOPF;
 	}
 	_resolveRelativeToContentOPF(path) {
 		if (!this._contentOPFPath) {
 			throw new Error('content.opf not loaded');
 		}
 		// Use the URL class with a phony zip: scheme to resolve relative paths in a non-platform-defined way
 		return new URL(path, 'zip:/' + this._contentOPFPath).pathname.substring(1);
 	}
 	async _parseEntryToDocument(entry, type) {
 		let parser = new DOMParser();
 		let stream = this._zipReader.getInputStream(entry);
 		let xml;
 		try {
-			return await this._parseStreamToDocument(contentOPFStream, 'text/xml');
+			xml = await Zotero.File.getContentsAsync(stream);
 		}
 		finally {
-			contentOPFStream.close();
+			stream.close();
 		}
 	},
 	async _parseStreamToDocument(stream, type) {
 		let parser = new DOMParser();
 		let xml = await Zotero.File.getContentsAsync(stream);
 		return parser.parseFromString(xml, type);
 	}
-};
+}
--- a/chrome/content/zotero/xpcom/fulltext.js
+++ b/chrome/content/zotero/xpcom/fulltext.js
@ -409,13 +409,16 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 	 * @return {Promise}
 	 */
 	this.indexEPUB = async function (filePath, itemID, allText) {
 		const { EPUB } = ChromeUtils.import('chrome://zotero/content/EPUB.jsm');
 		let maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
 		let item = await Zotero.Items.getAsync(itemID);
 		let epub = new EPUB(filePath);
 		try {
 			let text = '';
 			let totalChars = 0;
-			for await (let { href, doc } of Zotero.EPUB.getSectionDocuments(filePath)) {
+			for await (let { href, doc } of epub.getSectionDocuments(filePath)) {
 				if (!doc.body) {
 					Zotero.debug(`Skipping EPUB entry '${href}' with no body`);
 					continue;
@ -437,6 +440,9 @@ Zotero.Fulltext = Zotero.FullText = new function(){
 			Zotero.logError(e);
 			return false;
 		}
 		finally {
 			epub.close();
 		}
 	};
--- a/chrome/content/zotero/xpcom/progressQueueDialog.js
+++ b/chrome/content/zotero/xpcom/progressQueueDialog.js
@ -133,7 +133,12 @@ Zotero.ProgressQueueDialog = function (progressQueue) {
 		if (!_progressWindow) return;
 		let total = _progressQueue.getTotal();
 		let processed = _progressQueue.getProcessedTotal();
-		_progressIndicator.value = processed * 100 / total;
+		if (total === 0) {
 			_progressIndicator.value = 0;
 		}
 		else {
 			_progressIndicator.value = processed * 100 / total;
 		}
 		if (processed === total) {
 			_progressWindow.document.getElementById("cancel-button").hidden = true;
 			_progressWindow.document.getElementById("minimize-button").hidden = true;
--- a/chrome/content/zotero/xpcom/recognizeDocument.js
+++ b/chrome/content/zotero/xpcom/recognizeDocument.js
@ -27,6 +27,7 @@ Zotero.RecognizeDocument = new function () {
 	const OFFLINE_RECHECK_DELAY = 60 * 1000;
 	const MAX_PAGES = 5;
 	const UNRECOGNIZE_TIMEOUT = 86400 * 1000;
 	const EPUB_MAX_SECTIONS = 5;
 	let _newItems = new WeakMap();
@ -572,45 +573,74 @@ Zotero.RecognizeDocument = new function () {
 	}
 	async function _recognizeEPUB(item, filePath) {
-		let metadata = await Zotero.EPUB.getMetadataRDF(filePath);
+		const { EPUB } = ChromeUtils.import('chrome://zotero/content/EPUB.jsm');
 		if (!metadata) {
 			throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
 		}
 		let libraryID = item.libraryID;
 		let translate = new Zotero.Translate.Import();
 		translate.setTranslator(Zotero.Translators.TRANSLATOR_ID_RDF);
 		translate.setString(metadata);
 		let epub = new EPUB(filePath);
 		try {
-			let [rdfItemJSON] = await translate.translate({
+			let search = {};
 				libraryID: false,
 				saveAttachments: false
 			});
-			let itemJSON = rdfItemJSON;
+			let rdfItemJSON = await _translateEPUBMetadata(epub);
-			let isbn = Zotero.Utilities.cleanISBN(rdfItemJSON.ISBN || '');
+			if (rdfItemJSON && rdfItemJSON.ISBN) {
-			if (isbn) {
+				let clean = rdfItemJSON.ISBN.split(' ')
 					.map(isbn => Zotero.Utilities.cleanISBN(isbn))
 					.filter(Boolean);
 				if (clean.length) {
 					Zotero.debug('RecognizeEPUB: Found ISBN in RDF metadata');
 					search.ISBN = clean.join(' ');
 				}
 			}
 			for await (let doc of _getFirstSectionDocuments(epub)) {
 				if (search.DOI && search.ISBN) break;
 				if (!search.DOI) {
 					let dois = _getDOIsFromDocument(doc);
 					if (dois.length) {
 						Zotero.debug('RecognizeEPUB: Found DOI in section document');
 						search.DOI = dois[0];
 					}
 				}
 				if (!search.ISBN) {
 					let isbn = _getISBNFromDocument(doc);
 					if (isbn) {
 						Zotero.debug('RecognizeEPUB: Found ISBN in section document');
 						search.ISBN = isbn;
 					}
 				}
 			}
 			let itemJSON;
 			if (search.ISBN || search.DOI) {
 				try {
-					translate = new Zotero.Translate.Search();
+					Zotero.debug('RecognizeEPUB: Searching by ' + Object.keys(search)
-					translate.setSearch({ ISBN: isbn });
+						.join(', '));
-					let [isbnItemJSON] = await translate.translate({
+					let translate = new Zotero.Translate.Search();
 					translate.setSearch(search);
 					let [searchItemJSON] = await translate.translate({
 						libraryID: false,
 						saveAttachments: false
 					});
-					if (isbnItemJSON?.ISBN?.split(' ')
+					if (searchItemJSON) {
 						if (search.ISBN && searchItemJSON?.ISBN?.split(' ')
 							.map(resolvedISBN => Zotero.Utilities.cleanISBN(resolvedISBN))
-							.includes(isbn)) {
+							.includes(search.ISBN)) {
-						itemJSON = isbnItemJSON;
+							Zotero.debug('RecognizeDocument: Using ISBN search result');
 							itemJSON = searchItemJSON;
 						}
 						else {
 							Zotero.debug(`RecognizeDocument: ISBN mismatch (was ${search.ISBN}, got ${searchItemJSON.ISBN})`);
 						}
 					}
-					else if (isbnItemJSON) {
+				} catch (e) {
 						Zotero.debug(`RecognizeDocument: ISBN mismatch (was ${isbn}, got ${isbnItemJSON.ISBN})`);
 					}
 				}
 				catch (e) {
 					Zotero.debug('RecognizeDocument: Error while resolving ISBN: ' + e);
 				}
 			}
 			if (!itemJSON) {
 				Zotero.debug('RecognizeEPUB: Falling back to RDF metadata');
 				itemJSON = rdfItemJSON;
 			}
 			if (!itemJSON) {
 				throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
 			}
 			if (Zotero.Prefs.get('automaticTags')) {
 				itemJSON.tags = itemJSON.tags.map((tag) => {
@ -628,17 +658,105 @@ Zotero.RecognizeDocument = new function () {
 				itemJSON.tags = [];
 			}
-			let item = new Zotero.Item();
+			let translatedItem = new Zotero.Item();
-			item.libraryID = libraryID;
+			translatedItem.libraryID = item.libraryID;
-			item.fromJSON(itemJSON);
+			translatedItem.fromJSON(itemJSON);
-			await item.saveTx();
+			await translatedItem.saveTx();
-			return item;
+			return translatedItem;
 		}
-		catch (e) {
+		finally {
-			Zotero.debug('RecognizeDocument: ' + e);
+			epub.close();
 		}
 	}
 	async function _translateEPUBMetadata(epub) {
 		let metadata = await epub.getMetadataRDF();
 		if (!metadata) {
 			return null;
 		}
-		return null;
+		let translate = new Zotero.Translate.Import();
 		translate.setTranslator(Zotero.Translators.TRANSLATOR_ID_RDF);
 		translate.setString(metadata);
 		try {
 			let [itemJSON] = await translate.translate({
 				libraryID: false,
 				saveAttachments: false
 			});
 			return itemJSON;
 		}
 		catch (e) {
 			Zotero.logError(e);
 			return null;
 		}
 	}
 	async function* _getFirstSectionDocuments(epub) {
 		let copyrightDoc = await epub.getDocumentByReferenceType('copyright-page');
 		if (copyrightDoc) {
 			yield copyrightDoc;
 		}
 		let i = 0;
 		for await (let { doc: sectionDoc } of epub.getSectionDocuments()) {
 			yield sectionDoc;
 			if (++i >= EPUB_MAX_SECTIONS) {
 				break;
 			}
 		}
 	}
 	function _getDOIsFromDocument(doc) {
 		// Copied from DOI translator
 		const DOIre = /\b10\.[0-9]{4,}\/[^\s&"']*[^\s&"'.,]/g;
 		var dois = new Set();
 		var m, DOI;
 		var treeWalker = doc.createTreeWalker(doc.documentElement, NodeFilter.SHOW_TEXT);
 		var ignore = ['script', 'style'];
 		while (treeWalker.nextNode()) {
 			if (ignore.includes(treeWalker.currentNode.parentNode.tagName.toLowerCase())) continue;
 			DOIre.lastIndex = 0;
 			while ((m = DOIre.exec(treeWalker.currentNode.nodeValue))) {
 				DOI = m[0];
 				if (DOI.endsWith(")") && !DOI.includes("(")) {
 					DOI = DOI.substring(0, DOI.length - 1);
 				}
 				if (DOI.endsWith("}") && !DOI.includes("{")) {
 					DOI = DOI.substring(0, DOI.length - 1);
 				}
 				dois.add(DOI);
 			}
 		}
 		var links = doc.querySelectorAll('a[href]');
 		for (let link of links) {
 			DOIre.lastIndex = 0;
 			let m = DOIre.exec(link.href);
 			if (m) {
 				let doi = m[0];
 				if (doi.endsWith(")") && !doi.includes("(")) {
 					doi = doi.substring(0, doi.length - 1);
 				}
 				if (doi.endsWith("}") && !doi.includes("{")) {
 					doi = doi.substring(0, doi.length - 1);
 				}
 				// only add new DOIs
 				if (!dois.has(doi) && !dois.has(doi.replace(/#.*/, ''))) {
 					dois.add(doi);
 				}
 			}
 		}
 		return Array.from(dois);
 	}
 	function _getISBNFromDocument(doc) {
 		if (!doc.body) {
 			return null;
 		}
 		return Zotero.Utilities.cleanISBN(doc.body.innerText) || null;
 	}
 	/**
--- a/components/zotero-service.js
+++ b/components/zotero-service.js
@ -102,7 +102,6 @@ const xpcomFilesLocal = [
 	'dictionaries',
 	'duplicates',
 	'editorInstance',
 	'epub',
 	'feedReader',
 	'fileDragDataProvider',
 	'fulltext',
--- a/test/tests/data/recognizeEPUB_test_content.epub
+++ b/test/tests/data/recognizeEPUB_test_content.epub
--- a/test/tests/data/recognizeEPUB_test_copyright_page.epub
+++ b/test/tests/data/recognizeEPUB_test_copyright_page.epub
--- a/test/tests/recognizeDocumentTest.js
+++ b/test/tests/recognizeDocumentTest.js
@ -296,9 +296,10 @@ describe("Document Recognition", function() {
 	describe("Ebooks", function () {
 		it("should recognize an EPUB by ISBN and rename the file", async function () {
 			let isbn = '9780656173822';
 			let search;
 			let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
 				.callsFake(async function () {
-					assert.equal(this.search.ISBN, isbn);
+					search = this.search;
 					return [{
 						itemType: 'book',
 						title: 'The Mania of the Nations on the Planet Mars: ISBN Database Edition',
@ -321,6 +322,8 @@ describe("Document Recognition", function() {
 			let addedIDs = await waitForItemEvent('add');
 			let modifiedIDs = await waitForItemEvent('modify');
 			assert.isTrue(translateStub.calledOnce);
 			assert.ok(search);
 			assert.equal(search.ISBN, isbn);
 			assert.lengthOf(addedIDs, 1);
 			let item = Zotero.Items.get(addedIDs[0]);
 			assert.equal(item.getField('title'), 'The Mania of the Nations on the Planet Mars: ISBN Database Edition');
@ -380,9 +383,10 @@ describe("Document Recognition", function() {
 		it("should use metadata from EPUB when search returns item with different ISBN", async function () {
 			let isbn = '9780656173822';
 			let isbnWrong = '9780656173823';
 			let search;
 			let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
 				.callsFake(async function () {
-					assert.equal(this.search.ISBN, isbn);
+					search = this.search;
 					return [{
 						itemType: 'book',
 						title: 'The Mania of the Nations on the Planet Mars: Bad Metadata Edition',
@ -405,6 +409,8 @@ describe("Document Recognition", function() {
 			let addedIDs = await waitForItemEvent('add');
 			let modifiedIDs = await waitForItemEvent('modify');
 			assert.isTrue(translateStub.calledOnce);
 			assert.ok(search);
 			assert.equal(search.ISBN, isbn);
 			assert.lengthOf(addedIDs, 1);
 			let item = Zotero.Items.get(addedIDs[0]);
 			assert.equal(item.getField('title'), 'The Mania of the Nations on the Planet Mars and its Terrific Consequences / A Combination of Fun and Wisdom');
@ -416,15 +422,57 @@ describe("Document Recognition", function() {
 		it("should use metadata from EPUB when search fails", async function () {
 			let isbn = '9780656173822';
 			let search = null;
 			let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
 				.callsFake(async function () {
-					assert.equal(this.search.ISBN, isbn);
+					search = this.search;
 					throw new Error('simulated failure');
 				});
 			let testDir = getTestDataDirectory();
 			testDir.append('recognizeEPUB_test_ISBN.epub');
 			let collection = await createDataObject('collection');
 			let attachment = await Zotero.Attachments.importFromFile({
 				file: testDir,
 				collections: [collection.id]
 			});
 			await win.ZoteroPane.selectItem(attachment.id); // No idea why this is necessary for only this test
 			win.ZoteroPane.recognizeSelected();
 			let addedIDs = await waitForItemEvent('add');
 			let modifiedIDs = await waitForItemEvent('modify');
 			assert.isTrue(translateStub.calledOnce);
 			assert.ok(search);
 			assert.equal(search.ISBN, isbn);
 			assert.lengthOf(addedIDs, 1);
 			let item = Zotero.Items.get(addedIDs[0]);
 			assert.equal(item.getField('title'), 'The Mania of the Nations on the Planet Mars and its Terrific Consequences / A Combination of Fun and Wisdom');
 			assert.equal(Zotero.Utilities.cleanISBN(item.getField('ISBN')), isbn);
 			assert.lengthOf(modifiedIDs, 2);
 			translateStub.restore();
 		});
 		it("should find and search by ISBN and DOI in section marked as copyright page", async function () {
 			let isbn = '9780226300481';
 			let doi = '10.7208/chicago/9780226300658.001.0001';
 			let search = null;
 			let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
 				.callsFake(async function () {
 					search = this.search;
 					return [{
 						itemType: 'book',
 						title: 'Building the American Republic, Volume 1, Library Catalog Edition',
 						ISBN: isbn,
 						attachments: [],
 						tags: []
 					}];
 				});
 			let testDir = getTestDataDirectory();
 			testDir.append('recognizeEPUB_test_copyright_page.epub');
 			let collection = await createDataObject('collection');
 			await Zotero.Attachments.importFromFile({
 				file: testDir,
 				collections: [collection.id]
@ -435,9 +483,53 @@ describe("Document Recognition", function() {
 			let addedIDs = await waitForItemEvent('add');
 			let modifiedIDs = await waitForItemEvent('modify');
 			assert.isTrue(translateStub.calledOnce);
 			assert.ok(search);
 			assert.equal(search.ISBN, isbn);
 			assert.equal(search.DOI, doi);
 			assert.lengthOf(addedIDs, 1);
 			let item = Zotero.Items.get(addedIDs[0]);
-			assert.equal(item.getField('title'), 'The Mania of the Nations on the Planet Mars and its Terrific Consequences / A Combination of Fun and Wisdom');
+			assert.equal(item.getField('title'), 'Building the American Republic, Volume 1, Library Catalog Edition');
 			assert.equal(Zotero.Utilities.cleanISBN(item.getField('ISBN')), isbn);
 			assert.lengthOf(modifiedIDs, 2);
 			translateStub.restore();
 		});
 		it("should find and search by ISBN and DOI in section not marked as copyright page", async function () {
 			let isbn = '9780226300481';
 			let doi = '10.7208/chicago/9780226300658.001.0001';
 			let search = null;
 			let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
 				.callsFake(async function () {
 					search = this.search;
 					return [{
 						itemType: 'book',
 						title: 'Building the American Republic, Volume 1, Library Catalog Edition',
 						ISBN: isbn,
 						attachments: [],
 						tags: []
 					}];
 				});
 			let testDir = getTestDataDirectory();
 			testDir.append('recognizeEPUB_test_content.epub');
 			let collection = await createDataObject('collection');
 			await Zotero.Attachments.importFromFile({
 				file: testDir,
 				collections: [collection.id]
 			});
 			win.ZoteroPane.recognizeSelected();
 			let addedIDs = await waitForItemEvent('add');
 			let modifiedIDs = await waitForItemEvent('modify');
 			assert.isTrue(translateStub.calledOnce);
 			assert.ok(search);
 			assert.equal(search.ISBN, isbn);
 			assert.equal(search.DOI, doi);
 			assert.lengthOf(addedIDs, 1);
 			let item = Zotero.Items.get(addedIDs[0]);
 			assert.equal(item.getField('title'), 'Building the American Republic, Volume 1, Library Catalog Edition');
 			assert.equal(Zotero.Utilities.cleanISBN(item.getField('ISBN')), isbn);
 			assert.lengthOf(modifiedIDs, 2);