Extract ISBNs and DOIs from EPUB content (#64)

And move EPUB functionality to class.
This commit is contained in:
Abe Jellinek 2023-08-06 17:52:26 -04:00 committed by Dan Stillman
parent cab0fa93e7
commit 2ef560f7d8
8 changed files with 365 additions and 88 deletions

View file

@ -23,16 +23,43 @@
***** END LICENSE BLOCK *****
*/
var EXPORTED_SYMBOLS = ["EPUB"];
const { XPCOMUtils } = ChromeUtils.import("resource://gre/modules/XPCOMUtils.jsm");
XPCOMUtils.defineLazyModuleGetters(this, {
Zotero: "chrome://zotero/content/include.jsm"
});
const ZipReader = Components.Constructor(
"@mozilla.org/libjar/zip-reader;1",
"nsIZipReader",
"open"
);
Zotero.EPUB = {
async* getSectionDocuments(epubPath) {
let zipReader = new ZipReader(Zotero.File.pathToFile(epubPath));
let contentOPFDoc = await this._getContentOPF(zipReader);
const DC_NS = 'http://purl.org/dc/elements/1.1/';
const OPF_NS = 'http://www.idpf.org/2007/opf';
class EPUB {
_zipReader;
_contentOPF = null;
_contentOPFPath = null;
/**
* @param {String | nsIFile} file
*/
constructor(file) {
this._zipReader = new ZipReader(Zotero.File.pathToFile(file));
}
close() {
this._zipReader.close();
}
async* getSectionDocuments() {
let contentOPFDoc = await this._getContentOPF();
let manifest = contentOPFDoc.documentElement.querySelector(':scope > manifest');
let spine = contentOPFDoc.documentElement.querySelector(':scope > spine');
if (!manifest || !spine) {
@ -46,40 +73,58 @@ Zotero.EPUB = {
|| manifestItem.getAttribute('media-type') !== 'application/xhtml+xml') {
continue;
}
idToHref.set(manifestItem.getAttribute('id'), manifestItem.getAttribute('href'));
let href = manifestItem.getAttribute('href');
href = this._resolveRelativeToContentOPF(href);
idToHref.set(manifestItem.getAttribute('id'), href);
}
for (let spineItem of spine.querySelectorAll('itemref')) {
let id = spineItem.getAttribute('idref');
let href = idToHref.get(id);
if (!href || !zipReader.hasEntry(href)) {
if (!href || !this._zipReader.hasEntry(href)) {
Zotero.debug('EPUB: Skipping missing or invalid href in spine: ' + href);
continue;
}
let entryStream = zipReader.getInputStream(href);
let doc;
try {
doc = await this._parseStreamToDocument(entryStream, 'application/xhtml+xml');
}
finally {
entryStream.close();
}
yield { href, doc };
let doc = await this._parseEntryToDocument(href, 'application/xhtml+xml');
yield {
href,
doc
};
}
},
async getMetadataRDF(epubPath) {
const DC_NS = 'http://purl.org/dc/elements/1.1/';
const OPF_NS = 'http://www.idpf.org/2007/opf';
let zipReader = new ZipReader(Zotero.File.pathToFile(epubPath));
let doc = await this._getContentOPF(zipReader);
}
async getDocumentByReferenceType(referenceType) {
let contentOPFDoc = await this._getContentOPF();
let guide = contentOPFDoc.documentElement.querySelector(':scope > guide');
if (!guide) {
return null;
}
let reference = guide.querySelector(`:scope > reference[type="${referenceType}"]`);
if (!reference) {
return null;
}
let href = reference.getAttribute('href')
?.split('#')[0];
if (!href) {
return null;
}
href = this._resolveRelativeToContentOPF(href);
if (!this._zipReader.hasEntry(href)) {
return null;
}
return this._parseEntryToDocument(href, 'application/xhtml+xml');
}
async getMetadataRDF() {
let doc = await this._getContentOPF();
let metadata = doc.documentElement.querySelector(':scope > metadata');
metadata = metadata.cloneNode(true);
if (!metadata.getAttribute('xmlns')) {
metadata.setAttribute('xmlns', doc.documentElement.namespaceURI || '');
}
for (let elem of metadata.querySelectorAll('*')) {
for (let attr of Array.from(elem.attributes)) {
// Null- and unknown-namespace attributes cause rdf.js to ignore the entire element
@ -89,47 +134,59 @@ Zotero.EPUB = {
}
}
}
// If the metadata doesn't contain a dc:type, add one
if (!metadata.getElementsByTagNameNS(DC_NS, 'type').length) {
let dcType = doc.createElementNS(DC_NS, 'type');
dcType.textContent = 'book';
metadata.appendChild(dcType);
}
return new XMLSerializer().serializeToString(metadata);
},
}
/**
* @param {ZipReader} zipReader
* @return {Promise<XMLDocument>}
*/
async _getContentOPF(zipReader) {
if (!zipReader.hasEntry('META-INF/container.xml')) {
async _getContentOPF() {
if (this._contentOPF) {
return this._contentOPF;
}
if (!this._zipReader.hasEntry('META-INF/container.xml')) {
throw new Error('EPUB file does not contain container.xml');
}
let containerXMLStream = zipReader.getInputStream('META-INF/container.xml');
let containerXMLDoc = await this._parseStreamToDocument(containerXMLStream, 'text/xml');
containerXMLStream.close();
let containerXMLDoc = await this._parseEntryToDocument('META-INF/container.xml', 'text/xml');
let rootFile = containerXMLDoc.documentElement.querySelector(':scope > rootfiles > rootfile');
if (!rootFile || !rootFile.hasAttribute('full-path')) {
throw new Error('container.xml does not contain <rootfile full-path="...">');
}
let contentOPFStream = zipReader.getInputStream(rootFile.getAttribute('full-path'));
this._contentOPFPath = rootFile.getAttribute('full-path');
this._contentOPF = await this._parseEntryToDocument(this._contentOPFPath, 'text/xml');
return this._contentOPF;
}
_resolveRelativeToContentOPF(path) {
if (!this._contentOPFPath) {
throw new Error('content.opf not loaded');
}
// Use the URL class with a phony zip: scheme to resolve relative paths in a non-platform-defined way
return new URL(path, 'zip:/' + this._contentOPFPath).pathname.substring(1);
}
async _parseEntryToDocument(entry, type) {
let parser = new DOMParser();
let stream = this._zipReader.getInputStream(entry);
let xml;
try {
return await this._parseStreamToDocument(contentOPFStream, 'text/xml');
xml = await Zotero.File.getContentsAsync(stream);
}
finally {
contentOPFStream.close();
stream.close();
}
},
async _parseStreamToDocument(stream, type) {
let parser = new DOMParser();
let xml = await Zotero.File.getContentsAsync(stream);
return parser.parseFromString(xml, type);
}
};
}

View file

@ -409,13 +409,16 @@ Zotero.Fulltext = Zotero.FullText = new function(){
* @return {Promise}
*/
this.indexEPUB = async function (filePath, itemID, allText) {
const { EPUB } = ChromeUtils.import('chrome://zotero/content/EPUB.jsm');
let maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
let item = await Zotero.Items.getAsync(itemID);
let epub = new EPUB(filePath);
try {
let text = '';
let totalChars = 0;
for await (let { href, doc } of Zotero.EPUB.getSectionDocuments(filePath)) {
for await (let { href, doc } of epub.getSectionDocuments(filePath)) {
if (!doc.body) {
Zotero.debug(`Skipping EPUB entry '${href}' with no body`);
continue;
@ -437,6 +440,9 @@ Zotero.Fulltext = Zotero.FullText = new function(){
Zotero.logError(e);
return false;
}
finally {
epub.close();
}
};

View file

@ -133,7 +133,12 @@ Zotero.ProgressQueueDialog = function (progressQueue) {
if (!_progressWindow) return;
let total = _progressQueue.getTotal();
let processed = _progressQueue.getProcessedTotal();
_progressIndicator.value = processed * 100 / total;
if (total === 0) {
_progressIndicator.value = 0;
}
else {
_progressIndicator.value = processed * 100 / total;
}
if (processed === total) {
_progressWindow.document.getElementById("cancel-button").hidden = true;
_progressWindow.document.getElementById("minimize-button").hidden = true;

View file

@ -27,6 +27,7 @@ Zotero.RecognizeDocument = new function () {
const OFFLINE_RECHECK_DELAY = 60 * 1000;
const MAX_PAGES = 5;
const UNRECOGNIZE_TIMEOUT = 86400 * 1000;
const EPUB_MAX_SECTIONS = 5;
let _newItems = new WeakMap();
@ -572,45 +573,74 @@ Zotero.RecognizeDocument = new function () {
}
async function _recognizeEPUB(item, filePath) {
let metadata = await Zotero.EPUB.getMetadataRDF(filePath);
if (!metadata) {
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
}
let libraryID = item.libraryID;
let translate = new Zotero.Translate.Import();
translate.setTranslator(Zotero.Translators.TRANSLATOR_ID_RDF);
translate.setString(metadata);
const { EPUB } = ChromeUtils.import('chrome://zotero/content/EPUB.jsm');
let epub = new EPUB(filePath);
try {
let [rdfItemJSON] = await translate.translate({
libraryID: false,
saveAttachments: false
});
let itemJSON = rdfItemJSON;
let isbn = Zotero.Utilities.cleanISBN(rdfItemJSON.ISBN || '');
if (isbn) {
let search = {};
let rdfItemJSON = await _translateEPUBMetadata(epub);
if (rdfItemJSON && rdfItemJSON.ISBN) {
let clean = rdfItemJSON.ISBN.split(' ')
.map(isbn => Zotero.Utilities.cleanISBN(isbn))
.filter(Boolean);
if (clean.length) {
Zotero.debug('RecognizeEPUB: Found ISBN in RDF metadata');
search.ISBN = clean.join(' ');
}
}
for await (let doc of _getFirstSectionDocuments(epub)) {
if (search.DOI && search.ISBN) break;
if (!search.DOI) {
let dois = _getDOIsFromDocument(doc);
if (dois.length) {
Zotero.debug('RecognizeEPUB: Found DOI in section document');
search.DOI = dois[0];
}
}
if (!search.ISBN) {
let isbn = _getISBNFromDocument(doc);
if (isbn) {
Zotero.debug('RecognizeEPUB: Found ISBN in section document');
search.ISBN = isbn;
}
}
}
let itemJSON;
if (search.ISBN || search.DOI) {
try {
translate = new Zotero.Translate.Search();
translate.setSearch({ ISBN: isbn });
let [isbnItemJSON] = await translate.translate({
Zotero.debug('RecognizeEPUB: Searching by ' + Object.keys(search)
.join(', '));
let translate = new Zotero.Translate.Search();
translate.setSearch(search);
let [searchItemJSON] = await translate.translate({
libraryID: false,
saveAttachments: false
});
if (isbnItemJSON?.ISBN?.split(' ')
if (searchItemJSON) {
if (search.ISBN && searchItemJSON?.ISBN?.split(' ')
.map(resolvedISBN => Zotero.Utilities.cleanISBN(resolvedISBN))
.includes(isbn)) {
itemJSON = isbnItemJSON;
.includes(search.ISBN)) {
Zotero.debug('RecognizeDocument: Using ISBN search result');
itemJSON = searchItemJSON;
}
else {
Zotero.debug(`RecognizeDocument: ISBN mismatch (was ${search.ISBN}, got ${searchItemJSON.ISBN})`);
}
}
else if (isbnItemJSON) {
Zotero.debug(`RecognizeDocument: ISBN mismatch (was ${isbn}, got ${isbnItemJSON.ISBN})`);
}
}
catch (e) {
} catch (e) {
Zotero.debug('RecognizeDocument: Error while resolving ISBN: ' + e);
}
}
if (!itemJSON) {
Zotero.debug('RecognizeEPUB: Falling back to RDF metadata');
itemJSON = rdfItemJSON;
}
if (!itemJSON) {
throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
}
if (Zotero.Prefs.get('automaticTags')) {
itemJSON.tags = itemJSON.tags.map((tag) => {
@ -628,17 +658,105 @@ Zotero.RecognizeDocument = new function () {
itemJSON.tags = [];
}
let item = new Zotero.Item();
item.libraryID = libraryID;
item.fromJSON(itemJSON);
await item.saveTx();
return item;
let translatedItem = new Zotero.Item();
translatedItem.libraryID = item.libraryID;
translatedItem.fromJSON(itemJSON);
await translatedItem.saveTx();
return translatedItem;
}
finally {
epub.close();
}
}
async function _translateEPUBMetadata(epub) {
let metadata = await epub.getMetadataRDF();
if (!metadata) {
return null;
}
let translate = new Zotero.Translate.Import();
translate.setTranslator(Zotero.Translators.TRANSLATOR_ID_RDF);
translate.setString(metadata);
try {
let [itemJSON] = await translate.translate({
libraryID: false,
saveAttachments: false
});
return itemJSON;
}
catch (e) {
Zotero.debug('RecognizeDocument: ' + e);
Zotero.logError(e);
return null;
}
}
async function* _getFirstSectionDocuments(epub) {
let copyrightDoc = await epub.getDocumentByReferenceType('copyright-page');
if (copyrightDoc) {
yield copyrightDoc;
}
let i = 0;
for await (let { doc: sectionDoc } of epub.getSectionDocuments()) {
yield sectionDoc;
if (++i >= EPUB_MAX_SECTIONS) {
break;
}
}
}
function _getDOIsFromDocument(doc) {
// Copied from DOI translator
return null;
const DOIre = /\b10\.[0-9]{4,}\/[^\s&"']*[^\s&"'.,]/g;
var dois = new Set();
var m, DOI;
var treeWalker = doc.createTreeWalker(doc.documentElement, NodeFilter.SHOW_TEXT);
var ignore = ['script', 'style'];
while (treeWalker.nextNode()) {
if (ignore.includes(treeWalker.currentNode.parentNode.tagName.toLowerCase())) continue;
DOIre.lastIndex = 0;
while ((m = DOIre.exec(treeWalker.currentNode.nodeValue))) {
DOI = m[0];
if (DOI.endsWith(")") && !DOI.includes("(")) {
DOI = DOI.substring(0, DOI.length - 1);
}
if (DOI.endsWith("}") && !DOI.includes("{")) {
DOI = DOI.substring(0, DOI.length - 1);
}
dois.add(DOI);
}
}
var links = doc.querySelectorAll('a[href]');
for (let link of links) {
DOIre.lastIndex = 0;
let m = DOIre.exec(link.href);
if (m) {
let doi = m[0];
if (doi.endsWith(")") && !doi.includes("(")) {
doi = doi.substring(0, doi.length - 1);
}
if (doi.endsWith("}") && !doi.includes("{")) {
doi = doi.substring(0, doi.length - 1);
}
// only add new DOIs
if (!dois.has(doi) && !dois.has(doi.replace(/#.*/, ''))) {
dois.add(doi);
}
}
}
return Array.from(dois);
}
function _getISBNFromDocument(doc) {
if (!doc.body) {
return null;
}
return Zotero.Utilities.cleanISBN(doc.body.innerText) || null;
}
/**

View file

@ -102,7 +102,6 @@ const xpcomFilesLocal = [
'dictionaries',
'duplicates',
'editorInstance',
'epub',
'feedReader',
'fileDragDataProvider',
'fulltext',

Binary file not shown.

Binary file not shown.

View file

@ -296,9 +296,10 @@ describe("Document Recognition", function() {
describe("Ebooks", function () {
it("should recognize an EPUB by ISBN and rename the file", async function () {
let isbn = '9780656173822';
let search;
let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
.callsFake(async function () {
assert.equal(this.search.ISBN, isbn);
search = this.search;
return [{
itemType: 'book',
title: 'The Mania of the Nations on the Planet Mars: ISBN Database Edition',
@ -321,6 +322,8 @@ describe("Document Recognition", function() {
let addedIDs = await waitForItemEvent('add');
let modifiedIDs = await waitForItemEvent('modify');
assert.isTrue(translateStub.calledOnce);
assert.ok(search);
assert.equal(search.ISBN, isbn);
assert.lengthOf(addedIDs, 1);
let item = Zotero.Items.get(addedIDs[0]);
assert.equal(item.getField('title'), 'The Mania of the Nations on the Planet Mars: ISBN Database Edition');
@ -380,9 +383,10 @@ describe("Document Recognition", function() {
it("should use metadata from EPUB when search returns item with different ISBN", async function () {
let isbn = '9780656173822';
let isbnWrong = '9780656173823';
let search;
let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
.callsFake(async function () {
assert.equal(this.search.ISBN, isbn);
search = this.search;
return [{
itemType: 'book',
title: 'The Mania of the Nations on the Planet Mars: Bad Metadata Edition',
@ -405,6 +409,8 @@ describe("Document Recognition", function() {
let addedIDs = await waitForItemEvent('add');
let modifiedIDs = await waitForItemEvent('modify');
assert.isTrue(translateStub.calledOnce);
assert.ok(search);
assert.equal(search.ISBN, isbn);
assert.lengthOf(addedIDs, 1);
let item = Zotero.Items.get(addedIDs[0]);
assert.equal(item.getField('title'), 'The Mania of the Nations on the Planet Mars and its Terrific Consequences / A Combination of Fun and Wisdom');
@ -416,15 +422,57 @@ describe("Document Recognition", function() {
it("should use metadata from EPUB when search fails", async function () {
let isbn = '9780656173822';
let search = null;
let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
.callsFake(async function () {
assert.equal(this.search.ISBN, isbn);
search = this.search;
throw new Error('simulated failure');
});
let testDir = getTestDataDirectory();
testDir.append('recognizeEPUB_test_ISBN.epub');
let collection = await createDataObject('collection');
let attachment = await Zotero.Attachments.importFromFile({
file: testDir,
collections: [collection.id]
});
await win.ZoteroPane.selectItem(attachment.id); // No idea why this is necessary for only this test
win.ZoteroPane.recognizeSelected();
let addedIDs = await waitForItemEvent('add');
let modifiedIDs = await waitForItemEvent('modify');
assert.isTrue(translateStub.calledOnce);
assert.ok(search);
assert.equal(search.ISBN, isbn);
assert.lengthOf(addedIDs, 1);
let item = Zotero.Items.get(addedIDs[0]);
assert.equal(item.getField('title'), 'The Mania of the Nations on the Planet Mars and its Terrific Consequences / A Combination of Fun and Wisdom');
assert.equal(Zotero.Utilities.cleanISBN(item.getField('ISBN')), isbn);
assert.lengthOf(modifiedIDs, 2);
translateStub.restore();
});
it("should find and search by ISBN and DOI in section marked as copyright page", async function () {
let isbn = '9780226300481';
let doi = '10.7208/chicago/9780226300658.001.0001';
let search = null;
let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
.callsFake(async function () {
search = this.search;
return [{
itemType: 'book',
title: 'Building the American Republic, Volume 1, Library Catalog Edition',
ISBN: isbn,
attachments: [],
tags: []
}];
});
let testDir = getTestDataDirectory();
testDir.append('recognizeEPUB_test_copyright_page.epub');
let collection = await createDataObject('collection');
await Zotero.Attachments.importFromFile({
file: testDir,
collections: [collection.id]
@ -435,9 +483,53 @@ describe("Document Recognition", function() {
let addedIDs = await waitForItemEvent('add');
let modifiedIDs = await waitForItemEvent('modify');
assert.isTrue(translateStub.calledOnce);
assert.ok(search);
assert.equal(search.ISBN, isbn);
assert.equal(search.DOI, doi);
assert.lengthOf(addedIDs, 1);
let item = Zotero.Items.get(addedIDs[0]);
assert.equal(item.getField('title'), 'The Mania of the Nations on the Planet Mars and its Terrific Consequences / A Combination of Fun and Wisdom');
assert.equal(item.getField('title'), 'Building the American Republic, Volume 1, Library Catalog Edition');
assert.equal(Zotero.Utilities.cleanISBN(item.getField('ISBN')), isbn);
assert.lengthOf(modifiedIDs, 2);
translateStub.restore();
});
it("should find and search by ISBN and DOI in section not marked as copyright page", async function () {
let isbn = '9780226300481';
let doi = '10.7208/chicago/9780226300658.001.0001';
let search = null;
let translateStub = sinon.stub(Zotero.Translate.Search.prototype, 'translate')
.callsFake(async function () {
search = this.search;
return [{
itemType: 'book',
title: 'Building the American Republic, Volume 1, Library Catalog Edition',
ISBN: isbn,
attachments: [],
tags: []
}];
});
let testDir = getTestDataDirectory();
testDir.append('recognizeEPUB_test_content.epub');
let collection = await createDataObject('collection');
await Zotero.Attachments.importFromFile({
file: testDir,
collections: [collection.id]
});
win.ZoteroPane.recognizeSelected();
let addedIDs = await waitForItemEvent('add');
let modifiedIDs = await waitForItemEvent('modify');
assert.isTrue(translateStub.calledOnce);
assert.ok(search);
assert.equal(search.ISBN, isbn);
assert.equal(search.DOI, doi);
assert.lengthOf(addedIDs, 1);
let item = Zotero.Items.get(addedIDs[0]);
assert.equal(item.getField('title'), 'Building the American Republic, Volume 1, Library Catalog Edition');
assert.equal(Zotero.Utilities.cleanISBN(item.getField('ISBN')), isbn);
assert.lengthOf(modifiedIDs, 2);