fx-compat: Update full-text indexing
Use the new PageData mechanism for character set detection, don't try to index HTML files directly without properly detecting the charset, and generally simplify the indexing code. HTML files are now considered cached files that require indexing and won't be indexed automatically in Zotero.FullText.findTextInItems(), which breaks certain expectations, including in some tests. This will need to be addressed.
This commit is contained in:
parent
1dd24f7082
commit
13adfd131c
3 changed files with 163 additions and 308 deletions
|
@ -131,7 +131,7 @@ Zotero.Attachments = new function(){
|
||||||
await attachmentItem.save(saveOptions);
|
await attachmentItem.save(saveOptions);
|
||||||
}.bind(this));
|
}.bind(this));
|
||||||
try {
|
try {
|
||||||
yield _postProcessFile(attachmentItem, newFile, contentType);
|
yield _postProcessFile(attachmentItem);
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.logError(e);
|
Zotero.logError(e);
|
||||||
|
@ -194,7 +194,7 @@ Zotero.Attachments = new function(){
|
||||||
saveOptions
|
saveOptions
|
||||||
});
|
});
|
||||||
try {
|
try {
|
||||||
yield _postProcessFile(item, file, contentType);
|
yield _postProcessFile(item);
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.logError(e);
|
Zotero.logError(e);
|
||||||
|
@ -258,7 +258,7 @@ Zotero.Attachments = new function(){
|
||||||
var file = this.resolveRelativePath(path);
|
var file = this.resolveRelativePath(path);
|
||||||
if (file && await OS.File.exists(file)) {
|
if (file && await OS.File.exists(file)) {
|
||||||
try {
|
try {
|
||||||
await _postProcessFile(item, file, contentType);
|
await _postProcessFile(item);
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.logError(e);
|
Zotero.logError(e);
|
||||||
|
@ -334,12 +334,7 @@ Zotero.Attachments = new function(){
|
||||||
}
|
}
|
||||||
}.bind(this));
|
}.bind(this));
|
||||||
try {
|
try {
|
||||||
yield _postProcessFile(
|
yield _postProcessFile(attachmentItem);
|
||||||
attachmentItem,
|
|
||||||
Zotero.File.pathToFile(newPath),
|
|
||||||
contentType,
|
|
||||||
charset
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
Zotero.logError(e);
|
Zotero.logError(e);
|
||||||
|
@ -2912,115 +2907,15 @@ Zotero.Attachments = new function(){
|
||||||
/**
|
/**
|
||||||
* If necessary/possible, detect the file charset and index the file
|
* If necessary/possible, detect the file charset and index the file
|
||||||
*
|
*
|
||||||
* Since we have to load the content into the browser to get the
|
* Since we have to load the content into the browser to get the character set, we create the
|
||||||
* character set (at least until we figure out a better way to get
|
* item above and update asynchronously after the fact
|
||||||
* at the native detectors), we create the item above and update
|
|
||||||
* asynchronously after the fact
|
|
||||||
*
|
*
|
||||||
* @return {Promise}
|
* @return {Promise}
|
||||||
*/
|
*/
|
||||||
var _postProcessFile = Zotero.Promise.coroutine(function* (item, file, contentType) {
|
var _postProcessFile = async function (item) {
|
||||||
// Don't try to process if MIME type is unknown
|
return Zotero.Fulltext.indexItems([item.id]);
|
||||||
if (!contentType) {
|
};
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Items with content types that get cached by the fulltext indexer can just be indexed,
|
|
||||||
// since a charset isn't necessary
|
|
||||||
if (Zotero.Fulltext.isCachedMIMEType(contentType)) {
|
|
||||||
return Zotero.Fulltext.indexItems([item.id]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ignore non-text types
|
|
||||||
var ext = Zotero.File.getExtension(file);
|
|
||||||
if (!Zotero.MIME.hasInternalHandler(contentType, ext) || !Zotero.MIME.isTextType(contentType)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the charset is already set, index item directly
|
|
||||||
if (item.attachmentCharset) {
|
|
||||||
return Zotero.Fulltext.indexItems([item.id]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Otherwise, load in a hidden browser to get the charset, and then index the document
|
|
||||||
return new Zotero.Promise(function (resolve, reject) {
|
|
||||||
var browser = Zotero.Browser.createHiddenBrowser(
|
|
||||||
null,
|
|
||||||
// Disable JavaScript, since it can cause imports that include HTML files to hang
|
|
||||||
// (from network requests that fail?)
|
|
||||||
{ allowJavaScript: false }
|
|
||||||
);
|
|
||||||
|
|
||||||
var pageshown = false;
|
|
||||||
|
|
||||||
if (item.attachmentCharset) {
|
|
||||||
var onpageshow = async function () {
|
|
||||||
// ignore spurious about:blank loads
|
|
||||||
if(browser.contentDocument.location.href == "about:blank") return;
|
|
||||||
|
|
||||||
pageshown = true;
|
|
||||||
|
|
||||||
browser.removeEventListener("pageshow", onpageshow, false);
|
|
||||||
|
|
||||||
try {
|
|
||||||
await Zotero.Fulltext.indexDocument(browser.contentDocument, itemID);
|
|
||||||
resolve();
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
reject(e);
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
Zotero.Browser.deleteHiddenBrowser(browser);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
browser.addEventListener("pageshow", onpageshow, false);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
let callback = async function (charset, args) {
|
|
||||||
// ignore spurious about:blank loads
|
|
||||||
if(browser.contentDocument.location.href == "about:blank") return;
|
|
||||||
|
|
||||||
pageshown = true;
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (charset) {
|
|
||||||
charset = Zotero.CharacterSets.toCanonical(charset);
|
|
||||||
if (charset) {
|
|
||||||
item.attachmentCharset = charset;
|
|
||||||
await item.saveTx({
|
|
||||||
skipNotifier: true
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
await Zotero.Fulltext.indexDocument(browser.contentDocument, item.id);
|
|
||||||
resolve();
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
reject(e);
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
Zotero.Browser.deleteHiddenBrowser(browser);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
Zotero.File.addCharsetListener(browser, callback, item.id);
|
|
||||||
}
|
|
||||||
|
|
||||||
var url = Components.classes["@mozilla.org/network/protocol;1?name=file"]
|
|
||||||
.getService(Components.interfaces.nsIFileProtocolHandler)
|
|
||||||
.getURLSpecFromFile(file);
|
|
||||||
browser.loadURI(url);
|
|
||||||
|
|
||||||
// Avoid a hang if a pageshow is never called on the hidden browser (which can happen
|
|
||||||
// if a .pdf file is really HTML, which can also result in the file being launched,
|
|
||||||
// which we should try to fix)
|
|
||||||
setTimeout(function () {
|
|
||||||
if (!pageshown) {
|
|
||||||
reject(new Error("pageshow not called in hidden browser"));
|
|
||||||
}
|
|
||||||
}, 5000);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determines if a given document is an instance of PDFJS
|
* Determines if a given document is an instance of PDFJS
|
||||||
|
|
|
@ -24,8 +24,6 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
Zotero.Fulltext = Zotero.FullText = new function(){
|
Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
this.isCachedMIMEType = isCachedMIMEType;
|
|
||||||
|
|
||||||
this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; });
|
this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; });
|
||||||
this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; });
|
this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; });
|
||||||
|
|
||||||
|
@ -67,9 +65,6 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
yield Zotero.DB.queryAsync("ATTACH ':memory:' AS 'indexing'");
|
yield Zotero.DB.queryAsync("ATTACH ':memory:' AS 'indexing'");
|
||||||
yield Zotero.DB.queryAsync('CREATE TABLE indexing.fulltextWords (word NOT NULL)');
|
yield Zotero.DB.queryAsync('CREATE TABLE indexing.fulltextWords (word NOT NULL)');
|
||||||
|
|
||||||
this.unicodeConverter = Cc["@mozilla.org/intl/scriptableunicodeconverter"]
|
|
||||||
.createInstance(Ci.nsIScriptableUnicodeConverter);
|
|
||||||
|
|
||||||
let pdfConverterFileName = "pdftotext";
|
let pdfConverterFileName = "pdftotext";
|
||||||
let pdfInfoFileName = "pdfinfo";
|
let pdfInfoFileName = "pdfinfo";
|
||||||
|
|
||||||
|
@ -78,7 +73,8 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
pdfInfoFileName += '.exe';
|
pdfInfoFileName += '.exe';
|
||||||
}
|
}
|
||||||
|
|
||||||
let dir = FileUtils.getDir('AChrom', []).parent;
|
// AChrome is app/chrome
|
||||||
|
let dir = FileUtils.getDir('AChrom', []).parent.parent;
|
||||||
|
|
||||||
_pdfData = dir.clone();
|
_pdfData = dir.clone();
|
||||||
_pdfData.append('poppler-data');
|
_pdfData.append('poppler-data');
|
||||||
|
@ -222,13 +218,14 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
* Returns true if MIME type is converted to text and cached before indexing
|
* Returns true if MIME type is converted to text and cached before indexing
|
||||||
* (e.g. application/pdf is run through pdftotext)
|
* (e.g. application/pdf is run through pdftotext)
|
||||||
*/
|
*/
|
||||||
function isCachedMIMEType(mimeType) {
|
this.isCachedMIMEType = function (mimeType) {
|
||||||
switch (mimeType) {
|
switch (mimeType) {
|
||||||
case 'application/pdf':
|
case 'application/pdf':
|
||||||
|
case 'text/html':
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -274,8 +271,12 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
/**
|
/**
|
||||||
* @return {Promise}
|
* @return {Promise}
|
||||||
*/
|
*/
|
||||||
var indexString = Zotero.Promise.coroutine(function* (text, charset, itemID, stats, version, synced) {
|
var indexString = Zotero.Promise.coroutine(function* (text, itemID, stats, version, synced) {
|
||||||
var words = this.semanticSplitter(text, charset);
|
if (itemID != parseInt(itemID)) {
|
||||||
|
throw new Error("itemID not provided");
|
||||||
|
}
|
||||||
|
|
||||||
|
var words = this.semanticSplitter(text);
|
||||||
|
|
||||||
while (Zotero.DB.inTransaction()) {
|
while (Zotero.DB.inTransaction()) {
|
||||||
yield Zotero.DB.waitForTransaction('indexString()');
|
yield Zotero.DB.waitForTransaction('indexString()');
|
||||||
|
@ -334,9 +335,12 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
if (!maxLength) {
|
if (!maxLength) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
var obj = yield convertItemHTMLToText(itemID, document.body.innerHTML, maxLength);
|
var text = document.documentElement.innerText;
|
||||||
var text = obj.text;
|
var totalChars = text.length;
|
||||||
var totalChars = obj.totalChars;
|
var item = Zotero.Items.get(itemID);
|
||||||
|
if (document.contentType == 'text/html') {
|
||||||
|
yield writeCacheFile(item, text, maxLength);
|
||||||
|
}
|
||||||
|
|
||||||
if (totalChars > maxLength) {
|
if (totalChars > maxLength) {
|
||||||
Zotero.debug('Only indexing first ' + maxLength + ' characters of item '
|
Zotero.debug('Only indexing first ' + maxLength + ' characters of item '
|
||||||
|
@ -345,82 +349,15 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
|
|
||||||
yield indexString(
|
yield indexString(
|
||||||
text,
|
text,
|
||||||
document.characterSet,
|
|
||||||
itemID,
|
itemID,
|
||||||
{ indexedChars: text.length, totalChars }
|
{ indexedChars: text.length, totalChars }
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param {String} path
|
|
||||||
* @param {Boolean} [complete=FALSE] Index the file in its entirety, ignoring maxLength
|
|
||||||
*/
|
|
||||||
var indexFile = Zotero.Promise.coroutine(function* (path, contentType, charset, itemID, complete, stats) {
|
|
||||||
if (!(yield OS.File.exists(path))) {
|
|
||||||
Zotero.debug('File not found in indexFile()', 2);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!contentType) {
|
|
||||||
Zotero.debug("Content type not provided in indexFile()", 1);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!itemID) {
|
|
||||||
throw new Error('Item ID not provided');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (contentType == 'application/pdf') {
|
|
||||||
return this.indexPDF(path, itemID, complete);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!Zotero.MIME.isTextType(contentType)) {
|
|
||||||
Zotero.debug('File is not text in indexFile()', 2);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!charset) {
|
|
||||||
Zotero.logError(`Item ${itemID} didn't have a charset`);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
|
|
||||||
if (!maxLength) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (complete) {
|
|
||||||
maxLength = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
Zotero.debug('Indexing file ' + path);
|
|
||||||
var text = yield Zotero.File.getContentsAsync(path, charset);
|
|
||||||
var totalChars = text.length;
|
|
||||||
if (contentType == 'text/html') {
|
|
||||||
let obj = yield convertItemHTMLToText(itemID, text, maxLength);
|
|
||||||
text = obj.text;
|
|
||||||
totalChars = obj.totalChars;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (maxLength && text.length > maxLength) {
|
|
||||||
text = text.substr(0, maxLength);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Record the number of characters indexed (unless we're indexing a (PDF) cache file,
|
|
||||||
// in which case the stats are coming from elsewhere)
|
|
||||||
if (!stats) {
|
|
||||||
stats = { indexedChars: text.length, totalChars: totalChars };
|
|
||||||
}
|
|
||||||
yield indexString(text, charset, itemID, stats);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}.bind(this));
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info
|
* Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info
|
||||||
* and .zotero-ft-cache, and pass the text file back to indexFile()
|
* and .zotero-ft-cache, and pass the text file to indexString()
|
||||||
*
|
*
|
||||||
* @param {nsIFile} file
|
* @param {nsIFile} file
|
||||||
* @param {Number} itemID
|
* @param {Number} itemID
|
||||||
|
@ -494,14 +431,9 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
yield indexFile(
|
var text = Zotero.File.getContentsAsync(cacheFilePath);
|
||||||
cacheFilePath,
|
var stats = { indexedPages, totalPages };
|
||||||
'text/plain',
|
yield indexString(text, itemID, stats);
|
||||||
'utf-8',
|
|
||||||
itemID,
|
|
||||||
true,
|
|
||||||
{ indexedPages, totalPages }
|
|
||||||
);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
|
@ -554,7 +486,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await indexFile(path, item.attachmentContentType, item.attachmentCharset, itemID, complete);
|
await indexItem(item, path, complete);
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
if (ignoreErrors) {
|
if (ignoreErrors) {
|
||||||
|
@ -568,6 +500,87 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
var indexItem = async function (item, path, complete) {
|
||||||
|
if (!await OS.File.exists(path)) {
|
||||||
|
Zotero.debug(`${path} does not exist in indexItem()`, 2);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var contentType = item.attachmentContentType;
|
||||||
|
var charset = item.attachmentCharacterSet;
|
||||||
|
|
||||||
|
if (!contentType) {
|
||||||
|
Zotero.debug("No content type in indexItem()", 2);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (contentType == 'application/pdf') {
|
||||||
|
return this.indexPDF(path, item.id, complete);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!Zotero.MIME.isTextType(contentType)) {
|
||||||
|
Zotero.debug('File is not text in indexItem()', 2);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
|
||||||
|
if (!maxLength) {
|
||||||
|
Zotero.debug('fulltext.textMaxLength is 0 -- skipping indexing');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Zotero.debug('Indexing file ' + path);
|
||||||
|
|
||||||
|
var text;
|
||||||
|
|
||||||
|
// If it's a plain-text file and we know the charset, just get the contents
|
||||||
|
if (contentType == 'text/plain' && charset) {
|
||||||
|
text = await Zotero.File.getContentsAsync(path, charset);
|
||||||
|
}
|
||||||
|
// Otherwise load it in a hidden browser
|
||||||
|
else {
|
||||||
|
let pageData = await getPageData(path);
|
||||||
|
text = pageData.bodyText;
|
||||||
|
if (!charset) {
|
||||||
|
charset = pageData.characterSet;
|
||||||
|
}
|
||||||
|
if (contentType == 'text/html') {
|
||||||
|
await writeCacheFile(item, text, maxLength, complete);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the item didn't have a charset assigned and the library is editable, update it now
|
||||||
|
if (charset && !item.attachmentCharset && item.library.editable) {
|
||||||
|
let canonical = Zotero.CharacterSets.toCanonical(charset);
|
||||||
|
let msg = `Character set is ${canonical}`;
|
||||||
|
if (charset != canonical) {
|
||||||
|
msg += ` (detected: ${charset})`;
|
||||||
|
charset = canonical;
|
||||||
|
}
|
||||||
|
Zotero.debug(msg);
|
||||||
|
|
||||||
|
if (charset) {
|
||||||
|
item.attachmentCharset = charset;
|
||||||
|
await item.saveTx({
|
||||||
|
skipNotifier: true
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!charset) {
|
||||||
|
Zotero.logError(`Could not detect character set for ${item.libraryKey} -- skipping indexing`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var totalChars = text.length;
|
||||||
|
if (!complete) {
|
||||||
|
text = text.substr(0, maxLength);
|
||||||
|
}
|
||||||
|
var stats = { indexedChars: text.length, totalChars };
|
||||||
|
await indexString(text, item.id, stats);
|
||||||
|
}.bind(this);
|
||||||
|
|
||||||
|
|
||||||
// TEMP: Temporary mechanism to serialize indexing of new attachments
|
// TEMP: Temporary mechanism to serialize indexing of new attachments
|
||||||
//
|
//
|
||||||
// This should instead save the itemID to a table that's read by the content processor
|
// This should instead save the itemID to a table that's read by the content processor
|
||||||
|
@ -640,7 +653,7 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
let item = yield Zotero.Items.getAsync(itemID);
|
let item = yield Zotero.Items.getAsync(itemID);
|
||||||
let libraryKey = item.libraryKey;
|
let libraryKey = item.libraryKey;
|
||||||
let contentType = item.attachmentContentType;
|
let contentType = item.attachmentContentType;
|
||||||
if (contentType && (isCachedMIMEType(contentType) || Zotero.MIME.isTextType(contentType))) {
|
if (contentType && (this.isCachedMIMEType(contentType) || Zotero.MIME.isTextType(contentType))) {
|
||||||
try {
|
try {
|
||||||
let cacheFile = this.getItemCacheFile(item).path;
|
let cacheFile = this.getItemCacheFile(item).path;
|
||||||
if (yield OS.File.exists(cacheFile)) {
|
if (yield OS.File.exists(cacheFile)) {
|
||||||
|
@ -649,8 +662,8 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
content = yield Zotero.File.getContentsAsync(cacheFile);
|
content = yield Zotero.File.getContentsAsync(cacheFile);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// If there should be a cache file and isn't, mark the full text as missing
|
// If a cache file is required, mark the full text as missing
|
||||||
if (!Zotero.MIME.isTextType(contentType)) {
|
if (this.isCachedMIMEType(contentType)) {
|
||||||
Zotero.debug("Full-text content cache file doesn't exist for item "
|
Zotero.debug("Full-text content cache file doesn't exist for item "
|
||||||
+ libraryKey, 2);
|
+ libraryKey, 2);
|
||||||
let sql = "UPDATE fulltextItems SET synced=? WHERE itemID=?";
|
let sql = "UPDATE fulltextItems SET synced=? WHERE itemID=?";
|
||||||
|
@ -671,21 +684,8 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
Zotero.debug("Getting full-text content from file for item " + libraryKey);
|
Zotero.debug("Getting full-text content from file for item " + libraryKey);
|
||||||
content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset);
|
content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset);
|
||||||
|
|
||||||
// If HTML, convert to plain text first, and cache the result
|
// Include only as many characters as we've indexed
|
||||||
if (item.attachmentContentType == 'text/html') {
|
content = content.substr(0, row.indexedChars);
|
||||||
let obj = yield convertItemHTMLToText(
|
|
||||||
itemID,
|
|
||||||
content,
|
|
||||||
// Include in the cache file only as many characters as we
|
|
||||||
// indexed previously
|
|
||||||
row.indexedChars
|
|
||||||
);
|
|
||||||
content = obj.text;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// Include only as many characters as we've indexed
|
|
||||||
content = content.substr(0, row.indexedChars);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (e) {
|
catch (e) {
|
||||||
|
@ -982,7 +982,6 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
|
|
||||||
yield indexString(
|
yield indexString(
|
||||||
data.text,
|
data.text,
|
||||||
"UTF-8",
|
|
||||||
itemID,
|
itemID,
|
||||||
{
|
{
|
||||||
indexedChars: data.indexedChars,
|
indexedChars: data.indexedChars,
|
||||||
|
@ -1104,9 +1103,12 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
let maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
|
let maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
|
||||||
let binaryMode = mode && mode.indexOf('Binary') != -1;
|
let binaryMode = mode && mode.indexOf('Binary') != -1;
|
||||||
|
|
||||||
if (isCachedMIMEType(mimeType)) {
|
if (this.isCachedMIMEType(mimeType)) {
|
||||||
let file = this.getItemCacheFile(item).path;
|
let file = this.getItemCacheFile(item).path;
|
||||||
if (!(yield OS.File.exists(file))) {
|
if (!(yield OS.File.exists(file))) {
|
||||||
|
Zotero.debug("No cache file at " + file, 2);
|
||||||
|
// TODO: Index on-demand?
|
||||||
|
// What about a cleared full-text index?
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1122,33 +1124,13 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for a cache file
|
let path = yield item.getFilePathAsync();
|
||||||
let cacheFile = this.getItemCacheFile(item).path;
|
if (!path) {
|
||||||
if (yield OS.File.exists(cacheFile)) {
|
continue;
|
||||||
Zotero.debug("Searching for text '" + searchText + "' in " + cacheFile);
|
|
||||||
content = yield Zotero.File.getContentsAsync(cacheFile, 'utf-8', maxLength);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// If that doesn't exist, check for the actual file
|
|
||||||
let path = yield item.getFilePathAsync();
|
|
||||||
if (!path) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Zotero.debug("Searching for text '" + searchText + "' in " + path);
|
|
||||||
content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset);
|
|
||||||
|
|
||||||
// If HTML and not binary mode, convert to text
|
|
||||||
if (mimeType == 'text/html' && !binaryMode) {
|
|
||||||
// Include in the cache file only as many characters as we've indexed
|
|
||||||
let chars = yield getChars(itemID);
|
|
||||||
|
|
||||||
let obj = yield convertItemHTMLToText(
|
|
||||||
itemID, content, chars ? chars.indexedChars : null
|
|
||||||
);
|
|
||||||
content = obj.text;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Zotero.debug("Searching for text '" + searchText + "' in " + path);
|
||||||
|
content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset, maxLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
let match = findTextInString(content, searchText, mode);
|
let match = findTextInString(content, searchText, mode);
|
||||||
|
@ -1608,58 +1590,46 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
async function getPageData(path) {
|
||||||
|
const { HiddenBrowser } = ChromeUtils.import("chrome://zotero/content/HiddenBrowser.jsm");
|
||||||
|
var browser;
|
||||||
|
var pageData;
|
||||||
|
try {
|
||||||
|
let url = Zotero.File.pathToFileURI(path);
|
||||||
|
browser = await HiddenBrowser.create(url);
|
||||||
|
pageData = await HiddenBrowser.getPageData(browser, ['characterSet', 'bodyText']);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
if (browser) {
|
||||||
|
HiddenBrowser.destroy(browser);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
characterSet: pageData.characterSet,
|
||||||
|
bodyText: pageData.bodyText
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert HTML to text for an item and cache the result
|
* Write the converted text to a cache file
|
||||||
*
|
|
||||||
* @return {Promise}
|
|
||||||
*/
|
*/
|
||||||
var convertItemHTMLToText = Zotero.Promise.coroutine(function* (itemID, html, maxLength) {
|
var writeCacheFile = async function (item, text, maxLength, complete) {
|
||||||
// Split elements to avoid word concatenation
|
if (!complete) {
|
||||||
html = html.replace(/>/g, '> ');
|
|
||||||
|
|
||||||
var text = HTMLToText(html);
|
|
||||||
var totalChars = text.length;
|
|
||||||
|
|
||||||
if (maxLength) {
|
|
||||||
text = text.substr(0, maxLength);
|
text = text.substr(0, maxLength);
|
||||||
}
|
}
|
||||||
|
var cacheFile = this.getItemCacheFile(item).path;
|
||||||
// Write the converted text to a cache file
|
Zotero.debug("Writing converted full-text content to " + cacheFile);
|
||||||
var item = yield Zotero.Items.getAsync(itemID);
|
if (!await OS.File.exists(OS.Path.dirname(cacheFile))) {
|
||||||
var cacheFile = Zotero.Fulltext.getItemCacheFile(item).path;
|
await Zotero.Attachments.createDirectoryForItem(item);
|
||||||
Zotero.debug("Writing converted full-text HTML content to " + cacheFile);
|
|
||||||
if (!(yield OS.File.exists(OS.Path.dirname(cacheFile)))) {
|
|
||||||
yield Zotero.Attachments.createDirectoryForItem(item);
|
|
||||||
}
|
}
|
||||||
yield Zotero.File.putContentsAsync(cacheFile, text)
|
|
||||||
.catch(function (e) {
|
|
||||||
Zotero.debug(e, 1);
|
|
||||||
Components.utils.reportError(e);
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
text: text,
|
|
||||||
totalChars: totalChars
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
function HTMLToText(html) {
|
|
||||||
var nsIFC = Components.classes['@mozilla.org/widget/htmlformatconverter;1']
|
|
||||||
.createInstance(Components.interfaces.nsIFormatConverter);
|
|
||||||
var from = Components.classes['@mozilla.org/supports-string;1']
|
|
||||||
.createInstance(Components.interfaces.nsISupportsString);
|
|
||||||
from.data = html;
|
|
||||||
var to = { value: null };
|
|
||||||
try {
|
try {
|
||||||
nsIFC.convert('text/html', from, from.toString().length, 'text/unicode', to, {});
|
await Zotero.File.putContentsAsync(cacheFile, text);
|
||||||
to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
|
|
||||||
return to.toString();
|
|
||||||
}
|
}
|
||||||
catch(e) {
|
catch (e) {
|
||||||
Zotero.debug(e, 1);
|
Zotero.logError(e);
|
||||||
return html;
|
|
||||||
}
|
}
|
||||||
}
|
}.bind(this);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1673,16 +1643,6 @@ Zotero.Fulltext = Zotero.FullText = new function(){
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
|
||||||
if (charset && charset != 'utf-8') {
|
|
||||||
this.converter.charset = charset;
|
|
||||||
text = this.converter.ConvertToUnicode(text);
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
Zotero.debug("Error converting from charset " + charset, 1);
|
|
||||||
Zotero.debug(err, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
var words = {};
|
var words = {};
|
||||||
var word = '';
|
var word = '';
|
||||||
var cclass = null;
|
var cclass = null;
|
||||||
|
|
|
@ -259,8 +259,7 @@ describe("Zotero.Attachments", function() {
|
||||||
assert.propertyVal(matches[0], 'id', attachment.id);
|
assert.propertyVal(matches[0], 'id', attachment.id);
|
||||||
});
|
});
|
||||||
|
|
||||||
// This isn't particularly the behavior we want, but it documents the expected behavior
|
it("should index JavaScript-created text in an HTML file", async function () {
|
||||||
it("shouldn't index JavaScript-created text in an HTML file when the charset isn't known in advance", async function () {
|
|
||||||
var item = await createDataObject('item');
|
var item = await createDataObject('item');
|
||||||
var file = getTestDataDirectory();
|
var file = getTestDataDirectory();
|
||||||
file.append('test-js.html');
|
file.append('test-js.html');
|
||||||
|
@ -275,7 +274,8 @@ describe("Zotero.Attachments", function() {
|
||||||
assert.equal(attachment.attachmentCharset, 'utf-8');
|
assert.equal(attachment.attachmentCharset, 'utf-8');
|
||||||
|
|
||||||
var matches = await Zotero.Fulltext.findTextInItems([attachment.id], 'test');
|
var matches = await Zotero.Fulltext.findTextInItems([attachment.id], 'test');
|
||||||
assert.lengthOf(matches, 0);
|
assert.lengthOf(matches, 1);
|
||||||
|
assert.propertyVal(matches[0], 'id', attachment.id);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue