Handle multibyte characters in Zotero.File.truncateFileName()

Filesystems care about byte length, not character length, so treat
maxLength as the byte length limit and truncate accordingly.

This will also now remove entire emoji characters without corrupting
them.
This commit is contained in:
Dan Stillman 2021-05-20 19:22:09 -04:00
parent b511e452a8
commit bec42fe2a5
2 changed files with 105 additions and 18 deletions

View file

@ -1202,41 +1202,60 @@ Zotero.File = new function(){
}
/**
* Truncate a filename (excluding the extension) to the given total length
* If the "extension" is longer than 20 characters,
* it is treated as part of the file name
* Truncate a filename (excluding the extension) to the given byte length
*
* If the extension is longer than 20 characters, it's treated as part of the file name.
*
* @param {String} fileName
* @param {Number} maxLength - Maximum length in bytes
*/
function truncateFileName(fileName, maxLength) {
if(!fileName || (fileName + '').length <= maxLength) return fileName;
if (!fileName || Zotero.Utilities.Internal.byteLength((fileName + '')).length <= maxLength) {
return fileName;
}
var parts = (fileName + '').split(/\.(?=[^\.]+$)/);
var fn = parts[0];
var parts = (fileName + '').split(/\.(?=[^.]+$)/);
var name = parts[0];
var ext = parts[1];
//if the file starts with a period , use the whole file
//the whole file name might also just be a period
if(!fn) {
fn = '.' + (ext || '');
if (!name) {
name = '.' + (ext || '');
}
//treat long extensions as part of the file name
if(ext && ext.length > 20) {
fn += '.' + ext;
if (ext && ext.length > 20) {
name += '.' + ext;
ext = undefined;
}
if(ext === undefined) { //there was no period in the whole file name
// No period in the whole filename
if (ext === undefined) {
ext = '';
} else {
}
else {
ext = '.' + ext;
}
if (ext.length >= maxLength) {
// Improve resulting truncated filename by dropping extension if it wouldn't fit within
// the limit. e.g. for (lorem.json, 5) it returns "lorem", instead of ".json"
// Drop extension if it wouldn't fit within the limit
// E.g., for (lorem.json, 5), return "lorem" instead of ".json"
if (Zotero.Utilities.Internal.byteLength(ext) >= maxLength) {
ext = '';
}
return fn.substr(0,maxLength-ext.length) + ext;
while (Zotero.Utilities.Internal.byteLength(name + ext) > maxLength) {
// Split into characters, so we don't corrupt emoji characters (though we might
// change multi-part emoji in unfortunate ways by removing some of the characters)
let parts = [...name];
name = name.substring(0, name.length - parts[parts.length - 1].length);
}
// If removed completely, use underscore
if (name == '') {
name = '_';
}
return name + ext;
}
/*

View file

@ -340,6 +340,74 @@ describe("Zotero.File", function () {
});
describe("#truncateFileName()", function () {
it("should drop extension if longer than limit", function () {
var filename = "lorem.json";
var shortened = Zotero.File.truncateFileName(filename, 5);
assert.equal(shortened, "lorem");
});
it("should use byte length rather than character length", function () {
var filename = "\uD83E\uDD92abcdefgh.pdf";
var shortened = Zotero.File.truncateFileName(filename, 10);
assert.equal(shortened, "\uD83E\uDD92ab.pdf");
});
it("should remove characters, not bytes", function () {
// Emoji would put length over limit, so it should be removed completely
var filename = "abcé\uD83E\uDD92.pdf";
var shortened = Zotero.File.truncateFileName(filename, 10);
assert.equal(shortened, "abcé.pdf");
});
it("should replace single multi-byte character with underscore if longer than maxLength", function () {
// Emoji would put length over limit, so it should be replaced with _
var filename = "\uD83E\uDD92.pdf";
var shortened = Zotero.File.truncateFileName(filename, 5);
assert.equal(shortened, "_.pdf");
});
// The optimal behavior would probably be to remove the entire character sequence, but I'm
// not sure we can do that without an emoji library, so just make sure we're removing whole
// characters without corrupting anything.
it("should cruelly break apart families", function () {
var family = [
"\uD83D\uDC69", // woman (4)
"\uD83C\uDFFE", // skin tone (4)
"\u200D", // zero-width joiner (3)
"\uD83D\uDC68", // man (4)
"\uD83C\uDFFE", // skin tone (4)
"\u200D", // zero-width joiner (3)
"\uD83D\uDC67", // girl (4)
"\uD83C\uDFFE", // skin tone (4)
"\u200D", // zero-width joiner (3)
"\uD83D\uDC66", // boy (4)
"\uD83C\uDFFE" // skin tone (4)
].join("");
var filename = "abc" + family + ".pdf";
var limit = 3 // 'abc'
+ 4 + 4 + 3
+ 4 + 4 + 3
+ 4; // ext
// Add some extra bytes to make sure we don't corrupt an emoji character
limit += 2;
var shortened = Zotero.File.truncateFileName(filename, limit);
assert.equal(
shortened,
"abc"
+ "\uD83D\uDC69"
+ "\uD83C\uDFFE"
+ "\u200D"
+ "\uD83D\uDC68"
+ "\uD83C\uDFFE"
+ "\u200D"
+ ".pdf"
);
});
});
describe("#checkFileAccessError()", function () {
it("should catch OS.File access-denied errors", function* () {
// We can't modify a real OS.File.Error, but we also don't do an instanceof check in