Handle multibyte characters in Zotero.File.truncateFileName()

Filesystems care about byte length, not character length, so treat maxLength as the byte length limit and truncate accordingly. This will also now remove entire emoji characters without corrupting them.
2021-05-20 19:22:09 -04:00 · 2021-05-20 19:22:09 -04:00 · bec42fe2a5
commit bec42fe2a5
parent b511e452a8
2 changed files with 105 additions and 18 deletions
--- a/chrome/content/zotero/xpcom/file.js
+++ b/chrome/content/zotero/xpcom/file.js
@ -1202,41 +1202,60 @@ Zotero.File = new function(){
 	}
 	
 	/**
-	 * Truncate a filename (excluding the extension) to the given total length
-	 * If the "extension" is longer than 20 characters,
-	 * it is treated as part of the file name
+	 * Truncate a filename (excluding the extension) to the given byte length
+	 *
+	 * If the extension is longer than 20 characters, it's treated as part of the file name.
+	 *
+	 * @param {String} fileName
+	 * @param {Number} maxLength - Maximum length in bytes
 	 */
 	function truncateFileName(fileName, maxLength) {
-		if(!fileName || (fileName + '').length <= maxLength) return fileName;
+		if (!fileName || Zotero.Utilities.Internal.byteLength((fileName + '')).length <= maxLength) {
+			return fileName;
+		}

-		var parts = (fileName + '').split(/\.(?=[^\.]+$)/);
-		var fn = parts[0];
+		var parts = (fileName + '').split(/\.(?=[^.]+$)/);
+		var name = parts[0];
 		var ext = parts[1];
 		//if the file starts with a period , use the whole file
 		//the whole file name might also just be a period
-		if(!fn) {
-			fn = '.' + (ext || '');
+		if (!name) {
+			name = '.' + (ext || '');
 		}

 		//treat long extensions as part of the file name
-		if(ext && ext.length > 20) {
-			fn += '.' + ext;
+		if (ext && ext.length > 20) {
+			name += '.' + ext;
 			ext = undefined;
 		}
-
-		if(ext === undefined) {	//there was no period in the whole file name
+		
+		// No period in the whole filename
+		if (ext === undefined) {
 			ext = '';
-		} else {
+		}
+		else {
 			ext = '.' + ext;
 		}

-		if (ext.length >= maxLength) {
-			// Improve resulting truncated filename by dropping extension if it wouldn't fit within
-			// the limit. e.g. for (lorem.json, 5) it returns "lorem", instead of ".json"
+		// Drop extension if it wouldn't fit within the limit
+		// E.g., for (lorem.json, 5), return "lorem" instead of ".json"
+		if (Zotero.Utilities.Internal.byteLength(ext) >= maxLength) {
 			ext = '';
 		}
-
-		return fn.substr(0,maxLength-ext.length) + ext;
+		
+		while (Zotero.Utilities.Internal.byteLength(name + ext) > maxLength) {
+			// Split into characters, so we don't corrupt emoji characters (though we might
+			// change multi-part emoji in unfortunate ways by removing some of the characters)
+			let parts = [...name];
+			name = name.substring(0, name.length - parts[parts.length - 1].length);
+		}
+		
+		// If removed completely, use underscore
+		if (name == '') {
+			name = '_';
+		}
+		
+		return name + ext;
 	}
 	
 	/*
--- a/test/tests/fileTest.js
+++ b/test/tests/fileTest.js
@ -340,6 +340,74 @@ describe("Zotero.File", function () {
 	});
 	
 	
+	describe("#truncateFileName()", function () {
+		it("should drop extension if longer than limit", function () {
+			var filename = "lorem.json";
+			var shortened = Zotero.File.truncateFileName(filename, 5);
+			assert.equal(shortened, "lorem");
+		});
+		
+		it("should use byte length rather than character length", function () {
+			var filename = "\uD83E\uDD92abcdefgh.pdf";
+			var shortened = Zotero.File.truncateFileName(filename, 10);
+			assert.equal(shortened, "\uD83E\uDD92ab.pdf");
+		});
+		
+		it("should remove characters, not bytes", function () {
+			// Emoji would put length over limit, so it should be removed completely
+			var filename = "abcé\uD83E\uDD92.pdf";
+			var shortened = Zotero.File.truncateFileName(filename, 10);
+			assert.equal(shortened, "abcé.pdf");
+		});
+		
+		it("should replace single multi-byte character with underscore if longer than maxLength", function () {
+			// Emoji would put length over limit, so it should be replaced with _
+			var filename = "\uD83E\uDD92.pdf";
+			var shortened = Zotero.File.truncateFileName(filename, 5);
+			assert.equal(shortened, "_.pdf");
+		});
+		
+		// The optimal behavior would probably be to remove the entire character sequence, but I'm
+		// not sure we can do that without an emoji library, so just make sure we're removing whole
+		// characters without corrupting anything.
+		it("should cruelly break apart families", function () {
+			var family = [
+				"\uD83D\uDC69", // woman (4)
+				"\uD83C\uDFFE", // skin tone (4)
+				"\u200D", // zero-width joiner (3)
+				"\uD83D\uDC68", // man (4)
+				"\uD83C\uDFFE", // skin tone (4)
+				"\u200D", // zero-width joiner (3)
+				"\uD83D\uDC67", // girl (4)
+				"\uD83C\uDFFE", // skin tone (4)
+				"\u200D", // zero-width joiner (3)
+				"\uD83D\uDC66", // boy (4)
+				"\uD83C\uDFFE" // skin tone (4)
+			].join("");
+			
+			var filename = "abc" + family + ".pdf";
+			var limit = 3 // 'abc'
+				+ 4 + 4 + 3
+				+ 4 + 4 + 3
+				+ 4; // ext
+			// Add some extra bytes to make sure we don't corrupt an emoji character
+			limit += 2;
+			var shortened = Zotero.File.truncateFileName(filename, limit);
+			assert.equal(
+				shortened,
+				"abc"
+					+ "\uD83D\uDC69"
+					+ "\uD83C\uDFFE"
+					+ "\u200D"
+					+ "\uD83D\uDC68"
+					+ "\uD83C\uDFFE"
+					+ "\u200D"
+					+ ".pdf"
+			);
+		});
+	});
+	
+	
 	describe("#checkFileAccessError()", function () {
 		it("should catch OS.File access-denied errors", function* () {
 			// We can't modify a real OS.File.Error, but we also don't do an instanceof check in