From f7216298b40a64ae0c72408154bed80c9a876c85 Mon Sep 17 00:00:00 2001 From: Dan Stillman Date: Fri, 12 Jun 2015 02:20:07 -0400 Subject: [PATCH] Update character set handling Restore prepopulated charset table, but this time with just the encodings from the WHATWG Encoding Standard. Assigning a charset to Zotero.Item::attachmentCharset runs the value through Zotero.CharacterSets.toCanonical() automatically. This migrates attachment charsets to the new canonical values, clearing any that are unsupported. Other legacy mappings could still be added back, as disussed in #760. --- chrome/content/zotero/xpcom/attachments.js | 15 +++--- .../content/zotero/xpcom/data/cachedTypes.js | 19 ------- chrome/content/zotero/xpcom/data/item.js | 5 +- chrome/content/zotero/xpcom/data/items.js | 3 -- chrome/content/zotero/xpcom/file.js | 5 +- chrome/content/zotero/xpcom/schema.js | 9 +++- resource/schema/system.sql | 51 +++++++++++++++++-- resource/schema/userdata.sql | 6 --- test/tests/data/charsets/gbk.txt | 1 + .../{invalidChar.txt => charsets/invalid.txt} | 0 .../data/{utf8Char.txt => charsets/utf8.txt} | 0 test/tests/data/charsets/windows1252.txt | 1 + test/tests/fileTest.js | 22 +++++++- 13 files changed, 91 insertions(+), 46 deletions(-) create mode 100644 test/tests/data/charsets/gbk.txt rename test/tests/data/{invalidChar.txt => charsets/invalid.txt} (100%) rename test/tests/data/{utf8Char.txt => charsets/utf8.txt} (100%) create mode 100644 test/tests/data/charsets/windows1252.txt diff --git a/chrome/content/zotero/xpcom/attachments.js b/chrome/content/zotero/xpcom/attachments.js index e456a81b6c..8b26233176 100644 --- a/chrome/content/zotero/xpcom/attachments.js +++ b/chrome/content/zotero/xpcom/attachments.js @@ -1347,16 +1347,13 @@ Zotero.Attachments = new function(){ .then(function () { return Zotero.spawn(function* () { if (charset) { - var disabled = Zotero.Notifier.disable(); - - var item = yield Zotero.Items.getAsync(itemID); - if (yield Zotero.CharacterSets.add(charset)) { + charset = Zotero.CharacterSets.toCanonical(charset); + if (charset) { + let item = yield Zotero.Items.getAsync(itemID); item.attachmentCharset = charset; - } - yield item.saveTx(); - - if (disabled) { - Zotero.Notifier.enable(); + yield item.saveTx({ + skipNotifier: true + }); } } diff --git a/chrome/content/zotero/xpcom/data/cachedTypes.js b/chrome/content/zotero/xpcom/data/cachedTypes.js index 76ddfed44a..efbc48c5d7 100644 --- a/chrome/content/zotero/xpcom/data/cachedTypes.js +++ b/chrome/content/zotero/xpcom/data/cachedTypes.js @@ -538,27 +538,8 @@ Zotero.CharacterSets = new function() { this._nameCol = 'charset'; this._table = 'charsets'; this._ignoreCase = true; - this._allowAdd = true; - this._valueCheck = function (name) { - // Don't allow too-long or non-ASCII names - if (name.length > 50 || !name.match(/^[a-z0-9\-_]+$/)) { - return false; - } - return true; - } - - - /** - * @return {Promise} - */ - this.purge = function () { - var sql = "DELETE FROM " + this._table + " WHERE " + this._idCol + " NOT IN " - + "(SELECT " + this._idCol + " FROM itemAttachments)"; - return Zotero.DB.queryAsync(sql); - }; - // Converts charset label to charset name // https://encoding.spec.whatwg.org/#names-and-labels // @param {String} charset diff --git a/chrome/content/zotero/xpcom/data/item.js b/chrome/content/zotero/xpcom/data/item.js index 39dd0fd146..fcb8856af6 100644 --- a/chrome/content/zotero/xpcom/data/item.js +++ b/chrome/content/zotero/xpcom/data/item.js @@ -1442,7 +1442,7 @@ Zotero.Item.prototype._saveData = Zotero.Promise.coroutine(function* (env) { let linkMode = this.attachmentLinkMode; let contentType = this.attachmentContentType; let charsetID = this.attachmentCharset - ? (yield Zotero.CharacterSets.add(this.attachmentCharset)) + ? Zotero.CharacterSets.getID(this.attachmentCharset) : null; let path = this.attachmentPath; let syncState = this.attachmentSyncState; @@ -2652,6 +2652,9 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentCharset', { } oldVal = this.attachmentCharset; + if (val) { + val = Zotero.CharacterSets.toCanonical(val); + } if (!val) { val = ""; } diff --git a/chrome/content/zotero/xpcom/data/items.js b/chrome/content/zotero/xpcom/data/items.js index d7d56d1950..d003aac4fb 100644 --- a/chrome/content/zotero/xpcom/data/items.js +++ b/chrome/content/zotero/xpcom/data/items.js @@ -634,9 +634,6 @@ Zotero.Items = function() { + "(SELECT valueID FROM itemData)"; yield Zotero.DB.queryAsync(sql); - // Purge unused charsetIDs (if attachments were deleted) - yield Zotero.CharacterSets.purge(); - Zotero.Prefs.set('purge.items', false) }); diff --git a/chrome/content/zotero/xpcom/file.js b/chrome/content/zotero/xpcom/file.js index 06e45b5ebc..f99a40510a 100644 --- a/chrome/content/zotero/xpcom/file.js +++ b/chrome/content/zotero/xpcom/file.js @@ -160,7 +160,10 @@ Zotero.File = new function(){ throw new Error("File is not an nsIInputStream or nsIFile"); } - charset = charset ? Zotero.CharacterSets.getName(charset) : "UTF-8"; + if (charset) { + charset = Zotero.CharacterSets.toLabel(charset, true) + } + charset = charset || "UTF-8"; var blockSize = maxLength ? Math.min(maxLength, 524288) : 524288; diff --git a/chrome/content/zotero/xpcom/schema.js b/chrome/content/zotero/xpcom/schema.js index 1bc589955b..83cfac6c8e 100644 --- a/chrome/content/zotero/xpcom/schema.js +++ b/chrome/content/zotero/xpcom/schema.js @@ -118,6 +118,8 @@ Zotero.Schema = new function(){ } var updated = yield Zotero.DB.executeTransaction(function* (conn) { + yield Zotero.DB.queryAsync("PRAGMA defer_foreign_keys = true"); + var updated = yield _updateSchema('system'); // Update custom tables if they exist so that changes are in @@ -2145,9 +2147,13 @@ Zotero.Schema = new function(){ yield Zotero.DB.queryAsync("INSERT OR IGNORE INTO itemNotes SELECT * FROM itemNotesOld"); yield Zotero.DB.queryAsync("CREATE INDEX itemNotes_parentItemID ON itemNotes(parentItemID)"); + yield Zotero.DB.queryAsync("CREATE TEMPORARY TABLE charsetsOld (charsetID INT, charset UNIQUE, canonical, PRIMARY KEY (charsetID))"); + yield Zotero.DB.queryAsync("INSERT INTO charsetsOld VALUES (1,'utf-8','utf-8'), (2,'ascii','windows-1252'), (3,'windows-1250','windows-1250'), (4,'windows-1251','windows-1251'), (5,'windows-1252','windows-1252'), (6,'windows-1253','windows-1253'), (7,'windows-1254','windows-1254'), (8,'windows-1257','windows-1257'), (9,'us',NULL), (10,'us-ascii','windows-1252'), (11,'utf-7',NULL), (12,'iso8859-1','windows-1252'), (13,'iso8859-15','iso-8859-15'), (14,'iso_646.irv:1991',NULL), (15,'iso_8859-1','windows-1252'), (16,'iso_8859-1:1987','windows-1252'), (17,'iso_8859-2','iso-8859-2'), (18,'iso_8859-2:1987','iso-8859-2'), (19,'iso_8859-4','iso-8859-4'), (20,'iso_8859-4:1988','iso-8859-4'), (21,'iso_8859-5','iso-8859-5'), (22,'iso_8859-5:1988','iso-8859-5'), (23,'iso_8859-7','iso-8859-7'), (24,'iso_8859-7:1987','iso-8859-7'), (25,'iso-8859-1','windows-1252'), (26,'iso-8859-1-windows-3.0-latin-1',NULL), (27,'iso-8859-1-windows-3.1-latin-1',NULL), (28,'iso-8859-15','iso-8859-15'), (29,'iso-8859-2','iso-8859-2'), (30,'iso-8859-2-windows-latin-2',NULL), (31,'iso-8859-3','iso-8859-3'), (32,'iso-8859-4','iso-8859-4'), (33,'iso-8859-5','iso-8859-5'), (34,'iso-8859-5-windows-latin-5',NULL), (35,'iso-8859-6','iso-8859-6'), (36,'iso-8859-7','iso-8859-7'), (37,'iso-8859-8','iso-8859-8'), (38,'iso-8859-9','windows-1254'), (39,'l1','windows-1252'), (40,'l2','iso-8859-2'), (41,'l4','iso-8859-4'), (42,'latin1','windows-1252'), (43,'latin2','iso-8859-2'), (44,'latin4','iso-8859-4'), (45,'x-mac-ce',NULL), (46,'x-mac-cyrillic','x-mac-cyrillic'), (47,'x-mac-greek',NULL), (48,'x-mac-roman','macintosh'), (49,'x-mac-turkish',NULL), (50,'adobe-symbol-encoding',NULL), (51,'ansi_x3.4-1968','windows-1252'), (52,'ansi_x3.4-1986',NULL), (53,'big5','big5'), (54,'chinese','gbk'), (55,'cn-big5','big5'), (56,'cn-gb',NULL), (57,'cn-gb-isoir165',NULL), (58,'cp367',NULL), (59,'cp819','windows-1252'), (60,'cp850',NULL), (61,'cp852',NULL), (62,'cp855',NULL), (63,'cp857',NULL), (64,'cp862',NULL), (65,'cp864',NULL), (66,'cp866','ibm866'), (67,'csascii',NULL), (68,'csbig5','big5'), (69,'cseuckr','euc-kr'), (70,'cseucpkdfmtjapanese','euc-jp'), (71,'csgb2312','gbk'), (72,'cshalfwidthkatakana',NULL), (73,'cshppsmath',NULL), (74,'csiso103t618bit',NULL), (75,'csiso159jisx02121990',NULL), (76,'csiso2022jp','iso-2022-jp'), (77,'csiso2022jp2',NULL), (78,'csiso2022kr','replacement'), (79,'csiso58gb231280','gbk'), (80,'csisolatin4','iso-8859-4'), (81,'csisolatincyrillic','iso-8859-5'), (82,'csisolatingreek','iso-8859-7'), (83,'cskoi8r','koi8-r'), (84,'csksc56011987','euc-kr'), (85,'csshiftjis','shift_jis'), (86,'csunicode11',NULL), (87,'csunicode11utf7',NULL), (88,'csunicodeascii',NULL), (89,'csunicodelatin1',NULL), (90,'cswindows31latin5',NULL), (91,'cyrillic','iso-8859-5'), (92,'ecma-118','iso-8859-7'), (93,'elot_928','iso-8859-7'), (94,'euc-jp','euc-jp'), (95,'euc-kr','euc-kr'), (96,'extended_unix_code_packed_format_for_japanese',NULL), (97,'gb2312','gbk'), (98,'gb_2312-80','gbk'), (99,'greek','iso-8859-7'), (100,'greek8','iso-8859-7'), (101,'hz-gb-2312','replacement'), (102,'ibm367',NULL), (103,'ibm819','windows-1252'), (104,'ibm850',NULL), (105,'ibm852',NULL), (106,'ibm855',NULL), (107,'ibm857',NULL), (108,'ibm862',NULL), (109,'ibm864',NULL), (110,'ibm866','ibm866'), (111,'iso-10646',NULL), (112,'iso-10646-j-1',NULL), (113,'iso-10646-ucs-2',NULL), (114,'iso-10646-ucs-4',NULL), (115,'iso-10646-ucs-basic',NULL), (116,'iso-10646-unicode-latin1',NULL), (117,'iso-2022-jp','iso-2022-jp'), (118,'iso-2022-jp-2',NULL), (119,'iso-2022-kr','replacement'), (120,'iso-ir-100','windows-1252'), (121,'iso-ir-101','iso-8859-2'), (122,'iso-ir-103',NULL), (123,'iso-ir-110','iso-8859-4'), (124,'iso-ir-126','iso-8859-7'), (125,'iso-ir-144','iso-8859-5'), (126,'iso-ir-149','euc-kr'), (127,'iso-ir-159',NULL), (128,'iso-ir-58','gbk'), (129,'iso-ir-6',NULL), (130,'iso646-us',NULL), (131,'jis_x0201',NULL), (132,'jis_x0208-1983',NULL), (133,'jis_x0212-1990',NULL), (134,'koi8-r','koi8-r'), (135,'korean','euc-kr'), (136,'ks_c_5601',NULL), (137,'ks_c_5601-1987','euc-kr'), (138,'ks_c_5601-1989','euc-kr'), (139,'ksc5601','euc-kr'), (140,'ksc_5601','euc-kr'), (141,'ms_kanji','shift_jis'), (142,'shift_jis','shift_jis'), (143,'t.61',NULL), (144,'t.61-8bit',NULL), (145,'unicode-1-1-utf-7',NULL), (146,'unicode-1-1-utf-8','utf-8'), (147,'unicode-2-0-utf-7',NULL), (148,'windows-31j','shift_jis'), (149,'x-cns11643-1',NULL), (150,'x-cns11643-1110',NULL), (151,'x-cns11643-2',NULL), (152,'x-cp1250','windows-1250'), (153,'x-cp1251','windows-1251'), (154,'x-cp1253','windows-1253'), (155,'x-dectech',NULL), (156,'x-dingbats',NULL), (157,'x-euc-jp','euc-jp'), (158,'x-euc-tw',NULL), (159,'x-gb2312-11',NULL), (160,'x-imap4-modified-utf7',NULL), (161,'x-jisx0208-11',NULL), (162,'x-ksc5601-11',NULL), (163,'x-sjis','shift_jis'), (164,'x-tis620',NULL), (165,'x-unicode-2-0-utf-7',NULL), (166,'x-x-big5','big5'), (167,'x0201',NULL), (168,'x0212',NULL)"); + yield Zotero.DB.queryAsync("CREATE INDEX charsetsOld_canonical ON charsetsOld(canonical)"); + yield Zotero.DB.queryAsync("ALTER TABLE itemAttachments RENAME TO itemAttachmentsOld"); yield Zotero.DB.queryAsync("CREATE TABLE itemAttachments (\n itemID INTEGER PRIMARY KEY,\n parentItemID INT,\n linkMode INT,\n contentType TEXT,\n charsetID INT,\n path TEXT,\n syncState INT DEFAULT 0,\n storageModTime INT,\n storageHash TEXT,\n FOREIGN KEY (itemID) REFERENCES items(itemID) ON DELETE CASCADE,\n FOREIGN KEY (parentItemID) REFERENCES items(itemID) ON DELETE CASCADE,\n FOREIGN KEY (charsetID) REFERENCES charsets(charsetID) ON DELETE SET NULL\n)"); - yield Zotero.DB.queryAsync("INSERT OR IGNORE INTO itemAttachments SELECT itemID, sourceItemID, linkMode, mimeType, charsetID, path, syncState, storageModTime, storageHash FROM itemAttachmentsOld"); + yield Zotero.DB.queryAsync("INSERT OR IGNORE INTO itemAttachments SELECT itemID, sourceItemID, linkMode, mimeType, C.charsetID, path, syncState, storageModTime, storageHash FROM itemAttachmentsOld IA LEFT JOIN charsetsOld CO ON (IA.charsetID=CO.charsetID) LEFT JOIN charsets C ON (CO.canonical=C.charset)"); yield Zotero.DB.queryAsync("CREATE INDEX itemAttachments_parentItemID ON itemAttachments(parentItemID)"); yield Zotero.DB.queryAsync("CREATE INDEX itemAttachments_charsetID ON itemAttachments(charsetID)"); yield Zotero.DB.queryAsync("CREATE INDEX itemAttachments_contentType ON itemAttachments(contentType)"); @@ -2236,6 +2242,7 @@ Zotero.Schema = new function(){ yield Zotero.DB.queryAsync("DROP TABLE annotationsOld"); yield Zotero.DB.queryAsync("DROP TABLE collectionItemsOld"); + yield Zotero.DB.queryAsync("DROP TABLE charsetsOld"); yield Zotero.DB.queryAsync("DROP TABLE customBaseFieldMappingsOld"); yield Zotero.DB.queryAsync("DROP TABLE deletedItemsOld"); yield Zotero.DB.queryAsync("DROP TABLE fulltextItemWordsOld"); diff --git a/resource/schema/system.sql b/resource/schema/system.sql index 9205788260..c074712b7d 100644 --- a/resource/schema/system.sql +++ b/resource/schema/system.sql @@ -23,8 +23,6 @@ -- This file creates system tables that can be safely wiped and reinitialized -- at any time, as long as existing ids are preserved. -PRAGMA defer_foreign_keys = true; - -- Valid item types ("book," "journalArticle," etc.) DROP TABLE IF EXISTS itemTypes; CREATE TABLE itemTypes ( @@ -126,6 +124,13 @@ CREATE TABLE baseFieldMappingsCombined ( CREATE INDEX baseFieldMappingsCombined_baseFieldID ON baseFieldMappingsCombined(baseFieldID); CREATE INDEX baseFieldMappingsCombined_fieldID ON baseFieldMappingsCombined(fieldID); +DROP TABLE IF EXISTS charsets; +CREATE TABLE charsets ( + charsetID INTEGER PRIMARY KEY, + charset TEXT UNIQUE +); +CREATE INDEX charsets_charset ON charsets(charset); + DROP TABLE IF EXISTS fileTypes; CREATE TABLE fileTypes ( fileTypeID INTEGER PRIMARY KEY, @@ -1132,6 +1137,46 @@ INSERT INTO itemTypeCreatorTypes VALUES(36,3,0); INSERT INTO itemTypeCreatorTypes VALUES(36,4,0); INSERT INTO itemTypeCreatorTypes VALUES(36,5,0); +INSERT INTO "charsets" VALUES (1, "utf-8"); +INSERT INTO "charsets" VALUES (2, "big5"); +INSERT INTO "charsets" VALUES (3, "euc-jp"); +INSERT INTO "charsets" VALUES (4, "euc-kr"); +INSERT INTO "charsets" VALUES (5, "gb18030"); +INSERT INTO "charsets" VALUES (6, "gbk"); +INSERT INTO "charsets" VALUES (7, "ibm866"); +INSERT INTO "charsets" VALUES (8, "iso-2022-jp"); +INSERT INTO "charsets" VALUES (9, "iso-8859-2"); +INSERT INTO "charsets" VALUES (10, "iso-8859-3"); +INSERT INTO "charsets" VALUES (11, "iso-8859-4"); +INSERT INTO "charsets" VALUES (12, "iso-8859-5"); +INSERT INTO "charsets" VALUES (13, "iso-8859-6"); +INSERT INTO "charsets" VALUES (14, "iso-8859-7"); +INSERT INTO "charsets" VALUES (15, "iso-8859-8"); +INSERT INTO "charsets" VALUES (16, "iso-8859-8-i"); +INSERT INTO "charsets" VALUES (17, "iso-8859-10"); +INSERT INTO "charsets" VALUES (18, "iso-8859-13"); +INSERT INTO "charsets" VALUES (19, "iso-8859-14"); +INSERT INTO "charsets" VALUES (20, "iso-8859-15"); +INSERT INTO "charsets" VALUES (21, "iso-8859-16"); +INSERT INTO "charsets" VALUES (22, "koi8-r"); +INSERT INTO "charsets" VALUES (23, "koi8-u"); +INSERT INTO "charsets" VALUES (24, "macintosh"); +INSERT INTO "charsets" VALUES (25, "replacement"); +INSERT INTO "charsets" VALUES (26, "shift_jis"); +INSERT INTO "charsets" VALUES (27, "utf-16be"); +INSERT INTO "charsets" VALUES (28, "utf-16le"); +INSERT INTO "charsets" VALUES (29, "windows-874"); +INSERT INTO "charsets" VALUES (30, "windows-1250"); +INSERT INTO "charsets" VALUES (31, "windows-1251"); +INSERT INTO "charsets" VALUES (32, "windows-1252"); +INSERT INTO "charsets" VALUES (33, "windows-1253"); +INSERT INTO "charsets" VALUES (34, "windows-1254"); +INSERT INTO "charsets" VALUES (35, "windows-1255"); +INSERT INTO "charsets" VALUES (36, "windows-1256"); +INSERT INTO "charsets" VALUES (37, "windows-1257"); +INSERT INTO "charsets" VALUES (38, "windows-1258"); +INSERT INTO "charsets" VALUES (39, "x-mac-cyrillic"); +INSERT INTO "charsets" VALUES (40, "x-user-defined"); INSERT INTO "fileTypes" VALUES(1, 'webpage'); INSERT INTO "fileTypes" VALUES(2, 'image'); @@ -1186,5 +1231,3 @@ INSERT INTO "syncObjectTypes" VALUES(4, 'search'); INSERT INTO "syncObjectTypes" VALUES(5, 'tag'); INSERT INTO "syncObjectTypes" VALUES(6, 'relation'); INSERT INTO "syncObjectTypes" VALUES(7, 'setting'); - -PRAGMA defer_foreign_keys = false; diff --git a/resource/schema/userdata.sql b/resource/schema/userdata.sql index e1e58b6307..d70659ae0a 100644 --- a/resource/schema/userdata.sql +++ b/resource/schema/userdata.sql @@ -49,12 +49,6 @@ CREATE TABLE syncedSettings ( FOREIGN KEY (libraryID) REFERENCES libraries(libraryID) ON DELETE CASCADE ); -CREATE TABLE charsets ( - charsetID INTEGER PRIMARY KEY, - charset TEXT UNIQUE -); -CREATE INDEX charsets_charset ON charsets(charset); - -- Primary data applicable to all items CREATE TABLE items ( itemID INTEGER PRIMARY KEY, diff --git a/test/tests/data/charsets/gbk.txt b/test/tests/data/charsets/gbk.txt new file mode 100644 index 0000000000..0656406b1c --- /dev/null +++ b/test/tests/data/charsets/gbk.txt @@ -0,0 +1 @@ +@ \ No newline at end of file diff --git a/test/tests/data/invalidChar.txt b/test/tests/data/charsets/invalid.txt similarity index 100% rename from test/tests/data/invalidChar.txt rename to test/tests/data/charsets/invalid.txt diff --git a/test/tests/data/utf8Char.txt b/test/tests/data/charsets/utf8.txt similarity index 100% rename from test/tests/data/utf8Char.txt rename to test/tests/data/charsets/utf8.txt diff --git a/test/tests/data/charsets/windows1252.txt b/test/tests/data/charsets/windows1252.txt new file mode 100644 index 0000000000..92a39f398b --- /dev/null +++ b/test/tests/data/charsets/windows1252.txt @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/test/tests/fileTest.js b/test/tests/fileTest.js index 4d2951cac4..8ead4b3422 100644 --- a/test/tests/fileTest.js +++ b/test/tests/fileTest.js @@ -7,15 +7,33 @@ describe("Zotero.File", function () { it("should handle an extended character", function* () { var contents = yield Zotero.File.getContentsAsync( - OS.Path.join(getTestDataDirectory().path, "utf8Char.txt") + OS.Path.join(getTestDataDirectory().path, "charsets", "utf8.txt") ); assert.lengthOf(contents, 3); assert.equal(contents, "A\u72acB"); }) + it("should handle an extended Windows-1252 character", function* () { + var contents = yield Zotero.File.getContentsAsync( + OS.Path.join(getTestDataDirectory().path, "charsets", "windows1252.txt"), + "windows-1252" + ); + assert.lengthOf(contents, 1); + assert.equal(contents, "\u00E9"); + }) + + it("should handle a GBK character", function* () { + var contents = yield Zotero.File.getContentsAsync( + OS.Path.join(getTestDataDirectory().path, "charsets", "gbk.txt"), + "gbk" + ); + assert.lengthOf(contents, 1); + assert.equal(contents, "\u4e02"); + }) + it("should handle an invalid character", function* () { var contents = yield Zotero.File.getContentsAsync( - OS.Path.join(getTestDataDirectory().path, "invalidChar.txt") + OS.Path.join(getTestDataDirectory().path, "charsets", "invalid.txt") ); assert.lengthOf(contents, 3); assert.equal(contents, "A\uFFFDB");