Update character set handling

Restore prepopulated charset table, but this time with just the
encodings from the WHATWG Encoding Standard. Assigning a charset to
Zotero.Item::attachmentCharset runs the value through
Zotero.CharacterSets.toCanonical() automatically.

This migrates attachment charsets to the new canonical values, clearing any
that are unsupported.

Other legacy mappings could still be added back, as disussed in #760.
This commit is contained in:
Dan Stillman 2015-06-12 02:20:07 -04:00
parent 4bc5479b19
commit f7216298b4
13 changed files with 91 additions and 46 deletions

View file

@ -1347,16 +1347,13 @@ Zotero.Attachments = new function(){
.then(function () {
return Zotero.spawn(function* () {
if (charset) {
var disabled = Zotero.Notifier.disable();
var item = yield Zotero.Items.getAsync(itemID);
if (yield Zotero.CharacterSets.add(charset)) {
charset = Zotero.CharacterSets.toCanonical(charset);
if (charset) {
let item = yield Zotero.Items.getAsync(itemID);
item.attachmentCharset = charset;
}
yield item.saveTx();
if (disabled) {
Zotero.Notifier.enable();
yield item.saveTx({
skipNotifier: true
});
}
}

View file

@ -538,27 +538,8 @@ Zotero.CharacterSets = new function() {
this._nameCol = 'charset';
this._table = 'charsets';
this._ignoreCase = true;
this._allowAdd = true;
this._valueCheck = function (name) {
// Don't allow too-long or non-ASCII names
if (name.length > 50 || !name.match(/^[a-z0-9\-_]+$/)) {
return false;
}
return true;
}
/**
* @return {Promise}
*/
this.purge = function () {
var sql = "DELETE FROM " + this._table + " WHERE " + this._idCol + " NOT IN "
+ "(SELECT " + this._idCol + " FROM itemAttachments)";
return Zotero.DB.queryAsync(sql);
};
// Converts charset label to charset name
// https://encoding.spec.whatwg.org/#names-and-labels
// @param {String} charset

View file

@ -1442,7 +1442,7 @@ Zotero.Item.prototype._saveData = Zotero.Promise.coroutine(function* (env) {
let linkMode = this.attachmentLinkMode;
let contentType = this.attachmentContentType;
let charsetID = this.attachmentCharset
? (yield Zotero.CharacterSets.add(this.attachmentCharset))
? Zotero.CharacterSets.getID(this.attachmentCharset)
: null;
let path = this.attachmentPath;
let syncState = this.attachmentSyncState;
@ -2652,6 +2652,9 @@ Zotero.defineProperty(Zotero.Item.prototype, 'attachmentCharset', {
}
oldVal = this.attachmentCharset;
if (val) {
val = Zotero.CharacterSets.toCanonical(val);
}
if (!val) {
val = "";
}

View file

@ -634,9 +634,6 @@ Zotero.Items = function() {
+ "(SELECT valueID FROM itemData)";
yield Zotero.DB.queryAsync(sql);
// Purge unused charsetIDs (if attachments were deleted)
yield Zotero.CharacterSets.purge();
Zotero.Prefs.set('purge.items', false)
});

View file

@ -160,7 +160,10 @@ Zotero.File = new function(){
throw new Error("File is not an nsIInputStream or nsIFile");
}
charset = charset ? Zotero.CharacterSets.getName(charset) : "UTF-8";
if (charset) {
charset = Zotero.CharacterSets.toLabel(charset, true)
}
charset = charset || "UTF-8";
var blockSize = maxLength ? Math.min(maxLength, 524288) : 524288;

View file

@ -118,6 +118,8 @@ Zotero.Schema = new function(){
}
var updated = yield Zotero.DB.executeTransaction(function* (conn) {
yield Zotero.DB.queryAsync("PRAGMA defer_foreign_keys = true");
var updated = yield _updateSchema('system');
// Update custom tables if they exist so that changes are in
@ -2145,9 +2147,13 @@ Zotero.Schema = new function(){
yield Zotero.DB.queryAsync("INSERT OR IGNORE INTO itemNotes SELECT * FROM itemNotesOld");
yield Zotero.DB.queryAsync("CREATE INDEX itemNotes_parentItemID ON itemNotes(parentItemID)");
yield Zotero.DB.queryAsync("CREATE TEMPORARY TABLE charsetsOld (charsetID INT, charset UNIQUE, canonical, PRIMARY KEY (charsetID))");
yield Zotero.DB.queryAsync("INSERT INTO charsetsOld VALUES (1,'utf-8','utf-8'), (2,'ascii','windows-1252'), (3,'windows-1250','windows-1250'), (4,'windows-1251','windows-1251'), (5,'windows-1252','windows-1252'), (6,'windows-1253','windows-1253'), (7,'windows-1254','windows-1254'), (8,'windows-1257','windows-1257'), (9,'us',NULL), (10,'us-ascii','windows-1252'), (11,'utf-7',NULL), (12,'iso8859-1','windows-1252'), (13,'iso8859-15','iso-8859-15'), (14,'iso_646.irv:1991',NULL), (15,'iso_8859-1','windows-1252'), (16,'iso_8859-1:1987','windows-1252'), (17,'iso_8859-2','iso-8859-2'), (18,'iso_8859-2:1987','iso-8859-2'), (19,'iso_8859-4','iso-8859-4'), (20,'iso_8859-4:1988','iso-8859-4'), (21,'iso_8859-5','iso-8859-5'), (22,'iso_8859-5:1988','iso-8859-5'), (23,'iso_8859-7','iso-8859-7'), (24,'iso_8859-7:1987','iso-8859-7'), (25,'iso-8859-1','windows-1252'), (26,'iso-8859-1-windows-3.0-latin-1',NULL), (27,'iso-8859-1-windows-3.1-latin-1',NULL), (28,'iso-8859-15','iso-8859-15'), (29,'iso-8859-2','iso-8859-2'), (30,'iso-8859-2-windows-latin-2',NULL), (31,'iso-8859-3','iso-8859-3'), (32,'iso-8859-4','iso-8859-4'), (33,'iso-8859-5','iso-8859-5'), (34,'iso-8859-5-windows-latin-5',NULL), (35,'iso-8859-6','iso-8859-6'), (36,'iso-8859-7','iso-8859-7'), (37,'iso-8859-8','iso-8859-8'), (38,'iso-8859-9','windows-1254'), (39,'l1','windows-1252'), (40,'l2','iso-8859-2'), (41,'l4','iso-8859-4'), (42,'latin1','windows-1252'), (43,'latin2','iso-8859-2'), (44,'latin4','iso-8859-4'), (45,'x-mac-ce',NULL), (46,'x-mac-cyrillic','x-mac-cyrillic'), (47,'x-mac-greek',NULL), (48,'x-mac-roman','macintosh'), (49,'x-mac-turkish',NULL), (50,'adobe-symbol-encoding',NULL), (51,'ansi_x3.4-1968','windows-1252'), (52,'ansi_x3.4-1986',NULL), (53,'big5','big5'), (54,'chinese','gbk'), (55,'cn-big5','big5'), (56,'cn-gb',NULL), (57,'cn-gb-isoir165',NULL), (58,'cp367',NULL), (59,'cp819','windows-1252'), (60,'cp850',NULL), (61,'cp852',NULL), (62,'cp855',NULL), (63,'cp857',NULL), (64,'cp862',NULL), (65,'cp864',NULL), (66,'cp866','ibm866'), (67,'csascii',NULL), (68,'csbig5','big5'), (69,'cseuckr','euc-kr'), (70,'cseucpkdfmtjapanese','euc-jp'), (71,'csgb2312','gbk'), (72,'cshalfwidthkatakana',NULL), (73,'cshppsmath',NULL), (74,'csiso103t618bit',NULL), (75,'csiso159jisx02121990',NULL), (76,'csiso2022jp','iso-2022-jp'), (77,'csiso2022jp2',NULL), (78,'csiso2022kr','replacement'), (79,'csiso58gb231280','gbk'), (80,'csisolatin4','iso-8859-4'), (81,'csisolatincyrillic','iso-8859-5'), (82,'csisolatingreek','iso-8859-7'), (83,'cskoi8r','koi8-r'), (84,'csksc56011987','euc-kr'), (85,'csshiftjis','shift_jis'), (86,'csunicode11',NULL), (87,'csunicode11utf7',NULL), (88,'csunicodeascii',NULL), (89,'csunicodelatin1',NULL), (90,'cswindows31latin5',NULL), (91,'cyrillic','iso-8859-5'), (92,'ecma-118','iso-8859-7'), (93,'elot_928','iso-8859-7'), (94,'euc-jp','euc-jp'), (95,'euc-kr','euc-kr'), (96,'extended_unix_code_packed_format_for_japanese',NULL), (97,'gb2312','gbk'), (98,'gb_2312-80','gbk'), (99,'greek','iso-8859-7'), (100,'greek8','iso-8859-7'), (101,'hz-gb-2312','replacement'), (102,'ibm367',NULL), (103,'ibm819','windows-1252'), (104,'ibm850',NULL), (105,'ibm852',NULL), (106,'ibm855',NULL), (107,'ibm857',NULL), (108,'ibm862',NULL), (109,'ibm864',NULL), (110,'ibm866','ibm866'), (111,'iso-10646',NULL), (112,'iso-10646-j-1',NULL), (113,'iso-10646-ucs-2',NULL), (114,'iso-10646-ucs-4',NULL), (115,'iso-10646-ucs-basic',NULL), (116,'iso-10646-unicode-latin1',NULL), (117,'iso-2022-jp','iso-2022-jp'), (118,'iso-2022-jp-2',NULL), (119,'iso-2022-kr','replacement'), (120,'iso-ir-100','windows-1252'), (121,'iso-ir-101','iso-8859-2'), (122,'iso-ir-103',NULL), (123,'iso-ir-110','iso-8859-4'), (124,'iso-ir-126','iso-8859-7'), (125,'iso-ir-144','iso-8859-5'), (126,'iso-ir-149','euc-kr'), (127,'iso-ir-159',NULL), (128,'iso-ir-58','gbk'), (129,'iso-ir-6',NULL), (130,'iso646-us',NULL), (131,'jis_x0201',NULL), (132,'jis_x0208-1983',NULL), (133,'jis_x0212-1990',NULL), (134,'koi8-r','koi8-r'), (135,'korean','euc-kr'), (136,'ks_c_5601',NULL), (137,'ks_c_5601-1987','euc-kr'), (138,'ks_c_5601-1989','euc-kr'), (139,'ksc5601','euc-kr'), (140,'ksc_5601','euc-kr'), (141,'ms_kanji','shift_jis'), (142,'shift_jis','shift_jis'), (143,'t.61',NULL), (144,'t.61-8bit',NULL), (145,'unicode-1-1-utf-7',NULL), (146,'unicode-1-1-utf-8','utf-8'), (147,'unicode-2-0-utf-7',NULL), (148,'windows-31j','shift_jis'), (149,'x-cns11643-1',NULL), (150,'x-cns11643-1110',NULL), (151,'x-cns11643-2',NULL), (152,'x-cp1250','windows-1250'), (153,'x-cp1251','windows-1251'), (154,'x-cp1253','windows-1253'), (155,'x-dectech',NULL), (156,'x-dingbats',NULL), (157,'x-euc-jp','euc-jp'), (158,'x-euc-tw',NULL), (159,'x-gb2312-11',NULL), (160,'x-imap4-modified-utf7',NULL), (161,'x-jisx0208-11',NULL), (162,'x-ksc5601-11',NULL), (163,'x-sjis','shift_jis'), (164,'x-tis620',NULL), (165,'x-unicode-2-0-utf-7',NULL), (166,'x-x-big5','big5'), (167,'x0201',NULL), (168,'x0212',NULL)");
yield Zotero.DB.queryAsync("CREATE INDEX charsetsOld_canonical ON charsetsOld(canonical)");
yield Zotero.DB.queryAsync("ALTER TABLE itemAttachments RENAME TO itemAttachmentsOld");
yield Zotero.DB.queryAsync("CREATE TABLE itemAttachments (\n itemID INTEGER PRIMARY KEY,\n parentItemID INT,\n linkMode INT,\n contentType TEXT,\n charsetID INT,\n path TEXT,\n syncState INT DEFAULT 0,\n storageModTime INT,\n storageHash TEXT,\n FOREIGN KEY (itemID) REFERENCES items(itemID) ON DELETE CASCADE,\n FOREIGN KEY (parentItemID) REFERENCES items(itemID) ON DELETE CASCADE,\n FOREIGN KEY (charsetID) REFERENCES charsets(charsetID) ON DELETE SET NULL\n)");
yield Zotero.DB.queryAsync("INSERT OR IGNORE INTO itemAttachments SELECT itemID, sourceItemID, linkMode, mimeType, charsetID, path, syncState, storageModTime, storageHash FROM itemAttachmentsOld");
yield Zotero.DB.queryAsync("INSERT OR IGNORE INTO itemAttachments SELECT itemID, sourceItemID, linkMode, mimeType, C.charsetID, path, syncState, storageModTime, storageHash FROM itemAttachmentsOld IA LEFT JOIN charsetsOld CO ON (IA.charsetID=CO.charsetID) LEFT JOIN charsets C ON (CO.canonical=C.charset)");
yield Zotero.DB.queryAsync("CREATE INDEX itemAttachments_parentItemID ON itemAttachments(parentItemID)");
yield Zotero.DB.queryAsync("CREATE INDEX itemAttachments_charsetID ON itemAttachments(charsetID)");
yield Zotero.DB.queryAsync("CREATE INDEX itemAttachments_contentType ON itemAttachments(contentType)");
@ -2236,6 +2242,7 @@ Zotero.Schema = new function(){
yield Zotero.DB.queryAsync("DROP TABLE annotationsOld");
yield Zotero.DB.queryAsync("DROP TABLE collectionItemsOld");
yield Zotero.DB.queryAsync("DROP TABLE charsetsOld");
yield Zotero.DB.queryAsync("DROP TABLE customBaseFieldMappingsOld");
yield Zotero.DB.queryAsync("DROP TABLE deletedItemsOld");
yield Zotero.DB.queryAsync("DROP TABLE fulltextItemWordsOld");

View file

@ -23,8 +23,6 @@
-- This file creates system tables that can be safely wiped and reinitialized
-- at any time, as long as existing ids are preserved.
PRAGMA defer_foreign_keys = true;
-- Valid item types ("book," "journalArticle," etc.)
DROP TABLE IF EXISTS itemTypes;
CREATE TABLE itemTypes (
@ -126,6 +124,13 @@ CREATE TABLE baseFieldMappingsCombined (
CREATE INDEX baseFieldMappingsCombined_baseFieldID ON baseFieldMappingsCombined(baseFieldID);
CREATE INDEX baseFieldMappingsCombined_fieldID ON baseFieldMappingsCombined(fieldID);
DROP TABLE IF EXISTS charsets;
CREATE TABLE charsets (
charsetID INTEGER PRIMARY KEY,
charset TEXT UNIQUE
);
CREATE INDEX charsets_charset ON charsets(charset);
DROP TABLE IF EXISTS fileTypes;
CREATE TABLE fileTypes (
fileTypeID INTEGER PRIMARY KEY,
@ -1132,6 +1137,46 @@ INSERT INTO itemTypeCreatorTypes VALUES(36,3,0);
INSERT INTO itemTypeCreatorTypes VALUES(36,4,0);
INSERT INTO itemTypeCreatorTypes VALUES(36,5,0);
INSERT INTO "charsets" VALUES (1, "utf-8");
INSERT INTO "charsets" VALUES (2, "big5");
INSERT INTO "charsets" VALUES (3, "euc-jp");
INSERT INTO "charsets" VALUES (4, "euc-kr");
INSERT INTO "charsets" VALUES (5, "gb18030");
INSERT INTO "charsets" VALUES (6, "gbk");
INSERT INTO "charsets" VALUES (7, "ibm866");
INSERT INTO "charsets" VALUES (8, "iso-2022-jp");
INSERT INTO "charsets" VALUES (9, "iso-8859-2");
INSERT INTO "charsets" VALUES (10, "iso-8859-3");
INSERT INTO "charsets" VALUES (11, "iso-8859-4");
INSERT INTO "charsets" VALUES (12, "iso-8859-5");
INSERT INTO "charsets" VALUES (13, "iso-8859-6");
INSERT INTO "charsets" VALUES (14, "iso-8859-7");
INSERT INTO "charsets" VALUES (15, "iso-8859-8");
INSERT INTO "charsets" VALUES (16, "iso-8859-8-i");
INSERT INTO "charsets" VALUES (17, "iso-8859-10");
INSERT INTO "charsets" VALUES (18, "iso-8859-13");
INSERT INTO "charsets" VALUES (19, "iso-8859-14");
INSERT INTO "charsets" VALUES (20, "iso-8859-15");
INSERT INTO "charsets" VALUES (21, "iso-8859-16");
INSERT INTO "charsets" VALUES (22, "koi8-r");
INSERT INTO "charsets" VALUES (23, "koi8-u");
INSERT INTO "charsets" VALUES (24, "macintosh");
INSERT INTO "charsets" VALUES (25, "replacement");
INSERT INTO "charsets" VALUES (26, "shift_jis");
INSERT INTO "charsets" VALUES (27, "utf-16be");
INSERT INTO "charsets" VALUES (28, "utf-16le");
INSERT INTO "charsets" VALUES (29, "windows-874");
INSERT INTO "charsets" VALUES (30, "windows-1250");
INSERT INTO "charsets" VALUES (31, "windows-1251");
INSERT INTO "charsets" VALUES (32, "windows-1252");
INSERT INTO "charsets" VALUES (33, "windows-1253");
INSERT INTO "charsets" VALUES (34, "windows-1254");
INSERT INTO "charsets" VALUES (35, "windows-1255");
INSERT INTO "charsets" VALUES (36, "windows-1256");
INSERT INTO "charsets" VALUES (37, "windows-1257");
INSERT INTO "charsets" VALUES (38, "windows-1258");
INSERT INTO "charsets" VALUES (39, "x-mac-cyrillic");
INSERT INTO "charsets" VALUES (40, "x-user-defined");
INSERT INTO "fileTypes" VALUES(1, 'webpage');
INSERT INTO "fileTypes" VALUES(2, 'image');
@ -1186,5 +1231,3 @@ INSERT INTO "syncObjectTypes" VALUES(4, 'search');
INSERT INTO "syncObjectTypes" VALUES(5, 'tag');
INSERT INTO "syncObjectTypes" VALUES(6, 'relation');
INSERT INTO "syncObjectTypes" VALUES(7, 'setting');
PRAGMA defer_foreign_keys = false;

View file

@ -49,12 +49,6 @@ CREATE TABLE syncedSettings (
FOREIGN KEY (libraryID) REFERENCES libraries(libraryID) ON DELETE CASCADE
);
CREATE TABLE charsets (
charsetID INTEGER PRIMARY KEY,
charset TEXT UNIQUE
);
CREATE INDEX charsets_charset ON charsets(charset);
-- Primary data applicable to all items
CREATE TABLE items (
itemID INTEGER PRIMARY KEY,

View file

@ -0,0 +1 @@
 

View file

@ -0,0 +1 @@
<EFBFBD>

View file

@ -7,15 +7,33 @@ describe("Zotero.File", function () {
it("should handle an extended character", function* () {
var contents = yield Zotero.File.getContentsAsync(
OS.Path.join(getTestDataDirectory().path, "utf8Char.txt")
OS.Path.join(getTestDataDirectory().path, "charsets", "utf8.txt")
);
assert.lengthOf(contents, 3);
assert.equal(contents, "A\u72acB");
})
it("should handle an extended Windows-1252 character", function* () {
var contents = yield Zotero.File.getContentsAsync(
OS.Path.join(getTestDataDirectory().path, "charsets", "windows1252.txt"),
"windows-1252"
);
assert.lengthOf(contents, 1);
assert.equal(contents, "\u00E9");
})
it("should handle a GBK character", function* () {
var contents = yield Zotero.File.getContentsAsync(
OS.Path.join(getTestDataDirectory().path, "charsets", "gbk.txt"),
"gbk"
);
assert.lengthOf(contents, 1);
assert.equal(contents, "\u4e02");
})
it("should handle an invalid character", function* () {
var contents = yield Zotero.File.getContentsAsync(
OS.Path.join(getTestDataDirectory().path, "invalidChar.txt")
OS.Path.join(getTestDataDirectory().path, "charsets", "invalid.txt")
);
assert.lengthOf(contents, 3);
assert.equal(contents, "A\uFFFDB");