check UTF-8 in separate chunks, since running a regexp on too much data produces an error
This commit is contained in:
parent
071c05e0bb
commit
e01aa91e49
1 changed files with 30 additions and 26 deletions
|
@ -110,15 +110,10 @@ Zotero.Translate.IO.Read = function(file, mode) {
|
||||||
var charset = null;
|
var charset = null;
|
||||||
|
|
||||||
// look for a BOM in the document
|
// look for a BOM in the document
|
||||||
if(Zotero.isFx4) {
|
|
||||||
var first4 = NetUtil.readInputStreamToString(this._rawStream, 4);
|
|
||||||
} else {
|
|
||||||
var binStream = Components.classes["@mozilla.org/binaryinputstream;1"].
|
var binStream = Components.classes["@mozilla.org/binaryinputstream;1"].
|
||||||
createInstance(Components.interfaces.nsIBinaryInputStream);
|
createInstance(Components.interfaces.nsIBinaryInputStream);
|
||||||
binStream.setInputStream(this._rawStream);
|
binStream.setInputStream(this._rawStream);
|
||||||
var first4 = binStream.readByteArray(4);
|
var first4 = binStream.readBytes(4);
|
||||||
first4 = String.fromCharCode.apply(null, first4);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(var possibleCharset in BOMs) {
|
for(var possibleCharset in BOMs) {
|
||||||
if(first4.substr(0, BOMs[possibleCharset].length) == BOMs[possibleCharset]) {
|
if(first4.substr(0, BOMs[possibleCharset].length) == BOMs[possibleCharset]) {
|
||||||
|
@ -184,6 +179,7 @@ Zotero.Translate.IO.Read = function(file, mode) {
|
||||||
// Check whether the user has specified a charset preference
|
// Check whether the user has specified a charset preference
|
||||||
var charsetPref = Zotero.Prefs.get("import.charset");
|
var charsetPref = Zotero.Prefs.get("import.charset");
|
||||||
if(charsetPref == "auto") {
|
if(charsetPref == "auto") {
|
||||||
|
Zotero.debug("Translate: Checking whether file is UTF-8");
|
||||||
// For auto-detect, we are basically going to check if the file could be valid
|
// For auto-detect, we are basically going to check if the file could be valid
|
||||||
// UTF-8, and if this is true, we will treat it as UTF-8. Prior likelihood of
|
// UTF-8, and if this is true, we will treat it as UTF-8. Prior likelihood of
|
||||||
// UTF-8 is very high, so this should be a reasonable strategy.
|
// UTF-8 is very high, so this should be a reasonable strategy.
|
||||||
|
@ -205,27 +201,35 @@ Zotero.Translate.IO.Read = function(file, mode) {
|
||||||
// Read all currently available bytes from file. I'm not sure how many this is
|
// Read all currently available bytes from file. I'm not sure how many this is
|
||||||
// but it's a safe bet that we don't want to try to read any more than this, since
|
// but it's a safe bet that we don't want to try to read any more than this, since
|
||||||
// it would slow things down considerably.
|
// it would slow things down considerably.
|
||||||
if(Zotero.isFx4) {
|
this._charset = "UTF-8";
|
||||||
var fileContents = NetUtil.readInputStreamToString(this._rawStream, this._rawStream.available());
|
let bytesAvailable;
|
||||||
} else {
|
while(bytesAvailable = this._rawStream.available()) {
|
||||||
var fileContents = binStream.readByteArray(this._rawStream.available());
|
// read 131072 bytes
|
||||||
fileContents = String.fromCharCode.apply(null, fileContents);
|
let fileContents = binStream.readBytes(Math.min(131072, bytesAvailable));
|
||||||
|
|
||||||
|
// on failure, try reading up to 3 more bytes and see if that makes this
|
||||||
|
// valid (since we have chunked it)
|
||||||
|
let isUTF8;
|
||||||
|
for(let i=1; !(isUTF8 = UTF8Regex.test(fileContents)) && i <= 3; i++) {
|
||||||
|
if(this._rawStream.available()) {
|
||||||
|
fileContents += binStream.readBytes(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Seek back to beginning of file
|
// if the regexp continues to fail, this is not UTF-8
|
||||||
this._seekToStart();
|
if(!isUTF8) {
|
||||||
|
|
||||||
// See whether this could be UTF-8
|
|
||||||
if(UTF8Regex.test(fileContents)) {
|
|
||||||
// Assume this is UTF-8
|
|
||||||
this._charset = "UTF-8";
|
|
||||||
} else {
|
|
||||||
// Can't be UTF-8; see if a default charset is defined
|
// Can't be UTF-8; see if a default charset is defined
|
||||||
this._charset = Zotero.Prefs.get("intl.charset.default", true);
|
this._charset = Zotero.Prefs.get("intl.charset.default", true);
|
||||||
|
|
||||||
// ISO-8859-1 by default
|
// ISO-8859-1 by default
|
||||||
if(!this._charset) this._charset = "ISO-8859-1";
|
if(!this._charset) this._charset = "ISO-8859-1";
|
||||||
|
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seek back to beginning of file
|
||||||
|
this._seekToStart();
|
||||||
} else {
|
} else {
|
||||||
// No need to auto-detect; user has specified a charset
|
// No need to auto-detect; user has specified a charset
|
||||||
this._charset = charsetPref;
|
this._charset = charsetPref;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue