Skip to content

Commit

Permalink
Implement UTF-16LE encoding, update tests, adjust codec interface
Browse files Browse the repository at this point in the history
Three major reasons for reimplementing UTF-16 and not use native codec:
 1. We want to remove StringDecoder & Buffer references due to #235.
 2. StringDecoder is inconsistent with handling surrogates on Node v6-9
 3. NPM module string_decoder gives strange results when processing chunks -
    it sometimes prepends '\u0000', likely due to a bug.

Performance was and is a major concern here. Decoder shouldn't be affected because it uses
backend methods directly. Encoder is affected due to introducing character-level loop. It's
still very fast (~450Mb/s), so I'm not too worried. If needed, we can make it about 4x faster
in Node.js by introducing a dedicated backend method. Browser speeds will be the same.
  • Loading branch information
ashtuchkin committed Jul 16, 2020
1 parent e567849 commit 9aa082f
Show file tree
Hide file tree
Showing 8 changed files with 464 additions and 100 deletions.
13 changes: 11 additions & 2 deletions backends/web.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
"use strict";
// NOTE: This backend uses TextDecoder interface.
// NOTE: This backend uses TextDecoder class.
// NOTE: Web backend differs from Node in handling invalid surrogates when decoding to strings in rawCharsToResult() function.
// Node passes them through unchanged, web backend (actually TextDecoder) replaces them with '�'. I haven't found a
// performant way to unify these behaviors while keeping compatibility with Node <11 where there's no TextDecoder.
// Not too worried as it seems like an edge case mostly concerning utf-16/utf-32/cesu8 codecs, but something to be aware of.

module.exports = {
// Encoder string input: use str directly, .length, .charCodeAt(i).
Expand Down Expand Up @@ -38,7 +42,12 @@ module.exports = {
return new Uint16Array(new ArrayBuffer(numChars * Uint16Array.BYTES_PER_ELEMENT));
},
rawCharsToResult(rawChars, finalLen) {
return new TextDecoder("utf-16").decode(rawChars.subarray(0, finalLen));
rawChars = rawChars.subarray(0, finalLen);
// NOTE: TextDecoder will convert all invalid surrogates to '�'-s.
let res = new TextDecoder("utf-16", {ignoreBOM: true}).decode(rawChars);
if (res.length !== finalLen)
throw new Error("TextDecoder returned different length string on array " + rawChars);
return res;
},

// Optimizations
Expand Down
3 changes: 1 addition & 2 deletions encodings/internal.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ module.exports = {
cesu8: { type: "_internal", bomAware: true},
unicode11utf8: "utf8",

ucs2: { type: "_internal", bomAware: true},
utf16le: "ucs2",
// NOTE: utf-16le/ucs2 are in utf16.js.

binary: { type: "_internal" },
base64: { type: "_internal" },
Expand Down
249 changes: 212 additions & 37 deletions encodings/utf16.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,150 @@
"use strict";

// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js
// == UTF16-LE codec. ==========================================================
// Note: We're not using Node.js native codec because StringDecoder implementation is buggy
// (adds \0 in some chunks; doesn't flag non-even number of bytes). We do use raw encoding/decoding
// routines for performance where possible, though.

exports.utf16le = class Utf16LECodec {
createEncoder(options, iconv) {
return new Utf16LEEncoder(iconv.backend);
}
createDecoder(options, iconv) {
return new Utf16LEDecoder(iconv.backend, iconv.defaultCharUnicode);
}
get bomAware() { return true; }
}

class Utf16LEEncoder {
constructor(backend) {
this.backend = backend;
}

write(str) {
const bytes = this.backend.allocBytes(str.length * 2);
const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length);
for (let i = 0; i < str.length; i++) {
chars[i] = str.charCodeAt(i);
}
return this.backend.bytesToResult(bytes, bytes.length);
}

end() {}
}

class Utf16LEDecoder {
constructor(backend, defaultChar) {
this.backend = backend;
this.defaultChar = defaultChar;
this.leadByte = -1;
this.leadSurrogate = undefined;
}

write(buf) {
// NOTE: This function is mostly the same as Utf16BEDecoder.write() with bytes swapped.
// Please keep them in sync.
// NOTE: The logic here is more complicated than barely necessary due to several limitations:
// 1. Input data chunks can split 2-byte code units, making 'leadByte' necessary.
// 2. Input data chunks can split valid surrogate pairs, making 'leadSurrogate' necessary.
// 3. rawCharsToResult() of Web backend converts all lone surrogates to '�', so we need to make
// sure we don't feed it parts of valid surrogate pairs.
// 4. For performance reasons we want to use initial buffer as much as we can. This is not
// possible if after our calculations the 2-byte memory alignment of a Uint16Array is lost,
// in which case we have to do a copy.

if (buf.length == 0) {
return '';
}
let offset = 0;
let byteLen = buf.length;

// Process previous leadByte
let prefix = '';
if (this.leadByte !== -1) {
offset++; byteLen--;
prefix = String.fromCharCode(this.leadByte | (buf[0] << 8));
}

// Set new leadByte if needed
if (byteLen & 1) {
this.leadByte = buf[buf.length-1];
byteLen--;
} else {
this.leadByte = -1;
}

// Process leadSurrogate
if (prefix.length || byteLen) {
// Add high surrogate from previous chunk.
if (this.leadSurrogate) {
if (prefix.length) {
prefix = this.leadSurrogate + prefix;
} else {
// Make sure 'chars' don't start with a lone low surrogate; it will mess with rawCharsToResult.
prefix = this.leadSurrogate + String.fromCharCode(buf[offset] | (buf[offset+1] << 8));
offset += 2; byteLen -= 2;
}
this.leadSurrogate = undefined;
}

// Slice off a new high surrogate at the end of the current chunk.
if (byteLen) {
const lastIdx = offset + byteLen - 2;
const lastChar = buf[lastIdx] | (buf[lastIdx+1] << 8);
if (0xD800 <= lastChar && lastChar < 0xDC00) {
this.leadSurrogate = String.fromCharCode(lastChar);
byteLen -= 2;
}
} else { // slice from prefix
const lastChar = prefix.charCodeAt(prefix.length-1);
if (0xD800 <= lastChar && lastChar < 0xDC00) {
this.leadSurrogate = prefix[prefix.length-1];
prefix = prefix.slice(0, -1);
}
}
}

let chars;
if ((buf.byteOffset + offset) & 1 === 0) {
// If byteOffset is aligned, just use the ArrayBuffer from input buf.
chars = new Uint16Array(buf.buffer, buf.byteOffset + offset, byteLen >> 1);
} else {
// If byteOffset is NOT aligned, create a new aligned buffer and copy the data.
chars = this.backend.allocRawChars(byteLen >> 1);
const srcByteView = new Uint8Array(buf.buffer, buf.byteOffset + offset, byteLen);
const destByteView = new Uint8Array(chars.buffer, chars.byteOffset, byteLen);
destByteView.set(srcByteView);
}

return prefix + this.backend.rawCharsToResult(chars, chars.length);
}

end() {
if (this.leadSurrogate || this.leadByte !== -1) {
const res = (this.leadSurrogate ? this.leadSurrogate : '') + (this.leadByte !== -1 ? this.defaultChar : '');
this.leadSurrogate = undefined;
this.leadByte = -1;
return res;
}
}
}
exports.ucs2 = "utf16le"; // Alias


// == UTF16-BE codec. ==========================================================

exports.utf16be = class Utf16BECodec {
get encoder() { return Utf16BEEncoder; }
get decoder() { return Utf16BEDecoder; }
createEncoder(options, iconv) {
return new Utf16BEEncoder(iconv.backend);
}
createDecoder(options, iconv) {
return new Utf16BEDecoder(iconv.backend, iconv.defaultCharUnicode);
}
get bomAware() { return true; }
}

class Utf16BEEncoder {
constructor(opts, codec, backend) {
constructor(backend) {
this.backend = backend;
}

Expand All @@ -30,30 +163,86 @@ class Utf16BEEncoder {
}

class Utf16BEDecoder {
constructor(opts, codec, backend) {
constructor(backend, defaultChar) {
this.backend = backend;
this.overflowByte = -1;
this.defaultChar = defaultChar;
this.leadByte = -1;
this.leadSurrogate = undefined;
}

write(buf) {
const chars = this.backend.allocRawChars((buf.length+1) >> 1);
let charsPos = 0, i = 0;

if (this.overflowByte !== -1 && i < buf.length) {
chars[charsPos++] = (this.overflowByte << 8) + buf[i++];
// NOTE: This function is mostly copy/paste from Utf16LEDecoder.write() with bytes swapped.
// Please keep them in sync. Comments in that function apply here too.
if (buf.length === 0) {
return '';
}

for (; i < buf.length-1; i += 2) {
chars[charsPos++] = (buf[i] << 8) + buf[i+1];

let offset = 0;
let byteLen = buf.length;

// Process previous leadByte
let prefix = '';
if (this.leadByte !== -1) {
offset++; byteLen--;
prefix = String.fromCharCode((this.leadByte << 8) | buf[0]);
}

// Set new leadByte
if (byteLen & 1) {
this.leadByte = buf[buf.length-1];
byteLen--;
} else {
this.leadByte = -1;
}

this.overflowByte = (i == buf.length-1) ? buf[i] : -1;
// Process leadSurrogate
if (prefix.length || byteLen) {
// Add high surrogate from previous chunk.
if (this.leadSurrogate) {
if (prefix.length) {
prefix = this.leadSurrogate + prefix;
} else {
// Make sure 'chars' don't start with a lone low surrogate; it will mess with rawCharsToResult.
prefix = this.leadSurrogate + String.fromCharCode((buf[offset] << 8) | buf[offset+1]);
offset += 2; byteLen -= 2;
}
this.leadSurrogate = undefined;
}

// Slice off a new high surrogate at the end of the current chunk.
if (byteLen) {
const lastIdx = offset + byteLen - 2;
const lastChar = (buf[lastIdx] << 8) | buf[lastIdx+1];
if (0xD800 <= lastChar && lastChar < 0xDC00) {
this.leadSurrogate = String.fromCharCode(lastChar);
byteLen -= 2;
}
} else { // slice from prefix
const lastChar = prefix.charCodeAt(prefix.length-1);
if (0xD800 <= lastChar && lastChar < 0xDC00) {
this.leadSurrogate = prefix[prefix.length-1];
prefix = prefix.slice(0, -1);
}
}
}

// Convert the main chunk of bytes
const chars = this.backend.allocRawChars(byteLen >> 1);
const srcBytes = new DataView(buf.buffer, buf.byteOffset + offset, byteLen);
for (let i = 0; i < chars.length; i++) {
chars[i] = srcBytes.getUint16(i*2);
}

return this.backend.rawCharsToResult(chars, charsPos);
return prefix + this.backend.rawCharsToResult(chars, chars.length);
}

end() {
this.overflowByte = -1;
if (this.leadSurrogate || this.leadByte !== -1) {
const res = (this.leadSurrogate ? this.leadSurrogate : '') + (this.leadByte !== -1 ? this.defaultChar : '');
this.leadSurrogate = undefined;
this.leadByte = -1;
return res;
}
}
}

Expand All @@ -67,39 +256,25 @@ class Utf16BEDecoder {
// Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false).

exports.utf16 = class Utf16Codec {
constructor(opts, iconv) {
this.iconv = iconv;
}
get encoder() { return Utf16Encoder; }
get decoder() { return Utf16Decoder; }
}

class Utf16Encoder {
constructor(options, codec) {
createEncoder(options, iconv) {
options = options || {};
if (options.addBOM === undefined)
options.addBOM = true;
this.encoder = codec.iconv.getEncoder(options.use || 'utf-16le', options);
return iconv.getEncoder('utf-16le', options);
}

// Pass-through to this.encoder
write(str) {
return this.encoder.write(str);
}

end() {
return this.encoder.end();
createDecoder(options, iconv) {
return new Utf16Decoder(options, iconv);
}
}

class Utf16Decoder {
constructor(options, codec) {
constructor(options, iconv) {
this.decoder = null;
this.initialBufs = [];
this.initialBufsLen = 0;

this.options = options || {};
this.iconv = codec.iconv;
this.iconv = iconv;
}

write(buf) {
Expand Down
14 changes: 10 additions & 4 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,11 @@ iconv._canonicalizeEncoding = function(encoding) {
}

iconv.getEncoder = function getEncoder(encoding, options) {
var codec = iconv.getCodec(encoding),
encoder = new codec.encoder(options, codec, iconv.backend);
const codec = iconv.getCodec(encoding);

let encoder = codec.createEncoder
? codec.createEncoder(options, iconv)
: new codec.encoder(options, codec, iconv.backend);

if (codec.bomAware && options && options.addBOM)
encoder = new bomHandling.PrependBOM(encoder, options);
Expand All @@ -115,8 +118,11 @@ iconv.getEncoder = function getEncoder(encoding, options) {
}

iconv.getDecoder = function getDecoder(encoding, options) {
var codec = iconv.getCodec(encoding),
decoder = new codec.decoder(options, codec, iconv.backend);
const codec = iconv.getCodec(encoding);

let decoder = codec.createDecoder
? codec.createDecoder(options, iconv)
: new codec.decoder(options, codec, iconv.backend);

if (codec.bomAware && !(options && options.stripBOM === false))
decoder = new bomHandling.StripBOM(decoder, options);
Expand Down
1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
"iconv": "^2.3.5",
"mocha": "^3.5.3",
"request": "^2.88.2",
"semver": "^6.3.0",
"unorm": "^1.6.0"
},
"dependencies": {
Expand Down
Loading

0 comments on commit 9aa082f

Please sign in to comment.