From 9aa082fa7f7fb459b579fb44f859d416a1159019 Mon Sep 17 00:00:00 2001 From: Alexander Shtuchkin Date: Wed, 15 Jul 2020 18:08:49 -0400 Subject: [PATCH] Implement UTF-16LE encoding, update tests, adjust codec interface Three major reasons for reimplementing UTF-16 and not use native codec: 1. We want to remove StringDecoder & Buffer references due to #235. 2. StringDecoder is inconsistent with handling surrogates on Node v6-9 3. NPM module string_decoder gives strange results when processing chunks - it sometimes prepends '\u0000', likely due to a bug. Performance was and is a major concern here. Decoder shouldn't be affected because it uses backend methods directly. Encoder is affected due to introducing character-level loop. It's still very fast (~450Mb/s), so I'm not too worried. If needed, we can make it about 4x faster in Node.js by introducing a dedicated backend method. Browser speeds will be the same. --- backends/web.js | 13 ++- encodings/internal.js | 3 +- encodings/utf16.js | 249 +++++++++++++++++++++++++++++++++++------- lib/index.js | 14 ++- package.json | 1 - test/streams-test.js | 17 +-- test/utf16-test.js | 241 +++++++++++++++++++++++++++++++++------- test/utils.js | 26 +++++ 8 files changed, 464 insertions(+), 100 deletions(-) diff --git a/backends/web.js b/backends/web.js index 888c903..531a018 100644 --- a/backends/web.js +++ b/backends/web.js @@ -1,5 +1,9 @@ "use strict"; -// NOTE: This backend uses TextDecoder interface. +// NOTE: This backend uses TextDecoder class. +// NOTE: Web backend differs from Node in handling invalid surrogates when decoding to strings in rawCharsToResult() function. +// Node passes them through unchanged, web backend (actually TextDecoder) replaces them with '�'. I haven't found a +// performant way to unify these behaviors while keeping compatibility with Node <11 where there's no TextDecoder. +// Not too worried as it seems like an edge case mostly concerning utf-16/utf-32/cesu8 codecs, but something to be aware of. module.exports = { // Encoder string input: use str directly, .length, .charCodeAt(i). @@ -38,7 +42,12 @@ module.exports = { return new Uint16Array(new ArrayBuffer(numChars * Uint16Array.BYTES_PER_ELEMENT)); }, rawCharsToResult(rawChars, finalLen) { - return new TextDecoder("utf-16").decode(rawChars.subarray(0, finalLen)); + rawChars = rawChars.subarray(0, finalLen); + // NOTE: TextDecoder will convert all invalid surrogates to '�'-s. + let res = new TextDecoder("utf-16", {ignoreBOM: true}).decode(rawChars); + if (res.length !== finalLen) + throw new Error("TextDecoder returned different length string on array " + rawChars); + return res; }, // Optimizations diff --git a/encodings/internal.js b/encodings/internal.js index dc1074f..d04ed2f 100644 --- a/encodings/internal.js +++ b/encodings/internal.js @@ -9,8 +9,7 @@ module.exports = { cesu8: { type: "_internal", bomAware: true}, unicode11utf8: "utf8", - ucs2: { type: "_internal", bomAware: true}, - utf16le: "ucs2", + // NOTE: utf-16le/ucs2 are in utf16.js. binary: { type: "_internal" }, base64: { type: "_internal" }, diff --git a/encodings/utf16.js b/encodings/utf16.js index ba23f9a..2cd0452 100644 --- a/encodings/utf16.js +++ b/encodings/utf16.js @@ -1,17 +1,150 @@ "use strict"; -// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js +// == UTF16-LE codec. ========================================================== +// Note: We're not using Node.js native codec because StringDecoder implementation is buggy +// (adds \0 in some chunks; doesn't flag non-even number of bytes). We do use raw encoding/decoding +// routines for performance where possible, though. + +exports.utf16le = class Utf16LECodec { + createEncoder(options, iconv) { + return new Utf16LEEncoder(iconv.backend); + } + createDecoder(options, iconv) { + return new Utf16LEDecoder(iconv.backend, iconv.defaultCharUnicode); + } + get bomAware() { return true; } +} + +class Utf16LEEncoder { + constructor(backend) { + this.backend = backend; + } + + write(str) { + const bytes = this.backend.allocBytes(str.length * 2); + const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length); + for (let i = 0; i < str.length; i++) { + chars[i] = str.charCodeAt(i); + } + return this.backend.bytesToResult(bytes, bytes.length); + } + + end() {} +} + +class Utf16LEDecoder { + constructor(backend, defaultChar) { + this.backend = backend; + this.defaultChar = defaultChar; + this.leadByte = -1; + this.leadSurrogate = undefined; + } + + write(buf) { + // NOTE: This function is mostly the same as Utf16BEDecoder.write() with bytes swapped. + // Please keep them in sync. + // NOTE: The logic here is more complicated than barely necessary due to several limitations: + // 1. Input data chunks can split 2-byte code units, making 'leadByte' necessary. + // 2. Input data chunks can split valid surrogate pairs, making 'leadSurrogate' necessary. + // 3. rawCharsToResult() of Web backend converts all lone surrogates to '�', so we need to make + // sure we don't feed it parts of valid surrogate pairs. + // 4. For performance reasons we want to use initial buffer as much as we can. This is not + // possible if after our calculations the 2-byte memory alignment of a Uint16Array is lost, + // in which case we have to do a copy. + + if (buf.length == 0) { + return ''; + } + let offset = 0; + let byteLen = buf.length; + + // Process previous leadByte + let prefix = ''; + if (this.leadByte !== -1) { + offset++; byteLen--; + prefix = String.fromCharCode(this.leadByte | (buf[0] << 8)); + } + + // Set new leadByte if needed + if (byteLen & 1) { + this.leadByte = buf[buf.length-1]; + byteLen--; + } else { + this.leadByte = -1; + } + + // Process leadSurrogate + if (prefix.length || byteLen) { + // Add high surrogate from previous chunk. + if (this.leadSurrogate) { + if (prefix.length) { + prefix = this.leadSurrogate + prefix; + } else { + // Make sure 'chars' don't start with a lone low surrogate; it will mess with rawCharsToResult. + prefix = this.leadSurrogate + String.fromCharCode(buf[offset] | (buf[offset+1] << 8)); + offset += 2; byteLen -= 2; + } + this.leadSurrogate = undefined; + } + + // Slice off a new high surrogate at the end of the current chunk. + if (byteLen) { + const lastIdx = offset + byteLen - 2; + const lastChar = buf[lastIdx] | (buf[lastIdx+1] << 8); + if (0xD800 <= lastChar && lastChar < 0xDC00) { + this.leadSurrogate = String.fromCharCode(lastChar); + byteLen -= 2; + } + } else { // slice from prefix + const lastChar = prefix.charCodeAt(prefix.length-1); + if (0xD800 <= lastChar && lastChar < 0xDC00) { + this.leadSurrogate = prefix[prefix.length-1]; + prefix = prefix.slice(0, -1); + } + } + } + + let chars; + if ((buf.byteOffset + offset) & 1 === 0) { + // If byteOffset is aligned, just use the ArrayBuffer from input buf. + chars = new Uint16Array(buf.buffer, buf.byteOffset + offset, byteLen >> 1); + } else { + // If byteOffset is NOT aligned, create a new aligned buffer and copy the data. + chars = this.backend.allocRawChars(byteLen >> 1); + const srcByteView = new Uint8Array(buf.buffer, buf.byteOffset + offset, byteLen); + const destByteView = new Uint8Array(chars.buffer, chars.byteOffset, byteLen); + destByteView.set(srcByteView); + } + + return prefix + this.backend.rawCharsToResult(chars, chars.length); + } + + end() { + if (this.leadSurrogate || this.leadByte !== -1) { + const res = (this.leadSurrogate ? this.leadSurrogate : '') + (this.leadByte !== -1 ? this.defaultChar : ''); + this.leadSurrogate = undefined; + this.leadByte = -1; + return res; + } + } +} +exports.ucs2 = "utf16le"; // Alias + // == UTF16-BE codec. ========================================================== exports.utf16be = class Utf16BECodec { - get encoder() { return Utf16BEEncoder; } - get decoder() { return Utf16BEDecoder; } + createEncoder(options, iconv) { + return new Utf16BEEncoder(iconv.backend); + } + createDecoder(options, iconv) { + return new Utf16BEDecoder(iconv.backend, iconv.defaultCharUnicode); + } get bomAware() { return true; } } class Utf16BEEncoder { - constructor(opts, codec, backend) { + constructor(backend) { this.backend = backend; } @@ -30,30 +163,86 @@ class Utf16BEEncoder { } class Utf16BEDecoder { - constructor(opts, codec, backend) { + constructor(backend, defaultChar) { this.backend = backend; - this.overflowByte = -1; + this.defaultChar = defaultChar; + this.leadByte = -1; + this.leadSurrogate = undefined; } write(buf) { - const chars = this.backend.allocRawChars((buf.length+1) >> 1); - let charsPos = 0, i = 0; - - if (this.overflowByte !== -1 && i < buf.length) { - chars[charsPos++] = (this.overflowByte << 8) + buf[i++]; + // NOTE: This function is mostly copy/paste from Utf16LEDecoder.write() with bytes swapped. + // Please keep them in sync. Comments in that function apply here too. + if (buf.length === 0) { + return ''; } - - for (; i < buf.length-1; i += 2) { - chars[charsPos++] = (buf[i] << 8) + buf[i+1]; + + let offset = 0; + let byteLen = buf.length; + + // Process previous leadByte + let prefix = ''; + if (this.leadByte !== -1) { + offset++; byteLen--; + prefix = String.fromCharCode((this.leadByte << 8) | buf[0]); + } + + // Set new leadByte + if (byteLen & 1) { + this.leadByte = buf[buf.length-1]; + byteLen--; + } else { + this.leadByte = -1; } - this.overflowByte = (i == buf.length-1) ? buf[i] : -1; + // Process leadSurrogate + if (prefix.length || byteLen) { + // Add high surrogate from previous chunk. + if (this.leadSurrogate) { + if (prefix.length) { + prefix = this.leadSurrogate + prefix; + } else { + // Make sure 'chars' don't start with a lone low surrogate; it will mess with rawCharsToResult. + prefix = this.leadSurrogate + String.fromCharCode((buf[offset] << 8) | buf[offset+1]); + offset += 2; byteLen -= 2; + } + this.leadSurrogate = undefined; + } + + // Slice off a new high surrogate at the end of the current chunk. + if (byteLen) { + const lastIdx = offset + byteLen - 2; + const lastChar = (buf[lastIdx] << 8) | buf[lastIdx+1]; + if (0xD800 <= lastChar && lastChar < 0xDC00) { + this.leadSurrogate = String.fromCharCode(lastChar); + byteLen -= 2; + } + } else { // slice from prefix + const lastChar = prefix.charCodeAt(prefix.length-1); + if (0xD800 <= lastChar && lastChar < 0xDC00) { + this.leadSurrogate = prefix[prefix.length-1]; + prefix = prefix.slice(0, -1); + } + } + } + + // Convert the main chunk of bytes + const chars = this.backend.allocRawChars(byteLen >> 1); + const srcBytes = new DataView(buf.buffer, buf.byteOffset + offset, byteLen); + for (let i = 0; i < chars.length; i++) { + chars[i] = srcBytes.getUint16(i*2); + } - return this.backend.rawCharsToResult(chars, charsPos); + return prefix + this.backend.rawCharsToResult(chars, chars.length); } end() { - this.overflowByte = -1; + if (this.leadSurrogate || this.leadByte !== -1) { + const res = (this.leadSurrogate ? this.leadSurrogate : '') + (this.leadByte !== -1 ? this.defaultChar : ''); + this.leadSurrogate = undefined; + this.leadByte = -1; + return res; + } } } @@ -67,39 +256,25 @@ class Utf16BEDecoder { // Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false). exports.utf16 = class Utf16Codec { - constructor(opts, iconv) { - this.iconv = iconv; - } - get encoder() { return Utf16Encoder; } - get decoder() { return Utf16Decoder; } -} - -class Utf16Encoder { - constructor(options, codec) { + createEncoder(options, iconv) { options = options || {}; if (options.addBOM === undefined) options.addBOM = true; - this.encoder = codec.iconv.getEncoder(options.use || 'utf-16le', options); + return iconv.getEncoder('utf-16le', options); } - - // Pass-through to this.encoder - write(str) { - return this.encoder.write(str); - } - - end() { - return this.encoder.end(); + createDecoder(options, iconv) { + return new Utf16Decoder(options, iconv); } } class Utf16Decoder { - constructor(options, codec) { + constructor(options, iconv) { this.decoder = null; this.initialBufs = []; this.initialBufsLen = 0; this.options = options || {}; - this.iconv = codec.iconv; + this.iconv = iconv; } write(buf) { diff --git a/lib/index.js b/lib/index.js index f85d007..89b5890 100644 --- a/lib/index.js +++ b/lib/index.js @@ -105,8 +105,11 @@ iconv._canonicalizeEncoding = function(encoding) { } iconv.getEncoder = function getEncoder(encoding, options) { - var codec = iconv.getCodec(encoding), - encoder = new codec.encoder(options, codec, iconv.backend); + const codec = iconv.getCodec(encoding); + + let encoder = codec.createEncoder + ? codec.createEncoder(options, iconv) + : new codec.encoder(options, codec, iconv.backend); if (codec.bomAware && options && options.addBOM) encoder = new bomHandling.PrependBOM(encoder, options); @@ -115,8 +118,11 @@ iconv.getEncoder = function getEncoder(encoding, options) { } iconv.getDecoder = function getDecoder(encoding, options) { - var codec = iconv.getCodec(encoding), - decoder = new codec.decoder(options, codec, iconv.backend); + const codec = iconv.getCodec(encoding); + + let decoder = codec.createDecoder + ? codec.createDecoder(options, iconv) + : new codec.decoder(options, codec, iconv.backend); if (codec.bomAware && !(options && options.stripBOM === false)) decoder = new bomHandling.StripBOM(decoder, options); diff --git a/package.json b/package.json index 90bb804..57b6d78 100644 --- a/package.json +++ b/package.json @@ -37,7 +37,6 @@ "iconv": "^2.3.5", "mocha": "^3.5.3", "request": "^2.88.2", - "semver": "^6.3.0", "unorm": "^1.6.0" }, "dependencies": { diff --git a/test/streams-test.js b/test/streams-test.js index 202781f..ddc8d26 100644 --- a/test/streams-test.js +++ b/test/streams-test.js @@ -1,6 +1,5 @@ var assert = require('assert'), Buffer = require('safer-buffer').Buffer, - semver = require('semver'), iconv = require(__dirname+'/../'); if (!iconv.supportsStreams) @@ -213,17 +212,7 @@ describe("Streaming mode", function() { encoding: "ucs2", input: [[0x3D], [0xD8, 0x3B], [0xDE]], // U+1F63B, 😻, SMILING CAT FACE WITH HEART-SHAPED EYES outputType: false, // Don't concat - checkOutput: function(res) { - if (semver.satisfies(process.version, '>= 6.2.1 < 10.0.0')) { - // After a string_decoder rewrite in https://github.com/nodejs/node/pull/6777, which - // was merged in Node v6.2.1, we don't merge chunks anymore. - // Not really correct, but it seems we cannot do anything with it. - // Though it has been fixed again in Node v10.0.0 - assert.deepEqual(res, ["\uD83D", "\uDE3B"]); - } else { - assert.deepEqual(res, ["\uD83D\uDE3B"]); // We should have only 1 chunk. - } - }, + checkOutput: function(res) { assert.deepEqual(res, ["\uD83D\uDE3B"]); }, // We should have only 1 chunk. })); it("Encoding using internal modules: utf8", checkEncodeStream({ @@ -264,13 +253,13 @@ describe("Streaming mode", function() { it("Decoding of uneven length buffers from UTF-16BE - 2", checkDecodeStream({ encoding: "UTF-16BE", - input: [[0x00, 0x61, 0x00], [0x62, 0x00, 0x63]], + input: [[0x00, 0x61, 0x00], [0x62, 0x00], [0x63]], output: "abc" })); it("Decoding of uneven length buffers from UTF-16", checkDecodeStream({ encoding: "UTF-16", - input: [[0x61], [0x0], [0x20], [0x0]], + input: [[0x61], [0x0, 0x20], [0x0]], output: "a " })); diff --git a/test/utf16-test.js b/test/utf16-test.js index 7a18188..3629118 100644 --- a/test/utf16-test.js +++ b/test/utf16-test.js @@ -1,80 +1,241 @@ -var assert = require('assert'), - utils = require('./utils'), - iconv = utils.requireIconv(), - hex = utils.hex; - -var testStr = "1aя中文☃💩"; - utf16beBuf = utils.bytesFrom([0, 0x31, 0, 0x61, 0x04, 0x4f, 0x4e, 0x2d, 0x65, 0x87, 0x26, 0x03, 0xd8, 0x3d, 0xdc, 0xa9]), - utf16leBuf = utils.bytesFrom([0x31, 0, 0x61, 0, 0x4f, 0x04, 0x2d, 0x4e, 0x87, 0x65, 0x03, 0x26, 0x3d, 0xd8, 0xa9, 0xdc]), - utf16beBOM = utils.bytesFrom([0xFE, 0xFF]), - utf16leBOM = utils.bytesFrom([0xFF, 0xFE]), - sampleStr = '\n<俄语>данные'; - -describe("UTF-16LE codec #node-web", function() { +"use strict"; + +const assert = require('assert'), + utils = require('./utils'), + iconv = utils.requireIconv(), + hex = utils.hex; + +const testStr = "1aя中文☃💩", + utf16beBuf = utils.bytesFrom([0, 0x31, 0, 0x61, 0x04, 0x4f, 0x4e, 0x2d, 0x65, 0x87, 0x26, 0x03, 0xd8, 0x3d, 0xdc, 0xa9]), + utf16leBuf = utils.bytesFrom([0x31, 0, 0x61, 0, 0x4f, 0x04, 0x2d, 0x4e, 0x87, 0x65, 0x03, 0x26, 0x3d, 0xd8, 0xa9, 0xdc]), + utf16beBOM = utils.bytesFrom([0xFE, 0xFF]), + utf16leBOM = utils.bytesFrom([0xFF, 0xFE]), + sampleStr = '\n<数据>נְתוּנִים', + weirdBuf = utils.bytesFrom([0x15, 0x16, 0x17, 0x18]); // Can't automatically detect whether it's LE or BE. + + +describe("UTF-16LE encoder #node-web", function() { + const enc = 'utf16-le'; + it("encodes basic strings correctly", function() { + assert.equal(hex(iconv.encode('', enc)), ''); + assert.equal(hex(iconv.encode(testStr, enc)), hex(utf16leBuf)); + }); + + it("adds BOM if asked", function() { + assert.equal(hex(iconv.encode(testStr, enc, {addBOM: true})), hex(utf16leBOM) + hex(utf16leBuf)); + }); + + // NOTE: I'm not sure what the right behavior is here. Node.js keeps all invalid surrogates as-is for + // both utf-16le and ucs2 encodings. TextEncoder can't encode utf-16, but when using utf-8, replaces + // these with '�'. Leaning towards Node side for now. + it("keeps single and invalid surrogates as-is", function() { + assert.equal(hex(iconv.encode(' \uD800 \uDE00 \uDE00\uD800 \uD800', enc)), + "2000 00d8 2000 00de 2000 00de 00d8 2000 00d8".replace(/ /g, "")); + }); + + it("has full 16-bit transparency", function() { + let s = '', arr = []; + for (let i = 0; i < 65536; i++) { + s += String.fromCharCode(i); + arr.push(i & 0xFF, i >> 8); + } + assert.equal(hex(iconv.encode(s, enc)), hex(utils.bytesFrom(arr))); + }); + + it("keeps valid surrogate pairs split on a chunk boundary unchanged", function() { + const encoder = iconv.getEncoder(enc); + assert.equal(hex(encoder.write('\uD83D')), '3dd8'); + assert.equal(hex(encoder.write('\uDCA9')), 'a9dc'); + assert.strictEqual(encoder.end(), undefined); + }); +}); + +describe("UTF-16LE decoder #node-web", function() { + const enc = 'utf16-le'; + it("decodes basic buffers correctly", function() { + assert.equal(iconv.decode(utf16leBuf, enc), testStr); + }); + + it("decodes uneven length buffers showing an error", function() { + assert.equal(iconv.decode(utils.bytesFrom([0x61, 0, 0]), enc), "a�"); + }); + it("decodes very short buffers correctly", function() { - assert.equal(iconv.decode(utils.bytesFrom([]), 'utf-16le'), ''); - - // Looks like StringDecoder doesn't do the right thing here, returning '\u0000'. TODO: fix. - //assert.equal(iconv.decode(utils.bytesFrom([0x61]), 'utf-16le'), ''); + assert.equal(iconv.decode(utils.bytesFrom([]), enc), ''); + assert.equal(iconv.decode(utils.bytesFrom([0x61]), enc), '�'); + }); + + // NOTE: Node and Web backends differ in handling invalid surrogates: node passes them through, web + // replaces them with '�'. Don't know what to do with this, as I haven't found a performant way + // to unify them while keeping compatibility with Node 4.5 where there's no TextDecoder. + // Not too worried as it seems like an edge case, but something to be aware of. + // When this is resolved, please add the same tests to utf16-be codec too. + it.skip("passes through invalid surrogates as-is", function() { + assert.equal(iconv.decode(utils.bytesFrom( + [0x20, 0x00, 0x00, 0xd8, 0x20, 0x00, 0x00, 0xde, 0x20, 0x00, 0x00, 0xde, 0x00, 0xd8, 0x20, 0x00, 0x00, 0xd8]), enc), + ' \uD800 \uDE00 \uDE00\uD800 \uD800'); + }); + + // See comment in the test above. + it.skip("has full 16-bit transparency", function() { + let s = '', arr = []; + for (let i = 0; i < 65536; i++) { + s += String.fromCharCode(i); + arr.push(i & 0xFF, i >> 8); + } + assert.equal(iconv.decode(utils.bytesFrom(arr), enc), s); }); + + it("handles chunks with uneven lengths correctly", utils.checkDecoderChunks(enc, { + inputs: [[], [0x61], [], [0x00], [0x61], [0x00, 0x61], [0x00, 0x00]], + outputs: ['', '', '', 'a', '', 'a', 'a', '�'], + })); + + it("doesn't split valid surrogate pairs between chunks", utils.checkDecoderChunks(enc, [{ + inputs: [[0x3D, 0xD8, 0x3B], [0xDE]], + outputs: [ '', "\uD83D\uDE3B"], + }, { + inputs: [[0x3D, 0xD8], [0x3B], [0xDE]], + outputs: [ '', '', "\uD83D\uDE3B"], + }, { + inputs: [[0x3D], [0xD8, 0x3B], [0xDE]], + outputs: [ '', '', "\uD83D\uDE3B"], + }, { + inputs: [[0x3D], [0xD8], [0x3B], [0xDE]], + outputs: [ '', '', '', "\uD83D\uDE3B"], + }])); + + it("handles complex surrogate pairs cases", utils.checkDecoderChunks(enc, [{ + inputs: [[0x3E], [0xD9], [0x3D], [0xD8], [0x3B], [0xDE]], + outputs: [ '', '', '', '\uD93E', '', "\uD83D\uDE3B"] + }, { + inputs: [[0x3E, 0xD9, 0x3D], [0xD8], [0x3B, 0xDE]], + outputs: [ '', '\uD93E', "\uD83D\uDE3B"], + }, { + inputs: [[0x3E, 0xD9, 0x3D]], + outputs: [ '', '\uD93E�'], + }, { + inputs: [[0x3E, 0xD9], [0x3D]], + outputs: [ '', '', '\uD93E�'], + }, { + inputs: [[0x3E, 0xD9]], + outputs: [ '', '\uD93E'], + }])); }); -describe("UTF-16BE codec #node-web", function() { +describe("UTF-16BE encoder #node-web", function() { + const enc = 'utf16-be'; it("encodes basic strings correctly", function() { - assert.equal(hex(iconv.encode(testStr, 'utf16-be')), hex(utf16beBuf)); + assert.equal(hex(iconv.encode('', enc)), ''); + assert.equal(hex(iconv.encode(testStr, enc)), hex(utf16beBuf)); + }); + + it("adds BOM if asked", function() { + assert.equal(hex(iconv.encode(testStr, enc, {addBOM: true})), hex(utf16beBOM) + hex(utf16beBuf)); + }); + + // See note in UTF-16LE encoder above; we need to keep them consistent. + it("keeps single and invalid surrogates as-is", function() { + assert.equal(hex(iconv.encode(' \uD800 \uDE00 \uDE00\uD800 \uD800', enc)), + "0020 d800 0020 de00 0020 de00 d800 0020 d800".replace(/ /g, "")); }); + it("handles valid surrogate pairs on chunk boundary correctly", function() { + const encoder = iconv.getEncoder(enc); + assert.equal(hex(encoder.write('\uD83D')), 'd83d'); + assert.equal(hex(encoder.write('\uDCA9')), 'dca9'); + assert.strictEqual(encoder.end(), undefined); + }); +}); + +describe("UTF-16BE decoder #node-web", function() { + const enc = 'utf16-be'; it("decodes basic buffers correctly", function() { - assert.equal(iconv.decode(utf16beBuf, 'utf16-be'), testStr); + assert.equal(iconv.decode(utf16beBuf, enc), testStr); }); - it("decodes uneven length buffers with no error", function() { - assert.equal(iconv.decode(utils.bytesFrom([0, 0x61, 0]), 'utf16-be'), "a"); + it("decodes uneven length buffers showing an error", function() { + assert.equal(iconv.decode(utils.bytesFrom([0, 0x61, 0]), enc), "a�"); }); it("decodes very short buffers correctly", function() { - assert.equal(iconv.decode(utils.bytesFrom([]), 'utf-16be'), ''); - assert.equal(iconv.decode(utils.bytesFrom([0x61]), 'utf-16be'), ''); + assert.equal(iconv.decode(utils.bytesFrom([]), enc), ''); + assert.equal(iconv.decode(utils.bytesFrom([0x61]), enc), '�'); }); + + it("handles chunks with uneven lengths correctly", utils.checkDecoderChunks(enc, { + inputs: [[], [0x00], [], [0x61], [0x00], [0x61, 0x00], [0x61, 0x00]], + outputs: ['', '', '', 'a', '', 'a', 'a', '�'], + })); + + it("doesn't split valid surrogate pairs between chunks", utils.checkDecoderChunks(enc, [{ + inputs: [[0xD8, 0x3D, 0xDE], [0x3B]], + outputs: [ '', "\uD83D\uDE3B"], + }, { + inputs: [[0xD8, 0x3D], [0xDE], [0x3B]], + outputs: [ '', '', "\uD83D\uDE3B"], + }, { + inputs: [[0xD8], [0x3D, 0xDE], [0x3B]], + outputs: [ '', '', "\uD83D\uDE3B"], + }, { + inputs: [[0xD8], [0x3D], [0xDE], [0x3B]], + outputs: [ '', '', '', "\uD83D\uDE3B"], + }])); + + it("handles complex surrogate pairs cases", utils.checkDecoderChunks(enc, [{ + inputs: [[0xD9], [0x3E], [0xD8], [0x3D], [0xDE], [0x3B]], + outputs: [ '', '', '', '\uD93E', '', "\uD83D\uDE3B"] + }, { + inputs: [[0xD9, 0x3E, 0xD8], [0x3D], [0xDE, 0x3B]], + outputs: [ '', '\uD93E', "\uD83D\uDE3B"], + }, { + inputs: [[0xD9, 0x3E, 0xD8]], + outputs: [ '', '\uD93E�'], + }, { + inputs: [[0xD9, 0x3E], [0xD8]], + outputs: [ '', '', '\uD93E�'], + }, { + inputs: [[0xD9, 0x3E]], + outputs: [ '', '\uD93E'], + }])); }); describe("UTF-16 encoder #node-web", function() { + const enc = 'utf-16'; it("uses UTF-16LE and adds BOM when encoding", function() { - assert.equal(hex(iconv.encode(testStr, "utf-16")), hex(utf16leBOM) + hex(utf16leBuf)); + assert.equal(hex(iconv.encode(testStr, enc)), hex(utf16leBOM) + hex(utf16leBuf)); }); it("can skip BOM", function() { - assert.equal(hex(iconv.encode(testStr, "utf-16", {addBOM: false})), hex(utf16leBuf)); - }); - - it("can use other encodings, for example UTF-16BE, with BOM", function() { - assert.equal(hex(iconv.encode(testStr, "utf-16", {use: 'UTF-16BE'})), hex(utf16beBOM) + hex(utf16beBuf)); + assert.equal(hex(iconv.encode(testStr, enc, {addBOM: false})), hex(utf16leBuf)); }); }); describe("UTF-16 decoder #node-web", function() { + const enc = 'utf-16', + encLE = 'utf-16le', + encBE = 'utf-16be'; + it("uses BOM to determine encoding", function() { - assert.equal(iconv.decode(utils.concatBufs([utf16leBOM, utf16leBuf]), "utf-16"), testStr); - assert.equal(iconv.decode(utils.concatBufs([utf16beBOM, utf16beBuf]), "utf-16"), testStr); + assert.equal(iconv.decode(utils.concatBufs([utf16leBOM, utf16leBuf]), enc), testStr); + assert.equal(iconv.decode(utils.concatBufs([utf16beBOM, utf16beBuf]), enc), testStr); }); it("handles very short buffers", function() { - assert.equal(iconv.decode(utils.bytesFrom([]), 'utf-16'), ''); - - // Looks like StringDecoder doesn't do the right thing here. TODO: fix. - //assert.equal(iconv.decode(utils.bytesFrom([0x61]), 'utf-16'), ''); + assert.equal(iconv.decode(utils.bytesFrom([]), enc), ''); + assert.equal(iconv.decode(utils.bytesFrom([0x61]), enc), '�'); }); it("uses spaces when there is no BOM to determine encoding", function() { - assert.equal(iconv.decode(iconv.encode(sampleStr, 'utf-16le'), 'utf-16'), sampleStr); - assert.equal(iconv.decode(iconv.encode(sampleStr, 'utf-16be'), 'utf-16'), sampleStr); + assert.equal(iconv.decode(iconv.encode(sampleStr, encLE), enc), sampleStr); + assert.equal(iconv.decode(iconv.encode(sampleStr, encBE), enc), sampleStr); }); it("uses UTF-16LE if no BOM and heuristics failed", function() { - assert.equal(iconv.decode(utf16leBuf, 'utf-16'), testStr); + assert.equal(iconv.decode(weirdBuf, enc), iconv.decode(weirdBuf, encLE)); }); it("can be given a different default encoding", function() { - assert.equal(iconv.decode(utf16leBuf, 'utf-16', {default: 'utf-16le'}), testStr); + assert.equal(iconv.decode(weirdBuf, enc, {defaultEncoding: encBE}), iconv.decode(weirdBuf, encBE)); }); }); diff --git a/test/utils.js b/test/utils.js index dc9e3c8..f0a430b 100644 --- a/test/utils.js +++ b/test/utils.js @@ -1,3 +1,5 @@ +"use strict"; + const assert = require("assert"); const utils = module.exports = { @@ -34,5 +36,29 @@ const utils = module.exports = { assert(nonStrict || (bytes instanceof utils.BytesType)); return bytes.reduce((output, byte) => (output + ('0' + (byte & 0xFF).toString(16)).slice(-2)), ''); }, + + checkDecoderChunks(encoding, cases) { + return () => { + const decoder = utils.iconv.getDecoder(encoding); + if (!Array.isArray(cases)) { + cases = [cases]; + } + + for (let idx = 0; idx < cases.length; idx++) { + const inputs = cases[idx].inputs, + outputs = cases[idx].outputs; + for (let i = 0; i < inputs.length; i++) + assert.strictEqual(decoder.write(utils.bytesFrom(inputs[i])), outputs[i], `position ${i} in case ${idx}`); + + if (outputs.length === inputs.length) { + assert(!decoder.end(), `end is not empty in case ${idx}`); + } else if (outputs.length === inputs.length + 1) { + assert.strictEqual(decoder.end(), outputs[outputs.length-1], `end result unexpected in case ${idx}`); + } else { + assert(false, `invalid outputs array size in case ${idx}`); + } + } + } + }, };