From 8f93ac1de8aed3937943f8e4753576315a3e57e9 Mon Sep 17 00:00:00 2001 From: Yarchik Date: Tue, 23 Jun 2026 01:42:07 +0100 Subject: [PATCH] fix(utf32): reassemble split codepoint from overflow, not source index When a 4-byte UTF-32 unit is split across two stream chunks, the decoder fills `this.overflow` to four bytes and then read it back with the source index `i` (`overflow[i]`...`overflow[i + 3]`) instead of `overflow[0]`... `overflow[3]`. Since `overflow` only holds indices 0-3, the read landed out of range whenever `i > 0`, so every codepoint straddling a chunk boundary decoded to U+0000 (LE) or a byte-shifted character (BE). This block was copied from the main loop (which correctly uses `src[i]`); the index just was not adjusted for the overflow buffer. Whole-buffer decode was unaffected, which is why existing tests passed. --- encodings/utf32.js | 4 ++-- test/utf32-test.js | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/encodings/utf32.js b/encodings/utf32.js index 72317893..aae73751 100644 --- a/encodings/utf32.js +++ b/encodings/utf32.js @@ -113,9 +113,9 @@ Utf32Decoder.prototype.write = function (src) { // NOTE: codepoint is a signed int32 and can be negative. // NOTE: We copied this block from below to help V8 optimize it (it works with array, not buffer). if (isLE) { - codepoint = overflow[i] | (overflow[i + 1] << 8) | (overflow[i + 2] << 16) | (overflow[i + 3] << 24) + codepoint = overflow[0] | (overflow[1] << 8) | (overflow[2] << 16) | (overflow[3] << 24) } else { - codepoint = overflow[i + 3] | (overflow[i + 2] << 8) | (overflow[i + 1] << 16) | (overflow[i] << 24) + codepoint = overflow[3] | (overflow[2] << 8) | (overflow[1] << 16) | (overflow[0] << 24) } overflow.length = 0 diff --git a/test/utf32-test.js b/test/utf32-test.js index f43156c5..0635ecdd 100644 --- a/test/utf32-test.js +++ b/test/utf32-test.js @@ -63,6 +63,14 @@ describe("UTF-32LE codec", function () { assert.equal(iconv.decode(Buffer.from([0x61, 0, 0, 0, 0]), "UTF32-LE"), "a") }) + it("decodes correctly when codepoints are split across stream chunks", function () { + for (var at = 1; at < utf32leBuf.length; at++) { + var decoder = iconv.getDecoder("utf-32le") + var res = decoder.write(utf32leBuf.slice(0, at)) + decoder.write(utf32leBuf.slice(at)) + (decoder.end() || "") + assert.equal(res, testStr, "split at byte " + at) + } + }) + it("handles invalid surrogates gracefully", function () { var encoded = iconv.encode(testStr2, "UTF32-LE") assert.equal(escape(iconv.decode(encoded, "UTF32-LE")), escape(testStr2)) @@ -114,6 +122,14 @@ describe("UTF-32BE codec", function () { assert.equal(iconv.decode(Buffer.from([0, 0, 0, 0x61, 0]), "UTF32-BE"), "a") }) + it("decodes correctly when codepoints are split across stream chunks", function () { + for (var at = 1; at < utf32beBuf.length; at++) { + var decoder = iconv.getDecoder("utf-32be") + var res = decoder.write(utf32beBuf.slice(0, at)) + decoder.write(utf32beBuf.slice(at)) + (decoder.end() || "") + assert.equal(res, testStr, "split at byte " + at) + } + }) + it("handles invalid surrogates gracefully", function () { var encoded = iconv.encode(testStr2, "UTF32-BE") assert.equal(escape(iconv.decode(encoded, "UTF32-BE")), escape(testStr2))