diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs index 7d4fa4e91..4819023dd 100644 --- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs +++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs @@ -1,9 +1,12 @@ import MagicString from 'magic-string'; -const REQUIRE_POLYFILLS = - `const { TextEncoder, TextDecoder } = require('../vendor/text-encoding'); +const REQUIRE_WEB_UTILS_POLYFILLS = + `const { TextEncoder } = require('../vendor/text-encoding'); const { encode: btoa, decode: atob } = require('../vendor/base64');\n` +const REQUIRE_PARSE_UTF8_POLYFILLS = + `const { TextDecoder } = require('../vendor/text-encoding');\n`; + export class RequireVendor { /** * Take the compiled source code input; types are expected to already have been removed. @@ -14,17 +17,24 @@ export class RequireVendor { * @returns {{ code: string; map: import('magic-string').SourceMap }} */ transform(code, id) { - if (!id.includes('web_byte_utils')) { - return; - } + if (id.includes('parse_utf8')) { + // MagicString lets us edit the source code and still generate an accurate source map + const magicString = new MagicString(code); + magicString.prepend(REQUIRE_PARSE_UTF8_POLYFILLS); - // MagicString lets us edit the source code and still generate an accurate source map - const magicString = new MagicString(code); - magicString.prepend(REQUIRE_POLYFILLS); + return { + code: magicString.toString(), + map: magicString.generateMap({ hires: true }) + }; + } else if (id.includes('web_byte_utils')) { + // MagicString lets us edit the source code and still generate an accurate source map + const magicString = new MagicString(code); + magicString.prepend(REQUIRE_WEB_UTILS_POLYFILLS); - return { - code: magicString.toString(), - map: magicString.generateMap({ hires: true }) - }; + return { + code: magicString.toString(), + map: magicString.generateMap({ hires: true }) + }; + } } } diff --git a/src/parse_utf8.ts b/src/parse_utf8.ts new file mode 100644 index 000000000..045a9080b --- /dev/null +++ b/src/parse_utf8.ts @@ -0,0 +1,35 @@ +import { BSONError } from './error'; + +type TextDecoder = { + readonly encoding: string; + readonly fatal: boolean; + readonly ignoreBOM: boolean; + decode(input?: Uint8Array): string; +}; +type TextDecoderConstructor = { + new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder; +}; + +// parse utf8 globals +declare const TextDecoder: TextDecoderConstructor; +let TextDecoderFatal: TextDecoder; +let TextDecoderNonFatal: TextDecoder; + +/** + * Determines if the passed in bytes are valid utf8 + * @param bytes - An array of 8-bit bytes. Must be indexable and have length property + * @param start - The index to start validating + * @param end - The index to end validating + */ +export function parseUtf8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string { + if (fatal) { + TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true }); + try { + return TextDecoderFatal.decode(buffer.subarray(start, end)); + } catch (cause) { + throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); + } + } + TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false }); + return TextDecoderNonFatal.decode(buffer.subarray(start, end)); +} diff --git a/src/parser/deserializer.ts b/src/parser/deserializer.ts index ac2781903..165a529cf 100644 --- a/src/parser/deserializer.ts +++ b/src/parser/deserializer.ts @@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol'; import { Timestamp } from '../timestamp'; import { ByteUtils } from '../utils/byte_utils'; import { NumberUtils } from '../utils/number_utils'; -import { validateUtf8 } from '../validate_utf8'; /** @public */ export interface DeserializeOptions { @@ -604,12 +603,7 @@ function deserializeObject( ) throw new BSONError('bad string length in bson'); // Namespace - if (validation != null && validation.utf8) { - if (!validateUtf8(buffer, index, index + stringSize - 1)) { - throw new BSONError('Invalid UTF-8 string in BSON document'); - } - } - const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false); + const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey); // Update parse index position index = index + stringSize; diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts index d6a641a47..ca1482ca0 100644 --- a/src/utils/node_byte_utils.ts +++ b/src/utils/node_byte_utils.ts @@ -1,5 +1,5 @@ import { BSONError } from '../error'; -import { validateUtf8 } from '../validate_utf8'; +import { parseUtf8 } from '../parse_utf8'; import { tryReadBasicLatin, tryWriteBasicLatin } from './latin'; type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary'; @@ -136,12 +136,9 @@ export const nodeJsByteUtils = { const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end); if (fatal) { - // TODO(NODE-4930): Insufficiently strict BSON UTF8 validation for (let i = 0; i < string.length; i++) { if (string.charCodeAt(i) === 0xfffd) { - if (!validateUtf8(buffer, start, end)) { - throw new BSONError('Invalid UTF-8 string in BSON document'); - } + parseUtf8(buffer, start, end, true); break; } } diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts index 77a1f0f74..0f79f0df3 100644 --- a/src/utils/web_byte_utils.ts +++ b/src/utils/web_byte_utils.ts @@ -1,5 +1,6 @@ import { BSONError } from '../error'; import { tryReadBasicLatin } from './latin'; +import { parseUtf8 } from '../parse_utf8'; type TextDecoder = { readonly encoding: string; @@ -179,14 +180,7 @@ export const webByteUtils = { return basicLatin; } - if (fatal) { - try { - return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); - } catch (cause) { - throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); - } - } - return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); + return parseUtf8(uint8array, start, end, fatal); }, utf8ByteLength(input: string): number { diff --git a/src/validate_utf8.ts b/src/validate_utf8.ts deleted file mode 100644 index e1da934c6..000000000 --- a/src/validate_utf8.ts +++ /dev/null @@ -1,47 +0,0 @@ -const FIRST_BIT = 0x80; -const FIRST_TWO_BITS = 0xc0; -const FIRST_THREE_BITS = 0xe0; -const FIRST_FOUR_BITS = 0xf0; -const FIRST_FIVE_BITS = 0xf8; - -const TWO_BIT_CHAR = 0xc0; -const THREE_BIT_CHAR = 0xe0; -const FOUR_BIT_CHAR = 0xf0; -const CONTINUING_CHAR = 0x80; - -/** - * Determines if the passed in bytes are valid utf8 - * @param bytes - An array of 8-bit bytes. Must be indexable and have length property - * @param start - The index to start validating - * @param end - The index to end validating - */ -export function validateUtf8( - bytes: { [index: number]: number }, - start: number, - end: number -): boolean { - let continuation = 0; - - for (let i = start; i < end; i += 1) { - const byte = bytes[i]; - - if (continuation) { - if ((byte & FIRST_TWO_BITS) !== CONTINUING_CHAR) { - return false; - } - continuation -= 1; - } else if (byte & FIRST_BIT) { - if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) { - continuation = 1; - } else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) { - continuation = 2; - } else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) { - continuation = 3; - } else { - return false; - } - } - } - - return !continuation; -} diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index fa6d7f893..67a4721fe 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils'; import * as sinon from 'sinon'; import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson'; import * as crypto from 'node:crypto'; +import { utf8WebPlatformSpecTests } from './data/utf8_wpt_error_cases'; type ByteUtilTest = { name: string; @@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [ } } ]; + const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ { name: 'should create utf8 string from buffer input', @@ -416,13 +418,6 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ expect(output).to.be.a('string').with.lengthOf(0); } }, - { - name: 'should throw an error if fatal is set and string is invalid', - inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true], - expectation({ error }) { - expect(error).to.match(/Invalid UTF-8 string in BSON document/i); - } - }, { name: 'should insert replacement character fatal is false and string is invalid', inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false], @@ -430,8 +425,21 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ expect(error).to.not.exist; expect(output).to.equal('abc\uFFFD'); } - } + }, + ...utf8WebPlatformSpecTests.map(t => ({ + name: t.name, + inputs: [Uint8Array.from(t.input), 0, t.input.length, true] as [ + buffer: Uint8Array, + start: number, + end: number, + fatal: boolean + ], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + })) ]; + const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [ { name: 'should return zero for empty string', diff --git a/test/node/data/utf8_wpt_error_cases.ts b/test/node/data/utf8_wpt_error_cases.ts new file mode 100644 index 000000000..6d3a98135 --- /dev/null +++ b/test/node/data/utf8_wpt_error_cases.ts @@ -0,0 +1,67 @@ +// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js +// commit sha: 7c9f867 +// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544 + +export const utf8WebPlatformSpecTests = [ + { encoding: 'utf-8', input: [0xff], name: 'invalid code' }, + { encoding: 'utf-8', input: [0xc0], name: 'ends early' }, + { encoding: 'utf-8', input: [0xe0], name: 'ends early 2' }, + { encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' }, + { encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' }, + { encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' }, + { encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' }, + { encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' }, + { encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' }, + { encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' }, + { encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' }, + + // Overlong encodings + { encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' }, + { encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' }, + { encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], + name: 'overlong U+0000 - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' }, + { encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' }, + { encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf], + name: 'overlong U+007f - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' }, + { encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf], + name: 'overlong U+07ff - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' }, + { encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf], + name: 'overlong U+ffff - 6 bytes' + }, + + { encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' }, + { + encoding: 'utf-8', + input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf], + name: 'overlong U+10ffff - 6 bytes' + }, + + // UTf-16 surrogates encoded as code points in UTf-8 + { encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' }, + { encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' }, + { encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' } +]; diff --git a/test/node/parser/deserializer.test.ts b/test/node/parser/deserializer.test.ts index 005ccefa4..30c684be5 100644 --- a/test/node/parser/deserializer.test.ts +++ b/test/node/parser/deserializer.test.ts @@ -1,6 +1,7 @@ import * as BSON from '../../register-bson'; import { expect } from 'chai'; -import { bufferFromHexArray } from '../tools/utils'; +import { bufferFromHexArray, int32LEToHex } from '../tools/utils'; +import { utf8WebPlatformSpecTests } from '../data/utf8_wpt_error_cases'; describe('deserializer()', () => { describe('when the fieldsAsRaw options is present and has a value that corresponds to a key in the object', () => { @@ -58,4 +59,47 @@ describe('deserializer()', () => { expect(resultCodeWithScope).to.have.deep.nested.property('a.scope', { b: true }); }); }); + + describe('utf8 validation', () => { + for (const test of utf8WebPlatformSpecTests) { + const inputStringSize = int32LEToHex(test.input.length + 1); // int32 size of string + const inputHexString = Buffer.from(test.input).toString('hex'); + const buffer = bufferFromHexArray([ + '02', // string + '6100', // 'a' key with null terminator + inputStringSize, + inputHexString, + '00' + ]); + context(`when utf8 validation is on and input is ${test.name}`, () => { + it(`throws error containing 'Invalid UTF-8'`, () => { + // global case + expect(() => BSON.deserialize(buffer, { validation: { utf8: true } })).to.throw( + BSON.BSONError, + /Invalid UTF-8 string in BSON document/i + ); + + // specific case + expect(() => BSON.deserialize(buffer, { validation: { utf8: { a: true } } })).to.throw( + BSON.BSONError, + /Invalid UTF-8 string in BSON document/i + ); + }); + }); + + context(`when utf8 validation is off and input is ${test.name}`, () => { + it('returns a string containing at least 1 replacement character', () => { + // global case + expect(BSON.deserialize(buffer, { validation: { utf8: false } })) + .to.have.property('a') + .that.includes('\uFFFD'); + + // specific case + expect(BSON.deserialize(buffer, { validation: { utf8: { a: false } } })) + .to.have.property('a') + .that.includes('\uFFFD'); + }); + }); + } + }); }); diff --git a/test/node/release.test.ts b/test/node/release.test.ts index da69230df..756305b38 100644 --- a/test/node/release.test.ts +++ b/test/node/release.test.ts @@ -50,7 +50,7 @@ const REQUIRED_FILES = [ 'src/utils/number_utils.ts', 'src/utils/web_byte_utils.ts', 'src/utils/latin.ts', - 'src/validate_utf8.ts', + 'src/parse_utf8.ts', 'vendor/base64/base64.js', 'vendor/base64/package.json', 'vendor/base64/LICENSE-MIT.txt',