nodejs · nodejs-github-bot · Dec 25, 2022 · Dec 22, 2022 · Dec 23, 2022 · Dec 23, 2022
diff --git a/doc/api/buffer.md b/doc/api/buffer.md
@@ -5130,6 +5130,17 @@ For code running using Node.js APIs, converting between base64-encoded strings
 and binary data should be performed using `Buffer.from(str, 'base64')` and
 `buf.toString('base64')`.**
 
+### `buffer.isUtf8(input)`
+
+<!-- YAML
+added: REPLACEME
+-->
+
+* input {Buffer | ArrayBuffer | TypedArray} The input to validate.
+* Returns: {boolean} Returns `true` if and only if the input is valid UTF-8.
+
+This function is used to check if input contains UTF-8 code points (characters).
+
 ### `buffer.INSPECT_MAX_BYTES`
 
 <!-- YAML

diff --git a/lib/buffer.js b/lib/buffer.js
@@ -57,6 +57,7 @@ const {
  compareOffset,
  createFromString,
  fill: bindingFill,
+ isUtf8: bindingIsUtf8,
  indexOfBuffer,
  indexOfNumber,
  indexOfString,
@@ -84,7 +85,8 @@ const {
 const {
  isAnyArrayBuffer,
  isArrayBufferView,
- isUint8Array
+ isUint8Array,
+ isTypedArray,
 } = require('internal/util/types');
 const {
  inspect: utilInspect
@@ -1314,10 +1316,19 @@ function atob(input) {
  return Buffer.from(input, 'base64').toString('latin1');
 }
 
+function isUtf8(input) {
+ if (isTypedArray(input) || isAnyArrayBuffer(input)) {
+ return bindingIsUtf8(input);
+ }
+
+ throw new ERR_INVALID_ARG_TYPE('input', ['TypedArray', 'Buffer'], input);
+}
+
 module.exports = {
  Buffer,
  SlowBuffer,
  transcode,
+ isUtf8,
 
  // Legacy
  kMaxLength,

diff --git a/src/node_buffer.cc b/src/node_buffer.cc
@@ -1223,6 +1223,20 @@ static void EncodeInto(const FunctionCallbackInfo<Value>& args) {
  results[1] = written;
 }
 
+static void IsUtf8(const FunctionCallbackInfo<Value>& args) {
+ Environment* env = Environment::GetCurrent(args);
+ CHECK_EQ(args.Length(), 1);
+ CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer() ||
+ args[0]->IsSharedArrayBuffer());
+ ArrayBufferViewContents<char> abv(args[0]);
+
+ if (abv.WasDetached()) {
+ return node::THROW_ERR_INVALID_STATE(
+ env, "Cannot validate on a detached buffer");
+ }
+
+ args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length()));
+}
 
 void SetBufferPrototype(const FunctionCallbackInfo<Value>& args) {
  Environment* env = Environment::GetCurrent(args);
@@ -1358,6 +1372,8 @@ void Initialize(Local<Object> target,
  SetMethod(context, target, "encodeInto", EncodeInto);
  SetMethodNoSideEffect(context, target, "encodeUtf8String", EncodeUtf8String);
 
+ SetMethodNoSideEffect(context, target, "isUtf8", IsUtf8);
+
  target
  ->Set(context,
  FIXED_ONE_BYTE_STRING(isolate, "kMaxLength"),
@@ -1413,6 +1429,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
  registry->Register(EncodeInto);
  registry->Register(EncodeUtf8String);
 
+ registry->Register(IsUtf8);
+
  registry->Register(StringSlice<ASCII>);
  registry->Register(StringSlice<BASE64>);
  registry->Register(StringSlice<BASE64URL>);

diff --git a/src/node_errors.h b/src/node_errors.h
@@ -68,6 +68,7 @@ void OOMErrorHandler(const char* location, const v8::OOMDetails& details);
  V(ERR_INVALID_ARG_TYPE, TypeError) \
  V(ERR_INVALID_OBJECT_DEFINE_PROPERTY, TypeError) \
  V(ERR_INVALID_MODULE, Error) \
+ V(ERR_INVALID_STATE, Error) \
  V(ERR_INVALID_THIS, TypeError) \
  V(ERR_INVALID_TRANSFER_OBJECT, TypeError) \
  V(ERR_MEMORY_ALLOCATION_FAILED, Error) \

diff --git a/src/util-inl.h b/src/util-inl.h
@@ -555,6 +555,7 @@ void ArrayBufferViewContents<T, S>::ReadValue(v8::Local<v8::Value> buf) {
  auto ab = buf.As<v8::ArrayBuffer>();
  length_ = ab->ByteLength();
  data_ = static_cast<T*>(ab->Data());
+ was_detached_ = ab->WasDetached();
  } else {
  CHECK(buf->IsSharedArrayBuffer());
  auto sab = buf.As<v8::SharedArrayBuffer>();

diff --git a/src/util.h b/src/util.h
@@ -511,6 +511,7 @@ class ArrayBufferViewContents {
  inline void Read(v8::Local<v8::ArrayBufferView> abv);
  inline void ReadValue(v8::Local<v8::Value> buf);
 
+ inline bool WasDetached() const { return was_detached_; }
  inline const T* data() const { return data_; }
  inline size_t length() const { return length_; }
 
@@ -525,6 +526,7 @@ class ArrayBufferViewContents {
  T stack_storage_[kStackStorageSize];
  T* data_ = nullptr;
  size_t length_ = 0;
+ bool was_detached_ = false;
 };
 
 class Utf8Value : public MaybeStackBuffer<char> {

diff --git a/test/parallel/test-buffer-isutf8.js b/test/parallel/test-buffer-isutf8.js
@@ -0,0 +1,86 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { isUtf8, Buffer } = require('buffer');
+const { TextEncoder } = require('util');
+
+const encoder = new TextEncoder();
+
+assert.strictEqual(isUtf8(encoder.encode('hello')), true);
+assert.strictEqual(isUtf8(encoder.encode('ğ')), true);
+assert.strictEqual(isUtf8(Buffer.from([])), true);
+
+// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js
+[
+ [0xFF], // 'invalid code'
+ [0xC0], // 'ends early'
+ [0xE0], // 'ends early 2'
+ [0xC0, 0x00], // 'invalid trail'
+ [0xC0, 0xC0], // 'invalid trail 2'
+ [0xE0, 0x00], // 'invalid trail 3'
+ [0xE0, 0xC0], // 'invalid trail 4'
+ [0xE0, 0x80, 0x00], // 'invalid trail 5'
+ [0xE0, 0x80, 0xC0], // 'invalid trail 6'
+ [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF'
+ [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte'
+
+ // Overlong encodings
+ [0xC0, 0x80], // 'overlong U+0000 - 2 bytes'
+ [0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes'
+ [0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes'
+ [0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes'
+ [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes'
+
+ [0xC1, 0xBF], // 'overlong U+007F - 2 bytes'
+ [0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes'
+ [0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes'
+ [0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes'
+ [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes'
+
+ [0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes'
+ [0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes'
+ [0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes'
+ [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes'
+
+ [0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes'
+ [0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes'
+ [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes'
+
+ [0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes'
+ [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes'
+
+ // UTF-16 surrogates encoded as code points in UTF-8
+ [0xED, 0xA0, 0x80], // 'lead surrogate'
+ [0xED, 0xB0, 0x80], // 'trail surrogate'
+ [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair'
+].forEach((input) => {
+ assert.strictEqual(isUtf8(Buffer.from(input)), false);
+});
+
+[
+ null,
+ undefined,
+ 'hello',
+ true,
+ false,
+].forEach((input) => {
+ assert.throws(
+ () => { isUtf8(input); },
+ {
+ code: 'ERR_INVALID_ARG_TYPE',
+ },
+ );
+});
+
+{
+ // Test with detached array buffers
+ const arrayBuffer = new ArrayBuffer(1024);
+ structuredClone(arrayBuffer, { transfer: [arrayBuffer] });
+ assert.throws(
+ () => { isUtf8(arrayBuffer); },
+ {
+ code: 'ERR_INVALID_STATE'
+ }
+ );
+}