1- /* auto-generated on 2024-12-17 14:54:59  -0500. Do not edit! */
1+ /* auto-generated on 2024-12-26 12:42:33  -0500. Do not edit! */
22/* begin file src/simdutf.cpp */
33#include "simdutf.h"
44// We include base64_tables once.
@@ -697,6 +697,15 @@ static_assert(to_base64_url_value[uint8_t('_')] == 63,
697697#include <climits>
698698#include <type_traits>
699699
700+ static_assert(sizeof(uint8_t) == sizeof(char),
701+  "simdutf requires that uint8_t be a char");
702+ static_assert(sizeof(uint16_t) == sizeof(char16_t),
703+  "simdutf requires that char16_t be 16 bits");
704+ static_assert(sizeof(uint32_t) == sizeof(char32_t),
705+  "simdutf requires that char32_t be 32 bits");
706+ // next line is redundant, but it is kept to catch defective systems.
707+ static_assert(CHAR_BIT == 8, "simdutf requires 8-bit bytes");
708+ 
700709// Useful for debugging purposes
701710namespace simdutf {
702711namespace {
@@ -9746,24 +9755,23 @@ inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
97469755}
97479756
97489757template <endianness big_endian>
9749- inline simdutf_warn_unused bool validate(const char16_t *buf ,
9758+ inline simdutf_warn_unused bool validate(const char16_t *data ,
97509759 size_t len) noexcept {
9751-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
97529760 uint64_t pos = 0;
97539761 while (pos < len) {
9754-  uint16_t  word =
9762+  char16_t  word =
97559763 !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
97569764 if ((word & 0xF800) == 0xD800) {
97579765 if (pos + 1 >= len) {
97589766 return false;
97599767 }
9760-  uint16_t  diff = uint16_t (word - 0xD800);
9768+  char16_t  diff = char16_t (word - 0xD800);
97619769 if (diff > 0x3FF) {
97629770 return false;
97639771 }
9764-  uint16_t  next_word =
9772+  char16_t  next_word =
97659773 !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
9766-  uint16_t  diff2 = uint16_t (next_word - 0xDC00);
9774+  char16_t  diff2 = char16_t (next_word - 0xDC00);
97679775 if (diff2 > 0x3FF) {
97689776 return false;
97699777 }
@@ -9776,24 +9784,23 @@ inline simdutf_warn_unused bool validate(const char16_t *buf,
97769784}
97779785
97789786template <endianness big_endian>
9779- inline simdutf_warn_unused result validate_with_errors(const char16_t *buf ,
9787+ inline simdutf_warn_unused result validate_with_errors(const char16_t *data ,
97809788 size_t len) noexcept {
9781-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
97829789 size_t pos = 0;
97839790 while (pos < len) {
9784-  uint16_t  word =
9791+  char16_t  word =
97859792 !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
97869793 if ((word & 0xF800) == 0xD800) {
97879794 if (pos + 1 >= len) {
97889795 return result(error_code::SURROGATE, pos);
97899796 }
9790-  uint16_t  diff = uint16_t (word - 0xD800);
9797+  char16_t  diff = char16_t (word - 0xD800);
97919798 if (diff > 0x3FF) {
97929799 return result(error_code::SURROGATE, pos);
97939800 }
9794-  uint16_t  next_word =
9801+  char16_t  next_word =
97959802 !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
9796-  uint16_t  diff2 = uint16_t(next_word - 0xDC00);
9803+  char16_t  diff2 = uint16_t(next_word - 0xDC00);
97979804 if (diff2 > 0x3FF) {
97989805 return result(error_code::SURROGATE, pos);
97999806 }
@@ -9806,24 +9813,22 @@ inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
98069813}
98079814
98089815template <endianness big_endian>
9809- inline size_t count_code_points(const char16_t *buf , size_t len) {
9816+ inline size_t count_code_points(const char16_t *p , size_t len) {
98109817 // We are not BOM aware.
9811-  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
98129818 size_t counter{0};
98139819 for (size_t i = 0; i < len; i++) {
9814-  uint16_t  word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
9820+  char16_t  word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
98159821 counter += ((word & 0xFC00) != 0xDC00);
98169822 }
98179823 return counter;
98189824}
98199825
98209826template <endianness big_endian>
9821- inline size_t utf8_length_from_utf16(const char16_t *buf , size_t len) {
9827+ inline size_t utf8_length_from_utf16(const char16_t *p , size_t len) {
98229828 // We are not BOM aware.
9823-  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
98249829 size_t counter{0};
98259830 for (size_t i = 0; i < len; i++) {
9826-  uint16_t  word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
9831+  char16_t  word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
98279832 counter++; // ASCII
98289833 counter += static_cast<size_t>(
98299834 word >
@@ -9835,25 +9840,22 @@ inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
98359840}
98369841
98379842template <endianness big_endian>
9838- inline size_t utf32_length_from_utf16(const char16_t *buf , size_t len) {
9843+ inline size_t utf32_length_from_utf16(const char16_t *p , size_t len) {
98399844 // We are not BOM aware.
9840-  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
98419845 size_t counter{0};
98429846 for (size_t i = 0; i < len; i++) {
9843-  uint16_t  word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
9847+  char16_t  word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
98449848 counter += ((word & 0xFC00) != 0xDC00);
98459849 }
98469850 return counter;
98479851}
98489852
98499853inline size_t latin1_length_from_utf16(size_t len) { return len; }
98509854
9851- simdutf_really_inline void change_endianness_utf16(const char16_t *in,
9852-  size_t size, char16_t *out) {
9853-  const uint16_t *input = reinterpret_cast<const uint16_t *>(in);
9854-  uint16_t *output = reinterpret_cast<uint16_t *>(out);
9855+ simdutf_really_inline void
9856+ change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) {
98559857 for (size_t i = 0; i < size; i++) {
9856-  *output++ = uint16_t (input[i] >> 8 | input[i] << 8);
9858+  *output++ = char16_t (input[i] >> 8 | input[i] << 8);
98579859 }
98589860}
98599861
@@ -21042,6 +21044,9 @@ struct validating_transcoder {
2104221044 uint64_t utf8_continuation_mask =
2104321045 input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
2104421046 // this case, we also have ASCII to account for.
21047+  if (utf8_continuation_mask & 1) {
21048+  return 0; // error
21049+  }
2104521050 uint64_t utf8_leading_mask = ~utf8_continuation_mask;
2104621051 uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
2104721052 // We process in blocks of up to 12 bytes except possibly
@@ -26717,6 +26722,14 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
2671726722 }
2671826723
2671926724 if (!ignore_garbage && equalsigns > 0) {
26725+  if (last_chunk_options == last_chunk_handling_options::strict) {
26726+  return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
26727+  size_t(dst - dstinit)};
26728+  }
26729+  if (last_chunk_options ==
26730+  last_chunk_handling_options::stop_before_partial) {
26731+  return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
26732+  }
2672026733 if ((size_t(dst - dstinit) % 3 == 0) ||
2672126734 ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
2672226735 return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
@@ -33161,6 +33174,9 @@ struct validating_transcoder {
3316133174 uint64_t utf8_continuation_mask =
3316233175 input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
3316333176 // this case, we also have ASCII to account for.
33177+  if (utf8_continuation_mask & 1) {
33178+  return 0; // error
33179+  }
3316433180 uint64_t utf8_leading_mask = ~utf8_continuation_mask;
3316533181 uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
3316633182 // We process in blocks of up to 12 bytes except possibly
@@ -43013,6 +43029,9 @@ struct validating_transcoder {
4301343029 uint64_t utf8_continuation_mask =
4301443030 input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
4301543031 // this case, we also have ASCII to account for.
43032+  if (utf8_continuation_mask & 1) {
43033+  return 0; // error
43034+  }
4301643035 uint64_t utf8_leading_mask = ~utf8_continuation_mask;
4301743036 uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
4301843037 // We process in blocks of up to 12 bytes except possibly
@@ -48110,6 +48129,9 @@ struct validating_transcoder {
4811048129 uint64_t utf8_continuation_mask =
4811148130 input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
4811248131 // this case, we also have ASCII to account for.
48132+  if (utf8_continuation_mask & 1) {
48133+  return 0; // error
48134+  }
4811348135 uint64_t utf8_leading_mask = ~utf8_continuation_mask;
4811448136 uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
4811548137 // We process in blocks of up to 12 bytes except possibly
@@ -54454,6 +54476,9 @@ struct validating_transcoder {
5445454476 uint64_t utf8_continuation_mask =
5445554477 input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
5445654478 // this case, we also have ASCII to account for.
54479+  if (utf8_continuation_mask & 1) {
54480+  return 0; // error
54481+  }
5445754482 uint64_t utf8_leading_mask = ~utf8_continuation_mask;
5445854483 uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
5445954484 // We process in blocks of up to 12 bytes except possibly
0 commit comments