Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/132593.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 132593
summary: Strings outside BMP have 2 chars per code points
area: Mapping
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ protected Text _finishAndReturnText() throws IOException {
return null;
}
ptr += bytesToSkip;
++stringLength;
// Code points that require 4 bytes in UTF-8 will use 2 chars in UTF-16.
stringLength += (bytesToSkip == 4 ? 2 : 1);
}
default -> {
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@ public void testGetValueAsText() throws IOException {
assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
});

testParseJson("{\"foo\": \"\uD83D\uDE0A\"}", parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));

var text = parser.getValueAsText();
assertThat(text, Matchers.notNullValue());
var bytes = text.bytes();
assertTextRef(bytes, "\uD83D\uDE0A");
assertThat(text.stringLength(), Matchers.equalTo(2));
});

testParseJson("{\"foo\": \"bår\"}", parser -> {
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
Expand Down Expand Up @@ -143,19 +155,37 @@ private record TestInput(String input, String result, boolean supportsOptimized)
new TestInput("\\/", "/", true),
new TestInput("\\\\", "\\", true) };

private int randomCodepoint(boolean includeAscii) {
private int randomCodepointIncludeAscii() {
while (true) {
char val = Character.toChars(randomInt(0xFFFF))[0];
if (val <= 0x7f && includeAscii == false) {
continue;
}
if (val >= Character.MIN_SURROGATE && val <= Character.MAX_SURROGATE) {
continue;
}
return val;
}
}

private int randomCodepointIncludeOutsideBMP(int remainingLength) {
while (true) {
int codePoint = randomInt(0x10FFFF);
char[] val = Character.toChars(codePoint);
// Don't include ascii
if (val.length == 1 && val[0] <= 0x7F) {
continue;
}
boolean surrogate = val[0] >= Character.MIN_SURROGATE && val[0] <= Character.MAX_SURROGATE;
// Single surrogate is invalid
if (val.length == 1 && surrogate) {
continue;
}
// Not enough remaining space for a surrogate pair
if (remainingLength < 2 && surrogate) {
continue;
}
return codePoint;
}
}

private TestInput buildRandomInput(int length) {
StringBuilder input = new StringBuilder(length);
StringBuilder result = new StringBuilder(length);
Expand All @@ -171,13 +201,14 @@ private TestInput buildRandomInput(int length) {
doesSupportOptimized = doesSupportOptimized && escape.supportsOptimized();
}
case 1 -> {
int value = randomCodepoint(true);
int value = randomCodepointIncludeAscii();
input.append(String.format(Locale.ENGLISH, "\\u%04x", value));
result.append(Character.toChars(value));
doesSupportOptimized = false;
}
default -> {
var value = Character.toChars(randomCodepoint(false));
var remainingLength = length - i;
var value = Character.toChars(randomCodepointIncludeOutsideBMP(remainingLength));
input.append(value);
result.append(value);
}
Expand Down Expand Up @@ -222,7 +253,9 @@ public void testGetValueRandomized() throws IOException {

String currVal = inputs[i].result();
if (inputs[i].supportsOptimized()) {
assertTextRef(parser.getValueAsText().bytes(), currVal);
var text = parser.getValueAsText();
assertTextRef(text.bytes(), currVal);
assertThat(text.stringLength(), Matchers.equalTo(currVal.length()));
} else {
assertThat(parser.getValueAsText(), Matchers.nullValue());
assertThat(parser.getValueAsString(), Matchers.equalTo(currVal));
Expand Down
9 changes: 0 additions & 9 deletions muted-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -503,15 +503,6 @@ tests:
- class: org.elasticsearch.xpack.esql.action.CrossClusterAsyncQueryIT
method: testBadAsyncId
issue: https://github.com/elastic/elasticsearch/issues/132353
- class: org.elasticsearch.xpack.logsdb.qa.LogsDbVersusReindexedLogsDbChallengeRestIT
method: testRandomQueries
issue: https://github.com/elastic/elasticsearch/issues/132376
- class: org.elasticsearch.xpack.logsdb.qa.LogsDbVersusLogsDbReindexedIntoStandardModeChallengeRestIT
method: testRandomQueries
issue: https://github.com/elastic/elasticsearch/issues/132377
- class: org.elasticsearch.xpack.logsdb.qa.LogsDbVersusReindexedIntoStoredSourceChallengeRestIT
method: testRandomQueries
issue: https://github.com/elastic/elasticsearch/issues/132378
- class: org.elasticsearch.xpack.esql.inference.completion.CompletionOperatorTests
method: testSimpleCircuitBreaking
issue: https://github.com/elastic/elasticsearch/issues/132382
Expand Down