Skip to content

Commit 43665f0

Browse files
authored
Store arrays offsets for keyword fields natively with synthetic source (#113757)
The keyword doc values field gets an extra sorted doc values field, that encodes the order of how array values were specified at index time. This also captures duplicate values. This is stored in an offset to ordinal array that gets zigzag vint encoded into a sorted doc values field. For example, in case of the following string array for a keyword field: ["c", "b", "a", "c"]. Sorted set doc values: ["a", "b", "c"] with ordinals: 0, 1 and 2. The offset array will be: [2, 1, 0, 2] Null values are also supported. For example ["c", "b", null, "c"] results into sorted set doc values: ["b", "c"] with ordinals: 0 and 1. The offset array will be: [1, 0, -1, 1] Empty arrays are also supported by encoding a zigzag vint array of zero elements. Limitations: currently only doc values based array support for keyword field mapper. multi level leaf arrays are flattened. For example: [[b], [c]] -> [b, c] arrays are always synthesized as one type. In case of keyword field, [1, 2] gets synthesized as ["1", "2"]. These limitations can be addressed, but some require more complexity and or additional storage. With this PR, keyword field array will no longer be stored in ignored source, but array offsets are kept track of in an adjacent sorted doc value field. This only applies if index.mapping.synthetic_source_keep is set to arrays (default for logsdb).
1 parent 18df4d0 commit 43665f0

File tree

19 files changed

+1106
-33
lines changed

19 files changed

+1106
-33
lines changed

docs/changelog/113757.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 113757
2+
summary: Store arrays offsets for keyword fields natively with synthetic source instead of falling back to ignored source.
3+
area: Mapping
4+
type: enhancement
5+
issues: []

rest-api-spec/build.gradle

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,8 @@ tasks.named("yamlRestCompatTestTransform").configure ({ task ->
8383
"node_version warning is removed in 9.0"
8484
)
8585
task.skipTest("tsdb/20_mapping/nested fields", "nested field support in tsdb indices is now supported")
86+
task.skipTest("logsdb/10_settings/routing path allowed in logs mode with routing on sort fields", "Unknown feature routing.logsb_route_on_sort_fields")
87+
task.skipTest("indices.create/21_synthetic_source_stored/index param - field ordering", "Synthetic source keep arrays now stores leaf arrays natively")
88+
task.skipTest("indices.create/21_synthetic_source_stored/field param - keep nested array", "Synthetic source keep arrays now stores leaf arrays natively")
89+
task.skipTest("indices.create/21_synthetic_source_stored/field param - keep root array", "Synthetic source keep arrays now stores leaf arrays natively")
8690
})

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/20_synthetic_source.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -922,7 +922,7 @@ subobjects auto:
922922
- match: { hits.hits.0._source.foo: 10 }
923923
- match: { hits.hits.0._source.foo\.bar: 100 }
924924
- match: { hits.hits.0._source.regular.span.id: "1" }
925-
- match: { hits.hits.0._source.regular.trace.id: [ "a", "b" ] }
925+
- match: { hits.hits.0._source.regular.trace.id: ["a", "b" ] }
926926
- match: { hits.hits.1._source.id: 2 }
927927
- match: { hits.hits.1._source.foo: 20 }
928928
- match: { hits.hits.1._source.foo\.bar: 200 }

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.create/21_synthetic_source_stored.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1024,7 +1024,7 @@ index param - field ordering:
10241024
index: test
10251025

10261026
- length: { hits.hits.0._source: 4 }
1027-
- match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": [30, 20, 10], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
1027+
- match: { hits.hits.0._source: { "a": "2", "b": [ { "bb": 100, "aa": 200 }, { "aa": 300, "bb": 400 } ], "c": ["30", "20", "10"], "d": [ { "bb": 10, "aa": 20 }, { "aa": 30, "bb": 40 } ] } }
10281028

10291029

10301030
---

server/src/main/java/org/elasticsearch/index/IndexVersions.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ private static Version parseUnchecked(String version) {
148148
public static final IndexVersion USE_SYNTHETIC_SOURCE_FOR_RECOVERY_BY_DEFAULT = def(9_010_00_0, Version.LUCENE_10_1_0);
149149
public static final IndexVersion TIMESTAMP_DOC_VALUES_SPARSE_INDEX = def(9_011_0_00, Version.LUCENE_10_1_0);
150150
public static final IndexVersion TIME_SERIES_ID_DOC_VALUES_SPARSE_INDEX = def(9_012_0_00, Version.LUCENE_10_1_0);
151+
public static final IndexVersion SYNTHETIC_SOURCE_STORE_ARRAYS_NATIVELY_KEYWORD = def(9_013_0_00, Version.LUCENE_10_1_0);
151152
/*
152153
* STOP! READ THIS FIRST! No, really,
153154
* ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _

server/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ private void internalParseDocument(MetadataFieldMapper[] metadataFieldsMappers,
154154

155155
executeIndexTimeScripts(context);
156156

157+
context.processArrayOffsets(context);
157158
for (MetadataFieldMapper metadataMapper : metadataFieldsMappers) {
158159
metadataMapper.postParse(context);
159160
}
@@ -519,6 +520,7 @@ private static void throwOnCopyToOnObject(Mapper mapper, List<String> copyToFiel
519520

520521
private static void parseObject(final DocumentParserContext context, String currentFieldName) throws IOException {
521522
assert currentFieldName != null;
523+
context.setImmediateXContentParent(context.parser().currentToken());
522524
Mapper objectMapper = context.getMapper(currentFieldName);
523525
if (objectMapper != null) {
524526
doParseObject(context, currentFieldName, objectMapper);
@@ -611,6 +613,12 @@ private static void throwOnCreateDynamicNestedViaCopyTo(Mapper dynamicObjectMapp
611613
}
612614

613615
private static void parseArray(DocumentParserContext context, String lastFieldName) throws IOException {
616+
// Record previous immediate parent, so that it can be reset after array has been parsed.
617+
// This is for recording array offset with synthetic source. Only if the immediate parent is an array,
618+
// then the offsets can be accounted accurately.
619+
var prev = context.getImmediateXContentParent();
620+
context.setImmediateXContentParent(context.parser().currentToken());
621+
614622
Mapper mapper = getLeafMapper(context, lastFieldName);
615623
if (mapper != null) {
616624
// There is a concrete mapper for this field already. Need to check if the mapper
@@ -624,6 +632,8 @@ private static void parseArray(DocumentParserContext context, String lastFieldNa
624632
} else {
625633
parseArrayDynamic(context, lastFieldName);
626634
}
635+
// Reset previous immediate parent
636+
context.setImmediateXContentParent(prev);
627637
}
628638

629639
private static void parseArrayDynamic(DocumentParserContext context, String currentFieldName) throws IOException {
@@ -688,11 +698,12 @@ private static void parseNonDynamicArray(
688698
final String lastFieldName,
689699
String arrayFieldName
690700
) throws IOException {
701+
boolean supportStoringArrayOffsets = mapper != null && mapper.supportStoringArrayOffsets();
691702
String fullPath = context.path().pathAsText(arrayFieldName);
692703

693704
// Check if we need to record the array source. This only applies to synthetic source.
694705
boolean canRemoveSingleLeafElement = false;
695-
if (context.canAddIgnoredField()) {
706+
if (context.canAddIgnoredField() && supportStoringArrayOffsets == false) {
696707
Mapper.SourceKeepMode mode = Mapper.SourceKeepMode.NONE;
697708
boolean objectWithFallbackSyntheticSource = false;
698709
if (mapper instanceof ObjectMapper objectMapper) {
@@ -736,6 +747,7 @@ private static void parseNonDynamicArray(
736747

737748
XContentParser parser = context.parser();
738749
XContentParser.Token token;
750+
XContentParser.Token previousToken = parser.currentToken();
739751
int elements = 0;
740752
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
741753
if (token == XContentParser.Token.START_OBJECT) {
@@ -754,6 +766,14 @@ private static void parseNonDynamicArray(
754766
elements++;
755767
parseValue(context, lastFieldName);
756768
}
769+
previousToken = token;
770+
}
771+
if (mapper != null
772+
&& context.canAddIgnoredField()
773+
&& mapper.supportStoringArrayOffsets()
774+
&& previousToken == XContentParser.Token.START_ARRAY
775+
&& context.isImmediateParentAnArray()) {
776+
context.getOffSetContext().maybeRecordEmptyArray(mapper.getOffsetFieldName());
757777
}
758778
if (elements <= 1 && canRemoveSingleLeafElement) {
759779
context.removeLastIgnoredField(fullPath);

server/src/main/java/org/elasticsearch/index/mapper/DocumentParserContext.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,31 @@ public LuceneDocument doc() {
9191
protected void addDoc(LuceneDocument doc) {
9292
in.addDoc(doc);
9393
}
94+
95+
@Override
96+
public void processArrayOffsets(DocumentParserContext context) throws IOException {
97+
in.processArrayOffsets(context);
98+
}
99+
100+
@Override
101+
public FieldArrayContext getOffSetContext() {
102+
return in.getOffSetContext();
103+
}
104+
105+
@Override
106+
public void setImmediateXContentParent(XContentParser.Token token) {
107+
in.setImmediateXContentParent(token);
108+
}
109+
110+
@Override
111+
public XContentParser.Token getImmediateXContentParent() {
112+
return in.getImmediateXContentParent();
113+
}
114+
115+
@Override
116+
public boolean isImmediateParentAnArray() {
117+
return in.isImmediateParentAnArray();
118+
}
94119
}
95120

96121
/**
@@ -141,6 +166,8 @@ private enum Scope {
141166
private final SeqNoFieldMapper.SequenceIDFields seqID;
142167
private final Set<String> fieldsAppliedFromTemplates;
143168

169+
private FieldArrayContext fieldArrayContext;
170+
144171
/**
145172
* Fields that are copied from values of other fields via copy_to.
146173
* This per-document state is needed since it is possible
@@ -460,6 +487,33 @@ public boolean isCopyToDestinationField(String name) {
460487
return copyToFields.contains(name);
461488
}
462489

490+
public void processArrayOffsets(DocumentParserContext context) throws IOException {
491+
if (fieldArrayContext != null) {
492+
fieldArrayContext.addToLuceneDocument(context);
493+
}
494+
}
495+
496+
public FieldArrayContext getOffSetContext() {
497+
if (fieldArrayContext == null) {
498+
fieldArrayContext = new FieldArrayContext();
499+
}
500+
return fieldArrayContext;
501+
}
502+
503+
private XContentParser.Token lastSetToken;
504+
505+
public void setImmediateXContentParent(XContentParser.Token token) {
506+
this.lastSetToken = token;
507+
}
508+
509+
public XContentParser.Token getImmediateXContentParent() {
510+
return lastSetToken;
511+
}
512+
513+
public boolean isImmediateParentAnArray() {
514+
return lastSetToken == XContentParser.Token.START_ARRAY;
515+
}
516+
463517
/**
464518
* Add a new mapper dynamically created while parsing.
465519
*
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.mapper;
11+
12+
import org.apache.lucene.document.SortedDocValuesField;
13+
import org.apache.lucene.util.BitUtil;
14+
import org.elasticsearch.common.io.stream.BytesStreamOutput;
15+
import org.elasticsearch.common.io.stream.StreamInput;
16+
17+
import java.io.IOException;
18+
import java.util.ArrayList;
19+
import java.util.HashMap;
20+
import java.util.List;
21+
import java.util.Map;
22+
import java.util.TreeMap;
23+
24+
public class FieldArrayContext {
25+
26+
private final Map<String, Offsets> offsetsPerField = new HashMap<>();
27+
28+
void recordOffset(String field, String value) {
29+
Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets());
30+
int nextOffset = arrayOffsets.currentOffset++;
31+
var offsets = arrayOffsets.valueToOffsets.computeIfAbsent(value, s -> new ArrayList<>(2));
32+
offsets.add(nextOffset);
33+
}
34+
35+
void recordNull(String field) {
36+
Offsets arrayOffsets = offsetsPerField.computeIfAbsent(field, k -> new Offsets());
37+
int nextOffset = arrayOffsets.currentOffset++;
38+
arrayOffsets.nullValueOffsets.add(nextOffset);
39+
}
40+
41+
void maybeRecordEmptyArray(String field) {
42+
offsetsPerField.computeIfAbsent(field, k -> new Offsets());
43+
}
44+
45+
void addToLuceneDocument(DocumentParserContext context) throws IOException {
46+
for (var entry : offsetsPerField.entrySet()) {
47+
var fieldName = entry.getKey();
48+
var offset = entry.getValue();
49+
50+
int currentOrd = 0;
51+
// This array allows to retain the original ordering of elements in leaf arrays and retain duplicates.
52+
int[] offsetToOrd = new int[offset.currentOffset];
53+
for (var offsetEntry : offset.valueToOffsets.entrySet()) {
54+
for (var offsetAndLevel : offsetEntry.getValue()) {
55+
offsetToOrd[offsetAndLevel] = currentOrd;
56+
}
57+
currentOrd++;
58+
}
59+
for (var nullOffset : offset.nullValueOffsets) {
60+
offsetToOrd[nullOffset] = -1;
61+
}
62+
63+
try (var streamOutput = new BytesStreamOutput()) {
64+
// Could just use vint for array length, but this allows for decoding my_field: null as -1
65+
streamOutput.writeVInt(BitUtil.zigZagEncode(offsetToOrd.length));
66+
for (int ord : offsetToOrd) {
67+
streamOutput.writeVInt(BitUtil.zigZagEncode(ord));
68+
}
69+
context.doc().add(new SortedDocValuesField(fieldName, streamOutput.bytes().toBytesRef()));
70+
}
71+
}
72+
}
73+
74+
static int[] parseOffsetArray(StreamInput in) throws IOException {
75+
int[] offsetToOrd = new int[BitUtil.zigZagDecode(in.readVInt())];
76+
for (int i = 0; i < offsetToOrd.length; i++) {
77+
offsetToOrd[i] = BitUtil.zigZagDecode(in.readVInt());
78+
}
79+
return offsetToOrd;
80+
}
81+
82+
private static class Offsets {
83+
84+
int currentOffset;
85+
// Need to use TreeMap here, so that we maintain the order in which each value (with offset) stored inserted,
86+
// (which is in the same order the document gets parsed) so we store offsets in right order. This is the same
87+
// order in what the values get stored in SortedSetDocValues.
88+
final Map<String, List<Integer>> valueToOffsets = new TreeMap<>();
89+
final List<Integer> nullValueOffsets = new ArrayList<>(2);
90+
91+
}
92+
93+
}

server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,15 +200,15 @@ public void parse(DocumentParserContext context) throws IOException {
200200
}
201201
}
202202

203-
private void doParseMultiFields(DocumentParserContext context) throws IOException {
203+
protected void doParseMultiFields(DocumentParserContext context) throws IOException {
204204
context.path().add(leafName());
205205
for (FieldMapper mapper : builderParams.multiFields.mappers) {
206206
mapper.parse(context);
207207
}
208208
context.path().remove();
209209
}
210210

211-
private static void throwIndexingWithScriptParam() {
211+
protected static void throwIndexingWithScriptParam() {
212212
throw new IllegalArgumentException("Cannot index data directly into a field with a [script] parameter");
213213
}
214214

0 commit comments

Comments
 (0)