Skip to content

Commit 7cabe1a

Browse files
Store high-cardinality keyword fields in binary doc values (#138548)
This PR adds a mapping parameter to keyword fields doc_values.cardinality. When this parameter is set to low (the default), keyword fields will use sorted set doc values as normal. However, when this parameter is set to high, keyword fields will instead use binary doc values. This is an optimization to remove the overhead of looking up keyword values by ordinal when the keyword field has high-cardinality.
1 parent bacd535 commit 7cabe1a

File tree

27 files changed

+802
-255
lines changed

27 files changed

+802
-255
lines changed

modules/percolator/src/test/java/org/elasticsearch/percolator/QueryBuilderStoreTests.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.elasticsearch.index.mapper.TestDocumentParserContext;
3333
import org.elasticsearch.index.query.SearchExecutionContext;
3434
import org.elasticsearch.index.query.TermQueryBuilder;
35+
import org.elasticsearch.script.field.BinaryDocValuesField;
3536
import org.elasticsearch.search.SearchModule;
3637
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
3738
import org.elasticsearch.test.ESTestCase;
@@ -88,7 +89,7 @@ public void testStoringQueryBuilders() throws IOException {
8889
when(searchExecutionContext.getWriteableRegistry()).thenReturn(writableRegistry());
8990
when(searchExecutionContext.getParserConfig()).thenReturn(parserConfig());
9091
when(searchExecutionContext.getForField(fieldMapper.fieldType(), fielddataOperation)).thenReturn(
91-
new BytesBinaryIndexFieldData(fieldMapper.fullPath(), CoreValuesSourceType.KEYWORD)
92+
new BytesBinaryIndexFieldData(fieldMapper.fullPath(), CoreValuesSourceType.KEYWORD, BinaryDocValuesField::new)
9293
);
9394
when(searchExecutionContext.getFieldType(Mockito.anyString())).thenAnswer(invocation -> {
9495
final String fieldName = (String) invocation.getArguments()[0];

qa/ccs-common-rest/build.gradle

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,21 @@ dependencies {
3232
}
3333

3434
tasks.named("yamlRestTest") {
35-
systemProperty 'tests.rest.blacklist',
36-
[
37-
'search/150_rewrite_on_coordinator/Ensure that we fetch the document only once', // terms lookup query with index
38-
'search/170_terms_query/Terms Query with No.of terms exceeding index.max_terms_count should FAIL', // terms lookup query with index
39-
'search.aggregation/220_filters_bucket/cache busting', // node_selector?
40-
'search.aggregation/220_filters_bucket/cache hits', // node_selector?
41-
'search.aggregation/50_filter/Standard queries get cached',
42-
'search.aggregation/50_filter/Terms lookup gets cached', // terms lookup by "index" doesn't seem to work correctly
43-
'search.aggregation/70_adjacency_matrix/Terms lookup' // terms lookup by "index" doesn't seem to work correctly
44-
].join(',')
35+
ArrayList<String> blacklist = [
36+
'search/150_rewrite_on_coordinator/Ensure that we fetch the document only once', // terms lookup query with index
37+
'search/170_terms_query/Terms Query with No.of terms exceeding index.max_terms_count should FAIL', // terms lookup query with index
38+
'search.aggregation/220_filters_bucket/cache busting', // node_selector?
39+
'search.aggregation/220_filters_bucket/cache hits', // node_selector?
40+
'search.aggregation/50_filter/Standard queries get cached',
41+
'search.aggregation/50_filter/Terms lookup gets cached', // terms lookup by "index" doesn't seem to work correctly
42+
'search.aggregation/70_adjacency_matrix/Terms lookup' // terms lookup by "index" doesn't seem to work correctly
43+
]
44+
if (buildParams.snapshotBuild == false) {
45+
blacklist += [
46+
// doc_values.cardinality option not available in snapshots
47+
"search/395_binary_doc_values_search/*"
48+
]
49+
}
50+
systemProperty 'tests.rest.blacklist', blacklist.join(',')
4551
}
4652

qa/smoke-test-multinode/build.gradle

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,16 @@ dependencies {
2424
}
2525

2626
tasks.named("yamlRestTest").configure {
27-
systemProperty 'tests.rest.blacklist', [
27+
ArrayList<String> blacklist = [
2828
'cat.templates/10_basic/No templates',
2929
'cat.templates/10_basic/Sort templates',
3030
'cat.templates/10_basic/Multiple template',
31-
].join(',')
31+
]
32+
if (buildParams.snapshotBuild == false) {
33+
blacklist += [
34+
// doc_values.cardinality option not available in snapshots
35+
"search/395_binary_doc_values_search/*"
36+
]
37+
}
38+
systemProperty 'tests.rest.blacklist', blacklist.join(',')
3239
}

rest-api-spec/build.gradle

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,15 @@ tasks.named("precommit").configure {
5454
dependsOn 'enforceYamlTestConvention'
5555
}
5656

57+
tasks.named("yamlRestTest") {
58+
if (buildParams.snapshotBuild == false) {
59+
// doc_values.cardinality option not available in snapshots
60+
systemProperty 'tests.rest.blacklist', [
61+
"search/395_binary_doc_values_search/*"
62+
].join(',')
63+
}
64+
}
65+
5766
tasks.named("yamlRestCompatTestTransform").configure ({ task ->
5867
task.replaceValueInMatch("profile.shards.0.dfs.knn.0.query.0.description", "DocAndScoreQuery[0,...][0.009673266,...],0.009673266", "dfs knn vector profiling")
5968
task.replaceValueInMatch("profile.shards.0.dfs.knn.0.query.0.description", "DocAndScoreQuery[0,...][0.009673266,...],0.009673266", "dfs knn vector profiling with vector_operations_count")

rest-api-spec/src/yamlRestTest/java/org/elasticsearch/test/rest/ClientYamlTestSuiteIT.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ public class ClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
3838
.feature(FeatureFlag.DOC_VALUES_SKIPPER)
3939
.feature(FeatureFlag.SYNTHETIC_VECTORS)
4040
.feature(FeatureFlag.RANDOM_SAMPLING)
41+
.feature(FeatureFlag.EXTENDED_DOC_VALUES_PARAMS)
4142
.build();
4243

4344
public ClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) {
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
---
2+
setup:
3+
- requires:
4+
cluster_features: ["mapper.keyword.store_high_cardinality_in_binary_doc_values"]
5+
reason: "testing binary doc values search"
6+
7+
- do:
8+
indices.create:
9+
index: test
10+
body:
11+
mappings:
12+
dynamic: false
13+
properties:
14+
keyword:
15+
type: keyword
16+
index: false
17+
doc_values:
18+
cardinality: high
19+
20+
- do:
21+
index:
22+
index: test
23+
id: "1"
24+
body:
25+
keyword: "key1"
26+
27+
- do:
28+
index:
29+
index: test
30+
id: "2"
31+
body:
32+
keyword: "key2"
33+
- do:
34+
indices.refresh: {}
35+
36+
---
37+
"Test match query on keyword field where only binary doc values are enabled":
38+
39+
- do:
40+
search:
41+
index: test
42+
body: { query: { match: { keyword: { query: "key1" } } } }
43+
- length: { hits.hits: 1 }
44+
45+
---
46+
"Test terms query on keyword field where only binary doc values are enabled":
47+
48+
- do:
49+
search:
50+
index: test
51+
body: { query: { terms: { keyword: [ "key1", "key2" ] } } }
52+
- length: { hits.hits: 2 }
53+
54+
---
55+
"Test range query on keyword field where only binary doc values are enabled":
56+
57+
- do:
58+
search:
59+
index: test
60+
body: { query: { range: { keyword: { gte: "key1" } } } }
61+
- length: { hits.hits: 2 }
62+
63+
---
64+
"Test fuzzy query on keyword field where only binary doc values are enabled":
65+
66+
- do:
67+
search:
68+
index: test
69+
body: { query: { fuzzy: { keyword: { value: "kay1", fuzziness: 1 } } } }
70+
- length: { hits.hits: 1 }
71+
72+
---
73+
"Test prefix query on keyword field where only binary doc values are enabled":
74+
75+
- do:
76+
search:
77+
index: test
78+
body: { query: { prefix: { keyword: { value: "key" } } } }
79+
- length: { hits.hits: 2 }
80+
81+
---
82+
"Test case insensitive term query on keyword field where only binary doc values are enabled":
83+
84+
- do:
85+
search:
86+
index: test
87+
body: { query: { term: { keyword: { value: "KeY1", case_insensitive: true } } } }
88+
- length: { hits.hits: 1 }
89+
90+
---
91+
"Test wildcard query on keyword field where only binary doc values are enabled":
92+
93+
- do:
94+
search:
95+
index: test
96+
body: { query: { wildcard: { keyword: { value: "k*1" } } } }
97+
- length: { hits.hits: 1 }
98+
99+
---
100+
"Test case insensitive wildcard query on keyword field where only binary doc values are enabled":
101+
102+
- do:
103+
search:
104+
index: test
105+
body: { query: { wildcard: { keyword: { value: "K*1", case_insensitive: true } } } }
106+
- length: { hits.hits: 1 }
107+
108+
---
109+
"Test regexp query on keyword field where only binary doc values are enabled":
110+
111+
- do:
112+
search:
113+
index: test
114+
body: { query: { regexp: { keyword: { value: "k.*1" } } } }
115+
- length: { hits.hits: 1 }
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.index.fielddata;
11+
12+
import org.apache.lucene.index.BinaryDocValues;
13+
import org.apache.lucene.util.BytesRef;
14+
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
15+
16+
import java.io.IOException;
17+
18+
/**
19+
* Wrapper around {@link BinaryDocValues} to decode the typical multivalued encoding used by
20+
* {@link org.elasticsearch.index.mapper.BinaryFieldMapper.CustomBinaryDocValuesField}.
21+
*/
22+
public class MultiValuedSortedBinaryDocValues extends SortedBinaryDocValues {
23+
24+
BinaryDocValues values;
25+
int count;
26+
27+
// the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
28+
// the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
29+
final ByteArrayStreamInput in = new ByteArrayStreamInput();
30+
final BytesRef scratch = new BytesRef();
31+
32+
public MultiValuedSortedBinaryDocValues(BinaryDocValues values) {
33+
this.values = values;
34+
}
35+
36+
@Override
37+
public boolean advanceExact(int doc) throws IOException {
38+
if (values.advanceExact(doc)) {
39+
final BytesRef bytes = values.binaryValue();
40+
assert bytes.length > 0;
41+
in.reset(bytes.bytes, bytes.offset, bytes.length);
42+
count = in.readVInt();
43+
scratch.bytes = bytes.bytes;
44+
return true;
45+
} else {
46+
count = 0;
47+
return false;
48+
}
49+
}
50+
51+
@Override
52+
public int docValueCount() {
53+
return count;
54+
}
55+
56+
@Override
57+
public BytesRef nextValue() throws IOException {
58+
scratch.length = in.readVInt();
59+
scratch.offset = in.getPosition();
60+
in.setPosition(scratch.offset + scratch.length);
61+
return scratch;
62+
}
63+
}

server/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractBinaryDVLeafFieldData.java

Lines changed: 0 additions & 71 deletions
This file was deleted.

server/src/main/java/org/elasticsearch/index/fielddata/plain/BytesBinaryDVLeafFieldData.java

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)