Skip to content

Commit fd5bd10

Browse files
committed
lucene 4: Exposed Lucene's codec api
This feature adds the option to configure a `PostingsFormat` and assign it to a field in the mapping. This feature is very expert and in almost all cases Elasticsearch's defaults will suite your needs. ## Configuring a postingsformat per field There're several default postings formats configured by default which can be used in your mapping: a* `direct` - A codec that wraps the default postings format during write time, but loads the terms and postinglists into memory directly in memory during read time as raw arrays. This postings format is exceptional memory intensive, but can give a substantial increase in search performance. * `memory` - A codec that loads and stores terms and postinglists in memory using a FST. Acts like a cached postingslist. * `bloom_default` - Maintains a bloom filter for the indexed terms, which is stored to disk and builds on top of the `default` postings format. This postings format is useful for low document frequency terms and offers a fail fast for seeks to terms that don't exist. * `bloom_pulsing` - Similar to the `bloom_default` postings format, but builds on top of the `pulsing` postings format. * `default` - The default postings format. The default if none is specified. On all fields it possible to configure a `postings_format` attribute. Example mapping: ``` { "person" : { "properties" : { "second_person_id" : {"type" : "string", "postings_format" : "pulsing"} } } } ``` ## Configuring a custom postingsformat It is possible the instantiate custom postingsformats. This can be specified via the index settings. ``` { "codec" : { "postings_format" : { "my_format" : { "type" : "pulsing40" "freq_cut_off" : "5" } } } } ``` In the above example the `freq_cut_off` is set the 5 (defaults to 1). This tells the pulsing postings format to inline the postinglist of terms with a document frequency lower or equal to 5 in the term dictionary. Closes elastic#2411
1 parent 120560b commit fd5bd10

File tree

56 files changed

+1221
-127
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1221
-127
lines changed
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
package org.elasticsearch.index.codec;
2+
3+
import com.google.common.collect.ImmutableList;
4+
import com.google.common.collect.Lists;
5+
import com.google.common.collect.Maps;
6+
import org.apache.lucene.codecs.PostingsFormat;
7+
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
8+
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
9+
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
10+
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
11+
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
12+
import org.elasticsearch.ElasticSearchIllegalArgumentException;
13+
import org.elasticsearch.common.inject.AbstractModule;
14+
import org.elasticsearch.common.inject.Scopes;
15+
import org.elasticsearch.common.inject.assistedinject.FactoryProvider;
16+
import org.elasticsearch.common.inject.multibindings.MapBinder;
17+
import org.elasticsearch.common.settings.Settings;
18+
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
19+
import org.elasticsearch.index.codec.postingsformat.PostingsFormatService;
20+
import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvider;
21+
22+
import java.util.List;
23+
import java.util.Map;
24+
25+
/**
26+
*/
27+
public class CodecModule extends AbstractModule {
28+
29+
public static final ImmutableList<PreBuiltPostingsFormatProvider.Factory> preConfiguredPostingFormats;
30+
31+
static {
32+
List<PreBuiltPostingsFormatProvider.Factory> preConfiguredPostingFormatsX = Lists.newArrayList();
33+
// add defaults ones
34+
for (String luceneName : PostingsFormat.availablePostingsFormats()) {
35+
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory(PostingsFormat.forName(luceneName)));
36+
}
37+
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("direct", new DirectPostingsFormat()));
38+
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("memory", new MemoryPostingsFormat()));
39+
// LUCENE UPGRADE: Need to change this to the relevant ones on a lucene upgrade
40+
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("pulsing", new Pulsing40PostingsFormat()));
41+
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", new BloomFilteringPostingsFormat(new Pulsing40PostingsFormat())));
42+
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("default", new Lucene40PostingsFormat()));
43+
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("bloom_default", new BloomFilteringPostingsFormat(new Lucene40PostingsFormat())));
44+
45+
preConfiguredPostingFormats = ImmutableList.copyOf(preConfiguredPostingFormatsX);
46+
}
47+
48+
private final Settings indexSettings;
49+
50+
private Map<String, Class<? extends PostingsFormatProvider>> customProviders = Maps.newHashMap();
51+
52+
public CodecModule(Settings indexSettings) {
53+
this.indexSettings = indexSettings;
54+
}
55+
56+
public CodecModule addPostingFormat(String name, Class<? extends PostingsFormatProvider> provider) {
57+
this.customProviders.put(name, provider);
58+
return this;
59+
}
60+
61+
@Override
62+
protected void configure() {
63+
64+
Map<String, Class<? extends PostingsFormatProvider>> postingFormatProviders = Maps.newHashMap(customProviders);
65+
66+
Map<String, Settings> postingsFormatsSettings = indexSettings.getGroups("index.codec.postings_format");
67+
for (Map.Entry<String, Settings> entry : postingsFormatsSettings.entrySet()) {
68+
String name = entry.getKey();
69+
Settings settings = entry.getValue();
70+
71+
Class<? extends PostingsFormatProvider> type =
72+
settings.getAsClass("type", null, "org.elasticsearch.index.codec.postingsformat.", "PostingsFormatProvider");
73+
74+
if (type == null) {
75+
// nothing found, see if its in bindings as a binding name
76+
throw new ElasticSearchIllegalArgumentException("PostingsFormat Factory [" + name + "] must have a type associated with it");
77+
}
78+
postingFormatProviders.put(name, type);
79+
}
80+
81+
// now bind
82+
MapBinder<String, PostingsFormatProvider.Factory> postingFormatFactoryBinder
83+
= MapBinder.newMapBinder(binder(), String.class, PostingsFormatProvider.Factory.class);
84+
85+
for (Map.Entry<String, Class<? extends PostingsFormatProvider>> entry : postingFormatProviders.entrySet()) {
86+
postingFormatFactoryBinder.addBinding(entry.getKey()).toProvider(FactoryProvider.newFactory(PostingsFormatProvider.Factory.class, entry.getValue())).in(Scopes.SINGLETON);
87+
}
88+
89+
for (PreBuiltPostingsFormatProvider.Factory factory : preConfiguredPostingFormats) {
90+
if (postingFormatProviders.containsKey(factory.name())) {
91+
continue;
92+
}
93+
postingFormatFactoryBinder.addBinding(factory.name()).toInstance(factory);
94+
}
95+
96+
bind(PostingsFormatService.class).asEagerSingleton();
97+
bind(CodecService.class).asEagerSingleton();
98+
}
99+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package org.elasticsearch.index.codec;
2+
3+
import com.google.common.collect.ImmutableMap;
4+
import org.apache.lucene.codecs.Codec;
5+
import org.elasticsearch.ElasticSearchIllegalArgumentException;
6+
import org.elasticsearch.common.collect.MapBuilder;
7+
import org.elasticsearch.common.inject.Inject;
8+
import org.elasticsearch.common.settings.ImmutableSettings;
9+
import org.elasticsearch.common.settings.Settings;
10+
import org.elasticsearch.index.AbstractIndexComponent;
11+
import org.elasticsearch.index.Index;
12+
import org.elasticsearch.index.codec.postingsformat.PostingsFormatService;
13+
import org.elasticsearch.index.mapper.MapperService;
14+
import org.elasticsearch.index.settings.IndexSettings;
15+
16+
/**
17+
*/
18+
public class CodecService extends AbstractIndexComponent {
19+
20+
private final PostingsFormatService postingsFormatService;
21+
private final MapperService mapperService;
22+
private final ImmutableMap<String, Codec> codecs;
23+
24+
public CodecService(Index index) {
25+
this(index, ImmutableSettings.Builder.EMPTY_SETTINGS);
26+
}
27+
28+
public CodecService(Index index, @IndexSettings Settings indexSettings) {
29+
this(index, indexSettings, new PostingsFormatService(index, indexSettings), null);
30+
}
31+
32+
@Inject
33+
public CodecService(Index index, @IndexSettings Settings indexSettings, PostingsFormatService postingsFormatService,
34+
MapperService mapperService) {
35+
super(index, indexSettings);
36+
this.postingsFormatService = postingsFormatService;
37+
this.mapperService = mapperService;
38+
MapBuilder<String, Codec> codecs = MapBuilder.<String, Codec>newMapBuilder();
39+
if (mapperService == null) {
40+
codecs.put("default", Codec.getDefault());
41+
} else {
42+
codecs.put("default", new PerFieldMappingPostingFormatCodec(mapperService, postingsFormatService.get("default").get()));
43+
}
44+
for (String codec : Codec.availableCodecs()) {
45+
codecs.put(codec, Codec.forName(codec));
46+
}
47+
this.codecs = codecs.immutableMap();
48+
}
49+
50+
public PostingsFormatService postingsFormatService() {
51+
return this.postingsFormatService;
52+
}
53+
54+
public MapperService mapperService() {
55+
return mapperService;
56+
}
57+
58+
public Codec codec(String name) throws ElasticSearchIllegalArgumentException {
59+
Codec codec = codecs.get(name);
60+
if (codec == null) {
61+
throw new ElasticSearchIllegalArgumentException("failed to find codec [" + name + "]");
62+
}
63+
return codec;
64+
}
65+
66+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package org.elasticsearch.index.codec;
2+
3+
import org.apache.lucene.codecs.PostingsFormat;
4+
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
5+
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
6+
import org.elasticsearch.index.mapper.MapperService;
7+
8+
/**
9+
* This one is the "default" codec we use.
10+
*/
11+
// LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version
12+
public class PerFieldMappingPostingFormatCodec extends Lucene40Codec {
13+
14+
private final MapperService mapperService;
15+
private final PostingsFormat defaultPostingFormat;
16+
17+
public PerFieldMappingPostingFormatCodec(MapperService mapperService, PostingsFormat defaultPostingFormat) {
18+
this.mapperService = mapperService;
19+
this.defaultPostingFormat = defaultPostingFormat;
20+
}
21+
22+
@Override
23+
public PostingsFormat getPostingsFormatForField(String field) {
24+
PostingsFormatProvider postingsFormat = mapperService.indexName(field).mapper().postingFormatProvider();
25+
return postingsFormat != null ? postingsFormat.get() : defaultPostingFormat;
26+
}
27+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package org.elasticsearch.index.codec.postingsformat;
2+
3+
/**
4+
*/
5+
public abstract class AbstractPostingsFormatProvider implements PostingsFormatProvider {
6+
7+
private final String name;
8+
9+
protected AbstractPostingsFormatProvider(String name) {
10+
this.name = name;
11+
}
12+
13+
public String name() {
14+
return name;
15+
}
16+
17+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
package org.elasticsearch.index.codec.postingsformat;
2+
3+
import org.apache.lucene.codecs.PostingsFormat;
4+
import org.apache.lucene.codecs.bloom.BloomFilterFactory;
5+
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
6+
import org.apache.lucene.codecs.bloom.FuzzySet;
7+
import org.apache.lucene.index.FieldInfo;
8+
import org.apache.lucene.index.SegmentWriteState;
9+
import org.elasticsearch.common.Nullable;
10+
import org.elasticsearch.common.inject.Inject;
11+
import org.elasticsearch.common.inject.assistedinject.Assisted;
12+
import org.elasticsearch.common.settings.Settings;
13+
import org.elasticsearch.index.settings.IndexSettings;
14+
15+
import java.util.Map;
16+
17+
/**
18+
*/
19+
public class BloomFilterPostingsFormatProvider extends AbstractPostingsFormatProvider {
20+
21+
private final float desiredMaxSaturation;
22+
private final float saturationLimit;
23+
private final PostingsFormatProvider delegate;
24+
private final BloomFilteringPostingsFormat postingsFormat;
25+
26+
@Inject
27+
public BloomFilterPostingsFormatProvider(@IndexSettings Settings indexSettings, @Nullable Map<String, Factory> postingFormatFactories, @Assisted String name, @Assisted Settings postingsFormatSettings) {
28+
super(name);
29+
this.desiredMaxSaturation = postingsFormatSettings.getAsFloat("desired_max_saturation", 0.1f);
30+
this.saturationLimit = postingsFormatSettings.getAsFloat("saturation_limit", 0.9f);
31+
this.delegate = Helper.lookup(indexSettings, postingsFormatSettings.get("delegate"), postingFormatFactories);
32+
this.postingsFormat = new BloomFilteringPostingsFormat(
33+
delegate.get(),
34+
new CustomBloomFilterFactory(desiredMaxSaturation, saturationLimit)
35+
);
36+
}
37+
38+
public float desiredMaxSaturation() {
39+
return desiredMaxSaturation;
40+
}
41+
42+
public float saturationLimit() {
43+
return saturationLimit;
44+
}
45+
46+
public PostingsFormatProvider delegate() {
47+
return delegate;
48+
}
49+
50+
@Override
51+
public PostingsFormat get() {
52+
return postingsFormat;
53+
}
54+
55+
static class CustomBloomFilterFactory extends BloomFilterFactory {
56+
57+
private final float desiredMaxSaturation;
58+
private final float saturationLimit;
59+
60+
CustomBloomFilterFactory(float desiredMaxSaturation, float saturationLimit) {
61+
this.desiredMaxSaturation = desiredMaxSaturation;
62+
this.saturationLimit = saturationLimit;
63+
}
64+
65+
@Override
66+
public FuzzySet getSetForField(SegmentWriteState state, FieldInfo info) {
67+
//Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with desiredMaxSaturation% of bits set
68+
return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), desiredMaxSaturation);
69+
}
70+
71+
@Override
72+
public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
73+
// Don't bother saving bitsets if > saturationLimit % of bits are set - we don't want to
74+
// throw any more memory at this problem.
75+
return bloomFilter.getSaturation() > saturationLimit;
76+
}
77+
}
78+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package org.elasticsearch.index.codec.postingsformat;
2+
3+
import org.apache.lucene.codecs.PostingsFormat;
4+
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
5+
import org.elasticsearch.common.inject.Inject;
6+
import org.elasticsearch.common.inject.assistedinject.Assisted;
7+
import org.elasticsearch.common.settings.Settings;
8+
9+
/**
10+
*/
11+
public class DirectPostingsFormatProvider extends AbstractPostingsFormatProvider {
12+
13+
private final int minSkipCount;
14+
private final int lowFreqCutoff;
15+
private final DirectPostingsFormat postingsFormat;
16+
17+
@Inject
18+
public DirectPostingsFormatProvider(@Assisted String name, @Assisted Settings postingsFormatSettings) {
19+
super(name);
20+
this.minSkipCount = postingsFormatSettings.getAsInt("min_skip_count", 8); // See DirectPostingsFormat#DEFAULT_MIN_SKIP_COUNT
21+
this.lowFreqCutoff = postingsFormatSettings.getAsInt("low_freq_cutoff", 32); // See DirectPostingsFormat#DEFAULT_LOW_FREQ_CUTOFF
22+
this.postingsFormat = new DirectPostingsFormat(minSkipCount, lowFreqCutoff);
23+
}
24+
25+
public int minSkipCount() {
26+
return minSkipCount;
27+
}
28+
29+
public int lowFreqCutoff() {
30+
return lowFreqCutoff;
31+
}
32+
33+
@Override
34+
public PostingsFormat get() {
35+
return postingsFormat;
36+
}
37+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package org.elasticsearch.index.codec.postingsformat;
2+
3+
import org.apache.lucene.codecs.BlockTreeTermsWriter;
4+
import org.apache.lucene.codecs.PostingsFormat;
5+
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
6+
import org.elasticsearch.common.inject.Inject;
7+
import org.elasticsearch.common.inject.assistedinject.Assisted;
8+
import org.elasticsearch.common.settings.Settings;
9+
10+
/**
11+
*/
12+
public class Lucene40PostingsFormatProvider extends AbstractPostingsFormatProvider {
13+
14+
private final int minBlockSize;
15+
private final int maxBlockSize;
16+
private final Lucene40PostingsFormat postingsFormat;
17+
18+
@Inject
19+
public Lucene40PostingsFormatProvider(@Assisted String name, @Assisted Settings postingsFormatSettings) {
20+
super(name);
21+
this.minBlockSize = postingsFormatSettings.getAsInt("min_block_size", BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE);
22+
this.maxBlockSize = postingsFormatSettings.getAsInt("max_block_size", BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
23+
this.postingsFormat = new Lucene40PostingsFormat(minBlockSize, maxBlockSize);
24+
}
25+
26+
public int minBlockSize() {
27+
return minBlockSize;
28+
}
29+
30+
public int maxBlockSize() {
31+
return maxBlockSize;
32+
}
33+
34+
@Override
35+
public PostingsFormat get() {
36+
return postingsFormat;
37+
}
38+
}

0 commit comments

Comments
 (0)