Skip to content

Commit 32a0772

Browse files
committed
elastic#2436 expose KeepWordTokenFilter by default
1 parent 65a43d3 commit 32a0772

File tree

5 files changed

+228
-0
lines changed

5 files changed

+228
-0
lines changed

src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,7 @@ public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
480480
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
481481
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
482482
tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
483+
tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);
483484

484485
tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class);
485486
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
package org.elasticsearch.index.analysis;
2+
/*
3+
* Licensed to ElasticSearch and Shay Banon under one
4+
* or more contributor license agreements. See the NOTICE file
5+
* distributed with this work for additional information
6+
* regarding copyright ownership. ElasticSearch licenses this
7+
* file to you under the Apache License, Version 2.0 (the
8+
* "License"); you may not use this file except in compliance
9+
* with the License. You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing,
14+
* software distributed under the License is distributed on an
15+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
* KIND, either express or implied. See the License for the
17+
* specific language governing permissions and limitations
18+
* under the License.
19+
*/
20+
import java.util.Arrays;
21+
import java.util.Map;
22+
23+
import org.apache.lucene.analysis.TokenStream;
24+
import org.apache.lucene.analysis.miscellaneous.KeepWordFilter;
25+
import org.apache.lucene.analysis.util.CharArraySet;
26+
import org.elasticsearch.ElasticSearchIllegalArgumentException;
27+
import org.elasticsearch.common.inject.Inject;
28+
import org.elasticsearch.common.inject.assistedinject.Assisted;
29+
import org.elasticsearch.common.settings.Settings;
30+
import org.elasticsearch.env.Environment;
31+
import org.elasticsearch.index.Index;
32+
import org.elasticsearch.index.settings.IndexSettings;
33+
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
34+
35+
/**
36+
* A {@link TokenFilterFactory} for {@link KeepWordFilter}. This filter only
37+
* keep tokens that are contained in the term set configured via
38+
* {@value #KEEP_WORDS_KEY} setting. This filter acts like an inverse stop
39+
* filter.
40+
*
41+
* Configuration options:
42+
*
43+
* <ul>
44+
* <li>{@value #KEEP_WORDS_KEY} the array of words / tokens to keep.</li>
45+
*
46+
* <li>{@value #KEEP_WORDS_PATH_KEY} an reference to a file containing the words
47+
* / tokens to keep. Note: this is an alternative to {@value #KEEP_WORDS_KEY} if
48+
* both are set an exception will be thrown.</li>
49+
*
50+
* <li>{@value #ENABLE_POS_INC_KEY} <code>true</code> iff the filter should
51+
* maintain position increments for dropped tokens. The default is
52+
* <code>true</code>.</li>
53+
*
54+
* <li>{@value #KEEP_WORDS_CASE_KEY} to use case sensitive keep words. The
55+
* default is <code>false</code> which corresponds to case-sensitive.</li>
56+
* </ul>
57+
*
58+
* @see StopTokenFilterFactory
59+
*
60+
*/
61+
@AnalysisSettingsRequired
62+
public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
63+
private Boolean enablePositionIncrements;
64+
private CharArraySet keepWords;
65+
private static final String KEEP_WORDS_KEY = "keep_words";
66+
private static final String KEEP_WORDS_PATH_KEY = KEEP_WORDS_KEY + "_path";
67+
private static final String KEEP_WORDS_CASE_KEY = KEEP_WORDS_KEY + "_case"; // for javadoc
68+
private static final String ENABLE_POS_INC_KEY = "enable_position_increments";
69+
70+
@Inject
71+
public KeepWordFilterFactory(Index index, @IndexSettings Settings indexSettings,
72+
Environment env, IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories,
73+
@Assisted String name, @Assisted Settings settings) {
74+
super(index, indexSettings, name, settings);
75+
76+
final String[] arrayKeepWords = settings.getAsArray(KEEP_WORDS_KEY);
77+
final String keepWordsPath = settings.get(KEEP_WORDS_PATH_KEY, null);
78+
if (!(arrayKeepWords == null ^ keepWordsPath == null)) {
79+
// we don't allow both or non
80+
throw new ElasticSearchIllegalArgumentException("keep requires either `" + KEEP_WORDS_KEY + "` or `"
81+
+ KEEP_WORDS_PATH_KEY + "` to be configured");
82+
}
83+
this.enablePositionIncrements = settings.getAsBoolean(ENABLE_POS_INC_KEY, true);
84+
this.keepWords = Analysis.getWordSet(env, settings, KEEP_WORDS_KEY, version);
85+
86+
}
87+
88+
@Override
89+
public TokenStream create(TokenStream tokenStream) {
90+
return new KeepWordFilter(enablePositionIncrements, tokenStream, keepWords);
91+
}
92+
93+
94+
95+
96+
97+
}

src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisTestsHelper.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.apache.lucene.analysis.TokenStream;
2323
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
24+
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
2425
import org.elasticsearch.common.inject.Injector;
2526
import org.elasticsearch.common.inject.ModulesBuilder;
2627
import org.elasticsearch.common.settings.ImmutableSettings;
@@ -45,6 +46,11 @@ public static AnalysisService createAnalysisServiceFromClassPath(String resource
4546
Settings settings = ImmutableSettings.settingsBuilder()
4647
.loadFromClasspath(resource).build();
4748

49+
return createAnalysisServiceFromSettings(settings);
50+
}
51+
52+
public static AnalysisService createAnalysisServiceFromSettings(
53+
Settings settings) {
4854
Index index = new Index("test");
4955

5056
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
@@ -71,4 +77,19 @@ public static void assertSimpleTSOutput(TokenStream stream, String[] expected) t
7177
}
7278
Assert.assertEquals(i, expected.length, "not all tokens produced");
7379
}
80+
81+
public static void assertSimpleTSOutput(TokenStream stream, String[] expected, int[] posInc) throws IOException {
82+
stream.reset();
83+
CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
84+
PositionIncrementAttribute posIncAttr = stream.getAttribute(PositionIncrementAttribute.class);
85+
Assert.assertNotNull(termAttr);
86+
int i = 0;
87+
while (stream.incrementToken()) {
88+
Assert.assertTrue(i < expected.length, "got extra term: " + termAttr.toString());
89+
Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i);
90+
Assert.assertEquals(posIncAttr.getPositionIncrement(), posInc[i]);
91+
i++;
92+
}
93+
Assert.assertEquals(i, expected.length, "not all tokens produced");
94+
}
7495
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
* Licensed to ElasticSearch and Shay Banon under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. ElasticSearch licenses this
6+
* file to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.test.unit.index.analysis;
21+
22+
import org.apache.lucene.analysis.Tokenizer;
23+
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
24+
import org.apache.lucene.util.Version;
25+
import org.elasticsearch.ElasticSearchIllegalArgumentException;
26+
import org.elasticsearch.common.settings.ImmutableSettings;
27+
import org.elasticsearch.common.settings.Settings;
28+
import org.elasticsearch.index.analysis.AnalysisService;
29+
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
30+
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
31+
import org.elasticsearch.index.analysis.TokenFilterFactory;
32+
import org.testng.Assert;
33+
import org.testng.annotations.Test;
34+
35+
import java.io.IOException;
36+
import java.io.StringReader;
37+
38+
import static org.hamcrest.MatcherAssert.assertThat;
39+
import static org.hamcrest.Matchers.instanceOf;
40+
41+
public class KeepFilterFactoryTests {
42+
43+
private static final String RESOURCE = "org/elasticsearch/test/unit/index/analysis/keep_analysis.json";
44+
45+
46+
@Test
47+
public void testLoadWithoutSettings() {
48+
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
49+
TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep");
50+
Assert.assertNull(tokenFilter);
51+
}
52+
53+
@Test
54+
public void testLoadOverConfiguredSettings() {
55+
Settings settings = ImmutableSettings.settingsBuilder()
56+
.put("index.analysis.filter.broken_keep_filter.type", "keep")
57+
.put("index.analysis.filter.broken_keep_filter.keep_words_path", "does/not/exists.txt")
58+
.put("index.analysis.filter.broken_keep_filter.keep_words", "[\"Hello\", \"worlD\"]")
59+
.build();
60+
try {
61+
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
62+
Assert.fail("path and array are configured");
63+
} catch (Exception e) {
64+
assertThat(e.getCause(), instanceOf(ElasticSearchIllegalArgumentException.class));
65+
}
66+
}
67+
68+
@Test
69+
public void testCaseInsensitiveMapping() throws IOException {
70+
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
71+
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_keep_filter");
72+
assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
73+
String source = "hello small world";
74+
String[] expected = new String[]{"hello", "world"};
75+
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_40, new StringReader(source));
76+
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected, new int[] {1,2});
77+
}
78+
79+
@Test
80+
public void testCaseSensitiveMapping() throws IOException {
81+
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromClassPath(RESOURCE);
82+
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_case_sensitive_keep_filter");
83+
assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
84+
String source = "Hello small world";
85+
String[] expected = new String[]{"Hello"};
86+
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_40, new StringReader(source));
87+
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected, new int[] {1});
88+
}
89+
90+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"index":{
3+
"analysis":{
4+
"filter":{
5+
"my_keep_filter":{
6+
"type":"keep",
7+
"keep_words" : ["Hello", "worlD"],
8+
"enable_position_increments" : true,
9+
"keep_words_case" : true
10+
},
11+
"my_case_sensitive_keep_filter":{
12+
"type":"keep",
13+
"keep_words" : ["Hello", "worlD"],
14+
"enable_position_increments" : false
15+
}
16+
}
17+
}
18+
}
19+
}

0 commit comments

Comments
 (0)