Skip to content

Commit 0ebe5f3

Browse files
s1monwdrewr
authored andcommitted
Limit the number of extracted token instance per query token.
FVH deploys some recursive logic to extract terms from documents that need to highlighted. For documents that have terms with super large term frequency like a document that repeats a terms very very often this can produce some very large stacks when extracting the terms. Taken to an extreme this causes stack overflow errors when this grow beyond a term frequency >= 6000. The ultimate solution is a iterative implementation of the extract logic but until then we should protect users from these massive term extractions which might be not very useful in the first place. Closes elastic#3486
1 parent 5c38d60 commit 0ebe5f3

File tree

2 files changed

+41
-1
lines changed

2 files changed

+41
-1
lines changed

src/main/java/org/apache/lucene/search/vectorhighlight/XFieldTermStack.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,10 @@ public XFieldTermStack( IndexReader reader, int docId, String fieldName, final X
114114
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
115115
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 );
116116

117-
final int freq = dpEnum.freq();
117+
// ES EDIT: added a safety check to limit this to 512 terms everything above might be meaningless anyways
118+
// This limit protectes the FVH from running into StackOverflowErrors if super large TF docs are highlighted.
119+
final int freq = Math.min(512, dpEnum.freq());
120+
118121

119122
for(int i = 0;i < freq;i++) {
120123
int pos = dpEnum.nextPosition();

src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,43 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
6262
protected int numberOfNodes() {
6363
return 4; // why 4?
6464
}
65+
66+
@Test
67+
// see #3486
68+
public void testHighTermFrequencyDoc() throws ElasticSearchException, IOException {
69+
wipeIndex("test");
70+
client().admin().indices().prepareCreate("test")
71+
.addMapping("test", jsonBuilder()
72+
.startObject()
73+
.startObject("test")
74+
.startObject("properties")
75+
.startObject("name")
76+
.field("type", "string")
77+
.field("term_vector", "with_positions_offsets")
78+
.field("store", randomBoolean() ? "yes" : "no")
79+
.endObject()
80+
.endObject()
81+
.endObject()
82+
.endObject())
83+
.setSettings(ImmutableSettings.settingsBuilder()
84+
.put("index.number_of_shards", between(1, 5)))
85+
.execute().actionGet();
86+
ensureYellow();
87+
StringBuilder builder = new StringBuilder();
88+
for (int i = 0; i < 6000; i++) {
89+
builder.append("abc").append(" ");
90+
}
91+
client().prepareIndex("test", "test", "1")
92+
.setSource(XContentFactory.jsonBuilder()
93+
.startObject()
94+
.field("name", builder.toString())
95+
.endObject())
96+
.execute().actionGet();
97+
refresh();
98+
SearchResponse search = client().prepareSearch().setQuery(constantScoreQuery(matchQuery("name", "abc"))).addHighlightedField("name").execute().actionGet();
99+
assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>"));
100+
}
101+
65102

66103
@Test
67104
public void testNgramHighlightingWithBrokenPositions() throws ElasticSearchException, IOException {

0 commit comments

Comments
 (0)