Skip to content
This repository was archived by the owner on Nov 16, 2018. It is now read-only.

Commit 7690b40

Browse files
nik9000jpountz
authored andcommitted
Allow string fields to store token counts
To use this one you send a string to a field of type 'token_count'. This makes the most sense with a multi-field.
1 parent 3494ac2 commit 7690b40

File tree

8 files changed

+643
-35
lines changed

8 files changed

+643
-35
lines changed

docs/reference/mapping/types/core-types.asciidoc

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,49 @@ defaults to `true` or to the parent `object` type setting.
212212

213213
|=======================================================================
214214

215+
[float]
216+
[[token_count]]
217+
==== Token Count
218+
added[0.90.8]
219+
The `token_count` type maps to the JSON string type but indexes and stores
220+
the number of tokens in the string rather than the string itself. For
221+
example:
222+
223+
[source,js]
224+
--------------------------------------------------
225+
{
226+
"tweet" : {
227+
"properties" : {
228+
"message" : {
229+
"type" : "multi_field",
230+
"fields" : {
231+
"name": {
232+
"type": "string"
233+
},
234+
"word_count": {
235+
"type" : "token_count",
236+
"store" : "yes",
237+
"analyzer" : "standard"
238+
}
239+
}
240+
}
241+
}
242+
}
243+
}
244+
--------------------------------------------------
245+
246+
All the configuration that can be specified for a number can be specified
247+
for a token_count. The only extra configuration is the required
248+
`analyzer` field which specifies which analyzer to use to break the string
249+
into tokens. For best performance, use an analyzer with no token filters.
250+
251+
[NOTE]
252+
===================================================================
253+
Technically the `token_count` type sums position increments rather than
254+
counting tokens. This means that even if the analyzer filters out stop
255+
words they are included in the count.
256+
===================================================================
257+
215258
[float]
216259
[[date]]
217260
==== Date

src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ public DocumentMapperParser(Index index, @IndexSettings Settings indexSettings,
9595
.put(DateFieldMapper.CONTENT_TYPE, new DateFieldMapper.TypeParser())
9696
.put(IpFieldMapper.CONTENT_TYPE, new IpFieldMapper.TypeParser())
9797
.put(StringFieldMapper.CONTENT_TYPE, new StringFieldMapper.TypeParser())
98+
.put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser())
9899
.put(ObjectMapper.CONTENT_TYPE, new ObjectMapper.TypeParser())
99100
.put(ObjectMapper.NESTED_CONTENT_TYPE, new ObjectMapper.TypeParser())
100101
.put(MultiFieldMapper.CONTENT_TYPE, new MultiFieldMapper.TypeParser())

src/main/java/org/elasticsearch/index/mapper/MapperBuilders.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ public static IntegerFieldMapper.Builder integerField(String name) {
133133
return new IntegerFieldMapper.Builder(name);
134134
}
135135

136+
public static TokenCountFieldMapper.Builder tokenCountField(String name) {
137+
return new TokenCountFieldMapper.Builder(name);
138+
}
139+
136140
public static LongFieldMapper.Builder longField(String name) {
137141
return new LongFieldMapper.Builder(name);
138142
}

src/main/java/org/elasticsearch/index/mapper/core/IntegerFieldMapper.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,10 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
313313
}
314314
}
315315
}
316+
addIntegerFields(fields, value, boost);
317+
}
316318

319+
protected void addIntegerFields(List<Field> fields, int value, float boost) {
317320
if (fieldType.indexed() || fieldType.stored()) {
318321
CustomIntegerNumericField field = new CustomIntegerNumericField(this, value, fieldType);
319322
field.setBoost(boost);
@@ -326,6 +329,10 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
326329
}
327330
}
328331

332+
protected Integer nullValue() {
333+
return nullValue;
334+
}
335+
329336
@Override
330337
protected String contentType() {
331338
return CONTENT_TYPE;

src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java

Lines changed: 76 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -259,57 +259,69 @@ public Filter nullValueFilter() {
259259

260260
@Override
261261
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
262-
String value = nullValue;
263-
float boost = this.boost;
264-
if (context.externalValueSet()) {
265-
value = (String) context.externalValue();
266-
} else {
267-
XContentParser parser = context.parser();
268-
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
269-
value = nullValue;
270-
} else if (parser.currentToken() == XContentParser.Token.START_OBJECT) {
271-
XContentParser.Token token;
272-
String currentFieldName = null;
273-
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
274-
if (token == XContentParser.Token.FIELD_NAME) {
275-
currentFieldName = parser.currentName();
276-
} else {
277-
if ("value".equals(currentFieldName) || "_value".equals(currentFieldName)) {
278-
value = parser.textOrNull();
279-
} else if ("boost".equals(currentFieldName) || "_boost".equals(currentFieldName)) {
280-
boost = parser.floatValue();
281-
} else {
282-
throw new ElasticSearchIllegalArgumentException("unknown property [" + currentFieldName + "]");
283-
}
284-
}
285-
}
286-
} else {
287-
value = parser.textOrNull();
288-
}
289-
}
290-
if (value == null) {
262+
ValueAndBoost valueAndBoost = parseCreateFieldForString(context, nullValue, boost);
263+
if (valueAndBoost.value() == null) {
291264
return;
292265
}
293-
if (ignoreAbove > 0 && value.length() > ignoreAbove) {
266+
if (ignoreAbove > 0 && valueAndBoost.value().length() > ignoreAbove) {
294267
return;
295268
}
296269
if (context.includeInAll(includeInAll, this)) {
297-
context.allEntries().addText(names.fullName(), value, boost);
270+
context.allEntries().addText(names.fullName(), valueAndBoost.value(), valueAndBoost.boost());
298271
}
299272

300273
if (fieldType.indexed() || fieldType.stored()) {
301-
Field field = new StringField(names.indexName(), value, fieldType);
302-
field.setBoost(boost);
274+
Field field = new StringField(names.indexName(), valueAndBoost.value(), fieldType);
275+
field.setBoost(valueAndBoost.boost());
303276
fields.add(field);
304277
}
305278
if (hasDocValues()) {
306-
fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(value)));
279+
fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(valueAndBoost.value())));
307280
}
308281
if (fields.isEmpty()) {
309-
context.ignoredValue(names.indexName(), value);
282+
context.ignoredValue(names.indexName(), valueAndBoost.value());
310283
}
311284
}
312285

286+
/**
287+
* Parse a field as though it were a string.
288+
* @param context parse context used during parsing
289+
* @param nullValue value to use for null
290+
* @param defaultBoost default boost value returned unless overwritten in the field
291+
* @return the parsed field and the boost either parsed or defaulted
292+
* @throws IOException if thrown while parsing
293+
*/
294+
public static ValueAndBoost parseCreateFieldForString(ParseContext context, String nullValue, float defaultBoost) throws IOException {
295+
if (context.externalValueSet()) {
296+
return new ValueAndBoost((String) context.externalValue(), defaultBoost);
297+
}
298+
XContentParser parser = context.parser();
299+
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
300+
return new ValueAndBoost(nullValue, defaultBoost);
301+
}
302+
if (parser.currentToken() == XContentParser.Token.START_OBJECT) {
303+
XContentParser.Token token;
304+
String currentFieldName = null;
305+
String value = nullValue;
306+
float boost = defaultBoost;
307+
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
308+
if (token == XContentParser.Token.FIELD_NAME) {
309+
currentFieldName = parser.currentName();
310+
} else {
311+
if ("value".equals(currentFieldName) || "_value".equals(currentFieldName)) {
312+
value = parser.textOrNull();
313+
} else if ("boost".equals(currentFieldName) || "_boost".equals(currentFieldName)) {
314+
boost = parser.floatValue();
315+
} else {
316+
throw new ElasticSearchIllegalArgumentException("unknown property [" + currentFieldName + "]");
317+
}
318+
}
319+
}
320+
return new ValueAndBoost(value, boost);
321+
}
322+
return new ValueAndBoost(parser.textOrNull(), defaultBoost);
323+
}
324+
313325
@Override
314326
protected String contentType() {
315327
return CONTENT_TYPE;
@@ -437,4 +449,33 @@ public void close() {
437449
value = null;
438450
}
439451
}
452+
453+
/**
454+
* Parsed value and boost to be returned from {@link #parseCreateFieldForString}.
455+
*/
456+
public static class ValueAndBoost {
457+
private final String value;
458+
private final float boost;
459+
460+
public ValueAndBoost(String value, float boost) {
461+
this.value = value;
462+
this.boost = boost;
463+
}
464+
465+
/**
466+
* Value of string field.
467+
* @return value of string field
468+
*/
469+
public String value() {
470+
return value;
471+
}
472+
473+
/**
474+
* Boost either parsed from the document or defaulted.
475+
* @return boost either parsed from the document or defaulted
476+
*/
477+
public float boost() {
478+
return boost;
479+
}
480+
}
440481
}

0 commit comments

Comments
 (0)