Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/95548.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 95548
summary: Create `.synonyms` system index
area: Analysis
type: enhancement
issues: []
6 changes: 6 additions & 0 deletions docs/reference/migration/apis/feature-migration.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ Example response:
"migration_status" : "NO_MIGRATION_NEEDED",
"indices" : [ ]
},
{
"feature_name" : "synonyms",
"minimum_index_version" : "8.8.0",
"migration_status" : "NO_MIGRATION_NEEDED",
"indices" : [ ]
},
{
"feature_name" : "tasks",
"minimum_index_version" : "8.0.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.util.SetOnce;
import org.elasticsearch.Version;
import org.elasticsearch.analysis.common.synonyms.SynonymsManagementAPIService;
import org.elasticsearch.client.internal.Client;
import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
import org.elasticsearch.cluster.routing.allocation.AllocationService;
Expand All @@ -120,12 +121,14 @@
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.SystemIndexDescriptor;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.elasticsearch.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.plugins.ScriptPlugin;
import org.elasticsearch.plugins.SystemIndexPlugin;
import org.elasticsearch.repositories.RepositoriesService;
import org.elasticsearch.script.ScriptContext;
import org.elasticsearch.script.ScriptService;
Expand All @@ -146,8 +149,7 @@

import static org.elasticsearch.plugins.AnalysisPlugin.requiresAnalysisSettings;

public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, ScriptPlugin {

public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, ScriptPlugin, SystemIndexPlugin {
private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(CommonAnalysisPlugin.class);

private final SetOnce<ScriptService> scriptServiceHolder = new SetOnce<>();
Expand Down Expand Up @@ -652,4 +654,23 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {

return tokenizers;
}

@Override
public Collection<SystemIndexDescriptor> getSystemIndexDescriptors(Settings settings) {
if (SynonymsManagementAPIService.isEnabled()) {
return Collections.singletonList(SynonymsManagementAPIService.getSystemIndexDescriptor());
} else {
return Collections.emptyList();
}
}

@Override
public String getFeatureName() {
return "synonyms";
}

@Override
public String getFeatureDescription() {
return "Index for storing synonyms managed through APIs";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.analysis.common.synonyms;

import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.FeatureFlag;
import org.elasticsearch.indices.SystemIndexDescriptor;
import org.elasticsearch.xcontent.XContentBuilder;

import java.io.IOException;
import java.io.UncheckedIOException;

import static org.elasticsearch.index.mapper.MapperService.SINGLE_MAPPING_NAME;
import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder;

public class SynonymsManagementAPIService {
private static final FeatureFlag SYNONYMS_API_FEATURE_FLAG = new FeatureFlag("synonyms_api");
public static final String SYNONYMS_INDEX = ".synonyms";
public static final String SYNONYMS_ORIGIN = "synonyms";

public static SystemIndexDescriptor getSystemIndexDescriptor() {
return SystemIndexDescriptor.builder()
.setIndexPattern(SYNONYMS_INDEX + "*")
.setDescription("Synonyms index for synonyms managed through APIs")
.setPrimaryIndex(SYNONYMS_INDEX)
.setMappings(mappings())
.setSettings(settings())
.setVersionMetaKey("version")
.setOrigin(SYNONYMS_ORIGIN)
.build();
}

private static XContentBuilder mappings() {
try {
XContentBuilder builder = jsonBuilder();
builder.startObject();
{
builder.startObject(SINGLE_MAPPING_NAME);
{
builder.startObject("_meta");
{
builder.field("version", Version.CURRENT.toString());
}
builder.endObject();
builder.field("dynamic", "strict");
builder.startObject("properties");
{
builder.startObject("synonyms");
{
builder.field("type", "object");
builder.field("enabled", "false");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For my understanding, this looks like we're leaning towards storing synonyms source-only, that is e.g. in this format:

{	"00001" : "foo => bar",	"00002": "lol, laughing out loud",	"00003": "i-phone, i phone => iphone" } 

Or maybe with more fields, but basically it would mean nothing in there is searchable and we'd have to load the whole source at once and parse stuff from a map? Also for updates this means we'd need to modify the source for every rule that is changed, appended etc...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The format, we are currently leaning towards is this:

{ "synonyms" : [ "foo => bar", "lol, laughing out loud", "i-phone, i phone => iphone" ] }

Where indeed synonyms are stored in source only inside a single field. And indeed for updates we modify the entire document (with its implications).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great, so that would mean a slightly different mapping then I guess (probably synonyms as keyword type array then?)

Copy link
Contributor Author

@mayya-sharipova mayya-sharipova Apr 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, it looks more like keyword, but I chose object field type because of performance:

  1. enabled parameter can only be applied to object. This will skip parsing of the contents of the field entirely, which is very performant and what we need. While doing:
"synonyms": { "type": "keyword", "doc_values": false, "index" : false }

will still parse every synonym rule and does some extra work for indexing before retuning with empty index options.

  1. Event it is defined as object, by setting enabled:false, we accept any value even arrays, as we don't do any parsing.

  2. Having object with enabled:false, allows us to modify document structure later without modifying mapping. For example, in case we decide to adopt a format with fields.

WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me, I didn't think about not parsing the docs here at all but its a nice idea as long as we don't have to implement any fast filtering or search on anything. So the parsing logic would only live in whatever piece of code is reading the synonyms from index instead of from file like right now and basically treats source like a blob. Sounds good to me given the current scope of the UI etc...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drive-by comment: should we consider properly disabling parsing for fields that don't have any lucene footprint like in this example, rather than using type object? The current solution looks ok to me, but I wonder if it's solid enough given we discussed rethinking how to disable fields in the past (see #63369). This should not block progress on synonyms management API though.

}
builder.endObject();
}
builder.endObject();
}
builder.endObject();
}
builder.endObject();
return builder;
} catch (IOException e) {
throw new UncheckedIOException("Failed to build mappings for " + SYNONYMS_INDEX, e);
}
}

static Settings settings() {
return Settings.builder()
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)
.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-all")
.build();
}

public static boolean isEnabled() {
return SYNONYMS_API_FEATURE_FLAG.isEnabled();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.SystemIndexDescriptor;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.util.List;
import java.util.Map;

import static org.elasticsearch.analysis.common.synonyms.SynonymsManagementAPIService.SYNONYMS_INDEX;

public class CommonAnalysisPluginTests extends ESTestCase {

/**
Expand Down Expand Up @@ -220,6 +224,16 @@ public void testNGramTokenizerDeprecation() throws IOException {
);
}

public void testSystemSynonymsIndexName() {
assertEquals(
List.of(SYNONYMS_INDEX),
new CommonAnalysisPlugin().getSystemIndexDescriptors(Settings.EMPTY)
.stream()
.map(SystemIndexDescriptor::getPrimaryIndex)
.toList()
);
}

public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
throws IOException {
final Settings settings = Settings.builder()
Expand Down