hughsio
diff --git a/‎notebooks/model-upgrades/upgrading-index-to-use-elser.ipynb‎
Lines changed: 28 additions & 49 deletions b/‎notebooks/model-upgrades/upgrading-index-to-use-elser.ipynb‎
Lines changed: 28 additions & 49 deletions
diff --git a/‎notebooks/search/03-ELSER.ipynb‎
Lines changed: 19 additions & 19 deletions b/‎notebooks/search/03-ELSER.ipynb‎
Lines changed: 19 additions & 19 deletions
@@ -35,19 +35,9 @@
  },
  {
  "cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
  "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3\u001b[0m\n",
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n"
- ]
- }
- ],
+ "outputs": [],
  "source": [
  "!pip install elasticsearch -qU"
  ]
@@ -82,17 +72,9 @@
  },
  {
  "cell_type": "code",
- "execution_count": 27,
+ "execution_count": null,
  "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'name': 'instance-0000000001', 'cluster_name': '1646af1463a8461e8bc3a33f317f8cf1', 'cluster_uuid': 'FF7uKiNRT6SejAcx2qDL-w', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '454cd35d33aafc161b4f7238d63777e71814d834', 'build_date': '2023-10-16T22:04:47.763088486Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
- ]
- }
- ],
+ "outputs": [],
  "source": [
  "# Found in the 'Manage Deployment' page\n",
  "CLOUD_ID = getpass.getpass('Enter Elastic Cloud ID: ')\n",
@@ -246,7 +228,7 @@
  "source": [
  "# Upgrade index `movies` to use ELSER model\n",
  "\n",
- "we are ready to re-index `movies` to a new index with the ELSER model `.elser_model_2`. As a first step, we have to create new ingestion pipeline and a index to use ELSER model. \n",
+ "we are ready to re-index `movies` to a new index with the ELSER model `.elser_model_2`. As a first step, we have to create new ingestion pipeline and index to use ELSER model. \n",
  "\n",
  "# Create a new pipeline with ELSER \n",
  "Let's create a new ingestion pipeline with ELSER model `.elser_model_2`. "
@@ -283,8 +265,7 @@
  "source": [
  "# Create a index with mappings\n",
  "\n",
- "Next, create an index with [`text_expansion`](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-text-expansion-query.html) query supporting ELSER model and [`rank_features`](https://www.elastic.co/guide/en/elasticsearch/reference/current/rank-features.html) to work with our vectors. \n",
- "\n"
+ "Next, create an index with required mappings for ELSER. "
  ]
  },
  {
@@ -307,13 +288,22 @@
  " }\n",
  " },\n",
  " \"plot_embedding\": { \n",
- " \"type\": \"rank_features\" \n",
+ " \"type\": \"sparse_vector\" \n",
  " }\n",
  " }\n",
  " }\n",
  ")"
  ]
  },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Note:**\n",
+ "- `plot_embedding` is the name of the field that contains generated token with the type [`sparse_vector`](https://www.elastic.co/guide/en/elasticsearch/reference/master/sparse-vector.html) \n",
+ "- `plot` is the name of the field from which the [`sparse_vector`](https://www.elastic.co/guide/en/elasticsearch/reference/master/sparse-vector.html) are created. "
+ ]
+ },
  {
  "cell_type": "markdown",
  "metadata": {},
@@ -341,7 +331,7 @@
  "cell_type": "markdown",
  "metadata": {},
  "source": [
- "Once reindex is complete, inspect any document in the index `elser-movies` and notice that the document has a additional field `plot_embedding` with terms that we will be using in to search in our `text_expansion` query. \n",
+ "Once reindex is complete, inspect any document in the index `elser-movies` and notice that the document has a additional field `plot_embedding` with terms that we will be using in `text_expansion` query. \n",
  " "
  ]
  },
@@ -356,7 +346,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 10,
+ "execution_count": 7,
  "metadata": {},
  "outputs": [
  {
@@ -473,7 +463,7 @@
  " }\n",
  " },\n",
  " \"plot_embedding\": {\n",
- " \"type\": \"rank_features\"\n",
+ " \"type\": \"sparse_vector\"\n",
  " },\n",
  " }\n",
  " }\n",
@@ -493,25 +483,14 @@
  },
  {
  "cell_type": "code",
- "execution_count": 30,
+ "execution_count": null,
  "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "ObjectApiResponse({'took': 2271, 'timed_out': False, 'total': 12, 'updated': 0, 'created': 12, 'deleted': 0, 'batches': 1, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []})"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
  "source": [
  "client.reindex(source={\n",
- " \"index\": \"elser-example-movies\", # replace with your index name\n",
+ " \"index\": \"my-index\", # replace with your index name\n",
  " \"_source\": {\n",
- " \"excludes\": [\"ml\"] # replace with the field-name from your index, that has previously generated tokens\n",
+ " \"excludes\": [\"my-tokens-field\"] # replace with the field-name from your index, that has previously generated tokens\n",
  " }}, \n",
  " dest={\n",
  " \"index\": \"elser-upgrade-index-demo\",\n",
@@ -530,7 +509,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 31,
+ "execution_count": 21,
  "metadata": {},
  "outputs": [
  {
@@ -594,7 +573,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 14,
+ "execution_count": 8,
  "metadata": {},
  "outputs": [
  {
@@ -603,7 +582,7 @@
  "ObjectApiResponse({'blogs': {'aliases': {}, 'mappings': {'properties': {'text_embedding': {'properties': {'is_truncated': {'type': 'boolean'}, 'model_id': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'predicted_value': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'l2_norm'}}}, 'title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'blocks': {'read_only_allow_delete': 'false'}, 'provided_name': 'blogs', 'default_pipeline': 'vectorize_blogs', 'creation_date': '1697651466693', 'number_of_replicas': '1', 'uuid': 'JWkPyTphQ2GV0sLadHWjjw', 'version': {'created': '8500003'}}}}})"
  ]
  },
- "execution_count": 14,
+ "execution_count": 8,
  "metadata": {},
  "output_type": "execute_result"
  }
@@ -682,7 +661,7 @@
  " }\n",
  " },\n",
  " \"title_embedding\": {\n",
- " \"type\": \"rank_features\"\n",
+ " \"type\": \"sparse_vector\"\n",
  " },\n",
  " }\n",
  " }\n",
@@ -725,7 +704,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 24,
+ "execution_count": 11,
  "metadata": {},
  "outputs": [
  {
 
@@ -64,7 +64,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 44,
+ "execution_count": 2,
  "metadata": {
  "id": "uP_GTVRi-d96"
  },
@@ -91,7 +91,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 64,
+ "execution_count": 3,
  "metadata": {
  "colab": {
  "base_uri": "https://localhost:8080/"
@@ -126,7 +126,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 65,
+ "execution_count": 4,
  "metadata": {
  "colab": {
  "base_uri": "https://localhost:8080/"
@@ -139,7 +139,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "{'name': 'instance-0000000000', 'cluster_name': '96f39514682c4f9a8dcdf46927f22675', 'cluster_uuid': 'jFtkpt2CSYyfYgxa_G4bTA', 'version': {'number': '8.10.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c63272efed16b5a1c25f3ce500715b7fddf9a9fb', 'build_date': '2023-10-05T10:15:55.152563867Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
+ "{'name': 'instance-0000000001', 'cluster_name': 'cfdd4b889b2548928bbfb1103e887f40', 'cluster_uuid': '9W4B3wPmQ8WGcl0eKN39uA', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '454cd35d33aafc161b4f7238d63777e71814d834', 'build_date': '2023-10-16T22:04:47.763088486Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
  ]
  }
  ],
@@ -257,7 +257,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 36,
+ "execution_count": 5,
  "metadata": {
  "colab": {
  "base_uri": "https://localhost:8080/"
@@ -272,7 +272,7 @@
  "ObjectApiResponse({'acknowledged': True})"
  ]
  },
- "execution_count": 36,
+ "execution_count": 5,
  "metadata": {},
  "output_type": "execute_result"
  }
@@ -309,7 +309,7 @@
  "- `inference`: A processor that performs inference using a machine learning model.\n",
  "- `model_id`: Specifies the ID of the machine learning model to be used. In this example, the model ID is set to `.elser_model_2`.\n",
  "- `input_output`: Specifies input and output fields\n",
- "- `input_field`: Field name from which the `rank_features` vector representation are created.\n",
+ "- `input_field`: Field name from which the `sparse_vector` representation are created.\n",
  "- `output_field`: Field name which contains inference results. "
  ]
  },
@@ -323,15 +323,15 @@
  "## Create index\n",
  "\n",
  "To use the ELSER model at index time, we'll need to create an index mapping that supports a [`text_expansion`](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-text-expansion-query.html) query.\n",
- "The mapping must include a field of type [`rank_features`](https://www.elastic.co/guide/en/elasticsearch/reference/current/rank-features.html) to work with our feature vectors of interest.\n",
+ "The mapping includes a field of type [`sparse_vector`](https://www.elastic.co/guide/en/elasticsearch/reference/master/sparse-vector.html)  to work with our feature vectors of interest.\n",
  "This field contains the token-weight pairs the ELSER model created based on the input text.\n",
  "\n",
  "Let's create an index named `elser-example-movies` with the mappings we need.\n"
  ]
  },
  {
  "cell_type": "code",
- "execution_count": 37,
+ "execution_count": 6,
  "metadata": {
  "colab": {
  "base_uri": "https://localhost:8080/"
@@ -346,7 +346,7 @@
  "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-example-movies'})"
  ]
  },
- "execution_count": 37,
+ "execution_count": 6,
  "metadata": {},
  "output_type": "execute_result"
  }
@@ -373,7 +373,7 @@
  " }\n",
  " },\n",
  " \"plot_embedding\": { \n",
- " \"type\": \"rank_features\" \n",
+ " \"type\": \"sparse_vector\" \n",
  " }\n",
  " }\n",
  " }\n",
@@ -395,7 +395,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 50,
+ "execution_count": 7,
  "metadata": {
  "colab": {
  "base_uri": "https://localhost:8080/"
@@ -408,7 +408,7 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Done indexing documents into `search-movies` index!\n"
+ "Done indexing documents into `elser-example-movies` index!\n"
  ]
  }
  ],
@@ -430,7 +430,7 @@
  "# Use helpers.bulk to index\n",
  "helpers.bulk(client, documents)\n",
  "\n",
- "print(\"Done indexing documents into `search-movies` index!\")"
+ "print(\"Done indexing documents into `elser-example-movies` index!\")"
  ]
  },
  {
@@ -462,7 +462,7 @@
  },
  {
  "cell_type": "code",
- "execution_count": 52,
+ "execution_count": 8,
  "metadata": {
  "colab": {
  "base_uri": "https://localhost:8080/"
@@ -475,15 +475,15 @@
  "name": "stdout",
  "output_type": "stream",
  "text": [
- "Score: 12.7633505\n",
+ "Score: 12.763346\n",
  "Title: Fight Club\n",
  "Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.\n",
  "\n",
- "Score: 9.930428\n",
+ "Score: 9.930427\n",
  "Title: Pulp Fiction\n",
  "Plot: The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.\n",
  "\n",
- "Score: 9.488338\n",
+ "Score: 9.4883375\n",
  "Title: The Matrix\n",
  "Plot: A computer hacker learns from mysterious rebels about the true nature of his reality and his role in the war against its controllers.\n",
  "\n"
@@ -540,7 +540,7 @@
  "name": "python",
  "nbconvert_exporter": "python",
  "pygments_lexer": "ipython3",
- "version": "3.10.3"
+ "version": "3.11.4"
  },
  "vscode": {
  "interpreter": {