Skip to content

Commit c4ea894

Browse files
committed
use sparse_vector in ELSER notebooks
1 parent bc36bfa commit c4ea894

File tree

2 files changed

+47
-68
lines changed

2 files changed

+47
-68
lines changed

notebooks/model-upgrades/upgrading-index-to-use-elser.ipynb

Lines changed: 28 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,9 @@
3535
},
3636
{
3737
"cell_type": "code",
38-
"execution_count": 1,
38+
"execution_count": null,
3939
"metadata": {},
40-
"outputs": [
41-
{
42-
"name": "stdout",
43-
"output_type": "stream",
44-
"text": [
45-
"\n",
46-
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3\u001b[0m\n",
47-
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n"
48-
]
49-
}
50-
],
40+
"outputs": [],
5141
"source": [
5242
"!pip install elasticsearch -qU"
5343
]
@@ -82,17 +72,9 @@
8272
},
8373
{
8474
"cell_type": "code",
85-
"execution_count": 27,
75+
"execution_count": null,
8676
"metadata": {},
87-
"outputs": [
88-
{
89-
"name": "stdout",
90-
"output_type": "stream",
91-
"text": [
92-
"{'name': 'instance-0000000001', 'cluster_name': '1646af1463a8461e8bc3a33f317f8cf1', 'cluster_uuid': 'FF7uKiNRT6SejAcx2qDL-w', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '454cd35d33aafc161b4f7238d63777e71814d834', 'build_date': '2023-10-16T22:04:47.763088486Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
93-
]
94-
}
95-
],
77+
"outputs": [],
9678
"source": [
9779
"# Found in the 'Manage Deployment' page\n",
9880
"CLOUD_ID = getpass.getpass('Enter Elastic Cloud ID: ')\n",
@@ -246,7 +228,7 @@
246228
"source": [
247229
"# Upgrade index `movies` to use ELSER model\n",
248230
"\n",
249-
"we are ready to re-index `movies` to a new index with the ELSER model `.elser_model_2`. As a first step, we have to create new ingestion pipeline and a index to use ELSER model. \n",
231+
"we are ready to re-index `movies` to a new index with the ELSER model `.elser_model_2`. As a first step, we have to create new ingestion pipeline and index to use ELSER model. \n",
250232
"\n",
251233
"# Create a new pipeline with ELSER \n",
252234
"Let's create a new ingestion pipeline with ELSER model `.elser_model_2`. "
@@ -283,8 +265,7 @@
283265
"source": [
284266
"# Create a index with mappings\n",
285267
"\n",
286-
"Next, create an index with [`text_expansion`](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-text-expansion-query.html) query supporting ELSER model and [`rank_features`](https://www.elastic.co/guide/en/elasticsearch/reference/current/rank-features.html) to work with our vectors. \n",
287-
"\n"
268+
"Next, create an index with required mappings for ELSER. "
288269
]
289270
},
290271
{
@@ -307,13 +288,22 @@
307288
" }\n",
308289
" },\n",
309290
" \"plot_embedding\": { \n",
310-
" \"type\": \"rank_features\" \n",
291+
" \"type\": \"sparse_vector\" \n",
311292
" }\n",
312293
" }\n",
313294
" }\n",
314295
")"
315296
]
316297
},
298+
{
299+
"cell_type": "markdown",
300+
"metadata": {},
301+
"source": [
302+
"**Note:**\n",
303+
"- `plot_embedding` is the name of the field that contains generated token with the type [`sparse_vector`](https://www.elastic.co/guide/en/elasticsearch/reference/master/sparse-vector.html) \n",
304+
"- `plot` is the name of the field from which the [`sparse_vector`](https://www.elastic.co/guide/en/elasticsearch/reference/master/sparse-vector.html) are created. "
305+
]
306+
},
317307
{
318308
"cell_type": "markdown",
319309
"metadata": {},
@@ -341,7 +331,7 @@
341331
"cell_type": "markdown",
342332
"metadata": {},
343333
"source": [
344-
"Once reindex is complete, inspect any document in the index `elser-movies` and notice that the document has a additional field `plot_embedding` with terms that we will be using in to search in our `text_expansion` query. \n",
334+
"Once reindex is complete, inspect any document in the index `elser-movies` and notice that the document has a additional field `plot_embedding` with terms that we will be using in `text_expansion` query. \n",
345335
" "
346336
]
347337
},
@@ -356,7 +346,7 @@
356346
},
357347
{
358348
"cell_type": "code",
359-
"execution_count": 10,
349+
"execution_count": 7,
360350
"metadata": {},
361351
"outputs": [
362352
{
@@ -473,7 +463,7 @@
473463
" }\n",
474464
" },\n",
475465
" \"plot_embedding\": {\n",
476-
" \"type\": \"rank_features\"\n",
466+
" \"type\": \"sparse_vector\"\n",
477467
" },\n",
478468
" }\n",
479469
" }\n",
@@ -493,25 +483,14 @@
493483
},
494484
{
495485
"cell_type": "code",
496-
"execution_count": 30,
486+
"execution_count": null,
497487
"metadata": {},
498-
"outputs": [
499-
{
500-
"data": {
501-
"text/plain": [
502-
"ObjectApiResponse({'took': 2271, 'timed_out': False, 'total': 12, 'updated': 0, 'created': 12, 'deleted': 0, 'batches': 1, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []})"
503-
]
504-
},
505-
"execution_count": 30,
506-
"metadata": {},
507-
"output_type": "execute_result"
508-
}
509-
],
488+
"outputs": [],
510489
"source": [
511490
"client.reindex(source={\n",
512-
" \"index\": \"elser-example-movies\", # replace with your index name\n",
491+
" \"index\": \"my-index\", # replace with your index name\n",
513492
" \"_source\": {\n",
514-
" \"excludes\": [\"ml\"] # replace with the field-name from your index, that has previously generated tokens\n",
493+
" \"excludes\": [\"my-tokens-field\"] # replace with the field-name from your index, that has previously generated tokens\n",
515494
" }}, \n",
516495
" dest={\n",
517496
" \"index\": \"elser-upgrade-index-demo\",\n",
@@ -530,7 +509,7 @@
530509
},
531510
{
532511
"cell_type": "code",
533-
"execution_count": 31,
512+
"execution_count": 21,
534513
"metadata": {},
535514
"outputs": [
536515
{
@@ -594,7 +573,7 @@
594573
},
595574
{
596575
"cell_type": "code",
597-
"execution_count": 14,
576+
"execution_count": 8,
598577
"metadata": {},
599578
"outputs": [
600579
{
@@ -603,7 +582,7 @@
603582
"ObjectApiResponse({'blogs': {'aliases': {}, 'mappings': {'properties': {'text_embedding': {'properties': {'is_truncated': {'type': 'boolean'}, 'model_id': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'predicted_value': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'l2_norm'}}}, 'title': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'blocks': {'read_only_allow_delete': 'false'}, 'provided_name': 'blogs', 'default_pipeline': 'vectorize_blogs', 'creation_date': '1697651466693', 'number_of_replicas': '1', 'uuid': 'JWkPyTphQ2GV0sLadHWjjw', 'version': {'created': '8500003'}}}}})"
604583
]
605584
},
606-
"execution_count": 14,
585+
"execution_count": 8,
607586
"metadata": {},
608587
"output_type": "execute_result"
609588
}
@@ -682,7 +661,7 @@
682661
" }\n",
683662
" },\n",
684663
" \"title_embedding\": {\n",
685-
" \"type\": \"rank_features\"\n",
664+
" \"type\": \"sparse_vector\"\n",
686665
" },\n",
687666
" }\n",
688667
" }\n",
@@ -725,7 +704,7 @@
725704
},
726705
{
727706
"cell_type": "code",
728-
"execution_count": 24,
707+
"execution_count": 11,
729708
"metadata": {},
730709
"outputs": [
731710
{

notebooks/search/03-ELSER.ipynb

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
},
6565
{
6666
"cell_type": "code",
67-
"execution_count": 44,
67+
"execution_count": 2,
6868
"metadata": {
6969
"id": "uP_GTVRi-d96"
7070
},
@@ -91,7 +91,7 @@
9191
},
9292
{
9393
"cell_type": "code",
94-
"execution_count": 64,
94+
"execution_count": 3,
9595
"metadata": {
9696
"colab": {
9797
"base_uri": "https://localhost:8080/"
@@ -126,7 +126,7 @@
126126
},
127127
{
128128
"cell_type": "code",
129-
"execution_count": 65,
129+
"execution_count": 4,
130130
"metadata": {
131131
"colab": {
132132
"base_uri": "https://localhost:8080/"
@@ -139,7 +139,7 @@
139139
"name": "stdout",
140140
"output_type": "stream",
141141
"text": [
142-
"{'name': 'instance-0000000000', 'cluster_name': '96f39514682c4f9a8dcdf46927f22675', 'cluster_uuid': 'jFtkpt2CSYyfYgxa_G4bTA', 'version': {'number': '8.10.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c63272efed16b5a1c25f3ce500715b7fddf9a9fb', 'build_date': '2023-10-05T10:15:55.152563867Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
142+
"{'name': 'instance-0000000001', 'cluster_name': 'cfdd4b889b2548928bbfb1103e887f40', 'cluster_uuid': '9W4B3wPmQ8WGcl0eKN39uA', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '454cd35d33aafc161b4f7238d63777e71814d834', 'build_date': '2023-10-16T22:04:47.763088486Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
143143
]
144144
}
145145
],
@@ -257,7 +257,7 @@
257257
},
258258
{
259259
"cell_type": "code",
260-
"execution_count": 36,
260+
"execution_count": 5,
261261
"metadata": {
262262
"colab": {
263263
"base_uri": "https://localhost:8080/"
@@ -272,7 +272,7 @@
272272
"ObjectApiResponse({'acknowledged': True})"
273273
]
274274
},
275-
"execution_count": 36,
275+
"execution_count": 5,
276276
"metadata": {},
277277
"output_type": "execute_result"
278278
}
@@ -309,7 +309,7 @@
309309
"- `inference`: A processor that performs inference using a machine learning model.\n",
310310
"- `model_id`: Specifies the ID of the machine learning model to be used. In this example, the model ID is set to `.elser_model_2`.\n",
311311
"- `input_output`: Specifies input and output fields\n",
312-
"- `input_field`: Field name from which the `rank_features` vector representation are created.\n",
312+
"- `input_field`: Field name from which the `sparse_vector` representation are created.\n",
313313
"- `output_field`: Field name which contains inference results. "
314314
]
315315
},
@@ -323,15 +323,15 @@
323323
"## Create index\n",
324324
"\n",
325325
"To use the ELSER model at index time, we'll need to create an index mapping that supports a [`text_expansion`](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-text-expansion-query.html) query.\n",
326-
"The mapping must include a field of type [`rank_features`](https://www.elastic.co/guide/en/elasticsearch/reference/current/rank-features.html) to work with our feature vectors of interest.\n",
326+
"The mapping includes a field of type [`sparse_vector`](https://www.elastic.co/guide/en/elasticsearch/reference/master/sparse-vector.html) to work with our feature vectors of interest.\n",
327327
"This field contains the token-weight pairs the ELSER model created based on the input text.\n",
328328
"\n",
329329
"Let's create an index named `elser-example-movies` with the mappings we need.\n"
330330
]
331331
},
332332
{
333333
"cell_type": "code",
334-
"execution_count": 37,
334+
"execution_count": 6,
335335
"metadata": {
336336
"colab": {
337337
"base_uri": "https://localhost:8080/"
@@ -346,7 +346,7 @@
346346
"ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-example-movies'})"
347347
]
348348
},
349-
"execution_count": 37,
349+
"execution_count": 6,
350350
"metadata": {},
351351
"output_type": "execute_result"
352352
}
@@ -373,7 +373,7 @@
373373
" }\n",
374374
" },\n",
375375
" \"plot_embedding\": { \n",
376-
" \"type\": \"rank_features\" \n",
376+
" \"type\": \"sparse_vector\" \n",
377377
" }\n",
378378
" }\n",
379379
" }\n",
@@ -395,7 +395,7 @@
395395
},
396396
{
397397
"cell_type": "code",
398-
"execution_count": 50,
398+
"execution_count": 7,
399399
"metadata": {
400400
"colab": {
401401
"base_uri": "https://localhost:8080/"
@@ -408,7 +408,7 @@
408408
"name": "stdout",
409409
"output_type": "stream",
410410
"text": [
411-
"Done indexing documents into `search-movies` index!\n"
411+
"Done indexing documents into `elser-example-movies` index!\n"
412412
]
413413
}
414414
],
@@ -430,7 +430,7 @@
430430
"# Use helpers.bulk to index\n",
431431
"helpers.bulk(client, documents)\n",
432432
"\n",
433-
"print(\"Done indexing documents into `search-movies` index!\")"
433+
"print(\"Done indexing documents into `elser-example-movies` index!\")"
434434
]
435435
},
436436
{
@@ -462,7 +462,7 @@
462462
},
463463
{
464464
"cell_type": "code",
465-
"execution_count": 52,
465+
"execution_count": 8,
466466
"metadata": {
467467
"colab": {
468468
"base_uri": "https://localhost:8080/"
@@ -475,15 +475,15 @@
475475
"name": "stdout",
476476
"output_type": "stream",
477477
"text": [
478-
"Score: 12.7633505\n",
478+
"Score: 12.763346\n",
479479
"Title: Fight Club\n",
480480
"Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.\n",
481481
"\n",
482-
"Score: 9.930428\n",
482+
"Score: 9.930427\n",
483483
"Title: Pulp Fiction\n",
484484
"Plot: The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.\n",
485485
"\n",
486-
"Score: 9.488338\n",
486+
"Score: 9.4883375\n",
487487
"Title: The Matrix\n",
488488
"Plot: A computer hacker learns from mysterious rebels about the true nature of his reality and his role in the war against its controllers.\n",
489489
"\n"
@@ -540,7 +540,7 @@
540540
"name": "python",
541541
"nbconvert_exporter": "python",
542542
"pygments_lexer": "ipython3",
543-
"version": "3.10.3"
543+
"version": "3.11.4"
544544
},
545545
"vscode": {
546546
"interpreter": {

0 commit comments

Comments
 (0)