Skip to content

Commit aaad6cc

Browse files
hwchase17bstadt
andauthored
Harrison/atlas db (langchain-ai#1315)
Co-authored-by: Brandon Duderstadt <brandonduderstadt@gmail.com>
1 parent 3989c79 commit aaad6cc

File tree

7 files changed

+777
-24
lines changed

7 files changed

+777
-24
lines changed

docs/ecosystem/atlas.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# AtlasDB
2+
3+
This page covers how to Nomic's Atlas ecosystem within LangChain.
4+
It is broken into two parts: installation and setup, and then references to specific Atlas wrappers.
5+
6+
## Installation and Setup
7+
- Install the Python package with `pip install nomic`
8+
- Nomic is also included in langchains poetry extras `poetry install -E all`
9+
-
10+
## Wrappers
11+
12+
### VectorStore
13+
14+
There exists a wrapper around the Atlas neural database, allowing you to use it as a vectorstore.
15+
This vectorstore also gives you full access to the underlying AtlasProject object, which will allow you to use the full range of Atlas map interactions, such as bulk tagging and automatic topic modeling.
16+
Please see [the Nomic docs](https://docs.nomic.ai/atlas_api.html) for more detailed information.
17+
18+
19+
20+
To import this vectorstore:
21+
```python
22+
from langchain.vectorstores import AtlasDB
23+
```
24+
25+
For a more detailed walkthrough of the Chroma wrapper, see [this notebook](../modules/indexes/examples/vectorstores.ipynb)
Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"pycharm": {
7+
"name": "#%% md\n"
8+
}
9+
},
10+
"source": [
11+
"# AtlasDB\n",
12+
"\n",
13+
"This notebook shows you how to use functionality related to the AtlasDB"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": 1,
19+
"metadata": {
20+
"pycharm": {
21+
"name": "#%%\n"
22+
}
23+
},
24+
"outputs": [],
25+
"source": [
26+
"import time\n",
27+
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
28+
"from langchain.text_splitter import SpacyTextSplitter\n",
29+
"from langchain.vectorstores import AtlasDB\n",
30+
"from langchain.document_loaders import TextLoader"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 2,
36+
"metadata": {
37+
"scrolled": true
38+
},
39+
"outputs": [
40+
{
41+
"name": "stdout",
42+
"output_type": "stream",
43+
"text": [
44+
"Collecting en-core-web-sm==3.5.0\n",
45+
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)\n",
46+
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m12.8/12.8 MB\u001B[0m \u001B[31m90.8 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\n",
47+
"\u001B[?25hRequirement already satisfied: spacy<3.6.0,>=3.5.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from en-core-web-sm==3.5.0) (3.5.0)\n",
48+
"Requirement already satisfied: packaging>=20.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (23.0)\n",
49+
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.1.1)\n",
50+
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.3.0)\n",
51+
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.4.5)\n",
52+
"Requirement already satisfied: pathy>=0.10.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.10.1)\n",
53+
"Requirement already satisfied: setuptools in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (67.4.0)\n",
54+
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.64.1)\n",
55+
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.4)\n",
56+
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (6.3.0)\n",
57+
"Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.1.7)\n",
58+
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.7)\n",
59+
"Requirement already satisfied: typer<0.8.0,>=0.3.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.7.0)\n",
60+
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.28.2)\n",
61+
"Requirement already satisfied: jinja2 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.1.2)\n",
62+
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.10.5)\n",
63+
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.8)\n",
64+
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.12)\n",
65+
"Requirement already satisfied: numpy>=1.15.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.24.2)\n",
66+
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.9)\n",
67+
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.8)\n",
68+
"Requirement already satisfied: typing-extensions>=4.2.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.5.0)\n",
69+
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.1)\n",
70+
"Requirement already satisfied: idna<4,>=2.5 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.4)\n",
71+
"Requirement already satisfied: certifi>=2017.4.17 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2022.12.7)\n",
72+
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.26.14)\n",
73+
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.7.9)\n",
74+
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.0.4)\n",
75+
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.1.3)\n",
76+
"Requirement already satisfied: MarkupSafe>=2.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.1.2)\n",
77+
"\n",
78+
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.0.1\u001B[0m\n",
79+
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n",
80+
"\u001B[38;5;2m✔ Download and installation successful\u001B[0m\n",
81+
"You can now load the package via spacy.load('en_core_web_sm')\n"
82+
]
83+
}
84+
],
85+
"source": [
86+
"!python -m spacy download en_core_web_sm"
87+
]
88+
},
89+
{
90+
"cell_type": "code",
91+
"execution_count": 3,
92+
"metadata": {},
93+
"outputs": [],
94+
"source": [
95+
"ATLAS_TEST_API_KEY = '7xDPkYXSYDc1_ErdTPIcoAR9RNd8YDlkS3nVNXcVoIMZ6'"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": 4,
101+
"metadata": {},
102+
"outputs": [],
103+
"source": [
104+
"loader = TextLoader('../../state_of_the_union.txt')\n",
105+
"documents = loader.load()\n",
106+
"text_splitter = SpacyTextSplitter(separator='|')\n",
107+
"texts = []\n",
108+
"for doc in text_splitter.split_documents(documents):\n",
109+
" texts.extend(doc.page_content.split('|'))\n",
110+
" \n",
111+
"texts = [e.strip() for e in texts]"
112+
]
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": 5,
117+
"metadata": {},
118+
"outputs": [
119+
{
120+
"name": "stderr",
121+
"output_type": "stream",
122+
"text": [
123+
"2023-02-24 16:13:49.696 | INFO | nomic.project:_create_project:884 - Creating project `test_index_1677255228.136989` in organization `Atlas Demo`\n",
124+
"2023-02-24 16:13:51.087 | INFO | nomic.project:wait_for_project_lock:993 - test_index_1677255228.136989: Project lock is released.\n",
125+
"2023-02-24 16:13:51.225 | INFO | nomic.project:wait_for_project_lock:993 - test_index_1677255228.136989: Project lock is released.\n",
126+
"2023-02-24 16:13:51.481 | INFO | nomic.project:add_text:1351 - Uploading text to Atlas.\n",
127+
"1it [00:00, 1.20it/s]\n",
128+
"2023-02-24 16:13:52.318 | INFO | nomic.project:add_text:1422 - Text upload succeeded.\n",
129+
"2023-02-24 16:13:52.628 | INFO | nomic.project:wait_for_project_lock:993 - test_index_1677255228.136989: Project lock is released.\n",
130+
"2023-02-24 16:13:53.380 | INFO | nomic.project:create_index:1192 - Created map `test_index_1677255228.136989_index` in project `test_index_1677255228.136989`: https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\n"
131+
]
132+
}
133+
],
134+
"source": [
135+
"db = AtlasDB.from_texts(texts=texts,\n",
136+
" name='test_index_'+str(time.time()),\n",
137+
" description='test_index',\n",
138+
" api_key=ATLAS_TEST_API_KEY,\n",
139+
" index_kwargs={'build_topic_model': True})"
140+
]
141+
},
142+
{
143+
"cell_type": "code",
144+
"execution_count": 6,
145+
"metadata": {
146+
"scrolled": false
147+
},
148+
"outputs": [
149+
{
150+
"name": "stderr",
151+
"output_type": "stream",
152+
"text": [
153+
"2023-02-24 16:14:09.106 | INFO | nomic.project:wait_for_project_lock:993 - test_index_1677255228.136989: Project lock is released.\n"
154+
]
155+
}
156+
],
157+
"source": [
158+
"with db.project.wait_for_project_lock():\n",
159+
" time.sleep(1)"
160+
]
161+
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": 7,
165+
"metadata": {},
166+
"outputs": [
167+
{
168+
"data": {
169+
"text/html": [
170+
"\n",
171+
" <strong><a href=\"https://atlas.nomic.ai/dashboard/project/ee2354a3-7f9a-4c6b-af43-b0cda09d7198\">test_index_1677255228.136989</strong></a>\n",
172+
" <br>\n",
173+
" A description for your project 508 datums inserted.\n",
174+
" <br>\n",
175+
" 1 index built.\n",
176+
" <br><strong>Projections</strong>\n",
177+
"<ul>\n",
178+
"<li>test_index_1677255228.136989_index. Status Completed. <a target=\"_blank\" href=\"https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\">view online</a></li></ul><hr><script>\n",
179+
" destroy = function() {\n",
180+
" document.getElementById(\"iframedb996d77-8981-48a0-897a-ff2c22bbf541\").remove()\n",
181+
" }\n",
182+
" </script>\n",
183+
"\n",
184+
" <h4>Projection ID: db996d77-8981-48a0-897a-ff2c22bbf541</h4>\n",
185+
" <div class=\"actions\">\n",
186+
" <div id=\"hide\" class=\"action\" onclick=\"destroy()\">Hide embedded project</div>\n",
187+
" <div class=\"action\" id=\"out\">\n",
188+
" <a href=\"https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\" target=\"_blank\">Explore on atlas.nomic.ai</a>\n",
189+
" </div>\n",
190+
" </div>\n",
191+
" \n",
192+
" <iframe class=\"iframe\" id=\"iframedb996d77-8981-48a0-897a-ff2c22bbf541\" allow=\"clipboard-read; clipboard-write\" src=\"https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\">\n",
193+
" </iframe>\n",
194+
"\n",
195+
" <style>\n",
196+
" .iframe {\n",
197+
" /* vh can be **very** large in vscode ipynb. */\n",
198+
" height: min(75vh, 66vw);\n",
199+
" width: 100%;\n",
200+
" }\n",
201+
" </style>\n",
202+
" \n",
203+
" <style>\n",
204+
" .actions {\n",
205+
" display: block;\n",
206+
" }\n",
207+
" .action {\n",
208+
" min-height: 18px;\n",
209+
" margin: 5px;\n",
210+
" transition: all 500ms ease-in-out;\n",
211+
" }\n",
212+
" .action:hover {\n",
213+
" cursor: pointer;\n",
214+
" }\n",
215+
" #hide:hover::after {\n",
216+
" content: \" X\";\n",
217+
" }\n",
218+
" #out:hover::after {\n",
219+
" content: \"\";\n",
220+
" }\n",
221+
" </style>\n",
222+
" "
223+
],
224+
"text/plain": [
225+
"AtlasProject: <{'id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'owner': '9c29afbb-a002-4d49-958e-ecf5ae1351ac', 'project_name': 'test_index_1677255228.136989', 'creator': 'auth0|63efc4b5462246f4d9a6ecf2', 'description': 'A description for your project', 'opensearch_index_id': 'f61fb8dd-0abf-4f31-9130-41870e443902', 'is_public': True, 'project_fields': ['atlas_id', 'text'], 'unique_id_field': 'atlas_id', 'modality': 'text', 'total_datums_in_project': 508, 'created_timestamp': '2023-02-24T16:13:50.313363+00:00', 'atlas_indices': [{'id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'project_id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'index_name': 'test_index_1677255228.136989_index', 'indexed_field': 'text', 'created_timestamp': '2023-02-24T16:13:52.957101+00:00', 'updated_timestamp': '2023-02-24T16:14:03.469621+00:00', 'atoms': ['charchunk', 'document'], 'colorable_fields': [], 'embedders': [{'id': '7ec0868a-4eed-4414-a482-25cce9803e1b', 'atlas_index_id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'ready': True, 'model_name': 'NomicEmbed', 'hyperparameters': {'norm': 'both', 'batch_size': 20, 'polymerize_by': 'charchunk', 'dataset_buffer_size': 1000}}], 'nearest_neighbor_indices': [{'id': '86f8e3ff-e07c-4678-a4d7-144db4b0301d', 'index_name': 'NomicOrganize', 'ready': True, 'hyperparameters': {'dim': 384, 'space': 'l2'}, 'atom_strategies': ['document']}], 'projections': [{'id': 'db996d77-8981-48a0-897a-ff2c22bbf541', 'projection_name': 'NomicProject', 'ready': True, 'hyperparameters': {'spread': 1.0, 'n_epochs': 50, 'n_neighbors': 15}, 'atom_strategies': ['document'], 'created_timestamp': '2023-02-24T16:13:52.979561+00:00', 'updated_timestamp': '2023-02-24T16:14:03.466309+00:00'}]}], 'insert_update_delete_lock': False}>"
226+
]
227+
},
228+
"execution_count": 7,
229+
"metadata": {},
230+
"output_type": "execute_result"
231+
}
232+
],
233+
"source": [
234+
"db.project"
235+
]
236+
},
237+
{
238+
"cell_type": "code",
239+
"execution_count": null,
240+
"metadata": {},
241+
"outputs": [],
242+
"source": []
243+
}
244+
],
245+
"metadata": {
246+
"kernelspec": {
247+
"display_name": "Python 3 (ipykernel)",
248+
"language": "python",
249+
"name": "python3"
250+
},
251+
"language_info": {
252+
"codemirror_mode": {
253+
"name": "ipython",
254+
"version": 3
255+
},
256+
"file_extension": ".py",
257+
"mimetype": "text/x-python",
258+
"name": "python",
259+
"nbconvert_exporter": "python",
260+
"pygments_lexer": "ipython3",
261+
"version": "3.9.4"
262+
}
263+
},
264+
"nbformat": 4,
265+
"nbformat_minor": 1
266+
}

langchain/vectorstores/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Wrappers on top of vector stores."""
2+
from langchain.vectorstores.atlas import AtlasDB
23
from langchain.vectorstores.base import VectorStore
34
from langchain.vectorstores.chroma import Chroma
45
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
@@ -19,4 +20,5 @@
1920
"Milvus",
2021
"Chroma",
2122
"OpenSearchVectorSearch",
23+
"AtlasDB",
2224
]

0 commit comments

Comments
 (0)