Skip to content

Commit 61c4eb2

Browse files
committed
add non-streaming chat api; add documentation; redefine setup script
1 parent 26e2d45 commit 61c4eb2

File tree

13 files changed

+1212
-955
lines changed

13 files changed

+1212
-955
lines changed

README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# EmbeddedLLM
2+
3+
Run local LLMs on iGPU and APU (AMD , Intel, and Qualcomm (Coming Soon))
4+
5+
|Support matrix| Supported now| Under Development | On the roadmap|
6+
|--------------|--------------|-------------------|---------------|
7+
|Model architectures| Gemma <br/> Llama * <br/> Mistral + <br/>Phi <br/>||
8+
|Platform| Linux <br/> Windows | ||||
9+
|Architecture|x86 <br/> x64 <br/> | Arm64 |||
10+
|Hardware Acceleration|CUDA<br/>DirectML<br/>|QNN <br/> ROCm |OpenVINO
11+
12+
\* The Llama model architecture supports similar model families such as CodeLlama, Vicuna, Yi, and more.
13+
14+
\+ The Mistral model architecture supports similar model families such as Zephyr.
15+
16+
17+
18+
## 🚀 Latest News
19+
- [2024/06] Support chat inference on iGPU and CPU.
20+
21+
22+
## Supported Models (Quick Start)
23+
| Models | Parameters | Context Length | Link |
24+
|---------------------|------------|----------------|------|
25+
|Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
26+
|Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
27+
|Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
28+
|Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
29+
|Mistral-7b-v0.3-instruct| 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx)|
30+
| Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |
31+
| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) |
32+
| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml)|
33+
| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml)|
34+
35+
36+
## Acknowledgements
37+
* Excellent open-source projects: [vLLM](https://github.com/vllm-project/vllm.git), [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai.git) and many others.
38+
39+
* Thanks to all the [contributors](./docs/contributors.md).

pyproject.toml

Lines changed: 63 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -1,125 +1,63 @@
1-
# See https://gitlab.liris.cnrs.fr/pagoda/tools/mkdocs_template/-/blob/master/user_config/pyproject.toml
2-
3-
# -----------------------------------------------------------------------------
4-
# Pytest configuration
5-
# https://docs.pytest.org/en/latest/customize.html?highlight=pyproject#pyproject-toml
6-
7-
[tool.pytest.ini_options]
8-
log_cli = true
9-
asyncio_mode = "auto"
10-
# log_cli_level = "DEBUG"
11-
addopts = "--cov=embeddedllm --doctest-modules"
12-
testpaths = ["tests"]
13-
filterwarnings = [
14-
"ignore::DeprecationWarning:tensorflow.*",
15-
"ignore::DeprecationWarning:tensorboard.*",
16-
"ignore::DeprecationWarning:matplotlib.*",
17-
"ignore::DeprecationWarning:flatbuffers.*",
18-
]
19-
20-
21-
# -----------------------------------------------------------------------------
22-
# Black (Option-less formatter) configuration
23-
# https://black.readthedocs.io/en/stable/index.html
24-
25-
[tool.black]
26-
line-length = 99
27-
target-version = ["py310"]
28-
include = '\.pyi?$|\.ipynb'
29-
30-
# -----------------------------------------------------------------------------
31-
# For sorting imports
32-
# This is used by VS Code to sort imports
33-
# https://code.visualstudio.com/docs/python/editing#_sort-imports
34-
# https://timothycrosley.github.io/isort/
35-
36-
[tool.isort]
37-
# Profile
38-
# Base profile type to use for configuration. Profiles include: black, django,
39-
# pycharm, google, open_stack, plone, attrs, hug. As well as any shared profiles.
40-
# Default: ``
41-
profile = "black"
42-
# Treat project as a git repository and ignore files listed in .gitignore
43-
# Default: `False`
44-
skip_gitignore = true
45-
# The max length of an import line (used for wrapping long imports).
46-
# Default: `79`
47-
line_length = 99
48-
known_first_party = []
49-
50-
# -----------------------------------------------------------------------------
51-
# setuptools
52-
# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
53-
54-
[build-system]
55-
# setuptools-scm considers all files tracked by git to be data files
56-
requires = ["setuptools>=62.0", "setuptools-scm"]
57-
build-backend = "setuptools.build_meta"
58-
59-
[project]
60-
name = "embeddedllm"
61-
description = "EmbeddedLLM: API server for Embedded Device Deployment. Currently support ONNX-DirectML."
62-
readme = "README.md"
63-
requires-python = "~=3.10"
64-
# keywords = ["one", "two"]
65-
license = { text = "Proprietary" }
66-
classifiers = [ # https://pypi.org/classifiers/
67-
"Development Status :: 3 - Alpha",
68-
"Programming Language :: Python :: 3 :: Only",
69-
"Intended Audience :: Information Technology",
70-
"Operating System :: Unix",
71-
]
72-
dependencies = [
73-
"huggingface-hub[cli]",
74-
"fastapi~=0.110.0",
75-
"gunicorn~=21.2.0",
76-
"loguru~=0.7.2",
77-
"numpy~=1.26.4",
78-
"pydantic-settings>=2.2.1",
79-
"pydantic~=2.6.3",
80-
"onnxruntime-directml",
81-
"onnxruntime-genai-directml",
82-
"loguru",
83-
"openai",
84-
"torch",
85-
"transformers",
86-
"uvicorn"
87-
] # Sort your dependencies https://sortmylist.com/
88-
dynamic = ["version"]
89-
90-
[project.optional-dependencies]
91-
lint = ["black~=24.4.2", "flake8~=7.0.0"]
92-
test = [
93-
"flaky~=3.7.0",
94-
"locust~=2.24.1",
95-
"mypy~=1.5.1",
96-
"pytest-cov~=4.1.0",
97-
"pytest~=7.4.2",
98-
]
99-
docs = [
100-
"furo~=2023.9.10", # Sphinx theme (nice looking, with dark mode)
101-
"myst-parser~=2.0.0",
102-
"sphinx-autobuild~=2021.3.14",
103-
"sphinx-copybutton~=0.5.2",
104-
"sphinx~=7.2.6",
105-
"sphinx_rtd_theme~=1.3.0", # Sphinx theme
106-
]
107-
build = [
108-
"build",
109-
"twine",
110-
] # https://realpython.com/pypi-publish-python-package/#build-your-package
111-
all = [
112-
"embeddedllm[lint,test,docs,build]", # https://hynek.me/articles/python-recursive-optional-dependencies/
113-
]
114-
115-
# [project.scripts]
116-
# embeddedllm = "embeddedllm.scripts.example:main_cli"
117-
118-
[tool.setuptools.dynamic]
119-
version = { attr = "embeddedllm.version.__version__" }
120-
121-
[tool.setuptools.packages.find]
122-
where = ["src"]
123-
124-
[tool.setuptools.package-data]
125-
owl = ["**/*.json"]
1+
# See https://gitlab.liris.cnrs.fr/pagoda/tools/mkdocs_template/-/blob/master/user_config/pyproject.toml
2+
3+
# -----------------------------------------------------------------------------
4+
# Pytest configuration
5+
# https://docs.pytest.org/en/latest/customize.html?highlight=pyproject#pyproject-toml
6+
7+
[tool.pytest.ini_options]
8+
log_cli = true
9+
asyncio_mode = "auto"
10+
# log_cli_level = "DEBUG"
11+
addopts = "--cov=embeddedllm --doctest-modules"
12+
testpaths = ["tests"]
13+
filterwarnings = [
14+
"ignore::DeprecationWarning:tensorflow.*",
15+
"ignore::DeprecationWarning:tensorboard.*",
16+
"ignore::DeprecationWarning:matplotlib.*",
17+
"ignore::DeprecationWarning:flatbuffers.*",
18+
]
19+
20+
21+
# -----------------------------------------------------------------------------
22+
# Black (Option-less formatter) configuration
23+
# https://black.readthedocs.io/en/stable/index.html
24+
25+
[tool.black]
26+
line-length = 99
27+
target-version = ["py310"]
28+
include = '\.pyi?$|\.ipynb'
29+
30+
# -----------------------------------------------------------------------------
31+
# For sorting imports
32+
# This is used by VS Code to sort imports
33+
# https://code.visualstudio.com/docs/python/editing#_sort-imports
34+
# https://timothycrosley.github.io/isort/
35+
36+
[tool.isort]
37+
# Profile
38+
# Base profile type to use for configuration. Profiles include: black, django,
39+
# pycharm, google, open_stack, plone, attrs, hug. As well as any shared profiles.
40+
# Default: ``
41+
profile = "black"
42+
# Treat project as a git repository and ignore files listed in .gitignore
43+
# Default: `False`
44+
skip_gitignore = true
45+
# The max length of an import line (used for wrapping long imports).
46+
# Default: `79`
47+
line_length = 99
48+
known_first_party = []
49+
50+
# -----------------------------------------------------------------------------
51+
# setuptools
52+
# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
53+
54+
[build-system]
55+
# setuptools-scm considers all files tracked by git to be data files
56+
requires = [
57+
"setuptools>=62.0",
58+
"packaging",
59+
"setuptools>=49.4.0",
60+
"torch==2.3.1",
61+
"wheel"
62+
]
63+
build-backend = "setuptools.build_meta"

requirements-build.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Should be mirrored in pyproject.toml
2+
packaging
3+
setuptools>=49.4.0
4+
torch==2.3.1
5+
wheel

requirements-common.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
2+
huggingface-hub[cli]
3+
fastapi~=0.110.0
4+
gunicorn~=21.2.0
5+
loguru~=0.7.2
6+
numpy~=1.26.4
7+
pydantic-settings>=2.2.1
8+
pydantic~=2.6.3
9+
loguru
10+
openai
11+
torch
12+
transformers
13+
uvicorn

requirements-cpu.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
onnxruntime
2+
onnxruntime-genai

requirements-directml.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
onnxruntime-directml~=1.18.0
2+
onnxruntime-genai-directml~=0.2.0

requirements-lint.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# formatting
2+
yapf==0.32.0
3+
toml==0.10.2
4+
tomli==2.0.1
5+
ruff==0.1.5
6+
codespell==2.3.0
7+
isort==5.13.2
8+
clang-format==18.1.5
9+
black
10+
11+
# type checking
12+
mypy==1.9.0
13+
types-PyYAML
14+
types-requests
15+
types-setuptools

scripts/python/httpx_client.py

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,22 @@
1-
import httpx
2-
import asyncio
3-
4-
async def stream_chat_completion(url: str, payload: dict):
5-
async with httpx.AsyncClient() as client:
6-
async with client.stream("POST", url, json=payload) as response:
7-
if response.status_code == 200:
8-
async for data in response.aiter_bytes():
9-
if data:
10-
print(data.decode('utf-8'))
11-
else:
12-
print(f"Error: {response.status_code}")
13-
print(await response.text())
14-
15-
# Example usage
16-
if __name__ == "__main__":
17-
url = "http://localhost:6979/v1/chat/completions"
18-
payload = {
19-
"messages": [{"role": "user", "content": "Hello!"}],
20-
"model": "phi3-mini-int4",
21-
"max_tokens": 80,
22-
"temperature": 0.0,
23-
"stream": True
24-
}
25-
asyncio.run(stream_chat_completion(url, payload))
1+
import httpx
2+
3+
def chat_completion(url: str, payload: dict):
4+
with httpx.Client() as client:
5+
response = client.post(url, json=payload)
6+
if response.status_code == 200:
7+
print(response.text)
8+
else:
9+
print(f"Error: {response.status_code}")
10+
print(response.text)
11+
12+
# Example usage
13+
if __name__ == "__main__":
14+
url = "http://localhost:6979/v1/chat/completions"
15+
payload = {
16+
"messages": [{"role": "user", "content": "Hello!"}],
17+
"model": "phi3-mini-int4",
18+
"max_tokens": 80,
19+
"temperature": 0.0,
20+
"stream": False # Set stream to False
21+
}
22+
chat_completion(url, payload)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import httpx
2+
import asyncio
3+
import time
4+
5+
async def stream_chat_completion(url: str, payload: dict):
6+
async with httpx.AsyncClient() as client:
7+
async with client.stream("POST", url, json=payload) as response:
8+
if response.status_code == 200:
9+
async for data in response.aiter_bytes():
10+
if data:
11+
print(data.decode('utf-8'))
12+
# time.sleep(1)
13+
else:
14+
print(f"Error: {response.status_code}")
15+
print(await response.text())
16+
17+
# Example usage
18+
if __name__ == "__main__":
19+
url = "http://localhost:6979/v1/chat/completions"
20+
payload = {
21+
"messages": [{"role": "user", "content": "Hello!"}],
22+
"model": "phi3-mini-int4",
23+
"max_tokens": 80,
24+
"temperature": 0.0,
25+
"stream": True
26+
}
27+
asyncio.run(stream_chat_completion(url, payload))

0 commit comments

Comments
 (0)