EmbeddedLLM
diff --git a/‎README.md‎
Lines changed: 39 additions & 0 deletions b/‎README.md‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 63 additions & 125 deletions b/‎pyproject.toml‎
Lines changed: 63 additions & 125 deletions
diff --git a/‎requirements-build.txt‎
Lines changed: 5 additions & 0 deletions b/‎requirements-build.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎requirements-common.txt‎
Lines changed: 13 additions & 0 deletions b/‎requirements-common.txt‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎requirements-cpu.txt‎
Lines changed: 2 additions & 0 deletions b/‎requirements-cpu.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎requirements-directml.txt‎
Lines changed: 2 additions & 0 deletions b/‎requirements-directml.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎requirements-lint.txt‎
Lines changed: 15 additions & 0 deletions b/‎requirements-lint.txt‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎scripts/python/httpx_client.py‎
Lines changed: 22 additions & 25 deletions b/‎scripts/python/httpx_client.py‎
Lines changed: 22 additions & 25 deletions
diff --git a/‎scripts/python/httpx_client_stream.py‎
Lines changed: 27 additions & 0 deletions b/‎scripts/python/httpx_client_stream.py‎
Lines changed: 27 additions & 0 deletions
@@ -0,0 +1,39 @@
+# EmbeddedLLM
+
+Run local LLMs on iGPU and APU (AMD , Intel, and Qualcomm (Coming Soon))
+
+|Support matrix| Supported now| Under Development | On the roadmap|
+|--------------|--------------|-------------------|---------------|
+|Model architectures| Gemma <br/> Llama * <br/> Mistral + <br/>Phi <br/>||
+|Platform| Linux <br/> Windows | ||||
+|Architecture|x86 <br/> x64 <br/> | Arm64 |||
+|Hardware Acceleration|CUDA<br/>DirectML<br/>|QNN <br/> ROCm |OpenVINO
+
+\* The Llama model architecture supports similar model families such as CodeLlama, Vicuna, Yi, and more.
+
+\+ The Mistral model architecture supports similar model families such as Zephyr.
+
+
+
+## 🚀 Latest News
+- [2024/06] Support chat inference on iGPU and CPU.
+
+
+## Supported Models (Quick Start)
+| Models | Parameters | Context Length | Link |
+|---------------------|------------|----------------|------|
+|Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
+|Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
+|Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
+|Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
+|Mistral-7b-v0.3-instruct| 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx)|
+| Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |
+| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) |
+| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml)|
+| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml)|
+
+
+## Acknowledgements
+* Excellent open-source projects: [vLLM](https://github.com/vllm-project/vllm.git), [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai.git) and many others.
+
+* Thanks to all the [contributors](./docs/contributors.md).
@@ -1,125 +1,63 @@
-# See https://gitlab.liris.cnrs.fr/pagoda/tools/mkdocs_template/-/blob/master/user_config/pyproject.toml
-
-# -----------------------------------------------------------------------------
-# Pytest configuration
-# https://docs.pytest.org/en/latest/customize.html?highlight=pyproject#pyproject-toml
-
-[tool.pytest.ini_options]
-log_cli = true
-asyncio_mode = "auto"
-# log_cli_level = "DEBUG"
-addopts = "--cov=embeddedllm --doctest-modules"
-testpaths = ["tests"]
-filterwarnings = [
- "ignore::DeprecationWarning:tensorflow.*",
- "ignore::DeprecationWarning:tensorboard.*",
- "ignore::DeprecationWarning:matplotlib.*",
- "ignore::DeprecationWarning:flatbuffers.*",
-]
-
-
-# -----------------------------------------------------------------------------
-# Black (Option-less formatter) configuration
-# https://black.readthedocs.io/en/stable/index.html
-
-[tool.black]
-line-length = 99
-target-version = ["py310"]
-include = '\.pyi?$|\.ipynb'
-
-# -----------------------------------------------------------------------------
-# For sorting imports
-# This is used by VS Code to sort imports
-# https://code.visualstudio.com/docs/python/editing#_sort-imports
-# https://timothycrosley.github.io/isort/
-
-[tool.isort]
-# Profile
-# Base profile type to use for configuration. Profiles include: black, django,
-# pycharm, google, open_stack, plone, attrs, hug. As well as any shared profiles.
-# Default: ``
-profile = "black"
-# Treat project as a git repository and ignore files listed in .gitignore
-# Default: `False`
-skip_gitignore = true
-# The max length of an import line (used for wrapping long imports).
-# Default: `79`
-line_length = 99
-known_first_party = []
-
-# -----------------------------------------------------------------------------
-# setuptools
-# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
-
-[build-system]
-# setuptools-scm considers all files tracked by git to be data files
-requires = ["setuptools>=62.0", "setuptools-scm"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "embeddedllm"
-description = "EmbeddedLLM: API server for Embedded Device Deployment. Currently support ONNX-DirectML."
-readme = "README.md"
-requires-python = "~=3.10"
-# keywords = ["one", "two"]
-license = { text = "Proprietary" }
-classifiers = [ # https://pypi.org/classifiers/
- "Development Status :: 3 - Alpha",
- "Programming Language :: Python :: 3 :: Only",
- "Intended Audience :: Information Technology",
- "Operating System :: Unix",
-]
-dependencies = [
- "huggingface-hub[cli]",
- "fastapi~=0.110.0",
- "gunicorn~=21.2.0",
- "loguru~=0.7.2",
- "numpy~=1.26.4",
- "pydantic-settings>=2.2.1",
- "pydantic~=2.6.3",
- "onnxruntime-directml",
- "onnxruntime-genai-directml",
- "loguru",
- "openai",
- "torch",
- "transformers",
- "uvicorn"
-] # Sort your dependencies https://sortmylist.com/
-dynamic = ["version"]
-
-[project.optional-dependencies]
-lint = ["black~=24.4.2", "flake8~=7.0.0"]
-test = [
- "flaky~=3.7.0",
- "locust~=2.24.1",
- "mypy~=1.5.1",
- "pytest-cov~=4.1.0",
- "pytest~=7.4.2",
-]
-docs = [
- "furo~=2023.9.10", # Sphinx theme (nice looking, with dark mode)
- "myst-parser~=2.0.0",
- "sphinx-autobuild~=2021.3.14",
- "sphinx-copybutton~=0.5.2",
- "sphinx~=7.2.6",
- "sphinx_rtd_theme~=1.3.0", # Sphinx theme
-]
-build = [
- "build",
- "twine",
-] # https://realpython.com/pypi-publish-python-package/#build-your-package
-all = [
- "embeddedllm[lint,test,docs,build]", # https://hynek.me/articles/python-recursive-optional-dependencies/
-]
-
-# [project.scripts]
-# embeddedllm = "embeddedllm.scripts.example:main_cli"
-
-[tool.setuptools.dynamic]
-version = { attr = "embeddedllm.version.__version__" }
-
-[tool.setuptools.packages.find]
-where = ["src"]
-
-[tool.setuptools.package-data]
-owl = ["**/*.json"]
+# See https://gitlab.liris.cnrs.fr/pagoda/tools/mkdocs_template/-/blob/master/user_config/pyproject.toml
+
+# -----------------------------------------------------------------------------
+# Pytest configuration
+# https://docs.pytest.org/en/latest/customize.html?highlight=pyproject#pyproject-toml
+
+[tool.pytest.ini_options]
+log_cli = true
+asyncio_mode = "auto"
+# log_cli_level = "DEBUG"
+addopts = "--cov=embeddedllm --doctest-modules"
+testpaths = ["tests"]
+filterwarnings = [
+ "ignore::DeprecationWarning:tensorflow.*",
+ "ignore::DeprecationWarning:tensorboard.*",
+ "ignore::DeprecationWarning:matplotlib.*",
+ "ignore::DeprecationWarning:flatbuffers.*",
+]
+
+
+# -----------------------------------------------------------------------------
+# Black (Option-less formatter) configuration
+# https://black.readthedocs.io/en/stable/index.html
+
+[tool.black]
+line-length = 99
+target-version = ["py310"]
+include = '\.pyi?$|\.ipynb'
+
+# -----------------------------------------------------------------------------
+# For sorting imports
+# This is used by VS Code to sort imports
+# https://code.visualstudio.com/docs/python/editing#_sort-imports
+# https://timothycrosley.github.io/isort/
+
+[tool.isort]
+# Profile
+# Base profile type to use for configuration. Profiles include: black, django,
+# pycharm, google, open_stack, plone, attrs, hug. As well as any shared profiles.
+# Default: ``
+profile = "black"
+# Treat project as a git repository and ignore files listed in .gitignore
+# Default: `False`
+skip_gitignore = true
+# The max length of an import line (used for wrapping long imports).
+# Default: `79`
+line_length = 99
+known_first_party = []
+
+# -----------------------------------------------------------------------------
+# setuptools
+# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
+
+[build-system]
+# setuptools-scm considers all files tracked by git to be data files
+requires = [
+ "setuptools>=62.0", 
+ "packaging",
+ "setuptools>=49.4.0",
+ "torch==2.3.1",
+ "wheel"
+]
+build-backend = "setuptools.build_meta"
@@ -0,0 +1,5 @@
+# Should be mirrored in pyproject.toml
+packaging
+setuptools>=49.4.0
+torch==2.3.1
+wheel
@@ -0,0 +1,13 @@
+
+huggingface-hub[cli]
+fastapi~=0.110.0
+gunicorn~=21.2.0
+loguru~=0.7.2
+numpy~=1.26.4
+pydantic-settings>=2.2.1
+pydantic~=2.6.3
+loguru
+openai
+torch
+transformers
+uvicorn
@@ -0,0 +1,2 @@
+onnxruntime
+onnxruntime-genai
@@ -0,0 +1,2 @@
+onnxruntime-directml~=1.18.0
+onnxruntime-genai-directml~=0.2.0
@@ -0,0 +1,15 @@
+# formatting
+yapf==0.32.0
+toml==0.10.2
+tomli==2.0.1
+ruff==0.1.5
+codespell==2.3.0
+isort==5.13.2
+clang-format==18.1.5
+black
+
+# type checking
+mypy==1.9.0
+types-PyYAML
+types-requests
+types-setuptools
@@ -1,25 +1,22 @@
-import httpx
-import asyncio
-
-async def stream_chat_completion(url: str, payload: dict):
- async with httpx.AsyncClient() as client:
- async with client.stream("POST", url, json=payload) as response:
- if response.status_code == 200:
- async for data in response.aiter_bytes():
- if data:
- print(data.decode('utf-8'))
- else:
- print(f"Error: {response.status_code}")
- print(await response.text())
-
-# Example usage
-if __name__ == "__main__":
- url = "http://localhost:6979/v1/chat/completions"
- payload = {
- "messages": [{"role": "user", "content": "Hello!"}],
- "model": "phi3-mini-int4",
- "max_tokens": 80,
- "temperature": 0.0,
- "stream": True
- }
- asyncio.run(stream_chat_completion(url, payload))
+import httpx
+
+def chat_completion(url: str, payload: dict):
+ with httpx.Client() as client:
+ response = client.post(url, json=payload)
+ if response.status_code == 200:
+ print(response.text)
+ else:
+ print(f"Error: {response.status_code}")
+ print(response.text)
+
+# Example usage
+if __name__ == "__main__":
+ url = "http://localhost:6979/v1/chat/completions"
+ payload = {
+ "messages": [{"role": "user", "content": "Hello!"}],
+ "model": "phi3-mini-int4",
+ "max_tokens": 80,
+ "temperature": 0.0,
+ "stream": False # Set stream to False
+ }
+ chat_completion(url, payload)
@@ -0,0 +1,27 @@
+import httpx
+import asyncio
+import time
+
+async def stream_chat_completion(url: str, payload: dict):
+ async with httpx.AsyncClient() as client:
+ async with client.stream("POST", url, json=payload) as response:
+ if response.status_code == 200:
+ async for data in response.aiter_bytes():
+ if data:
+ print(data.decode('utf-8'))
+ # time.sleep(1)
+ else:
+ print(f"Error: {response.status_code}")
+ print(await response.text())
+
+# Example usage
+if __name__ == "__main__":
+ url = "http://localhost:6979/v1/chat/completions"
+ payload = {
+ "messages": [{"role": "user", "content": "Hello!"}],
+ "model": "phi3-mini-int4",
+ "max_tokens": 80,
+ "temperature": 0.0,
+ "stream": True
+ }
+ asyncio.run(stream_chat_completion(url, payload))
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+onnxruntime-directml~=1.18.0`
	`2`	`+onnxruntime-genai-directml~=0.2.0`