Update README.md

tjtanaa · tjtanaa · commit adcbef50b1f1 · 2024-06-19T15:41:06.000+08:00
diff --git a/README.md b/README.md
@@ -16,7 +16,8 @@ Run local LLMs on iGPU and APU (AMD , Intel, and Qualcomm (Coming Soon))
 
 
 ## 🚀 Latest News
-- [2024/06] Support chat inference on iGPU, APU and CPU.
+- [2024/06] Support Phi-3 (mini, small, medium), Phi-3-Vision-Mini, Llama-2, Llama-3, Gemma (v1), Mistral v0.3, Starling-LM, Yi-1.5.
+- [2024/06] Support vision/chat inference on iGPU, APU, CPU and CUDA.
 
 
 ## Supported Models (Quick Start)
@@ -32,8 +33,46 @@ Run local LLMs on iGPU and APU (AMD , Intel, and Qualcomm (Coming Soon))
 | Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml)|
 | Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml)|
 
+## Getting Started
 
-## Acknowledgements
-* Excellent open-source projects: [vLLM](https://github.com/vllm-project/vllm.git), [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai.git) and many others.
+### Installation
+
+#### From Source
+**Windows**
+1. Install embeddedllm package. `$env:ELLM_TARGET_DEVICE='directml'; pip install -e .`. Note: currently support `cpu`, `directml` and `cuda`.
+   - **DirectML:** `$env:ELLM_TARGET_DEVICE='directml'; pip install -e .[directml]`
+   - **CPU:** `$env:ELLM_TARGET_DEVICE='cpu'; pip install -e .[cpu]`
+   - **CUDA:** `$env:ELLM_TARGET_DEVICE='cuda'; pip install -e .[cuda]`
+
+**Linux**
+1. Install embeddedllm package. `ELLM_TARGET_DEVICE='directml' pip install -e .`. Note: currently support `cpu`, `directml` and `cuda`.
+   - **DirectML:** `ELLM_TARGET_DEVICE='directml' pip install -e .[directml]`
+   - **CPU:** `ELLM_TARGET_DEVICE='cpu' pip install -e .[cpu]`
+   - **CUDA:** `ELLM_TARGET_DEVICE='cuda' pip install -e .[cuda]`
+
+
+### Launch OpenAI API Compatible Server
+```
+usage: ellm_server.exe [-h] [--port int] [--host str] [--response_role str] [--uvicorn_log_level str]
+                       [--served_model_name str] [--model_path str] [--vision bool]
 
-* Thanks to all the [contributors](./docs/contributors.md).
+options:
+  -h, --help            show this help message and exit
+  --port int            Server port. (default: 6979)
+  --host str            Server host. (default: 0.0.0.0)
+  --response_role str   Server response role. (default: assistant)
+  --uvicorn_log_level str
+                        Uvicorn logging level. `debug`, `info`, `trace`, `warning`, `critical` (default: info)
+  --served_model_name str
+                        Model name. (default: phi3-mini-int4)
+  --model_path str      Path to model weights. (required)
+  --vision bool         Enable vision capability, only if model supports vision input. (default: False)
+```
+
+1. `ellm_server --model_path <path/to/model/weight>`.
+2. Example code to connect to the api server can be found in `scripts/python`.
+
+
+
+## Acknowledgements
+* Excellent open-source projects: [vLLM](https://github.com/vllm-project/vllm.git), [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai.git) and many others.
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -3,8 +3,9 @@ fastapi~=0.110.0
 gunicorn~=21.2.0
 loguru~=0.7.2
 numpy~=1.26.4
-pydantic-settings>=2.2.1
-pydantic~=2.6.3
+pydantic-settings>=2.3.3
+pydantic-core~=2.18.4
+pydantic~=2.7.4
 loguru
 openai
 torch
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
@@ -1,2 +1,2 @@
 onnxruntime-gpu~=1.18.0
-onnxruntime-genai-cuda~=0.2.0
+onnxruntime-genai-cuda~=0.3.0rc2
diff --git a/scripts/benchmark/benchmark_api_server.py b/scripts/benchmark/benchmark_api_server.py
diff --git a/scripts/python/httpx_client_vision.py b/scripts/python/httpx_client_vision.py
@@ -20,7 +20,6 @@ def chat_completion(url: str, payload: dict):
 
 # Example usage
 if __name__ == "__main__":
-
     current_file_path = os.path.abspath(__file__)
     IMAGE_PATH = os.path.join(os.path.dirname(current_file_path), "..", "images", "catdog.png")
 
diff --git a/scripts/python/httpx_client_vision_stream.py b/scripts/python/httpx_client_vision_stream.py
@@ -20,7 +20,6 @@ async def stream_chat_completion(url: str, payload: dict):
 
 # Example usage
 if __name__ == "__main__":
-
     current_file_path = os.path.abspath(__file__)
     IMAGE_PATH = os.path.join(os.path.dirname(current_file_path), "..", "images", "catdog.png")
 
diff --git a/scripts/python/test_prompt_template.py b/scripts/python/test_prompt_template.py
diff --git a/setup.py b/setup.py
@@ -2,21 +2,21 @@
 import os
 import re
 from typing import List
+import platform
 
 from setuptools import find_packages, setup
 
 ROOT_DIR = os.path.dirname(__file__)
 
-# # Custom function to check for DirectML support
-# def check_directml_support():
-#     if platform.system() != "Windows":
-#         raise RuntimeError("This package requires a Windows system with DirectML support.")
-#     # Add additional checks for DirectML support if necessary
 
-# # Run the check before proceeding with the setup
-# check_directml_support()
+ELLM_TARGET_DEVICE = os.environ.get("ELLM_TARGET_DEVICE", "directml")
 
-ELLM_TARGET_DEVICE = "cuda"
+
+# Custom function to check for DirectML support
+def check_directml_support():
+    if platform.system() != "Windows":
+        raise RuntimeError("This package requires a Windows system with DirectML support.")
+    # Add additional checks for DirectML support if necessary
 
 
 def read_readme() -> str:
@@ -29,6 +29,8 @@ def read_readme() -> str:
 
 
 def _is_directml() -> bool:
+    # Run the check before proceeding with the setup
+    check_directml_support()
     return ELLM_TARGET_DEVICE == "directml"
 
 
@@ -97,6 +99,8 @@ def get_ellm_version() -> str:
     return version
 
 
+print(get_requirements().extend(_read_requirements("requirements-common.txt")))
+
 setup(
     name="embeddedllm",
     version=get_ellm_version(),
@@ -120,9 +124,20 @@ def get_ellm_version() -> str:
         "License :: OSI Approved :: Apache Software License",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    install_requires=get_requirements().extend(_read_requirements("requirements-common.txt")),
+    install_requires=get_requirements()
+    + _read_requirements("requirements-common.txt")
+    + _read_requirements("requirements-build.txt"),
     # Add other metadata and dependencies as needed
     extras_require={
         "lint": _read_requirements("requirements-lint.txt"),
+        "cuda": ["onnxruntime-genai-cuda==0.3.0rc2"],
+    },
+    dependency_links=[
+        "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/"
+    ],
+    entry_points={
+        "console_scripts": [
+            "ellm_server=embeddedllm.entrypoints.api_server:main",
+        ],
     },
 )
diff --git a/src/embeddedllm/engine.py b/src/embeddedllm/engine.py
@@ -36,7 +36,6 @@ def onnx_generator_context(model, params):
 
 
 class EmbeddedLLMEngine:
-
     def __init__(self, model_path: str, vision: bool):
         self.model_path = model_path
         self.model_config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
@@ -103,10 +102,8 @@ async def generate_vision(
         request_id: str,
         stream: bool = True,
     ) -> AsyncIterator[RequestOutput]:
-
         prompt_text = inputs["prompt"]
         # print(f"inputs: {str(inputs)}")
-        print(inputs.keys())
         input_tokens = self.onnx_tokenizer.encode(prompt_text)
         # logger.debug(f"inputs: {str(inputs)}")
         # logger.debug(f'inputs["multi_model_data"]: {str(inputs.multi_model_data)}')
diff --git a/src/embeddedllm/entrypoints/api_server.py b/src/embeddedllm/entrypoints/api_server.py
@@ -4,6 +4,7 @@
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import Field
 
 from embeddedllm.entrypoints.chat_server import OpenAPIChatServer
 from embeddedllm.protocol import (  # noqa: E501
@@ -18,14 +19,21 @@
 
 
 class Config(BaseSettings):
-    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
-    port: int = 6979
-    host: str = "0.0.0.0"
-    response_role: str = "assistant"
-    uvicorn_log_level: str = "info"
-    served_model_name: str = "phi3-mini-int4"
-    model_path: str = None
-    vision: bool = False
+    model_config = SettingsConfigDict(
+        env_file=".env", env_file_encoding="utf-8", extra="ignore", cli_parse_args=True
+    )
+    port: int = Field(default=6979, description="Server port.")
+    host: str = Field(default="0.0.0.0", description="Server host.")
+    response_role: str = Field(default="assistant", description="Server response role.")
+    uvicorn_log_level: str = Field(
+        default="info",
+        description="Uvicorn logging level. `debug`, `info`, `trace`, `warning`, `critical`",
+    )
+    served_model_name: str = Field(default="phi3-mini-int4", description="Model name.")
+    model_path: str = Field(description="Path to model weights.")
+    vision: bool = Field(
+        default=False, description="Enable vision capability, only if model supports vision input."
+    )
 
 
 config = Config()
@@ -52,20 +60,18 @@ async def show_available_models():
 
 @app.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
-
     generator = await openai_chat_server.create_chat_completion(request, raw_request)
     if isinstance(generator, ErrorResponse):
         return JSONResponse(content=generator.model_dump(), status_code=generator.code)
     if request.stream:
         return StreamingResponse(content=generator, media_type="text/event-stream")
     else:
-        # return JSONResponse(content="Non-streaming Chat Generation Yet to be Implemented.",
-        #                     status_code=404)
         assert isinstance(generator, ChatCompletionResponse)
         return JSONResponse(content=generator.model_dump())
 
 
-if __name__ == "__main__":
+def main():
+    global openai_chat_server
     import os
 
     import uvicorn
@@ -90,3 +96,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
     uvicorn.run(
         app, host=config.host, port=config.port, log_level=config.uvicorn_log_level, loop="asyncio"
     )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/embeddedllm/entrypoints/chat_server.py b/src/embeddedllm/entrypoints/chat_server.py
@@ -67,7 +67,6 @@ class ChatMessageParseResult:
 
 
 class OpenAPIChatServer:
-
     def __init__(
         self,
         model_path: str,
@@ -200,7 +199,6 @@ def _parse_chat_message_content(
     async def create_chat_completion(
         self, request: ChatCompletionRequest, raw_request: Optional[Request] = None
     ) -> Union[ErrorResponse, AsyncGenerator[str, None], ChatCompletionResponse]:
-
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -282,7 +280,7 @@ async def chat_completion_stream_generator(
         finish_reason_sent = [False] * request.n
         try:
             async for res in result_generator:
-                logger.debug("res:" + str(res))
+                # logger.debug("res:" + str(res))
                 # We need to do it here, because if there are exceptions in
                 # the result_generator, it needs to be sent as the FIRST
                 # response (by the try...catch).
@@ -352,7 +350,6 @@ async def chat_completion_stream_generator(
                         continue
 
                     if request.logprobs and request.top_logprobs is not None:
-
                         # @TODO: Add when ONNX support logits on DML
                         logprobs = None
                     else:
@@ -443,7 +440,6 @@ async def chat_completion_full_generator(
         request_id: str,
         conversation: List[ConversationMessage],
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
-
         model_name = self.served_model_name
         created_time = int(time.time())
         final_res: Optional[RequestOutput] = None
diff --git a/src/embeddedllm/inputs.py b/src/embeddedllm/inputs.py
@@ -32,13 +32,13 @@ class ImagePixelData(TypedDict):
 
 # https://github.com/vllm-project/vllm/pull/4028
 @overload
-def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: ...
+def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]:
+    ...
 
 
 @overload
-def parse_and_batch_prompt(
-    prompt: Union[List[int], List[List[int]]]
-) -> Sequence[ParsedTokens]: ...
+def parse_and_batch_prompt(prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]:
+    ...
 
 
 def parse_and_batch_prompt(

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`onnxruntime-gpu~=1.18.0`
`2`		`-onnxruntime-genai-cuda~=0.2.0`
	`2`	`+onnxruntime-genai-cuda~=0.3.0rc2`