evanweiguohua
diff --git a/‎api/config.py‎
Lines changed: 6 additions & 0 deletions b/‎api/config.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎api/core/default.py‎
Lines changed: 4 additions & 4 deletions b/‎api/core/default.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎api/core/llama_cpp_engine.py‎
Lines changed: 2 additions & 2 deletions b/‎api/core/llama_cpp_engine.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎api/core/tgi.py‎
Lines changed: 257 additions & 0 deletions b/‎api/core/tgi.py‎
Lines changed: 257 additions & 0 deletions
diff --git a/‎api/core/vllm_engine.py‎
Lines changed: 7 additions & 0 deletions b/‎api/core/vllm_engine.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎api/llama_cpp_routes/chat.py‎
Lines changed: 1 addition & 1 deletion b/‎api/llama_cpp_routes/chat.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎api/llama_cpp_routes/completion.py‎
Lines changed: 2 additions & 2 deletions b/‎api/llama_cpp_routes/completion.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎api/models.py‎
Lines changed: 16 additions & 0 deletions b/‎api/models.py‎
Lines changed: 16 additions & 0 deletions
@@ -239,6 +239,12 @@ class Settings(BaseModel):
  description="RoPE frequency scaling factor",
  )
 
+ # support for tgi
+ tgi_endpoint: Optional[str] = Field(
+ default=get_env("TGI_ENDPOINT", None),
+ description="Text Generate Inference Endpoint.",
+ )
+
 
 SETTINGS = Settings()
 logger.debug(f"SETTINGS: {model_json(SETTINGS, indent=4)}")
 
@@ -397,7 +397,7 @@ def _create_chat_completion_stream(self, params: Dict[str, Any]) -> Iterator:
  finish_reason=None,
  )
  yield ChatCompletionChunk(
- id=_id,
+ id=f"chat{_id}",
  choices=[choice],
  created=_created,
  model=_model,
@@ -443,7 +443,7 @@ def _create_chat_completion_stream(self, params: Dict[str, Any]) -> Iterator:
  finish_reason=finish_reason
  )
  yield ChatCompletionChunk(
- id=_id,
+ id=f"chat{_id}",
  choices=[choice],
  created=_created,
  model=_model,
@@ -457,7 +457,7 @@ def _create_chat_completion_stream(self, params: Dict[str, Any]) -> Iterator:
  finish_reason="stop"
  )
  yield ChatCompletionChunk(
- id=_id,
+ id=f"chat{_id}",
  choices=[choice],
  created=_created,
  model=_model,
@@ -521,7 +521,7 @@ def _create_chat_completion(self, params: Dict[str, Any]) -> Union[ChatCompletio
  )
  usage = model_parse(CompletionUsage, last_output["usage"])
  return ChatCompletion(
- id=last_output["id"],
+ id=f"chat{last_output['id']}",
  choices=[choice],
  created=last_output["created"],
  model=last_output["model"],
 
@@ -129,7 +129,7 @@ def _create_chat_completion_stream(self, prompt, **kwargs) -> Iterator:
  finish_reason=None,
  )
  yield ChatCompletionChunk(
- id=_id,
+ id=f"chat{_id}",
  choices=[choice],
  created=_created,
  model=_model,
@@ -147,7 +147,7 @@ def _create_chat_completion_stream(self, prompt, **kwargs) -> Iterator:
  finish_reason=output["choices"][0]["finish_reason"],
  )
  yield ChatCompletionChunk(
- id=_id,
+ id=f"chat{_id}",
  choices=[choice],
  created=_created,
  model=_model,
 
@@ -0,0 +1,257 @@
+import json
+from typing import Optional, List, AsyncIterator
+
+from aiohttp import ClientSession
+from openai.types.chat import ChatCompletionMessageParam
+from pydantic import ValidationError
+from text_generation import AsyncClient
+from text_generation.errors import parse_error
+from text_generation.types import Request, Parameters
+from text_generation.types import Response, StreamResponse
+
+from api.adapter import get_prompt_adapter
+from api.utils.compat import model_dump
+
+
+class TGIEngine:
+ def __init__(
+ self,
+ model: AsyncClient,
+ model_name: str,
+ prompt_name: Optional[str] = None,
+ ):
+ """
+ Initializes the TGIEngine object.
+
+ Args:
+ model: The AsyncLLMEngine object.
+ model_name: The name of the model.
+ prompt_name: The name of the prompt (optional).
+ """
+ self.model = model
+ self.model_name = model_name.lower()
+ self.prompt_name = prompt_name.lower() if prompt_name is not None else None
+ self.prompt_adapter = get_prompt_adapter(self.model_name, prompt_name=self.prompt_name)
+
+ def apply_chat_template(
+ self, messages: List[ChatCompletionMessageParam],
+ ) -> str:
+ """
+ Applies a chat template to the given messages and returns the processed output.
+
+ Args:
+ messages: A list of ChatCompletionMessageParam objects representing the chat messages.
+
+ Returns:
+ str: The processed output as a string.
+ """
+ return self.prompt_adapter.apply_chat_template(messages)
+
+ async def generate(
+ self,
+ prompt: str,
+ do_sample: bool = True,
+ max_new_tokens: int = 20,
+ best_of: Optional[int] = None,
+ repetition_penalty: Optional[float] = None,
+ return_full_text: bool = False,
+ seed: Optional[int] = None,
+ stop_sequences: Optional[List[str]] = None,
+ temperature: Optional[float] = None,
+ top_k: Optional[int] = None,
+ top_p: Optional[float] = None,
+ truncate: Optional[int] = None,
+ typical_p: Optional[float] = None,
+ watermark: bool = False,
+ decoder_input_details: bool = True,
+ top_n_tokens: Optional[int] = None,
+ ) -> Response:
+ """
+ Given a prompt, generate the following text asynchronously
+
+ Args:
+ prompt (`str`):
+ Input text
+ do_sample (`bool`):
+ Activate logits sampling
+ max_new_tokens (`int`):
+ Maximum number of generated tokens
+ best_of (`int`):
+ Generate best_of sequences and return the one if the highest token logprobs
+ repetition_penalty (`float`):
+ The parameter for repetition penalty. 1.0 means no penalty. See [this
+ paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+ return_full_text (`bool`):
+ Whether to prepend the prompt to the generated text
+ seed (`int`):
+ Random sampling seed
+ stop_sequences (`List[str]`):
+ Stop generating tokens if a member of `stop_sequences` is generated
+ temperature (`float`):
+ The value used to module the logits distribution.
+ top_k (`int`):
+ The number of the highest probability vocabulary tokens to keep for top-k-filtering.
+ top_p (`float`):
+ If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+ higher are kept for generation.
+ truncate (`int`):
+ Truncate inputs tokens to the given size
+ typical_p (`float`):
+ Typical Decoding mass
+ See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+ watermark (`bool`):
+ Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+ decoder_input_details (`bool`):
+ Return the decoder input token logprobs and ids
+ top_n_tokens (`int`):
+ Return the `n` most likely tokens at each step
+
+ Returns:
+ Response: generated response
+ """
+ # Validate parameters
+ parameters = Parameters(
+ best_of=best_of,
+ details=True,
+ decoder_input_details=decoder_input_details,
+ do_sample=do_sample,
+ max_new_tokens=max_new_tokens,
+ repetition_penalty=repetition_penalty,
+ return_full_text=return_full_text,
+ seed=seed,
+ stop=stop_sequences if stop_sequences is not None else [],
+ temperature=temperature,
+ top_k=top_k,
+ top_p=top_p,
+ truncate=truncate,
+ typical_p=typical_p,
+ watermark=watermark,
+ top_n_tokens=top_n_tokens,
+ )
+ request = Request(inputs=prompt, stream=False, parameters=parameters)
+
+ async with ClientSession(
+ headers=self.model.headers, cookies=self.model.cookies, timeout=self.model.timeout
+ ) as session:
+ async with session.post(f"{self.model.base_url}/generate", json=model_dump(request)) as resp:
+ payload = await resp.json()
+
+ if resp.status != 200:
+ raise parse_error(resp.status, payload)
+ return Response(**payload)
+
+ async def generate_stream(
+ self,
+ prompt: str,
+ do_sample: bool = False,
+ max_new_tokens: int = 20,
+ best_of: Optional[int] = 1,
+ repetition_penalty: Optional[float] = None,
+ return_full_text: bool = False,
+ seed: Optional[int] = None,
+ stop_sequences: Optional[List[str]] = None,
+ temperature: Optional[float] = None,
+ top_k: Optional[int] = None,
+ top_p: Optional[float] = None,
+ truncate: Optional[int] = None,
+ typical_p: Optional[float] = None,
+ watermark: bool = False,
+ top_n_tokens: Optional[int] = None,
+ ) -> AsyncIterator[StreamResponse]:
+ """
+ Given a prompt, generate the following stream of tokens asynchronously
+
+ Args:
+ prompt (`str`):
+ Input text
+ do_sample (`bool`):
+ Activate logits sampling
+ max_new_tokens (`int`):
+ Maximum number of generated tokens
+ best_of (`int`):
+ Generate best_of sequences and return the one if the highest token logprobs
+ repetition_penalty (`float`):
+ The parameter for repetition penalty. 1.0 means no penalty. See [this
+ paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+ return_full_text (`bool`):
+ Whether to prepend the prompt to the generated text
+ seed (`int`):
+ Random sampling seed
+ stop_sequences (`List[str]`):
+ Stop generating tokens if a member of `stop_sequences` is generated
+ temperature (`float`):
+ The value used to module the logits distribution.
+ top_k (`int`):
+ The number of the highest probability vocabulary tokens to keep for top-k-filtering.
+ top_p (`float`):
+ If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+ higher are kept for generation.
+ truncate (`int`):
+ Truncate inputs tokens to the given size
+ typical_p (`float`):
+ Typical Decoding mass
+ See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+ watermark (`bool`):
+ Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+ top_n_tokens (`int`):
+ Return the `n` most likely tokens at each step
+
+ Returns:
+ AsyncIterator: stream of generated tokens
+ """
+ # Validate parameters
+ parameters = Parameters(
+ best_of=best_of,
+ details=True,
+ do_sample=do_sample,
+ max_new_tokens=max_new_tokens,
+ repetition_penalty=repetition_penalty,
+ return_full_text=return_full_text,
+ seed=seed,
+ stop=stop_sequences if stop_sequences is not None else [],
+ temperature=temperature,
+ top_k=top_k,
+ top_p=top_p,
+ truncate=truncate,
+ typical_p=typical_p,
+ watermark=watermark,
+ top_n_tokens=top_n_tokens,
+ )
+ request = Request(inputs=prompt, parameters=parameters)
+
+ async with ClientSession(
+ headers=self.model.headers, cookies=self.model.cookies, timeout=self.model.timeout
+ ) as session:
+ async with session.post(f"{self.model.base_url}/generate_stream", json=model_dump(request)) as resp:
+ if resp.status != 200:
+ raise parse_error(resp.status, await resp.json())
+
+ # Parse ServerSentEvents
+ async for byte_payload in resp.content:
+ # Skip line
+ if byte_payload == b"\n":
+ continue
+
+ payload = byte_payload.decode("utf-8")
+
+ # Event data
+ if payload.startswith("data:"):
+ # Decode payload
+ json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
+ # Parse payload
+ try:
+ response = StreamResponse(**json_payload)
+ except ValidationError:
+ # If we failed to parse the payload, then it is an error payload
+ raise parse_error(resp.status, json_payload)
+ yield response
+
+ @property
+ def stop(self):
+ """
+ Gets the stop property of the prompt adapter.
+
+ Returns:
+ The stop property of the prompt adapter, or None if it does not exist.
+ """
+ return self.prompt_adapter.stop if hasattr(self.prompt_adapter, "stop") else None
@@ -140,6 +140,13 @@ def generate(self, params: Dict[str, Any], request_id: str) -> AsyncIterator:
  stop=params.get("stop", []),
  stop_token_ids=params.get("stop_token_ids", []),
  max_tokens=params.get("max_tokens", 256),
+ repetition_penalty=params.get("repetition_penalty", 1.03),
+ min_p=params.get("min_p", 0.0),
+ best_of=params.get("best_of", 1),
+ ignore_eos=params.get("ignore_eos", False),
+ use_beam_search=params.get("use_beam_search", False),
+ skip_special_tokens=params.get("skip_special_tokens", True),
+ spaces_between_special_tokens=params.get("spaces_between_special_tokens", True),
  )
  result_generator = self.model.generate(
  prompt_or_messages if isinstance(prompt_or_messages, str) else None,
 
@@ -35,7 +35,7 @@ async def create_chat_completion(
 
  prompt = engine.apply_chat_template(request.messages, request.functions, request.tools)
  include = {
- "temperature", "temperature", "top_p", "stream", "stop",
+ "temperature", "top_p", "stream", "stop",
  "max_tokens", "presence_penalty", "frequency_penalty", "model"
  }
  kwargs = model_dump(request, include=include)
 
@@ -30,10 +30,10 @@ async def create_completion(
  request.prompt = request.prompt[0] if len(request.prompt) > 0 else ""
 
  request.max_tokens = request.max_tokens or 256
- request, stop_token_ids = await handle_request(request, engine.stop)
+ request = await handle_request(request, engine.stop)
 
  include = {
- "temperature", "temperature", "top_p", "stream", "stop",
+ "temperature", "top_p", "stream", "stop",
  "max_tokens", "presence_penalty", "frequency_penalty", "model"
  }
  kwargs = model_dump(request, include=include)
 
@@ -131,6 +131,20 @@ def create_llama_cpp_engine():
  return LlamaCppEngine(engine, SETTINGS.model_name, SETTINGS.chat_template)
 
 
+def create_tgi_engine():
+ """ get llama.cpp generate engine for chat or completion. """
+ try:
+ from text_generation import AsyncClient
+ from api.core.tgi import TGIEngine
+ except ImportError:
+ return None
+
+ client = AsyncClient(SETTINGS.tgi_endpoint)
+ logger.info("Using TGI engine")
+
+ return TGIEngine(client, SETTINGS.model_name, SETTINGS.chat_template)
+
+
 # fastapi app
 app = create_app()
 
@@ -145,6 +159,8 @@ def create_llama_cpp_engine():
  GENERATE_ENGINE = create_vllm_engine()
  elif SETTINGS.engine == "llama.cpp":
  GENERATE_ENGINE = create_llama_cpp_engine()
+ elif SETTINGS.engine == "tgi":
+ GENERATE_ENGINE = create_tgi_engine()
 else:
  GENERATE_ENGINE = None
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ async def create_chat_completion(`
`35`	`35`
`36`	`36`	`prompt = engine.apply_chat_template(request.messages, request.functions, request.tools)`
`37`	`37`	`include = {`
`38`		`- "temperature", "temperature", "top_p", "stream", "stop",`
	`38`	`+ "temperature", "top_p", "stream", "stop",`
`39`	`39`	`"max_tokens", "presence_penalty", "frequency_penalty", "model"`
`40`	`40`	`}`
`41`	`41`	`kwargs = model_dump(request, include=include)`