Skip to content

Commit abaf4f2

Browse files
author
xusenlin
committed
Update vllm arguments
1 parent 9c6fc00 commit abaf4f2

File tree

3 files changed

+39
-11
lines changed

3 files changed

+39
-11
lines changed

api/config.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
2+
from loguru import logger
33
import dotenv
44

55
dotenv.load_dotenv()
@@ -11,29 +11,39 @@
1111
'MODEL_NAME': '',
1212
'MODEL_PATH': '',
1313
'ADAPTER_MODEL_PATH': '',
14+
1415
'DEVICE': 'cuda',
1516
'DEVICE_MAP': "",
1617
'GPUS': '',
1718
'NUM_GPUs': 1,
18-
'QUANTIZE': 16,
19+
1920
'EMBEDDING_NAME': '',
20-
'CONTEXT_LEN': '',
21+
'EMBEDDING_SIZE': '',
22+
'EMBEDDING_DEVICE': 'cuda',
23+
24+
'QUANTIZE': 16,
2125
'LOAD_IN_8BIT': 'False',
2226
'LOAD_IN_4BIT': 'False',
2327
'USING_PTUNING_V2': 'False',
28+
29+
'CONTEXT_LEN': '',
2430
'STREAM_INTERVERL': 2,
2531
'PROMPT_NAME': '',
32+
2633
'PATCH_TYPE': '',
2734
'TRAINING_LENGTH': 4096,
2835
'WINDOW_SIZE': 512,
36+
2937
'API_PREFIX': '/v1',
38+
3039
'USE_VLLM': 'False',
3140
'TRUST_REMOTE_CODE': "False",
3241
'TOKENIZE_MODE': "auto",
3342
'TENSOR_PARALLEL_SIZE': 1,
3443
'DTYPE': "half",
35-
'EMBEDDING_SIZE': '',
36-
'EMBEDDING_DEVICE': 'cuda',
44+
"GPU_MEMORY_UTILIZATION": 0.9,
45+
"MAX_NUM_BATCHED_TOKENS": 5120,
46+
"MAX_NUM_SEQS": 256,
3747
}
3848

3949

@@ -61,15 +71,19 @@ def __init__(self):
6171
self.GPUS = get_env('GPUS')
6272
self.NUM_GPUs = int(get_env('NUM_GPUs'))
6373

64-
self.QUANTIZE = int(get_env('QUANTIZE'))
6574
self.EMBEDDING_NAME = get_env('EMBEDDING_NAME') if get_env('EMBEDDING_NAME') else None
66-
self.CONTEXT_LEN = int(get_env('CONTEXT_LEN')) if get_env('CONTEXT_LEN') else None
75+
self.EMBEDDING_SIZE = int(get_env('EMBEDDING_SIZE')) if get_env('EMBEDDING_SIZE') else None
76+
self.EMBEDDING_DEVICE = get_env('EMBEDDING_DEVICE')
77+
78+
self.QUANTIZE = int(get_env('QUANTIZE'))
6779
self.LOAD_IN_8BIT = get_bool_env('LOAD_IN_8BIT')
6880
self.LOAD_IN_4BIT = get_bool_env('LOAD_IN_4BIT')
6981
self.USING_PTUNING_V2 = get_bool_env('USING_PTUNING_V2')
7082

83+
self.CONTEXT_LEN = int(get_env('CONTEXT_LEN')) if get_env('CONTEXT_LEN') else None
7184
self.STREAM_INTERVERL = int(get_env('STREAM_INTERVERL'))
7285
self.PROMPT_NAME = get_env('PROMPT_NAME') if get_env('PROMPT_NAME') else None
86+
7387
self.PATCH_TYPE = get_env('PATCH_TYPE') if get_env('PATCH_TYPE') else None
7488
self.TRAINING_LENGTH = int(get_env('TRAINING_LENGTH'))
7589
self.WINDOW_SIZE = int(get_env('WINDOW_SIZE'))
@@ -81,13 +95,13 @@ def __init__(self):
8195
self.TOKENIZE_MODE = get_env('TOKENIZE_MODE')
8296
self.TENSOR_PARALLEL_SIZE = int(get_env('TENSOR_PARALLEL_SIZE'))
8397
self.DTYPE = get_env('DTYPE')
84-
85-
self.EMBEDDING_SIZE = int(get_env('EMBEDDING_SIZE')) if get_env('EMBEDDING_SIZE') else None
86-
self.EMBEDDING_DEVICE = get_env('EMBEDDING_DEVICE')
98+
self.GPU_MEMORY_UTILIZATION = float(get_env('GPU_MEMORY_UTILIZATION'))
99+
self.MAX_NUM_BATCHED_TOKENS = int(get_env('MAX_NUM_BATCHED_TOKENS'))
100+
self.MAX_NUM_SEQS = int(get_env('MAX_NUM_SEQS'))
87101

88102

89103
config = Config()
90-
print(f"Config: {config.__dict__}")
104+
logger.debug(f"Config: {config.__dict__}")
91105
if config.GPUS:
92106
if len(config.GPUS.split(",")) < config.NUM_GPUs:
93107
raise ValueError(

api/models.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ def get_vllm_engine():
6969
trust_remote_code=config.TRUST_REMOTE_CODE,
7070
dtype=config.DTYPE,
7171
tensor_parallel_size=config.TENSOR_PARALLEL_SIZE,
72+
gpu_memory_utilization=config.GPU_MEMORY_UTILIZATION,
73+
max_num_batched_tokens=config.MAX_NUM_BATCHED_TOKENS,
74+
max_num_seqs=config.MAX_NUM_SEQS,
7275
)
7376
engine = AsyncLLMEngine.from_engine_args(engine_args)
7477

@@ -91,6 +94,7 @@ def get_vllm_engine():
9194
)
9295

9396
engine_model_config = asyncio.run(engine.get_model_config())
97+
engine.engine.scheduler_config.max_model_len = get_context_len(engine_model_config)
9498
engine.max_model_len = get_context_len(engine_model_config)
9599

96100
return engine

docs/VLLM_SCRIPT.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ pip uninstall transformer-engine -y
2323

2424
### 环境变量含义
2525

26+
2627
+ `MODEL_NAME`: 模型名称,如 `qwen``baichuan-13b-chat`
2728

2829

@@ -44,6 +45,15 @@ pip uninstall transformer-engine -y
4445
+ `EMBEDDING_NAME`(可选项): 嵌入模型的文件所在路径,推荐使用 `moka-ai/m3e-base` 或者 `BAAI/bge-large-zh`
4546

4647

48+
+ `GPU_MEMORY_UTILIZATION`(可选项): `GPU` 占用率
49+
50+
51+
+ `MAX_NUM_BATCHED_TOKENS`(可选项): 每个批处理的最大 `token` 数量
52+
53+
54+
+ `MAX_NUM_SEQS`(可选项): 批量大小
55+
56+
4757
### 启动方式
4858

4959
选择下面两种方式之一启动模型接口服务

0 commit comments

Comments
 (0)