- Notifications
You must be signed in to change notification settings - Fork 3.1k
[LLM INFER] Fix some bugs and chatglm_v2 support block_attn #9271
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -134,16 +134,16 @@ class PredictorArgument: | |
| }, | ||
| ) | ||
| | ||
| @property | ||
| def total_max_length(self): | ||
| if self.device == "npu": | ||
| return self.src_length + self.max_length | ||
| else: | ||
| return 8192 # Maximum sequence length. | ||
| total_max_length: int = field( | ||
| default=4096, metadata={"help": "Super parameter. Maximum sequence length(encoder+decoder)."} | ||
| ) | ||
| | ||
| def __post_init__(self): | ||
| if self.append_attn: | ||
| self.block_attn = True | ||
| assert ( | ||
| self.src_length + self.max_length <= self.total_max_length | ||
| ), "src_length + max_length should smaller than total_max_length." | ||
| | ||
| | ||
| @dataclass | ||
| | @@ -520,7 +520,7 @@ def _preprocess(self, source): | |
| alibi_slopes = llm_utils.get_alibi_slopes(self.model_config.n_head) | ||
| inputs["position_ids"] = paddle.to_tensor(alibi_slopes, dtype="float32") | ||
| arange_tensor_encoder = paddle.arange(self.config.total_max_length, dtype=self.config.dtype) | ||
| alibi = alibi_slopes[None, :, None, None] * arange_tensor_encoder | ||
| alibi = (alibi_slopes[None, :, None, None] * arange_tensor_encoder).astype(self.config.dtype) | ||
| | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. emm,这个 config dtype保险吗?用户可以改这个值。要不用里面一个tensor的dtype。 Collaborator Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个dtype确实需要与config.dtype保持一致的 | ||
| if self.model_config.tensor_parallel_degree > 1: | ||
| block_size = self.model_config.n_head // self.model_config.tensor_parallel_degree | ||
| | @@ -1352,13 +1352,19 @@ def create_predictor( | |
| predictor_args.model_name_or_path, config=config, dtype=predictor_args.dtype | ||
| ) | ||
| model.eval() | ||
| | ||
| elif "chatglmv2forcausallm" in config.architectures[0].lower(): | ||
| from paddlenlp.experimental.transformers import ( | ||
| ChatGLMv2ForCausalLMInferenceModel as Model, | ||
| ) | ||
| | ||
| model = Model.from_pretrained( | ||
| predictor_args.total_max_length = config.seq_length | ||
| if predictor_args.block_attn: | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. emm,我建议吧 block_attn 放到config的属性里面,然后 ChatGLMv2InferenceModel 里面自己控制。 Collaborator Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 但严格来说其实这个不属于每个模型的Config,如果加入如LlamaConfig的话,每个模型的Config里都需要加,先保持这样吧,后面重构的时候,会看下有没有更好的方式 | ||
| config.block_size = predictor_args.block_size | ||
| config.max_seq_len = predictor_args.total_max_length | ||
| from paddlenlp.experimental.transformers import ( | ||
| ChatGLMv2ForCausalLMBlockInferenceModel as ChatGLMv2InferenceModel, | ||
| ) | ||
| else: | ||
| from paddlenlp.experimental.transformers import ( | ||
| ChatGLMv2ForCausalLMInferenceModel as ChatGLMv2InferenceModel, | ||
| ) | ||
| model = ChatGLMv2InferenceModel.from_pretrained( | ||
| predictor_args.model_name_or_path, config=config, dtype=predictor_args.dtype | ||
| ) | ||
| model.eval() | ||
| | @@ -1522,19 +1528,19 @@ def create_predictor( | |
| config, predictor_args.batch_size, predictor_args.total_max_length | ||
| ) | ||
| elif "chatglmv2forcausallm" in config.architectures[0].lower(): | ||
| from paddlenlp.experimental.transformers import ( | ||
| ChatGLMv2ForCausalLMInferenceModel, | ||
| ) | ||
| | ||
| cache_kvs_shape = ChatGLMv2ForCausalLMInferenceModel.get_cache_kvs_shape( | ||
| config, predictor_args.batch_size, predictor_args.total_max_length | ||
| ) | ||
| elif "chatglmv2forcausallm" in config.architectures[0].lower(): | ||
| from paddlenlp.experimental.transformers import ( | ||
| ChatGLMv2ForCausalLMInferenceModel, | ||
| ) | ||
| predictor_args.total_max_length = config.seq_length | ||
| if predictor_args.block_attn: | ||
| config.block_size = predictor_args.block_size | ||
| config.max_seq_len = predictor_args.total_max_length | ||
| from paddlenlp.experimental.transformers import ( | ||
| ChatGLMv2ForCausalLMBlockInferenceModel as ChatGLMv2InferenceModel, | ||
| ) | ||
| else: | ||
| from paddlenlp.experimental.transformers import ( | ||
| ChatGLMv2ForCausalLMInferenceModel as ChatGLMv2InferenceModel, | ||
| ) | ||
| | ||
| cache_kvs_shape = ChatGLMv2ForCausalLMInferenceModel.get_cache_kvs_shape( | ||
| cache_kvs_shape = ChatGLMv2InferenceModel.get_cache_kvs_shape( | ||
| config, predictor_args.batch_size, predictor_args.total_max_length | ||
| ) | ||
| elif "chatglmforcausallm" in config.architectures[0].lower(): | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这个跟npu相关同学确认的吗?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
已确认,没问题。