@@ -647,8 +647,12 @@ def _create_predictor(self, predictor_args: PredictorArgument):
647647 if predictor_args .dtype == "bfloat16" :
648648 config .delete_pass ("gpu_cpu_map_matmul_v2_to_matmul_pass" )
649649
650- device_id = int (os .environ .get ("FLAGS_selected_gpus" , 0 ))
651- config .enable_use_gpu (100 , device_id )
650+ if predictor_args .device in paddle .device .get_all_custom_device_type ():
651+ device_id = int (os .environ .get ("FLAGS_selected_{}s" .format (predictor_args .device ), 0 ))
652+ config .enable_custom_device (predictor_args .device , device_id )
653+ else :
654+ device_id = int (os .environ .get ("FLAGS_selected_gpus" , 0 ))
655+ config .enable_use_gpu (100 , device_id )
652656 config .enable_new_executor ()
653657
654658 if self .tensor_parallel_degree > 1 :
@@ -793,6 +797,8 @@ def __init__(self, config: PredictorArgument, tokenizer: PretrainedTokenizer):
793797 self .free_list = [i for i in range (self .max_block_nums )][::- 1 ]
794798 self .used_list = [[] for _ in range (config .batch_size )]
795799
800+ self .benchmark = config .benchmark
801+
796802 def init_inputs (self , config : PredictorArgument ):
797803 self .inputs = {}
798804
@@ -909,19 +915,20 @@ def _get_rotary_position_embedding(self, position_ids, head_dim):
909915 return rot_emb
910916
911917 def _preprocess (self , source ):
912- if self .tokenizer .chat_template is not None :
918+ if not self . benchmark and self .tokenizer .chat_template is not None :
913919 source = [source ] if isinstance (source , str ) else source
914920 source = [self .tokenizer .apply_chat_template (sentence , tokenize = False ) for sentence in source ]
915921
916922 for i , text in enumerate (source ):
923+ add_special_tokens = self .tokenizer .chat_template is None or isinstance (self .tokenizer , (ChatGLMv2Tokenizer , ChatGLMTokenizer ))
924+ add_special_tokens = add_special_tokens if not self .benchmark else False
917925 tokens = self .tokenizer (
918926 text ,
919927 return_tensors = "np" ,
920928 padding = True ,
921929 max_length = self .config .src_length ,
922930 # if use chat_template, it will not add special_tokens
923- add_special_tokens = self .tokenizer .chat_template is None
924- or isinstance (self .tokenizer , (ChatGLMv2Tokenizer , ChatGLMTokenizer )),
931+ add_special_tokens = add_special_tokens ,
925932 )
926933 input_ids = tokens ["input_ids" ][0 ]
927934 length = len (input_ids )
@@ -1066,11 +1073,22 @@ def _create_predictor(self, predictor_args: PredictorArgument):
10661073 config = paddle .inference .Config (infer_model_path + ".pdmodel" , infer_model_path + ".pdiparams" )
10671074
10681075 config .switch_ir_optim (False )
1069- device_id = int (os .environ .get ("FLAGS_selected_gpus" , 0 ))
1070- config .enable_use_gpu (100 , device_id )
1076+ if predictor_args .device in paddle .device .get_all_custom_device_type ():
1077+ device_id = int (os .environ .get ("FLAGS_selected_{}s" .format (predictor_args .device ), 0 ))
1078+ config .enable_custom_device (predictor_args .device , device_id )
1079+ else :
1080+ device_id = int (os .environ .get ("FLAGS_selected_gpus" , 0 ))
1081+ config .enable_use_gpu (100 , device_id )
10711082 # config.disable_glog_info()
10721083 # config.enable_memory_optim()
10731084
1085+ if predictor_args .device == "npu" :
1086+ import paddle_custom_device .npu .passes as passes
1087+
1088+ config .switch_ir_optim (True )
1089+ pass_builder = config .pass_builder ()
1090+ passes .addPasses (pass_builder , self .model_config .model_type , self .model_config .quant_type )
1091+
10741092 if self .tensor_parallel_degree > 1 :
10751093 trainer_endpoints = fleet .worker_endpoints ()
10761094 current_endpoint = trainer_endpoints [self .tensor_parallel_rank ]
@@ -1516,6 +1534,11 @@ def predict():
15161534 fleet .init (is_collective = True , strategy = strategy )
15171535
15181536 predictor = create_predictor (predictor_args , model_args )
1537+
1538+ if predictor_args .benchmark :
1539+ benchmark (predictor , predictor_args , model_args )
1540+ return
1541+
15191542 source_texts = []
15201543 target_texts = []
15211544 if model_args .data_file :
@@ -1559,14 +1582,10 @@ def predict():
15591582 out = {"src" : source , "tgt" : target , "output" : output }
15601583 f .write (json .dumps (out , ensure_ascii = False ) + "\n " )
15611584
1562- if predictor_args .benchmark :
1563- benchmark (predictor , predictor_args , model_args )
1564-
15651585
15661586def benchmark (predictor , predictor_args , model_args ):
15671587 # Just construct a simple benchmark input. We pad input to the src_length.
1568- test_texts = "hello world, how are you?"
1569- benchmark_texts = [test_texts + "<pad>" * predictor_args .src_length for _ in range (predictor_args .batch_size )]
1588+ benchmark_texts = [predictor .tokenizer .pad_token * predictor_args .src_length for _ in range (predictor_args .batch_size )]
15701589
15711590 batch_benchmark_texts = batchfy_text (benchmark_texts , predictor_args .batch_size )
15721591 print ("***********Start Benchmark**********" )
0 commit comments