Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions scripts/distribute/ci_case_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ function llm_gpt_case_list_auto() {
llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2
llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2
llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2
# llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate
llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate
llm_gpt_pir_auto_bs4_TP2
llm_gpt_pir_auto_bs4_TP2_PP2
llm_gpt_pir_auto_bs8_DP2_TP2_PP2
Expand Down Expand Up @@ -223,7 +223,7 @@ function llama_dygraph_auto_bs4_bf16_SD2() {

export CUDA_DEVICE_MAX_CONNECTIONS=1

flags=("" "FLAGS_fuse_allreduce_in_opt" "FLAGS_fuse_reducescatter_in_opt" "FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap")
flags=("" "FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap")
for i in "${!flags[@]}"; do
flag="${flags[$i]}"

Expand Down Expand Up @@ -290,7 +290,7 @@ function llama_dygraph_auto_bs4_bf16_SD2() {
--tensor_parallel_degree 1 \
--sharding "stage1" \
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
--sharding_parallel_config "" \
--sharding_parallel_config "enable_tensor_fusion enable_overlap" \
--to_static 0 \
--amp_custom_black_list "reduce_sum" "c_softmax_with_cross_entropy" \
--amp_custom_white_list "lookup_table" "lookup_table_v2" \
Expand All @@ -302,10 +302,6 @@ function llama_dygraph_auto_bs4_bf16_SD2() {
echo "result: loss=$loss ips=$ips mem=$mem"
echo "flag=$flag acc_step=$acc_step"
if [ -z "$flag" ]; then
loss_base=9.23504791
elif [ "$flag" = "FLAGS_fuse_allreduce_in_opt" ]; then
loss_base=9.23502579
elif [ "$flag" = "FLAGS_fuse_reducescatter_in_opt" ]; then
loss_base=9.23504105
elif [ "$flag" = "FLAGS_enable_tensor_fusion FLAGS_enable_sharding_overlap" ]; then
if [ $acc_step -eq 1 ]; then
Expand Down Expand Up @@ -2653,11 +2649,11 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2_intermediate() {
mem=-1
echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
# loss_base=10.58456802 # note: need to debug
loss_base=10.56716251
loss_base=10.56668091
ips_base=-1
mem_base=-1
if [ $IS_A100 -ne 0 ];then
loss_base=10.56166935 # after add dropout spmd
loss_base=10.56199837 # after add dropout spmd
fi
check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
echo "=========== $FUNCNAME run end ==========="
Expand Down Expand Up @@ -3844,7 +3840,6 @@ function llama_baichuan_dygraph_auto_sp_async_reduce_scatter_bs8_bf16_DP4-MP2-SP
export NVIDIA_TF32_OVERRIDE=0

export CUDA_DEVICE_MAX_CONNECTIONS=1
export FLAGS_fuse_reducescatter_in_opt=1
export FLAGS_enable_inplace_master_grad=1
export FLAGS_auto_parallel_align_mode=1
export FLAGS_max_inplace_grad_add=65536
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ param+="nnodes=4 "
param+="model_type=baichuan2_13b "
param+='dynamic_auto=_dynamic_auto '

export FLAGS_fuse_reducescatter_in_opt=1
export FLAGS_enable_sharding_overlap=1
export FLAGS_enable_tensor_fusion=1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@ param+="nnodes=4 "
param+="model_type=llama2_13b "
param+='dynamic_auto=_dynamic_auto '

# This optimization currently only runs in the dynamic automatic parallelism of Llama7B.
export FLAGS_fuse_reducescatter_in_opt=1

# Enable tensor fusion and sharding overlap optimization
export FLAGS_enable_tensor_fusion=1
export FLAGS_enable_sharding_overlap=1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@ param+="nnodes=4 "
param+="model_type=llama2_7b "
param+='dynamic_auto=_dynamic_auto '

# This optimization currently only runs in the dynamic automatic parallelism of Llama7B.
export FLAGS_fuse_reducescatter_in_opt=1

# Enable tensor fusion and sharding overlap optimization
export FLAGS_enable_tensor_fusion=1
export FLAGS_enable_sharding_overlap=1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ param+="nnodes=4 "
param+="model_type=qwen_14b "
param+='dynamic_auto=_dynamic_auto '

export FLAGS_fuse_reducescatter_in_opt=1
export FLAGS_enable_tensor_fusion=1
export FLAGS_enable_sharding_overlap=1

Expand Down
Loading