File tree Expand file tree Collapse file tree 2 files changed +10
-4
lines changed
Expand file tree Collapse file tree 2 files changed +10
-4
lines changed Original file line number Diff line number Diff line change @@ -1062,11 +1062,12 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg):
10621062 if optimizer_was_run :
10631063 self .lr_scheduler .step ()
10641064
1065- if enable_release_grads and args .pipeline_parallel_degree > 1 :
1065+ if args .release_grads or enable_release_grads :
10661066 self .optimizer .clear_grad (set_to_zero = False )
1067- for _ , buffers in model ._chunk_2_comm_buffers .items ():
1068- for buffer in buffers :
1069- buffer ._clear_grad_storage ()
1067+ if args .pipeline_parallel_degree > 1 :
1068+ for _ , buffers in model ._chunk_2_comm_buffers .items ():
1069+ for buffer in buffers :
1070+ buffer ._clear_grad_storage ()
10701071 else :
10711072 self .optimizer .clear_grad ()
10721073
Original file line number Diff line number Diff line change @@ -344,6 +344,8 @@ class TrainingArguments:
344344 Whether skip profile timer, timer will record time usage of forward/ backward/ step, etc.
345345 distributed_dataloader (`bool`, *optional*):
346346 Whether to use distributed dataloader. Default is `False`.
347+ release_grads (`bool`, *optional*):
348+ Whether to release gradients during training. Default is `False`.
347349 """
348350
349351 output_dir : str = field (
@@ -791,6 +793,9 @@ class TrainingArguments:
791793 default = False ,
792794 metadata = {"help" : "Enable MoE (Mixture of Experts) expert parallel training" },
793795 )
796+ release_grads : Optional [bool ] = field (
797+ default = False , metadata = {"help" : "Whether to release gradients during training. Default is `False`." }
798+ )
794799
795800 def __post_init__ (self ):
796801 env_local_rank = int (os .environ .get ("PADDLE_RANK_IN_NODE" , - 1 ))
You can’t perform that action at this time.
0 commit comments