@@ -257,7 +257,7 @@ class Flops2e17TFM41MC4L2048BS128(BaseExperimentConfig):
257257 batch_size : int = 128
258258 num_train_steps : int = 2588
259259 lr_schedule_name : str = 'cosine_decay'
260- weight_decay : float = 3e-4
260+ weight_decay : float = 1e-1
261261 lr_schedule_config : Tuple [Tuple [str , Any ], ...] = (
262262 ('lr' , 3e-3 ), ('steps_after_decay' , 0 ),
263263 ('end_decay' , 0.1 ))
@@ -294,7 +294,7 @@ class Flops1e18TFM111MC4L2048BS256(Flops2e17TFM41MC4L2048BS128):
294294 batch_size : int = 256
295295 num_train_steps : int = 3626
296296 lr_schedule_name : str = 'cosine_decay'
297- weight_decay : float = 3e-4
297+ weight_decay : float = 1e-1
298298 lr_schedule_config : Tuple [Tuple [str , Any ], ...] = (
299299 ('lr' , 3e-3 ), ('steps_after_decay' , 0 ),
300300 ('end_decay' , 0.1 ))
@@ -329,7 +329,7 @@ class Flops1e19TFM338MC4L2048BS512(Flops2e17TFM41MC4L2048BS128):
329329 batch_size : int = 512
330330 num_train_steps : int = 5800
331331 lr_schedule_name : str = 'cosine_decay'
332- weight_decay : float = 3e-4
332+ weight_decay : float = 1e-1
333333 lr_schedule_config : Tuple [Tuple [str , Any ], ...] = (
334334 ('lr' , 3e-3 ),
335335 ('steps_after_decay' , 0 ),
0 commit comments