@@ -1629,6 +1629,7 @@ def run(
16291629 tensorboard : Optional [str ] = None ,
16301630 sync : bool = True ,
16311631 create_request_timeout : Optional [float ] = None ,
1632+ disable_retries : bool = False ,
16321633 ) -> None :
16331634 """Run this configured CustomJob.
16341635
@@ -1686,6 +1687,10 @@ def run(
16861687 will unblock and it will be executed in a concurrent Future.
16871688 create_request_timeout (float):
16881689 Optional. The timeout for the create request in seconds.
1690+ disable_retries (bool):
1691+ Indicates if the job should retry for internal errors after the
1692+ job starts running. If True, overrides
1693+ `restart_job_on_worker_restart` to False.
16891694 """
16901695 network = network or initializer .global_config .network
16911696
@@ -1700,6 +1705,7 @@ def run(
17001705 tensorboard = tensorboard ,
17011706 sync = sync ,
17021707 create_request_timeout = create_request_timeout ,
1708+ disable_retries = disable_retries ,
17031709 )
17041710
17051711 @base .optional_sync ()
@@ -1715,6 +1721,7 @@ def _run(
17151721 tensorboard : Optional [str ] = None ,
17161722 sync : bool = True ,
17171723 create_request_timeout : Optional [float ] = None ,
1724+ disable_retries : bool = False ,
17181725 ) -> None :
17191726 """Helper method to ensure network synchronization and to run the configured CustomJob.
17201727
@@ -1770,6 +1777,10 @@ def _run(
17701777 will unblock and it will be executed in a concurrent Future.
17711778 create_request_timeout (float):
17721779 Optional. The timeout for the create request in seconds.
1780+ disable_retries (bool):
1781+ Indicates if the job should retry for internal errors after the
1782+ job starts running. If True, overrides
1783+ `restart_job_on_worker_restart` to False.
17731784 """
17741785 self .submit (
17751786 service_account = service_account ,
@@ -1781,6 +1792,7 @@ def _run(
17811792 experiment_run = experiment_run ,
17821793 tensorboard = tensorboard ,
17831794 create_request_timeout = create_request_timeout ,
1795+ disable_retries = disable_retries ,
17841796 )
17851797
17861798 self ._block_until_complete ()
@@ -1797,6 +1809,7 @@ def submit(
17971809 experiment_run : Optional [Union ["aiplatform.ExperimentRun" , str ]] = None ,
17981810 tensorboard : Optional [str ] = None ,
17991811 create_request_timeout : Optional [float ] = None ,
1812+ disable_retries : bool = False ,
18001813 ) -> None :
18011814 """Submit the configured CustomJob.
18021815
@@ -1849,6 +1862,10 @@ def submit(
18491862 https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
18501863 create_request_timeout (float):
18511864 Optional. The timeout for the create request in seconds.
1865+ disable_retries (bool):
1866+ Indicates if the job should retry for internal errors after the
1867+ job starts running. If True, overrides
1868+ `restart_job_on_worker_restart` to False.
18521869
18531870 Raises:
18541871 ValueError:
@@ -1869,11 +1886,12 @@ def submit(
18691886 if network :
18701887 self ._gca_resource .job_spec .network = network
18711888
1872- if timeout or restart_job_on_worker_restart :
1889+ if timeout or restart_job_on_worker_restart or disable_retries :
18731890 timeout = duration_pb2 .Duration (seconds = timeout ) if timeout else None
18741891 self ._gca_resource .job_spec .scheduling = gca_custom_job_compat .Scheduling (
18751892 timeout = timeout ,
18761893 restart_job_on_worker_restart = restart_job_on_worker_restart ,
1894+ disable_retries = disable_retries ,
18771895 )
18781896
18791897 if enable_web_access :
@@ -2287,6 +2305,7 @@ def run(
22872305 tensorboard : Optional [str ] = None ,
22882306 sync : bool = True ,
22892307 create_request_timeout : Optional [float ] = None ,
2308+ disable_retries : bool = False ,
22902309 ) -> None :
22912310 """Run this configured CustomJob.
22922311
@@ -2331,6 +2350,10 @@ def run(
23312350 will unblock and it will be executed in a concurrent Future.
23322351 create_request_timeout (float):
23332352 Optional. The timeout for the create request in seconds.
2353+ disable_retries (bool):
2354+ Indicates if the job should retry for internal errors after the
2355+ job starts running. If True, overrides
2356+ `restart_job_on_worker_restart` to False.
23342357 """
23352358 network = network or initializer .global_config .network
23362359
@@ -2343,6 +2366,7 @@ def run(
23432366 tensorboard = tensorboard ,
23442367 sync = sync ,
23452368 create_request_timeout = create_request_timeout ,
2369+ disable_retries = disable_retries ,
23462370 )
23472371
23482372 @base .optional_sync ()
@@ -2356,6 +2380,7 @@ def _run(
23562380 tensorboard : Optional [str ] = None ,
23572381 sync : bool = True ,
23582382 create_request_timeout : Optional [float ] = None ,
2383+ disable_retries : bool = False ,
23592384 ) -> None :
23602385 """Helper method to ensure network synchronization and to run the configured CustomJob.
23612386
@@ -2398,19 +2423,24 @@ def _run(
23982423 will unblock and it will be executed in a concurrent Future.
23992424 create_request_timeout (float):
24002425 Optional. The timeout for the create request in seconds.
2426+ disable_retries (bool):
2427+ Indicates if the job should retry for internal errors after the
2428+ job starts running. If True, overrides
2429+ `restart_job_on_worker_restart` to False.
24012430 """
24022431 if service_account :
24032432 self ._gca_resource .trial_job_spec .service_account = service_account
24042433
24052434 if network :
24062435 self ._gca_resource .trial_job_spec .network = network
24072436
2408- if timeout or restart_job_on_worker_restart :
2437+ if timeout or restart_job_on_worker_restart or disable_retries :
24092438 duration = duration_pb2 .Duration (seconds = timeout ) if timeout else None
24102439 self ._gca_resource .trial_job_spec .scheduling = (
24112440 gca_custom_job_compat .Scheduling (
24122441 timeout = duration ,
24132442 restart_job_on_worker_restart = restart_job_on_worker_restart ,
2443+ disable_retries = disable_retries ,
24142444 )
24152445 )
24162446
0 commit comments