pytorch
diff --git a/‎test/pjrt/test_torchrun.py‎
Lines changed: 38 additions & 0 deletions b/‎test/pjrt/test_torchrun.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎torch_xla/_internal/pjrt.py‎
Lines changed: 5 additions & 3 deletions b/‎torch_xla/_internal/pjrt.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎torch_xla/experimental/pjrt_backend.py‎
Lines changed: 25 additions & 7 deletions b/‎torch_xla/experimental/pjrt_backend.py‎
Lines changed: 25 additions & 7 deletions
@@ -0,0 +1,38 @@
+from absl.testing import absltest
+from absl import logging
+import torch
+import torch.distributed as dist
+import torch_xla.core.xla_model as xm
+import torch_xla.experimental.pjrt_backend
+import torch_xla.runtime as xr
+import torch_xla.utils.utils as xu
+
+
+class TestTorchrun(absltest.TestCase):
+
+ def test_all_gather(self):
+ dist.init_process_group('xla', init_method='pjrt://')
+
+ dist_world_size = xu.getenv_as('WORLD_SIZE', int)
+ devices_per_thread = xr.addressable_device_count()
+
+ expected_world_size = dist_world_size * devices_per_thread
+
+ rank = torch.tensor([dist.get_rank()],
+ dtype=torch.float32,
+ device=xm.xla_device())
+ output = [rank.clone() for _ in range(expected_world_size)]
+ dist.all_gather(output, rank)
+ result = torch.concat(output)
+ xm.mark_step()
+
+ expected = torch.arange(0, expected_world_size, step=1, dtype=torch.float32)
+ torch.testing.assert_close(result.cpu(), expected)
+
+
+if __name__ == '__main__':
+ if not dist.is_torchelastic_launched():
+ logging.error('Test must be launched with torchrun!')
+ exit(1)
+
+ absltest.main()
@@ -56,7 +56,6 @@ def _run_thread_per_device(
  initializer_fn(local_rank, local_world_size)
 
  devices = xm.get_xla_supported_devices()
- xm.set_replication(xm.xla_device(), devices)
  num_threads = len(devices)
 
  @functools.wraps(fn)
@@ -104,13 +103,16 @@ def _run_singleprocess(fn: Callable[..., R], *args, **kwargs) -> Dict[int, R]:
 
 
 @runtime.requires_pjrt
-def _initialize_multiprocess(local_rank: int, local_world_size: int):
+def initialize_multiprocess(local_rank: int, local_world_size: int):
  os.environ.setdefault(xenv.PJRT_LOCAL_PROCESS_RANK, str(local_rank))
  os.environ.setdefault(xenv.PJRT_LOCAL_PROCESS_COUNT, str(local_world_size))
 
  if runtime.device_type() == 'TPU':
  tpu.configure_topology(local_rank, local_world_size)
 
+ devices = xm.get_xla_supported_devices()
+ xm.set_replication(xm.xla_device(), devices)
+
 
 @runtime.requires_pjrt
 def run_multiprocess(fn: Callable[..., R],
@@ -148,7 +150,7 @@ def run_multiprocess(fn: Callable[..., R],
  _run_thread_per_device,
  local_world_size=num_processes,
  fn=functools.partial(fn, *args, **kwargs),
- initializer_fn=_initialize_multiprocess)
+ initializer_fn=initialize_multiprocess)
  process_results = executor.map(mp_fn, range(num_processes))
  replica_results = list(
  itertools.chain.from_iterable(
 
@@ -1,10 +1,11 @@
 import datetime
+import logging
 import threading
 
 import torch.distributed as dist
-from torch.testing._internal.distributed import multi_threaded_pg
 from torch_xla.distributed import xla_backend
 from torch_xla import runtime as xr
+from torch_xla._internal import pjrt
 from torch_xla._internal import tpu
 import torch_xla.utils.utils as xu
 
@@ -15,6 +16,12 @@
 def _pjrt_rendezvous_handler(url: str,
  timeout: datetime.timedelta = ...,
  **kwargs):
+ # Assume `xmp.spawn` has not been called when using torchrun
+ if dist.is_torchelastic_launched():
+ local_world_size = xu.getenv_as('LOCAL_WORLD_SIZE', int)
+ local_rank = xu.getenv_as('LOCAL_RANK', int)
+ pjrt.initialize_multiprocess(local_rank, local_world_size)
+
  master_ip = xu.getenv_as('MASTER_ADDR', str)
  if not master_ip:
  master_ip = tpu.discover_master_worker_ip() if xr.device_type(
@@ -24,15 +31,26 @@ def _pjrt_rendezvous_handler(url: str,
  with _store_lock:
  global _store
  if not _store:
- _store = dist.TCPStore(
- master_ip,
- master_port,
- xr.process_count(),
- is_master=xr.process_index() == 0)
+ if xu.getenv_as('TORCHELASTIC_USE_AGENT_STORE', str) == 'True':
+ attempt = xu.getenv_as('TORCHELASTIC_RESTART_COUNT', int, defval=0)
+ tcp_store = dist.TCPStore(
+ master_ip, master_port, xr.process_count(), is_master=False)
+ _store = dist.PrefixStore(f"/worker/attempt_{attempt}", tcp_store)
+ else:
+ _store = dist.TCPStore(
+ master_ip,
+ master_port,
+ xr.process_count(),
+ is_master=xr.process_index() == 0)
 
  yield (_store, xr.global_ordinal(), xr.world_size())
 
 
-multi_threaded_pg._install_threaded_pg()
+if tpu.num_available_chips() > 0 and tpu.version() <= 3:
+ from torch.testing._internal.distributed import multi_threaded_pg
+ logging.warning('Patching torch.distributed state to support multithreading.')
+ logging.warning('torch.distributed support on TPU v2 and v3 is experimental '
+ 'and does not support torchrun.')
+ multi_threaded_pg._install_threaded_pg()
 
 dist.register_rendezvous_handler('pjrt', _pjrt_rendezvous_handler)