dalegebit
diff --git a/‎test/test_distributed.py‎
Lines changed: 32 additions & 3 deletions b/‎test/test_distributed.py‎
Lines changed: 32 additions & 3 deletions
@@ -8,6 +8,7 @@
 from contextlib import contextmanager
 
 import torch
+import torch.cuda
 import torch.distributed as dist
 from common import TestCase
 
@@ -22,6 +23,20 @@
  print('Distributed not available, skipping tests')
  sys.exit(0)
 
+SKIP_IF_NO_CUDA_EXIT_CODE = 75
+
+
+def skip_if_no_cuda_distributed(func):
+ func.skip_if_no_cuda_distributed = True
+
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ if not torch.cuda.is_available():
+ sys.exit(SKIP_IF_NO_CUDA_EXIT_CODE)
+
+ return func(*args, **kwargs)
+ return wrapper
+
 
 @contextmanager
 def _lock():
@@ -228,6 +243,7 @@ def test_broadcast(self):
  self._test_broadcast_helper(group, group_id, rank)
 
  @unittest.skipIf(BACKEND != 'gloo', "Only Gloo backend supports CUDA allReduce")
+ @skip_if_no_cuda_distributed
  def test_broadcast_cuda(self):
  group, group_id, rank = self._init_global_test()
  self._test_broadcast_helper(group, group_id, rank, True)
@@ -333,6 +349,7 @@ def test_all_reduce_sum(self):
  )
 
  @unittest.skipIf(BACKEND != 'gloo', "Only Gloo backend supports CUDA allReduce")
+ @skip_if_no_cuda_distributed
  def test_all_reduce_sum_cuda(self):
  group, group_id, rank = self._init_global_test()
  self._test_all_reduce_helper(
@@ -487,7 +504,7 @@ def manager_join(fn):
  @wraps(fn)
  def wrapper(self):
  if self.rank == self.MANAGER_PROCESS_RANK:
- self._join_and_reduce()
+ self._join_and_reduce(fn)
  else:
  fn(self)
  return wrapper
@@ -533,10 +550,22 @@ def _run(self, rank):
  getattr(self, self.id().split(".")[2])()
  sys.exit(0)
 
- def _join_and_reduce(self):
+ def _join_and_reduce(self, fn):
+ skip_ok = getattr(fn, "skip_if_no_cuda_distributed", False)
  for p in self.processes:
  p.join(self.JOIN_TIMEOUT)
- self.assertEqual(p.exitcode, 0)
+ if not skip_ok:
+ self.assertEqual(p.exitcode, 0)
+
+ if skip_ok:
+ first_process = self.processes[0]
+ # do this first so we don't give an error message about mismatched exit codes if the first isn't valid
+ assert first_process.exitcode == 0 or first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE
+
+ for p in self.processes:
+ self.assertEqual(p.exitcode, first_process.exitcode)
+ if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE:
+ raise unittest.SkipTest("cuda is not available")
 
 elif BACKEND == 'mpi':
  dist.init_process_group(init_method=INIT_METHOD, backend='mpi')