pytorch · vanbasten23 · Feb 3, 2024 · Dec 1, 2023 · Dec 4, 2023 · Dec 4, 2023
diff --git a/WORKSPACE b/WORKSPACE
@@ -49,6 +49,7 @@ http_archive(
  "//openxla_patches:cache_urls.diff",
  "//openxla_patches:gpu_race_condition.diff",
  "//openxla_patches:f16_abi_clang.diff",
+ "//openxla_patches:gpu_hanging.diff",
  "//openxla_patches:quant_dequant_converter.diff",
  "//openxla_patches:stablehlo_quant_seralization.diff",
  ],

diff --git a/openxla_patches/gpu_hanging.diff b/openxla_patches/gpu_hanging.diff
@@ -0,0 +1,36 @@
+// This patch is for https://github.com/openxla/xla/commit/ec0177de1748b4ebb0ecbd6f26043fdb1eb47d24.
+// It can be removed in the next openXLA pin update after 01/26/2024.
+diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc
+index 0f1818be2..c181f3025 100644
+--- a/xla/service/gpu/gpu_executable.cc
++++ b/xla/service/gpu/gpu_executable.cc
+@@ -382,9 +382,13 @@ absl::Status ExecuteThunks(const std::string& module_name,
+ }
+ }
+
+- // Maybe join a round of rendezvous after thunk initialization.
+- TF_RETURN_IF_ERROR(
+- MaybeRendezvousAfterInitialization(run_options, thunks_initialized));
++ // Maybe join a round of rendezvous after thunk initialization. We do this
++ // only in presence of collective cliques which means that we have collective
++ // operations in the XLA operations that tend to cause deadlocks.
++ if (!collective_cliques.empty()) {
++ TF_RETURN_IF_ERROR(
++ MaybeRendezvousAfterInitialization(run_options, thunks_initialized));
++ }
+
+ // Prepare parameters for thunks execution.
+ Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
+diff --git a/xla/service/gpu/thunk.h b/xla/service/gpu/thunk.h
+index 51a566b8f..94bab421f 100644
+--- a/xla/service/gpu/thunk.h
++++ b/xla/service/gpu/thunk.h
+@@ -175,6 +175,8 @@ class Thunk {
+ absl::StatusOr<NcclComm::Lock> GetComm(const NcclCliqueKey& clique_key,
+ int32_t rank) const;
+
++ bool empty() const { return cliques_map_.empty(); }
++
+ private:
+ CliquesMap cliques_map_;
+ };
diff --git a/test/cpp/test_replication.cpp b/test/cpp/test_replication.cpp
@@ -46,14 +46,17 @@ void TestSingleReplication(
  instances.emplace_back(CreateCrsComputation(shape), device_str,
  all_device_strings, &shape);
  }
- auto compiled_computations =
- torch_xla::runtime::GetComputationClient()->Compile(std::move(instances));
+ std::vector<torch_xla::runtime::ComputationClient::ComputationPtr>
+ compiled_computations =
+ torch_xla::runtime::GetComputationClient()->Compile(
+ std::move(instances));
 
  std::vector<at::Tensor> tensors;
  for (size_t i = 0; i < device_strings.size(); ++i) {
  tensors.push_back(at::ones({8, 8}, at::TensorOptions(at::kFloat)));
  }
- auto tensors_data = CreateTensorsData(tensors, device_strings);
+ std::vector<torch::lazy::BackendDataPtr> tensors_data =
+ CreateTensorsData(tensors, device_strings);
 
  std::vector<std::vector<torch_xla::runtime::ComputationClient::DataPtr>>
  results(device_strings.size());
@@ -75,7 +78,7 @@ void TestSingleReplication(
  counter.Wait();
 
  for (size_t i = 0; i < results.size(); ++i) {
- auto literals =
+ std::vector<xla::Literal> literals =
  torch_xla::runtime::GetComputationClient()->TransferFromDevice(
  results[i]);
  ASSERT_EQ(literals.size(), 1);
@@ -92,9 +95,12 @@ void TestSingleReplication(
 
 class ReplicationTest : public AtenXlaTensorTestBase {};
 
+// Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU
+// device per process instead of relying on threads so we will not run the test
+// on GPU.
 TEST_F(ReplicationTest, TestNSingleReplication) {
  WithAllDevices(
- {XlaDeviceType::TPU, XlaDeviceType::CUDA},
+ {XlaDeviceType::TPU},
  [&](const std::vector<torch::lazy::BackendDevice>& devices,
  const std::vector<torch::lazy::BackendDevice>& all_devices) {
  TestSingleReplication(devices, all_devices);

diff --git a/test/cpp/test_xla_sharding.cpp b/test/cpp/test_xla_sharding.cpp
@@ -309,11 +309,6 @@ TEST_F(XLAShardingTest, EqualShardingSpecs) {
 }
 
 TEST_F(XLAShardingTest, CreateTensorsData) {
- if (torch_xla::runtime::sys_util::GetEnvString(
- torch_xla::runtime::env::kEnvPjRtDevice, "") == "") {
- GTEST_SKIP() << "`PJRT_DEVICE` is not set.";
- }
-
  std::vector<at::Tensor> tensors(2);
  auto tensor = at::ones({8, 8}, at::TensorOptions(at::kFloat));
  xla::Shape tensor_shape =

diff --git a/test/pjrt/test_runtime_gpu.py → test/pjrt/test_runtime_multi_gpu.py b/test/pjrt/test_runtime_gpu.py → test/pjrt/test_runtime_multi_gpu.py
@@ -19,7 +19,7 @@
 
 @unittest.skipIf(xr.device_type() != "CUDA",
  f"GPU tests should only run on GPU devices.")
-class TestExperimentalPjrtGpu(parameterized.TestCase):
+class TestExperimentalPjrtMultiGpu(parameterized.TestCase):
 
  def setUp(self):
  xr.set_device_type('CUDA')

diff --git a/test/pjrt/test_runtime_single_proc_gpu.py b/test/pjrt/test_runtime_single_proc_gpu.py
@@ -0,0 +1,49 @@
+import concurrent.futures
+import itertools
+import os
+import queue
+import requests
+import unittest
+import subprocess
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch_xla
+import torch_xla.core.xla_env_vars as xenv
+import torch_xla.core.xla_model as xm
+import torch_xla.distributed.xla_multiprocessing as xmp
+from torch_xla import runtime as xr
+from torch_xla._internal import pjrt
+from absl.testing import absltest, parameterized
+
+
+@unittest.skipIf(xr.device_type() != "CUDA",
+ f"GPU tests should only run on GPU devices.")
+class TestExperimentalSingleProcPjrtGpu(parameterized.TestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ command = 'nvidia-smi --list-gpus | wc -l'
+ result = subprocess.run(
+ command,
+ capture_output=True,
+ shell=True,
+ check=True,
+ text=True,
+ )
+ cls.num_cuda_devices = int(result.stdout)
+
+ def test_num_local_devices(self):
+ self.assertLen(xm.get_xla_supported_devices(),
+ xr.addressable_device_count())
+ self.assertEqual(self.num_cuda_devices, xr.addressable_device_count())
+
+ def test_num_global_devices(self):
+ self.assertLen(torch_xla._XLAC._xla_get_all_devices(),
+ xr.global_device_count())
+ self.assertEqual(self.num_cuda_devices, xr.global_device_count())
+
+
+if __name__ == '__main__':
+ absltest.main()
diff --git a/test/pjrt/test_torchrun.py b/test/pjrt/test_torchrun.py
@@ -16,6 +16,10 @@ def setUp(self):
  def tearDown(self) -> None:
  dist.destroy_process_group()
 
+ def test_addressable_device_count(self):
+ devices_per_process = xr.addressable_device_count()
+ self.assertEqual(devices_per_process, 1)
+
  def test_all_gather(self):
  dist_world_size = xu.getenv_as('WORLD_SIZE', int)
  devices_per_thread = xr.addressable_device_count()

diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -167,7 +167,8 @@ function run_xla_op_tests1 {
  run_test "$CDIR/test_hlo_metadata.py"
  run_test "$CDIR/test_profiler.py"
  run_test "$CDIR/pjrt/test_runtime.py"
- run_test "$CDIR/pjrt/test_runtime_gpu.py"
+ run_test "$CDIR/pjrt/test_runtime_single_proc_gpu.py"
+ run_test "$CDIR/pjrt/test_runtime_multi_gpu.py"
  run_test "$CDIR/pjrt/test_runtime_multi_cpu.py"
  run_test "$CDIR/pjrt/test_internal_tpu.py"
  run_test "$CDIR/pjrt/test_ddp.py"

diff --git a/test/spmd/test_xla_sharding.py b/test/spmd/test_xla_sharding.py
@@ -207,10 +207,13 @@ def test_xla_sharding_type(self):
  t = torch.randn(10, 20).to(xm.xla_device())
  self.assertEqual(torch_xla._XLAC._get_xla_sharding_type(t), None)
 
- x_dim = 2 if self.n_devices % 4 == 0 else 1
+ x_dim = 2 if self.n_devices >= 2 else 1
+ # if self.n_devices==4, mesh=(2,2)
+ # if self.n_devices==2, mesh=(2,1)
+ # if self.n_devices==1, mesh=(1,1)
  mesh = self._get_mesh((x_dim, self.n_devices // x_dim))
  xt = xs.mark_sharding(t, mesh, (0, 1))
- if self.n_devices > 1:
+ if self.n_devices >= 2:
  self.assertEqual(xt.sharding_type, xs.ShardingType.TILED)
  else:
  self.assertEqual(xt.sharding_type, xs.ShardingType.REPLICATED)
@@ -221,7 +224,7 @@ def test_xla_sharding_type(self):
 
  xs.clear_sharding(t)
  xt = xs.mark_sharding(t, mesh, (None, 1))
- if self.n_devices > 1:
+ if mesh.get_logical_mesh().shape[1] > 1:
  self.assertEqual(xt.sharding_type, xs.ShardingType.PARTIAL)
  else:
  self.assertEqual(xt.sharding_type, xs.ShardingType.REPLICATED)
@@ -339,14 +342,13 @@ def test_mark_sharding_partial(self):
  mesh = self._get_mesh((z_dim, self.n_devices // z_dim))
  xt1 = xs.mark_sharding(t1, mesh, (0, None))
 
- # partial replication requires >1 devices; otherwise, it's replicated.
- if self.n_devices > 1:
+ # partial replication requires >= 4 devices; otherwise, it's replicated.
+ if self.n_devices >= 4:
  # xt1 is sharded `z_dim`-way, replicated `n_devices/z_dim`-way.
- self.assertTrue('last_tile_dim_replicate' in
- torch_xla._XLAC._get_xla_sharding_spec(t1))
- self.assertTrue('[%d,1,%d]' %
- (z_dim, self.n_devices //
- z_dim) in torch_xla._XLAC._get_xla_sharding_spec(t1))
+ self.assertIn('last_tile_dim_replicate',
+ torch_xla._XLAC._get_xla_sharding_spec(t1))
+ self.assertIn('[%d,1,%d]' % (z_dim, self.n_devices // z_dim),
+ torch_xla._XLAC._get_xla_sharding_spec(t1))
  # replicated group should share the same data content.
  if (self.n_devices // z_dim) > 1:
  shards = xt1.local_shards
@@ -381,14 +383,13 @@ def test_mark_sharding_partial_unordered(self):
  mesh = self._get_mesh((z_dim, 1, self.n_devices // z_dim))
  xt1 = xs.mark_sharding(t1, mesh, (1, None, 0))
 
- # partial replication requires >1 devices; otherwise, it's replicated.
- if self.n_devices > 1:
+ # partial replication requires >= 4 devices; otherwise, it's replicated.
+ if self.n_devices >= 4:
  # xt1 is sharded `z_dim`-way, replicated `n_devices/z_dim`-way.
- self.assertTrue('last_tile_dim_replicate' in
- torch_xla._XLAC._get_xla_sharding_spec(t1))
- self.assertTrue('[1,1,%d,%d]' %
- (z_dim, self.n_devices //
- z_dim) in torch_xla._XLAC._get_xla_sharding_spec(t1))
+ self.assertIn('last_tile_dim_replicate',
+ torch_xla._XLAC._get_xla_sharding_spec(t1))
+ self.assertIn('[1,1,%d,%d]' % (z_dim, self.n_devices // z_dim),
+ torch_xla._XLAC._get_xla_sharding_spec(t1))
  # replicated group should share the same data content.
  if (self.n_devices // z_dim) > 1:
  shards = xt1.local_shards
@@ -485,14 +486,14 @@ def test_partial_replication_addmm(self):
  xs.mark_sharding(xw, mesh, (None, 1))
 
  # Check if the partial replication annotations are passed to the compiler.
- # Note that partial replication requires >1 devices; otherwise, it's replicated.
- if self.n_devices > 1:
- self.assertTrue('last_tile_dim_replicate' in
-  torch_xla._XLAC._get_xla_sharding_spec(xx))
- self.assertTrue('last_tile_dim_replicate' in
-  torch_xla._XLAC._get_xla_sharding_spec(xw))
+ # Note that partial replication requires >= 4 devices; otherwise, it's replicated.
+ if self.n_devices >= 4:
+ self.assertIn('last_tile_dim_replicate',
+ torch_xla._XLAC._get_xla_sharding_spec(xx))
+ self.assertIn('last_tile_dim_replicate',
+ torch_xla._XLAC._get_xla_sharding_spec(xw))
  actual = (xx @ xw + xb).cpu()
- self.assertTrue(torch.allclose(expected, actual))
+ self.assertTrue(torch.allclose(expected, actual, atol=1e-5))
 
  def test_clear_sharding(self):
  xt = torch.randn(2, 4, 8, 16).to(xm.xla_device())
@@ -723,10 +724,14 @@ def test_2d_tensor_3d_mesh(self):
  # Meaningful test for higher-order mesh with extra replication
  # requires multiple devices. Otherwise, this should defaults back to
  # full replication.
- if self.n_devices > 1:
+ if self.n_devices >= 4:
  mesh = self._get_mesh((2, self.n_devices // 2, 1))
  xs.mark_sharding(t1, mesh, partition_spec=(2, 1))
  sharding_annotation = 'sharding={devices=[1,%d,2]' % (self.n_devices // 2)
+ elif self.n_devices == 2:
+ mesh = self._get_mesh((2, 1, 1))
+ xs.mark_sharding(t1, mesh, partition_spec=(2, 1))
+ sharding_annotation = "sharding={replicated}"
  else:
  mesh = self._get_mesh((1, 1, 1))
  xs.mark_sharding(t1, mesh, partition_spec=(2, 1))

diff --git a/test/test_core_aten_ops.py b/test/test_core_aten_ops.py
@@ -16,8 +16,6 @@ def diff_output(testcase, output1, output2, rtol, atol, equal_nan=True):
  output2_cpu = output2.detach().cpu()
  if output2_cpu.dtype != output1.dtype:
  output2_cpu = output2_cpu.to(output1.dtype)
- # import pdb
- # pdb.set_trace()
  testcase.assertTrue(
  torch.allclose(
  output1, output2_cpu, atol=atol, rtol=rtol, equal_nan=equal_nan))

diff --git a/test/test_operations.py b/test/test_operations.py
@@ -237,9 +237,14 @@ def forward(self, x):
  return F.log_softmax(x, dim=1)
 
 
+@unittest.skipIf(
+ xr.device_type() == 'CUDA',
+ 'Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU device per process instead of relying on threads.'
+)
 class TestParallelTensorMNIST(test_utils.XlaTestCase):
 
  def test(self):
+ # devices=['xla:0', 'xla:1', 'xla:2', 'xla:3'] for example.
  devices = xm.get_xla_supported_devices()
  batch_size = xu.getenv_as('BATCH_SIZE', int, defval=8)
  sample_count = xu.getenv_as('SAMPLE_COUNT', int, defval=10)
@@ -267,6 +272,10 @@ def loop_fn(model, loader, device, context):
  model_parallel(loop_fn, train_loader)
 
 
+@unittest.skipIf(
+ xr.device_type() == 'CUDA',
+ 'Parallelism for DataParallel uses multi-threads. But cuda assumes one GPU device per process instead of relying on threads.'
+)
 class TestParallelTensorResnet18(test_utils.XlaTestCase):
 
  def test(self):
@@ -1247,8 +1256,6 @@ def test_fn(a):
 
  self.runAtenTest(torch.zeros([4, 4]), test_fn)
 
- @unittest.skipIf(xr.device_type() == 'GPU',
- "This test fails only on GPU with 07/05 XLA pin update.")
  def test_stack_pred(self):
 
  def test_fn(a):

diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py
@@ -87,7 +87,7 @@ def get_xla_supported_devices(devkind=None, max_devices=None):
  that kind.
 
  Returns:
- The list of device strings.
+ The list of device strings such as ['xla:0', 'xla:1', ...]
  """
  # TODO(wcromar): Remove `devkind` after 2.3 release cut. We no longer support
  # multiple device types.
@@ -217,6 +217,14 @@ def _xla_real_device(device):
 
 
 def xla_real_devices(devices: Optional[List[torch.device]] = None):
+ """Returns the real devices' name.
+
+ Args:
+ devices: The list of torch devices such as ['xla:0', 'xla:1'].
+
+ Returns:
+ A list of real devices' name such as ['CUDA:0', 'CUDA:1'].
+ """
  if not devices:
  devices = get_xla_supported_devices()
 
@@ -257,6 +265,7 @@ def xla_replication_devices(local_devices):
  format(len(local_devices), len(kind_devices)))
  replication_devices = []
  for device in torch_xla._XLAC._xla_get_all_devices():
+ # device is like 'CUDA:0'
  xdev = parse_xla_device(device)
  if not xdev:
  raise RuntimeError('Invalid device format: {}'.format(device))
@@ -284,6 +293,7 @@ def set_replication(device, devices):
  devctx = _get_device_context(device=device)
  devices = [str(x) for x in devices]
  if devices:
+ # sample replication_devices: ['CUDA:0', 'CUDA:1', 'CUDA:2', 'CUDA:3']
  replication_devices = xla_replication_devices(devices)
  torch_xla._XLAC._xla_set_replication_devices(replication_devices)
  devctx.device_index = devices.index(device)