codeplaysoftware
diff --git a/‎torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp‎
Lines changed: 31 additions & 7 deletions b/‎torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp‎
Lines changed: 31 additions & 7 deletions
diff --git a/‎torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp‎
Lines changed: 4 additions & 0 deletions b/‎torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/testing/_internal/distributed/distributed_test.py‎
Lines changed: 41 additions & 0 deletions b/‎torch/testing/_internal/distributed/distributed_test.py‎
Lines changed: 41 additions & 0 deletions
@@ -772,12 +772,14 @@ uint64_t ProcessGroupNCCL::getSequenceNumberForGroup() {
  return seq_;
 }
 
-// Abort all communicators on this rank
-void ProcessGroupNCCL::abort(c10::optional<std::string> abortReason) {
- std::lock_guard<std::mutex> lock(mutex_);
+void abortCommsFromMap(
+ std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>&
+ ncclCommsMap,
+ const int rank,
+ c10::optional<std::string> abortReason) {
  // The process may control multiple devices, loop through the communicators on
  // each device
- for (auto& it : devNCCLCommMap_) {
+ for (auto& it : ncclCommsMap) {
  auto& devName = it.first;
  auto& ncclComms = it.second;
 
@@ -794,11 +796,18 @@ void ProcessGroupNCCL::abort(c10::optional<std::string> abortReason) {
  // their responsibility to destroy the process group and recreate
  // it to recover from errors.
 
- LOG(INFO) << "[Rank " << rank_ << "] Destroyed " << ncclComms.size()
+ LOG(INFO) << "[Rank " << rank << "] Destroyed " << ncclComms.size()
  << "communicators on CUDA device " << devName;
  }
 }
 
+// Abort all communicators on this rank
+void ProcessGroupNCCL::abort(c10::optional<std::string> abortReason) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ abortCommsFromMap(devNCCLCommMap_, rank_, abortReason);
+ abortCommsFromMap(inInitializationCommMap_, rank_, abortReason);
+}
+
 ProcessGroupNCCL::~ProcessGroupNCCL() {
  terminateProcessGroup_.store(true);
 
@@ -1160,6 +1169,11 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
  at::cuda::getStreamFromPool(options_->is_high_priority_stream));
  }
 
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ inInitializationCommMap_.emplace(devicesKey, ncclComms);
+ }
+
  // [Note 2 ]
 #ifndef NCCL_HAS_COMM_NONBLOCKING
  C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
@@ -1201,8 +1215,18 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
  ncclIdToCommMap_.emplace(buildNcclUniqueIdStr(ncclID), ncclComms);
 
  // Move the NCCL resource to cache
- devNCCLCommMap_.emplace(devicesKey, std::move(ncclComms));
- return devNCCLCommMap_[devicesKey];
+ auto it = inInitializationCommMap_.find(devicesKey);
+ // A previous thread could've already removed devicesKey from
+ // inInitializationCommMap_ and added it to devNCCLCommMap_
+ if (it != inInitializationCommMap_.end()) {
+ devNCCLCommMap_.emplace(devicesKey, std::move(it->second));
+ inInitializationCommMap_.erase(devicesKey);
+ }
+
+ it = devNCCLCommMap_.find(devicesKey);
+ TORCH_INTERNAL_ASSERT(
+ it != devNCCLCommMap_.end(), "Communicators not populated in cache!");
+ return it->second;
 }
 
 namespace {
 
@@ -640,6 +640,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
  std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
  devNCCLCommMap_;
 
+ // The NCCL communicators currently in process of being initialized.
+ std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
+ inInitializationCommMap_;
+
  // Map from ncclUniqueId to appropriate communicator.
  std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
  ncclIdToCommMap_;
 
@@ -9725,6 +9725,47 @@ def forward(self, inp):
  ddp._check_reducer_finalized()
  ddp(input)
 
+ @skip_if_lt_x_gpu(2)
+ @skip_but_pass_in_sandcastle_if(
+ BACKEND != "nccl",
+ "TORCH_NCCL_USE_COMM_NONBLOCKING only applies to NCCL"
+ )
+ def test_nccl_init_abort(self):
+ """
+ Tests that we can abort a NCCL communicator during initialization and
+ recover appropriately.
+ """
+ # Reinitialize global process group with TORCH_NCCL_USE_COMM_NONBLOCKING=1
+ os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
+ dist.destroy_process_group()
+ timeout = timedelta(seconds=1)
+ dist.init_process_group(
+ init_method=INIT_METHOD,
+ backend=BACKEND,
+ world_size=int(os.environ["WORLD_SIZE"]),
+ rank=self.rank,
+ timeout=timeout,
+ )
+
+ # Abort pg in background thread.
+ running = True
+
+ def abort():
+ pg = _get_default_group()
+ while running:
+ pg._get_backend(torch.device(0))._abort()
+ time.sleep(1)
+
+ if self.rank != 1:
+ import threading
+ t = threading.Thread(target=abort)
+ t.start()
+ with self.assertRaises(RuntimeError):
+ # First collective triggers initialization via ncclCommInitRank.
+ torch.distributed.barrier()
+ running = False
+ t.join()
+
 
 
  @skip_if_lt_x_gpu(2)