pytorch · yeounoh · Dec 15, 2022 · Dec 15, 2022
diff --git a/third_party/xla_client/computation_client.h b/third_party/xla_client/computation_client.h
@@ -224,6 +224,9 @@ class ComputationClient {
  absl::Span<const TensorSource> tensor_shards, std::string device,
  xla::Shape shape) = 0;
 
+ // Copies `data->buffer` to `dst` device buffer.
+ virtual DataPtr CopyToDevice(DataPtr data, std::string dst) = 0;
+
  // Reads the tensor literal values stored at TPU server sites, behind the
  // supplied handles.
  virtual std::vector<Literal> TransferFromServer(

diff --git a/third_party/xla_client/pjrt_computation_client.cc b/third_party/xla_client/pjrt_computation_client.cc
@@ -4,6 +4,7 @@
 
 #include "absl/strings/ascii.h"
 #include "absl/types/span.h"
+#include "pjrt_computation_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -171,6 +172,27 @@ ComputationClient::DataPtr PjRtComputationClient::TransferShardsToServer(
  return std::make_shared<PjRtShardedData>(device, shape, pjrt_data_shards);
 }
 
+ComputationClient::DataPtr PjRtComputationClient::CopyToDevice(
+ ComputationClient::DataPtr data, std::string dst) {
+ tensorflow::profiler::TraceMe activity(
+ "PjRtComputationClient::CopyToDevice",
+ tensorflow::profiler::TraceMeLevel::kInfo);
+ const PjRtData* pjrt_data = dynamic_cast<PjRtData*>(data.get());
+ XLA_CHECK(pjrt_data->HasValue()) << "Can't copy invalid device data.";
+
+ PjRtDevice* dst_device = StringToPjRtDevice(dst);
+ XLA_CHECK(dst_device->IsAddressable()) << dst << "is not addressable.";
+
+ // Returns error if the buffer is already on `dst_device`.
+ StatusOr<std::unique_ptr<PjRtBuffer>> status_or =
+ pjrt_data->buffer->CopyToDevice(dst_device);
+ XLA_CHECK(status_or.ok())
+ << pjrt_data->device() << " buffer already exists on " << dst;
+
+ return std::make_shared<PjRtData>(dst, pjrt_data->shape(),
+ std::move(status_or.value()));
+}
+
 std::vector<xla::Literal> PjRtComputationClient::TransferFromServer(
  absl::Span<const DataPtr> handles) {
  metrics::TimedSection timed(TransferFromServerMetric());

diff --git a/third_party/xla_client/pjrt_computation_client.h b/third_party/xla_client/pjrt_computation_client.h
@@ -32,6 +32,8 @@ class PjRtComputationClient : public ComputationClient {
  DataPtr TransferShardsToServer(absl::Span<const TensorSource> tensor_shards,
  std::string device, xla::Shape shape) override;
 
+ DataPtr CopyToDevice(DataPtr data, std::string dst) override;
+
  std::vector<ComputationPtr> Compile(
  std::vector<CompileInstance> instances) override;
 

diff --git a/third_party/xla_client/xrt_computation_client.h b/third_party/xla_client/xrt_computation_client.h
@@ -264,6 +264,10 @@ class XrtComputationClient : public ComputationClient {
  XLA_ERROR() << __FUNCTION__ << " not implemented";
  }
 
+ DataPtr CopyToDevice(DataPtr data, std::string dst) override {
+ XLA_ERROR() << __FUNCTION__ << " not implemented";
+ }
+
  std::vector<Literal> TransferFromServer(
  absl::Span<const DataPtr> handles) override;
 

diff --git a/torch_xla/csrc/xla_sharding_util.cpp b/torch_xla/csrc/xla_sharding_util.cpp
@@ -195,38 +195,13 @@ ShardingUtil::InputHandler(
  int64_t source_device_i =
  ParseDeviceString(shards[0]->device()).ordinal();
  arguments_by_device[source_device_i][argument_i] = shards[0];
-
- auto literal = std::make_shared<xla::Literal>(std::move(
- xla::ComputationClient::Get()->TransferFromServer(shards)[0]));
- std::vector<xla::ComputationClient::TensorSource> source_tensors;
- for (int64_t device_i = 0; device_i < devices.size(); ++device_i) {
- if (device_i != source_device_i) {
- auto populate_fn =
- [&](const xla::ComputationClient::TensorSource& source_tensor,
- void* dest_buffer, size_t dest_buffer_size) {
- std::memcpy(dest_buffer, literal->untyped_data(),
- dest_buffer_size);
- };
- source_tensors.emplace_back(
- xla::Shape(shards[0]->shape().ToProto()),
- ParseDeviceString(absl::StrFormat(":%d", device_i)).toString(),
- std::move(populate_fn));
- }
- }
-
- std::vector<xla::ComputationClient::DataPtr> replicated_shards =
- xla::ComputationClient::Get()->TransferToServer(source_tensors);
- auto itr = replicated_shards.begin();
  for (int64_t device_i = 0; device_i < devices.size(); ++device_i) {
  if (device_i != source_device_i) {
- arguments_by_device[device_i][argument_i] = *itr;
- ++itr;
+ arguments_by_device[device_i][argument_i] =
+ xla::ComputationClient::Get()->CopyToDevice(shards[0],
+ devices[device_i]);
  }
  }
- XLA_CHECK(itr == replicated_shards.end())
- << "Replicated arguments[" << argument_i << "] on "
- << shards[0]->device() << " " << replicated_shards.size()
- << " times (expected " << (devices.size() - 1) << ").";
  }
  }