PaddlePaddle
diff --git a/‎paddle/fluid/pybind/tensor.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/pybind/tensor.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/pybind/xpu_streams_py.cc‎
Lines changed: 151 additions & 15 deletions b/‎paddle/fluid/pybind/xpu_streams_py.cc‎
Lines changed: 151 additions & 15 deletions
diff --git a/‎paddle/fluid/pybind/xpu_streams_py.h‎
Lines changed: 4 additions & 1 deletion b/‎paddle/fluid/pybind/xpu_streams_py.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎paddle/phi/api/include/tensor.h‎
Lines changed: 9 additions & 0 deletions b/‎paddle/phi/api/include/tensor.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎paddle/phi/api/lib/tensor.cc‎
Lines changed: 10 additions & 0 deletions b/‎paddle/phi/api/lib/tensor.cc‎
Lines changed: 10 additions & 0 deletions
@@ -902,7 +902,7 @@ void BindTensor(pybind11::module &m) { // NOLINT
  const auto &device_id =
  paddle::platform::GetXPUCurrentDeviceId();
  auto stream = paddle::platform::get_current_stream(device_id);
- xpu_wait(stream);
+ xpu_wait(stream->raw_stream());
  int type_idx = static_cast<int>(self.type());
  size_t data_size = self.numel() *
  framework::SizeOfType(
 
@@ -33,19 +33,24 @@ namespace py = pybind11;
 namespace paddle {
 namespace platform {
 #ifdef PADDLE_WITH_XPU
-XPUStream get_current_stream(int device_id) {
- if (device_id == -1) {
- device_id = phi::backends::xpu::GetXPUCurrentDeviceId();
- }
+phi::XPUStreamHandle *get_current_stream(int device_id) {
+ auto handle = new phi::XPUStreamHandle();
+ return handle;
+}
+
+phi::XPUStreamHandle *set_current_stream(int idx) {
+ int device_id = phi::backends::xpu::GetXPUCurrentDeviceId();
+ auto original_stream = get_current_stream(device_id);
  auto place = phi::XPUPlace(device_id);
  auto *dev_ctx = static_cast<phi::XPUContext *>(
  phi::DeviceContextPool::Instance().Get(place));
- dev_ctx->Wait();
- return dev_ctx->stream();
+ dev_ctx->SetCurrentStream(idx);
+ return original_stream;
 }
 
 #endif
 } // namespace platform
+
 namespace pybind {
 void BindXpuStream(py::module *m_ptr) {
  auto &m = *m_ptr;
@@ -69,7 +74,7 @@ void BindXpuStream(py::module *m_ptr) {
 #endif
  });
  m.def(
- "_get_current_stream",
+ "_xpu_get_current_stream",
  [](int device_id) {
 #ifdef PADDLE_WITH_XPU
  if (device_id == -1) {
@@ -79,7 +84,19 @@ void BindXpuStream(py::module *m_ptr) {
  return platform::get_current_stream(device_id);
 #else
  PADDLE_THROW(
- common::errors::Unavailable("Paddle is not compiled with CUDA. "
+ common::errors::Unavailable("Paddle is not compiled with XPU. "
+ "Cannot visit device synchronize."));
+#endif
+ },
+ py::return_value_policy::reference);
+ m.def(
+ "_xpu_set_current_stream",
+ [](int stream_id) {
+#ifdef PADDLE_WITH_XPU
+ return platform::set_current_stream(stream_id);
+#else
+ PADDLE_THROW(
+ common::errors::Unavailable("Paddle is not compiled with XPU. "
  "Cannot visit device synchronize."));
 #endif
  },
@@ -101,11 +118,11 @@ void BindXpuStream(py::module *m_ptr) {
  });
 
 #ifdef PADDLE_WITH_XPU
- py::class_<XPUStream>(m, "XPUStream", R"DOC(
+ py::class_<phi::XPUStreamHandle>(m, "XPUStream", R"DOC(
  The handle of the CUDA stream.
 
  Parameters:
- device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream.
+ device(paddle.XPUPlace()|int|None, optional): The device which wanted to allocate the stream.
  If device is None or negative integer, device will be the current device.
  If device is positive integer, it must less than the device count. Default: None.
  priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
@@ -116,14 +133,115 @@ void BindXpuStream(py::module *m_ptr) {
 
  >>> # doctest: +REQUIRES(env:GPU)
  >>> import paddle
- >>> s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
- >>> s2 = paddle.device.cuda.Stream(0, 1)
- >>> s3 = paddle.device.cuda.Stream()
+ >>> s1 = paddle.device.xpu.Stream(paddle.XPUPlace(0))
+ >>> s2 = paddle.device.xpu.Stream(0)
+ >>> s3 = paddle.device.xpu.Stream()
+
+ )DOC")
+ .def("__init__",
+ [](phi::XPUStreamHandle &self) {
+ new (&self) phi::XPUStreamHandle();
+ })
+ .def_property_readonly(
+ "xpu_stream",
+ [](phi::XPUStreamHandle &self) {
+ return reinterpret_cast<std::uintptr_t>(self.raw_stream());
+ })
+ .def("wait_stream",
+ [](phi::XPUStreamHandle &self, phi::XPUStreamHandle &other) {
+ auto *dev_ctx = phi::get_xpu_context();
+ dev_ctx->StreamWaitStreamInPool(self.id(), other.id());
+ })
+ .def("wait_event",
+ [](phi::XPUStreamHandle &self, phi::XPUEventHandle &other) {
+ self.wait_event(other.get_event());
+ })
+ .def("__init__",
+ [](phi::XPUStreamHandle &self, phi::XPUPlace *place) {
+ if (place == nullptr) {
+ int curr_device_id = platform::GetXPUCurrentDeviceId();
+ auto place_tmp = phi::XPUPlace(curr_device_id);
+ new (&self) phi::XPUStreamHandle(place_tmp);
+ } else {
+ new (&self) phi::XPUStreamHandle(*place);
+ }
+ })
+ .def(
+ "__init__",
+ [](phi::XPUStreamHandle &self, int device) {
+ if (device < 0) {
+ device = platform::GetXPUCurrentDeviceId();
+ }
+ auto place_tmp = phi::XPUPlace(device);
+ new (&self) phi::XPUStreamHandle(place_tmp);
+ },
+ py::arg("device") = -1)
+ .def_property_readonly(
+ "place",
+ [](phi::XPUStreamHandle &self) {
+ return phi::XPUPlace(platform::GetXPUCurrentDeviceId());
+ })
+ .def_property_readonly(
+ "idx", [](phi::XPUStreamHandle &self) { return self.id(); });
+ py::class_<phi::XPUEventHandle>(m, "XPUEvent", R"DOC(
+ The handle of the XPU event.
+
+ Parameters:
+ enable_timing(bool, optional): Whether the event will measure time. Default: False.
+ blocking(bool, optional): Whether the wait() func will be blocking. Default: False;
+ interprocess(bool, optional): Whether the event can be shared between processes. Default: False.
+
+ Examples:
+ .. code-block:: python
+
+ >>> # doctest: +REQUIRES(env:XPU)
+ >>> import paddle
+ >>> event = paddle.device.xpu.Event()
+
+ )DOC")
+ .def("__init__",
+ [](phi::XPUEventHandle &self) { new (&self) phi::XPUEventHandle(); })
+ .def(
+ "record",
+ [](phi::XPUEventHandle &self, phi::XPUStreamHandle *stream) {
+ if (stream == nullptr) {
+ auto stream_handle = phi::get_current_stream_handle();
+ self.record(stream_handle.raw_stream());
+ } else {
+ self.record(stream->raw_stream());
+ }
+ },
+ py::arg("stream") = nullptr)
+ .def("query", [](phi::XPUEventHandle &self) { return self.query(); })
+ .def("elapsed_time",
+ [](phi::XPUEventHandle &self) {
+ PADDLE_THROW(common::errors::Unavailable(
+ "XPUEvent elapsed_time is not supported now"));
+ })
+ .def("synchronize",
+ [](phi::XPUEventHandle &self) { self.synchronize(); });
+
+ py::class_<phi::XPUCUDAStream>(m, "XPUCUDAStream", R"DOC(
+ The handle of the XPU stream.
+
+ Parameters:
+ device(paddle.XPUPlace()|int|None, optional): The device which wanted to allocate the stream.
+ If device is None or negative integer, device will be the current device.
+ If device is positive integer, it must less than the device count. Default: None.
+
+ Examples:
+ .. code-block:: python
+
+ >>> # doctest: +REQUIRES(env:GPU)
+ >>> import paddle
+ >>> s1 = paddle.device.xpu.Stream(paddle.XPUPlace(0), 1)
+ >>> s2 = paddle.device.xpu.Stream(0, 1)
+ >>> s3 = paddle.device.xpu.Stream()
 
  )DOC")
  .def(
  "synchronize",
- [](XPUStream &self) { xpu_wait(self); },
+ [](phi::XPUCUDAStream &self) { self.Synchronize(); },
  R"DOC(
  Waits for stream tasks to complete.
 
@@ -135,7 +253,25 @@ void BindXpuStream(py::module *m_ptr) {
  >>> s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
  >>> s.synchronize()
 
- )DOC");
+ )DOC")
+ .def("__init__",
+ [](phi::XPUCUDAStream &self, phi::XPUPlace *place, int priority) {
+ if (priority != 1 && priority != 2) {
+ PADDLE_THROW(common::errors::InvalidArgument(
+ "Priority should be 1(high) or 2(normal) "));
+ }
+ auto stream_flag =
+ phi::XPUCUDAStream::StreamFlag::kStreamNonBlocking;
+ if (place == nullptr) {
+ int curr_device_id = platform::GetXPUCurrentDeviceId();
+ auto place_tmp = phi::XPUPlace(curr_device_id);
+ new (&self)
+ phi::XPUCUDAStream(place_tmp, priority - 2, stream_flag);
+ } else {
+ new (&self)
+ phi::XPUCUDAStream(*place, priority - 2, stream_flag);
+ }
+ });
 #endif
 }
 } // namespace pybind
 
@@ -18,9 +18,11 @@
 #include "pybind11/stl.h"
 
 #ifdef PADDLE_WITH_XPU
+#include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/xpu_cuda_stream.h"
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
+
 #else
 namespace phi {
 class XPUCUDAStream {};
@@ -32,7 +34,8 @@ namespace py = pybind11;
 namespace paddle {
 namespace platform {
 #ifdef PADDLE_WITH_XPU
-XPUStream get_current_stream(int device_id = -1);
+phi::XPUStreamHandle* get_current_stream(int device_id = -1);
+phi::XPUStreamHandle* set_current_stream(int idx);
 #endif
 } // namespace platform
 namespace pybind {
 
@@ -29,6 +29,11 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
+#ifdef PADDLE_WITH_XPU
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#endif
+
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/stream.h"
 #endif
@@ -434,6 +439,10 @@ class PADDLE_API Tensor final {
  * @return gpuStream_t
  */
  gpuStream_t stream() const;
+#elif defined(PADDLE_WITH_XPU)
+
+ void record_stream(XPUStream stream) const;
+
 #elif defined(PADDLE_WITH_CUSTOM_DEVICE)
  /**
  * @brief Get the stream where the tensor is currently located
 
@@ -40,6 +40,8 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
 
+#include "paddle/phi/core/memory/malloc.h"
+
 namespace paddle {
 
 using DeviceContextPool = experimental::DeviceContextPool;
@@ -394,6 +396,14 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
 
 const std::shared_ptr<phi::TensorBase> &Tensor::impl() const { return impl_; }
 
+#ifdef PADDLE_WITH_XPU
+
+void Tensor::record_stream(XPUStream stream) const {
+ paddle::memory::RecordStream(
+ std::dynamic_pointer_cast<phi::DenseTensor>(impl_)->Holder(), stream);
+}
+
+#endif
 void Tensor::set_impl(const std::shared_ptr<phi::TensorBase> &impl) {
  impl_ = impl;
 }