Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/phi/kernels/funcs/cross_entropy.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ struct TolerableValue<phi::dtype::bfloat16> {
template <typename DeviceContext, typename T>
class CrossEntropyFunctor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
phi::DenseTensor* out,
const phi::DenseTensor* prob,
const phi::DenseTensor* labels,
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/funcs/fc_functor.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace funcs {
template <typename DeviceContext, typename T>
class FCFunctor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const int M,
const int N,
const int K,
Expand All @@ -40,7 +40,7 @@ class FCFunctor {
template <typename DeviceContext, typename T>
class FCInt8Functor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const int M,
const int N,
const int K,
Expand Down
8 changes: 4 additions & 4 deletions paddle/phi/kernels/funcs/math/unpooling.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@ namespace math {
template <typename DeviceContext, typename T>
class Unpool2dMaxFunctor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& indices,
phi::DenseTensor* output);
};
template <typename DeviceContext, class T>
class Unpool2dMaxGradFunctor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& indices,
const phi::DenseTensor& output,
Expand All @@ -41,15 +41,15 @@ class Unpool2dMaxGradFunctor {
template <typename DeviceContext, typename T>
class Unpool3dMaxFunctor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& indices,
phi::DenseTensor* output);
};
template <typename DeviceContext, class T>
class Unpool3dMaxGradFunctor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& indices,
const phi::DenseTensor& output,
Expand Down
20 changes: 10 additions & 10 deletions paddle/phi/kernels/funcs/math_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,70 +43,70 @@ void BatchTranspose(T* output,
template <typename DeviceContext, typename T>
struct TransposeNormal {
// for dims >= 7 situation
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& in,
phi::DenseTensor* out,
const std::vector<int>& axis);
};

template <typename DeviceContext, typename T, int Rank>
struct Transpose {
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& in,
phi::DenseTensor* out,
const std::vector<int>& axis);
};

template <typename DeviceContext, typename T>
struct SetConstant {
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
phi::DenseTensor* tensor,
T num);
};

#ifdef PADDLE_WITH_XPU
template <typename T>
struct SetConstant<phi::XPUContext, T> {
void operator()(const phi::XPUContext& context,
void operator()(const phi::XPUContext& dev_ctx,
phi::DenseTensor* tensor,
T num);
};
#endif

template <typename Place>
void set_constant_with_place(const phi::DeviceContext& context,
void set_constant_with_place(const phi::DeviceContext& dev_ctx,
phi::DenseTensor* tensor,
float value);

void set_constant(const phi::DeviceContext& context,
void set_constant(const phi::DeviceContext& dev_ctx,
phi::DenseTensor* tensor,
float value);

template <typename DeviceContext, typename T>
struct RowwiseAdd {
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& vec,
phi::DenseTensor* output);
};

template <typename DeviceContext, typename T>
struct ColwiseSum {
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* vec);
};

template <typename DeviceContext, typename T>
struct RowwiseSum {
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* vec);
};

template <typename DeviceContext, typename T>
struct RowwiseMean {
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* vec);
};
Expand Down
38 changes: 19 additions & 19 deletions paddle/phi/kernels/funcs/math_function_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,26 @@ namespace funcs {
using phi::To32BitIndex;

template <typename DeviceContext, typename T>
void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
void SetConstant<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
phi::DenseTensor* tensor,
T num) {
auto t = phi::EigenVector<T>::Flatten(*tensor);
t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(num));
}

#ifdef PADDLE_WITH_XPU
template <typename T>
void SetConstant<phi::XPUContext, T>::operator()(const phi::XPUContext& context,
void SetConstant<phi::XPUContext, T>::operator()(const phi::XPUContext& dev_ctx,
phi::DenseTensor* tensor,
T num) {
phi::VisitDataType(tensor->dtype(),
TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
TensorSetConstantXPU<T>(tensor, num, dev_ctx.GetPlace()));
}
#endif

template <typename DeviceContext, typename T, int Rank>
void Transpose<DeviceContext, T, Rank>::operator()(
const DeviceContext& context,
const DeviceContext& dev_ctx,
const phi::DenseTensor& in,
phi::DenseTensor* out,
const std::vector<int>& axis) {
Expand All @@ -55,10 +55,10 @@ void Transpose<DeviceContext, T, Rank>::operator()(
}
auto eigen_in = phi::EigenTensor<T, Rank>::From(in);
auto eigen_out = phi::EigenTensor<T, Rank>::From(*out);
auto* dev = context.eigen_device();
auto* dev = dev_ctx.eigen_device();
// use 32bit index to speed up computation
bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
bool is_gpu_place = context.GetPlace().GetType() == phi::AllocationType::GPU;
bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
if (use_32bit_index && is_gpu_place) {
To32BitIndex(eigen_out).device(*dev) =
To32BitIndex(eigen_in).shuffle(permute);
Expand All @@ -68,7 +68,7 @@ void Transpose<DeviceContext, T, Rank>::operator()(
}

template <typename DeviceContext, typename T>
void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* out) {
auto in_dims = input.dims();
Expand All @@ -85,7 +85,7 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
auto in = phi::EigenMatrix<T>::From(input);
auto vec = phi::EigenVector<T>::Flatten(*out);

vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
vec.device(*dev_ctx.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
}

// Specialize for CPU, since Eigen implement a general reduce. However,
Expand All @@ -94,7 +94,7 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
template <typename T>
class ColwiseSum<phi::CPUContext, T> {
public:
void operator()(const phi::CPUContext& context,
void operator()(const phi::CPUContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* out) {
auto& in_dims = input.dims();
Expand All @@ -110,7 +110,7 @@ class ColwiseSum<phi::CPUContext, T> {
size,
out->numel()));

T* out_buf = context.template Alloc<T>(out);
T* out_buf = dev_ctx.template Alloc<T>(out);
const T* in_buf = input.data<T>();

for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
Expand All @@ -126,7 +126,7 @@ class ColwiseSum<phi::CPUContext, T> {
};

template <typename DeviceContext, typename T>
void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* out) {
auto in_dims = input.dims();
Expand All @@ -148,7 +148,7 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
auto in = phi::EigenMatrix<T>::From(input);
auto vec = phi::EigenVector<T>::Flatten(*out);

vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
vec.device(*dev_ctx.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
}
// TODO(zcd): Following ColwiseSum format, need to confirm.
// Specialize for CPU, since Eigen implement a general reduce. However,
Expand All @@ -157,7 +157,7 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
template <typename T>
class RowwiseMean<phi::CPUContext, T> {
public:
void operator()(const phi::CPUContext& context,
void operator()(const phi::CPUContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* out) {
auto& in_dims = input.dims();
Expand All @@ -179,7 +179,7 @@ class RowwiseMean<phi::CPUContext, T> {
height,
out->numel()));
auto inv_size = 1.0 / size;
T* out_buf = context.template Alloc<T>(out);
T* out_buf = dev_ctx.template Alloc<T>(out);
const T* in_buf = input.data<T>();

for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
Expand All @@ -193,7 +193,7 @@ class RowwiseMean<phi::CPUContext, T> {
};

template <typename DeviceContext, typename T>
void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* out) {
auto in_dims = input.dims();
Expand All @@ -215,7 +215,7 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
auto in = phi::EigenMatrix<T>::From(input);
auto vec = phi::EigenVector<T>::Flatten(*out);

vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
vec.device(*dev_ctx.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
}
// TODO(zcd): Following ColwiseSum format, need to confirm.
// Specialize for CPU, since Eigen implement a general reduce. However,
Expand All @@ -224,7 +224,7 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
template <typename T>
class RowwiseSum<phi::CPUContext, T> {
public:
void operator()(const phi::CPUContext& context,
void operator()(const phi::CPUContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* out) {
auto& in_dims = input.dims();
Expand All @@ -246,7 +246,7 @@ class RowwiseSum<phi::CPUContext, T> {
height,
out->numel()));

T* out_buf = context.template Alloc<T>(out);
T* out_buf = dev_ctx.template Alloc<T>(out);
const T* in_buf = input.data<T>();

for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
Expand Down
8 changes: 4 additions & 4 deletions paddle/phi/kernels/funcs/maxouting.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace phi::funcs {

// All tensors are in NCHW or NHWC format, and the groups must be greater than 1
template <typename DeviceContext, typename T>
void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* output,
const int groups,
Expand All @@ -35,7 +35,7 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
// c_size means the output size of each sample
int c_size = fea_size * output_channels;
const T* input_data = input.data<T>();
T* output_data = context.template Alloc<T>(output);
T* output_data = dev_ctx.template Alloc<T>(output);
for (int i = 0; i < batch_size; ++i) {
int new_bindex = c_size * i;
for (int c = 0; c < output_channels; ++c) {
Expand Down Expand Up @@ -65,7 +65,7 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,

template <typename DeviceContext, typename T>
void MaxOutGradFunctor<DeviceContext, T>::operator()(
const DeviceContext& context,
const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* input_grad,
const phi::DenseTensor& output,
Expand All @@ -82,7 +82,7 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = context.template Alloc<T>(input_grad);
T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
for (int i = 0; i < batch_size; ++i) {
int blen = fea_size * output_channels * i;
for (int c = 0; c < output_channels; ++c) {
Expand Down
12 changes: 6 additions & 6 deletions paddle/phi/kernels/funcs/maxouting.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ __global__ void KernelMaxoutGrad(const int64_t nthreads,
}

template <typename DeviceContext, typename T>
void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* output,
const int groups,
Expand All @@ -122,13 +122,13 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
const int64_t input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]);

const T* input_data = input.data<T>();
T* output_data = context.template Alloc<T>(output);
T* output_data = dev_ctx.template Alloc<T>(output);
int64_t nthreads = static_cast<int64_t>(output->numel());
int64_t blocks = static_cast<int64_t>((nthreads + 1024 - 1) / 1024);
dim3 threads(1024, 1);
dim3 grid(blocks, 1);

KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(nthreads,
KernelMaxOut<T><<<grid, threads, 0, dev_ctx.stream()>>>(nthreads,
input_data,
input_channels,
input_height,
Expand All @@ -140,7 +140,7 @@ void MaxOutFunctor<DeviceContext, T>::operator()(const DeviceContext& context,

template <typename DeviceContext, typename T>
void MaxOutGradFunctor<DeviceContext, T>::operator()(
const DeviceContext& context,
const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
phi::DenseTensor* input_grad,
const phi::DenseTensor& output,
Expand All @@ -154,13 +154,13 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = context.template Alloc<T>(input_grad);
T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
int64_t nthreads = static_cast<int64_t>(output.numel());
int64_t blocks = static_cast<int64_t>((nthreads + 1024 - 1) / 1024);
dim3 threads(1024, 1);
dim3 grid(blocks, 1);

KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(nthreads,
KernelMaxoutGrad<T><<<grid, threads, 0, dev_ctx.stream()>>>(nthreads,
input_data,
output_data,
output_grad_data,
Expand Down
Loading
Loading