Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions paddle/phi/kernels/gpu/adagrad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ __global__ void SparseAdagradFunctorKernel(const T* grad,

template <typename T>
struct SparseAdagradFunctor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
void operator()(const phi::GPUContext& dev_ctx,
const phi::SelectedRows& grad,
const DenseTensor& learning_rate,
T epsilon,
Expand All @@ -162,15 +162,15 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
// 1. g_m.rows = set(g.rows)
auto grad_width = grad.value().dims()[1];
phi::funcs::scatter::MergeAdd<phi::GPUContext, T> merge_func;
auto grad_merge = merge_func(context, grad);
auto grad_merge = merge_func(dev_ctx, grad);
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
phi::Vector<int64_t> merge_rows(grad_merge.rows());
// 2. m += g_m * g_m
auto grad_square =
SquareSelectedRows<phi::GPUContext, T>(context, grad_merge);
SquareSelectedRows<phi::GPUContext, T>(dev_ctx, grad_merge);

phi::funcs::SelectedRowsAddToTensor<phi::GPUContext, T> functor;
functor(context, grad_square, moment);
functor(dev_ctx, grad_square, moment);

// 3. update parameter
auto* lr = learning_rate.data<T>();
Expand All @@ -185,9 +185,9 @@ struct SparseAdagradFunctor<phi::GPUContext, T> {
<<<grid2,
threads,
0,
reinterpret_cast<const phi::GPUContext&>(context).stream()>>>(
reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream()>>>(
grad_merge_data,
mixv_merge_rows.CUDAMutableData(context.GetPlace()),
mixv_merge_rows.CUDAMutableData(dev_ctx.GetPlace()),
lr,
param_data,
moment_data,
Expand Down
44 changes: 22 additions & 22 deletions paddle/phi/kernels/gpu/depthwise_conv.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ template <typename DeviceContext,
bool fuse_relu_before_conv = false>
class DepthwiseConvFunctor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& filter,
const std::vector<int>& strides,
Expand All @@ -148,7 +148,7 @@ template <typename DeviceContext,
bool fuse_relu_before_conv = false>
class DepthwiseConvInputGradFunctor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& filter,
const phi::DenseTensor& output_grad,
Expand All @@ -164,7 +164,7 @@ template <typename DeviceContext,
bool fuse_relu_before_conv = false>
class DepthwiseConvFilterGradFunctor {
public:
void operator()(const DeviceContext& context,
void operator()(const DeviceContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& output_grad,
const std::vector<int>& strides,
Expand Down Expand Up @@ -1450,7 +1450,7 @@ __global__ void KernelDepthwiseConvFilterGradSp(const T* output_grad_data,
template <class T, bool fuse_relu_before_conv>
class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
public:
void operator()(const phi::GPUContext& context,
void operator()(const phi::GPUContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& filter,
const std::vector<int>& strides,
Expand Down Expand Up @@ -1485,7 +1485,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {

const T* input_data = input.data<T>();
const T* filter_data = filter.data<T>();
T* output_data = context.template Alloc<T>(output);
T* output_data = dev_ctx.template Alloc<T>(output);

phi::DenseTensor filter_hwc;
if (data_layout == DataLayout::kNHWC) {
Expand All @@ -1494,10 +1494,10 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
filter.dims()[0],
filter.dims()[1]});
filter_hwc.Resize(filter_hwc_dims);
context.template Alloc<T>(&filter_hwc);
dev_ctx.template Alloc<T>(&filter_hwc);
std::vector<int> perm_axis({2, 3, 0, 1});
phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
trans(context, filter, &filter_hwc, perm_axis);
trans(dev_ctx, filter, &filter_hwc, perm_axis);
filter_data = filter_hwc.data<T>();
}

Expand Down Expand Up @@ -1546,7 +1546,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
c_filter, \
DataLayout::kNCHW, \
fuse_relu_before_conv> \
<<<grid, threads, 0, context.stream()>>>(input_data, \
<<<grid, threads, 0, dev_ctx.stream()>>>(input_data, \
filter_data, \
batch_size, \
output_channels, \
Expand All @@ -1572,7 +1572,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
c_filter, \
DataLayout::kNHWC, \
fuse_relu_before_conv> \
<<<grid, threads, 0, context.stream()>>>(input_data, \
<<<grid, threads, 0, dev_ctx.stream()>>>(input_data, \
filter_data, \
batch_size, \
output_channels, \
Expand Down Expand Up @@ -1616,7 +1616,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
template <typename T, bool fuse_relu_before_conv>
class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
public:
void operator()(const phi::GPUContext& context,
void operator()(const phi::GPUContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& filter,
const phi::DenseTensor& output_grad,
Expand Down Expand Up @@ -1653,7 +1653,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
const T* input_data = input.data<T>();
const T* filter_data = filter.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = context.template Alloc<T>(input_grad);
T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);

phi::DenseTensor filter_hwc;
if (data_layout == DataLayout::kNHWC) {
Expand All @@ -1662,10 +1662,10 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
filter.dims()[0],
filter.dims()[1]});
filter_hwc.Resize(filter_hwc_dims);
context.template Alloc<T>(&filter_hwc);
dev_ctx.template Alloc<T>(&filter_hwc);
std::vector<int> perm_axis({2, 3, 0, 1});
phi::funcs::TransposeNormal<phi::GPUContext, T> trans;
trans(context, filter, &filter_hwc, perm_axis);
trans(dev_ctx, filter, &filter_hwc, perm_axis);
filter_data = filter_hwc.data<T>();
}

Expand Down Expand Up @@ -1715,7 +1715,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
c_filter, \
DataLayout::kNCHW, \
fuse_relu_before_conv> \
<<<grid, threads, 0, context.stream()>>>(input_data, \
<<<grid, threads, 0, dev_ctx.stream()>>>(input_data, \
output_grad_data, \
filter_data, \
batch_size, \
Expand All @@ -1742,7 +1742,7 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
c_filter, \
DataLayout::kNHWC, \
fuse_relu_before_conv> \
<<<grid, threads, 0, context.stream()>>>(input_data, \
<<<grid, threads, 0, dev_ctx.stream()>>>(input_data, \
output_grad_data, \
filter_data, \
batch_size, \
Expand Down Expand Up @@ -1789,7 +1789,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
T,
fuse_relu_before_conv> {
public:
void operator()(const phi::GPUContext& context,
void operator()(const phi::GPUContext& dev_ctx,
const phi::DenseTensor& input,
const phi::DenseTensor& output_grad,
const std::vector<int>& strides,
Expand Down Expand Up @@ -1824,7 +1824,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,

const T* input_data = input.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* filter_grad_data = context.template Alloc<T>(filter_grad);
T* filter_grad_data = dev_ctx.template Alloc<T>(filter_grad);

int block_size = 512;
int blocks;
Expand Down Expand Up @@ -1875,7 +1875,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
c_filter, \
DataLayout::kNCHW, \
fuse_relu_before_conv> \
<<<grid, threads, 0, context.stream()>>>(output_grad_data, \
<<<grid, threads, 0, dev_ctx.stream()>>>(output_grad_data, \
input_data, \
batch_size, \
output_channels, \
Expand All @@ -1902,9 +1902,9 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
filter_grad->dims()[0], \
filter_grad->dims()[1]}); \
filter_grad_hwc.Resize(filter_grad_hwc_dims); \
context.template Alloc<T>(&filter_grad_hwc); \
dev_ctx.template Alloc<T>(&filter_grad_hwc); \
phi::funcs::SetConstant<phi::GPUContext, T> set_zero; \
set_zero(context, &filter_grad_hwc, static_cast<T>(0)); \
set_zero(dev_ctx, &filter_grad_hwc, static_cast<T>(0)); \
filter_grad_data = filter_grad_hwc.data<T>(); \
} else { \
block_size = 512; \
Expand All @@ -1924,7 +1924,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
c_filter, \
DataLayout::kNHWC, \
fuse_relu_before_conv> \
<<<grid, threads, 0, context.stream()>>>(output_grad_data, \
<<<grid, threads, 0, dev_ctx.stream()>>>(output_grad_data, \
input_data, \
batch_size, \
output_channels, \
Expand All @@ -1946,7 +1946,7 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
if (c_filter != -1) { \
std::vector<int> perm_axis({2, 3, 0, 1}); \
phi::funcs::TransposeNormal<phi::GPUContext, T> trans; \
trans(context, filter_grad_hwc, filter_grad, perm_axis); \
trans(dev_ctx, filter_grad_hwc, filter_grad, perm_axis); \
} \
} \
return; \
Expand Down
8 changes: 4 additions & 4 deletions paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
namespace phi {

template <typename T, typename Context>
void ExpandAsGradKernel(const Context& context,
void ExpandAsGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const std::vector<int64_t>& target_shape,
Expand All @@ -41,14 +41,14 @@ void ExpandAsGradKernel(const Context& context,
"to 6, but the value received is %d.",
out_rank));

context.template Alloc<T>(in_grad);
dev_ctx.template Alloc<T>(in_grad);
if (in_dims == out_dims) {
phi::Copy(context, out_grad, context.GetPlace(), false, in_grad);
phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, in_grad);
} else {
std::vector<int> reduce_dims = funcs::GetReduceDim(in_dims, out_dims, -1);

phi::SumKernel<T, Context>(
context, out_grad, reduce_dims, out_grad.dtype(), false, in_grad);
dev_ctx, out_grad, reduce_dims, out_grad.dtype(), false, in_grad);
}
}

Expand Down
12 changes: 6 additions & 6 deletions paddle/phi/kernels/gpu/sequence_expand_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ inline __global__ void sequence_expand_grad_kernel(const T* dout_data,

template <typename T>
struct SequenceExpandGradFunctor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
void operator()(const phi::GPUContext& dev_ctx,
const DenseTensor& dout,
const phi::Vector<size_t>& x_lod, /*expand source lod*/
const phi::Vector<size_t>& ref_lod, /*expand based lod*/
Expand All @@ -67,14 +67,14 @@ struct SequenceExpandGradFunctor<phi::GPUContext, T> {
phi::MixVector<size_t> mixv_ref_lod(&ref_lod);
phi::MixVector<size_t> mixv_x_lod(&x_lod);
phi::MixVector<size_t> mixv_out_offset(&out_offset);
sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
sequence_expand_grad_kernel<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
dout.data<T>(),
mixv_ref_lod.CUDAData(context.GetPlace()),
mixv_x_lod.CUDAData(context.GetPlace()),
mixv_out_offset.CUDAData(context.GetPlace()),
mixv_ref_lod.CUDAData(dev_ctx.GetPlace()),
mixv_x_lod.CUDAData(dev_ctx.GetPlace()),
mixv_out_offset.CUDAData(dev_ctx.GetPlace()),
ref_lod.size(),
x_item_length,
context.template Alloc<T>(dx));
dev_ctx.template Alloc<T>(dx));
}
};

Expand Down
18 changes: 9 additions & 9 deletions paddle/phi/kernels/gpu/sequence_expand_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
namespace phi {

template <typename T>
static inline int ExpandByMemoryCopy(const phi::GPUContext& context,
static inline int ExpandByMemoryCopy(const phi::GPUContext& dev_ctx,
const DenseTensor& x,
DenseTensor* out,
const phi::Vector<size_t>& x_lod,
Expand All @@ -27,7 +27,7 @@ static inline int ExpandByMemoryCopy(const phi::GPUContext& context,
auto out_data = out->data<T>();
auto x_data = x.data<T>();

const auto& gpu_place = context.GetPlace();
const auto& gpu_place = dev_ctx.GetPlace();

int x_item_length = x.numel() / x.dims()[0];
int out_offset = 0;
Expand All @@ -51,7 +51,7 @@ static inline int ExpandByMemoryCopy(const phi::GPUContext& context,
gpu_place,
x_data + (x_start + k) * x_item_length,
sizeof(T) * x_item_length,
context.stream());
dev_ctx.stream());
}
}
} else {
Expand Down Expand Up @@ -93,16 +93,16 @@ inline __global__ void sequence_expand_kernel(const T* x_data,

template <typename T>
struct SequenceExpandFunctor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
void operator()(const phi::GPUContext& dev_ctx,
const DenseTensor& x,
const phi::Vector<size_t>& x_lod, /*expand source lod*/
const phi::Vector<size_t>& ref_lod, /*expand referenced lod*/
DenseTensor* out) {
int num_copies =
ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, false);
ExpandByMemoryCopy<T>(dev_ctx, x, out, x_lod, ref_lod, false);
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (num_copies < 5) {
ExpandByMemoryCopy<T>(context, x, out, x_lod, ref_lod, true);
ExpandByMemoryCopy<T>(dev_ctx, x, out, x_lod, ref_lod, true);
} else {
int x_item_length = x.numel() / x.dims()[0];
size_t x_lod_size = x_lod.size();
Expand All @@ -118,7 +118,7 @@ struct SequenceExpandFunctor<phi::GPUContext, T> {

phi::MixVector<size_t> mixv_out_offset(&out_offset);
const size_t* out_offset_data =
mixv_out_offset.CUDAData(context.GetPlace());
mixv_out_offset.CUDAData(dev_ctx.GetPlace());
const size_t* x_lod_data = out_offset_data + x_lod_size;
const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size;

Expand All @@ -130,14 +130,14 @@ struct SequenceExpandFunctor<phi::GPUContext, T> {
dim3 block_size(thread_x, thread_y, thread_z);
dim3 grid_size(block_x, 1);

sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
sequence_expand_kernel<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
x.data<T>(),
x_lod_data,
ref_lod_data,
out_offset_data,
x_lod_size,
x_item_length,
context.template Alloc<T>(out));
dev_ctx.template Alloc<T>(out));
}
}
};
Expand Down
10 changes: 5 additions & 5 deletions paddle/phi/kernels/gpu/sequence_softmax_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data,

template <typename T>
struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {
void operator()(const phi::GPUContext &context,
void operator()(const phi::GPUContext &dev_ctx,
const DenseTensor &dout,
const DenseTensor &out,
const phi::Vector<size_t> &ref_lod, /*referenced lod*/
Expand All @@ -80,20 +80,20 @@ struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {

const int kThreadsPerBlock = 32;
int thread_x = kThreadsPerBlock;
int max_threads = context.GetMaxPhysicalThreadCount();
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);

dim3 block_size(thread_x);
dim3 grid_size(max_blocks);

phi::MixVector<size_t> mixv_ref_lod(&ref_lod);
sequence_softmax_grad_kernel<T, kThreadsPerBlock>
<<<grid_size, block_size, 0, context.stream()>>>(
<<<grid_size, block_size, 0, dev_ctx.stream()>>>(
dout.data<T>(),
out.data<T>(),
mixv_ref_lod.CUDAData(context.GetPlace()),
mixv_ref_lod.CUDAData(dev_ctx.GetPlace()),
height,
context.Alloc<T>(dx));
dev_ctx.Alloc<T>(dx));
}
};

Expand Down
Loading
Loading