Skip to content

Commit 08a2d79

Browse files
authored
[API] fix nn.functional.fused_layer_norm api large tensor bug (#74120)
1 parent 7b9507a commit 08a2d79

File tree

4 files changed

+25
-21
lines changed

4 files changed

+25
-21
lines changed

paddle/phi/kernels/funcs/dropout_impl_util.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ inline bool GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
2626
const phi::DenseTensor* seed,
2727
const bool is_fix_seed,
2828
const int seed_val,
29-
const int offset,
29+
const uint64_t offset,
3030
uint64_t* seed_data,
3131
uint64_t* increment,
3232
bool use_copy = true) {

paddle/phi/kernels/fusion/gpu/fused_dropout_common.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,33 +56,33 @@ inline phi::backends::gpu::GpuLaunchConfig Get1DBlocksAnd2DGrids(
5656
const uint64_t rows,
5757
const uint64_t cols,
5858
const int vec_size) {
59-
const uint64_t tmp_cols = cols / vec_size;
59+
const uint64_t tmp_cols = cols / static_cast<uint64_t>(vec_size);
6060
// NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
6161
// needs too many register resources. If data_type is float16, CUDA
6262
// error(701) will occur when block_size is 1024. Which error is
6363
// 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
6464
// occur because it did not have appropriate resources.
6565
// Of course, this kernel can be optimized later to reduce the use
6666
// of registers.
67-
const int threads =
67+
const uint64_t threads =
6868
std::max(static_cast<uint64_t>(32),
6969
std::min(tmp_cols,
7070
static_cast<uint64_t>(
7171
std::min(dev_ctx.GetMaxThreadsPerBlock(), 512))));
72-
73-
const int blocks_x =
74-
std::max(static_cast<uint64_t>(1), (tmp_cols + threads - 1) / threads);
72+
const uint64_t blocks_x = std::min(
73+
static_cast<uint64_t>(65536),
74+
std::max(static_cast<uint64_t>(1), (tmp_cols + threads - 1) / threads));
7575
uint64_t blocks_y = std::max(static_cast<uint64_t>(1), rows);
7676
int blocks_z = 1;
77-
if (blocks_y > 65536) {
77+
if (blocks_y >= 65536) {
7878
blocks_z = 1024;
7979
blocks_y = (blocks_y + blocks_z - 1) / blocks_z;
80-
blocks_y = blocks_y > 65536 ? 65535 : blocks_y;
80+
blocks_y = blocks_y >= 65536 ? 65535 : blocks_y;
8181
}
8282
phi::backends::gpu::GpuLaunchConfig config;
83-
config.block_per_grid.x = blocks_x;
84-
config.block_per_grid.y = blocks_y;
85-
config.block_per_grid.z = blocks_z;
83+
config.block_per_grid.x = static_cast<uint32_t>(blocks_x);
84+
config.block_per_grid.y = static_cast<uint32_t>(blocks_y);
85+
config.block_per_grid.z = static_cast<uint32_t>(blocks_z);
8686
config.thread_per_block.x = threads;
8787
return config;
8888
}

paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ struct DropoutParam {
7272
seed_val = seed_val_;
7373
}
7474

75-
int UpdateSeedAndIncrement(const phi::GPUContext& dev_ctx, const int offset) {
75+
uint64_t UpdateSeedAndIncrement(const phi::GPUContext& dev_ctx,
76+
const uint64_t offset) {
7677
uint64_t tmp_increment;
7778
phi::funcs::GetSeedDataAndIncrement(dev_ctx,
7879
tensor_seed,
@@ -81,7 +82,7 @@ struct DropoutParam {
8182
offset,
8283
&seed,
8384
&tmp_increment);
84-
increment = static_cast<int>(tmp_increment);
85+
increment = tmp_increment;
8586
return increment;
8687
}
8788
};
@@ -104,26 +105,29 @@ template <typename T,
104105
typename OutType = T>
105106
class FusedDropoutHelper {
106107
private:
107-
int GetIncrement(const phi::GPUContext& dev_ctx) {
108+
uint64_t GetIncrement(const phi::GPUContext& dev_ctx) {
108109
const int VecSize = MAX_CACHE_BYTES / sizeof(T);
109110
const int real_vec_size = cols_ % VecSize == 0 ? VecSize : 1;
110111
auto config = Get1DBlocksAnd2DGrids(dev_ctx,
111112
static_cast<uint64_t>(rows_),
112113
static_cast<uint64_t>(cols_),
113114
real_vec_size);
114-
int increment = ((cols_ - 1) / (config.thread_per_block.x *
115-
config.block_per_grid.x * real_vec_size) +
116-
1) *
117-
real_vec_size;
115+
uint64_t increment =
116+
((cols_ - static_cast<uint64_t>(1)) /
117+
(static_cast<uint64_t>(config.thread_per_block.x) *
118+
static_cast<uint64_t>(config.block_per_grid.x) *
119+
static_cast<uint64_t>(real_vec_size)) +
120+
static_cast<uint64_t>(1)) *
121+
static_cast<uint64_t>(real_vec_size);
118122
increment = dropout_param_.UpdateSeedAndIncrement(dev_ctx, increment);
119123
return increment;
120124
}
121125

122126
public:
123127
FusedDropoutHelper() {}
124128
FusedDropoutHelper(const phi::GPUContext& dev_ctx,
125-
const int rows,
126-
const int cols,
129+
const int64_t rows,
130+
const int64_t cols,
127131
const DropoutParam& dropout_param,
128132
const float residual_alpha = 1.0) {
129133
rows_ = rows;

paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ template <typename T,
339339
typename OutType = T>
340340
void LaunchResidualDropoutBias(const uint64_t rows,
341341
const uint64_t cols,
342-
const int increment,
342+
const uint64_t increment,
343343
uint64_t seed,
344344
const float dropout_prob,
345345
const bool is_test,

0 commit comments

Comments
 (0)