Skip to content

Commit 4d7ab59

Browse files
authored
[GPU] fix matrix_nms_ref kernel GPU memory allocation issue (#32456)
### Details fixed matrix_nms_ref stage 0 kernel GPU memory allocation issue ### Description of the issue #### Symptom pp_yolo model will be failed to run inference, and CL_OUT_OF_RESOUCE will be prompted when creating stage 0 matrix_nms_kernel. #### Root cause - It will try to allocate 1 * 80 * 22743 * 22742/2 * 4 = 82.7GB GPU memory (for batch 1 * classes 80 = 80 GPU work items) in the matrix_nms_ref stage 0 kernel, which exceeds the GPU's total memory size and will be failed to create matrix_nms_kernel. - Also, the matrix_nms_kernel has large size (22743) "for loop" which takes much GPU resources. #### How to fix it - Use global memory buffer allocated by host instead of private memory. - Use chunking for the "for loop". #### The code and line that caused this issue - big memory allocation https://github.com/openvinotoolkit/openvino/blob/bbed74ed1a6575ac40a85fba704a59eab214eb0b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl#L192 - big size (22743) for loop https://github.com/openvinotoolkit/openvino/blob/bbed74ed1a6575ac40a85fba704a59eab214eb0b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl#L182 #### Reproduction step and snapshot - benchmark_app benchmark_app -inference_only false -b 1 -d GPU.0 -hint none -infer_precision f32 -m FP32/1/ov/pp-yolo.xml #### Problematic graph - N/A #### Checklist - [x] Is it a proper fix? (not a workaround) - [x] Did you include test case for this fix, if necessary? Yes, testcase matrix_nms_test_inputs.get_matrix_nms_large_value_of_max_boxes_per_class is added. - [ ] Did you review existing test that can be extended to cover this scenario? Which test did you review? No existing test can cover the issue. ### Tickets: - [CVS-141140](https://jira.devtools.intel.com/browse/CVS-141140) --------- Signed-off-by: yuan.xiong <yuan.xiong@intel.com>
1 parent 3010f95 commit 4d7ab59

File tree

3 files changed

+99
-23
lines changed

3 files changed

+99
-23
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -166,45 +166,66 @@ KERNEL(matrix_nms_ref_stage_0)
166166
(const __global INPUT0_TYPE* input_boxes,
167167
const __global INPUT1_TYPE* input_scores,
168168
__global uchar* buffer0,
169-
__global int* selected_boxes_num) {
169+
__global int* selected_boxes_num,
170+
__global INPUT1_TYPE* input_iou_matrix,
171+
__global INPUT1_TYPE* input_iou_max,
172+
__global INPUT1_TYPE* input_min_decays) {
170173
const int batchId = get_global_id(0);
171174
const int classId = get_global_id(1);
172175

173176
if (classId == BACKGROUND_CLASS)
174177
return;
175178

179+
const int offset = batchId * NUM_CLASSES + classId;
176180
int sorted_score_indices[NUM_BOXES];
177-
178-
for (int i = 0; i < NUM_BOXES; ++i)
179-
sorted_score_indices[i] = i;
180-
181181
int valid_boxes_num = 0;
182-
for (int i = 0; i < NUM_BOXES; i++) {
183-
if (input_scores[INPUT1_GET_INDEX(batchId, classId, 0, i)] > SCORE_THRESHOLD)
184-
++valid_boxes_num;
182+
183+
const int BLOCK_SIZE = 256;
184+
const int num_blocks = (NUM_BOXES + BLOCK_SIZE - 1) / BLOCK_SIZE;
185+
for (int i = 0; i < num_blocks; i++) {
186+
for (int j = 0; j < BLOCK_SIZE; j++) {
187+
const int idx = i * BLOCK_SIZE + j;
188+
if (idx >= NUM_BOXES)
189+
break;
190+
if (input_scores[INPUT1_GET_INDEX(batchId, classId, 0, idx)] > SCORE_THRESHOLD) {
191+
sorted_score_indices[valid_boxes_num] = idx;
192+
++valid_boxes_num;
193+
}
194+
}
185195
}
186196

197+
for (int i = valid_boxes_num; i < NUM_BOXES; ++i)
198+
sorted_score_indices[i] = 0;
199+
187200
// TODO: consider faster sorting algorithm
188-
FUNC_CALL(sortIterative)(input_scores, batchId, classId, sorted_score_indices, NUM_BOXES);
201+
FUNC_CALL(sortIterative)(input_scores, batchId, classId, sorted_score_indices, valid_boxes_num);
189202

190203
valid_boxes_num = min(valid_boxes_num, MAX_BOXES_PER_CLASS);
191204

192-
const int matrix_size = MAX_BOXES_PER_CLASS < 3 ? 1 : (MAX_BOXES_PER_CLASS * (MAX_BOXES_PER_CLASS - 1)) >> 1;
193-
INPUT1_TYPE iou_matrix[matrix_size];
194-
INPUT1_TYPE iou_max[MAX_BOXES_PER_CLASS];
205+
__global INPUT1_TYPE* iou_matrix = input_iou_matrix + offset * MAX_BOXES_PER_CLASS * sizeof(INPUT1_TYPE);
206+
__global INPUT1_TYPE* iou_max = input_iou_max + offset * MAX_BOXES_PER_CLASS * sizeof(INPUT1_TYPE);
207+
__global INPUT1_TYPE* min_decays = input_min_decays + offset * MAX_BOXES_PER_CLASS * sizeof(INPUT1_TYPE);
195208

196209
iou_max[0] = INPUT1_VAL_ZERO;
197210
for (int i = 1; i < valid_boxes_num; ++i) {
198211
INPUT1_TYPE max_iou = INPUT1_VAL_ZERO;
212+
INPUT1_TYPE min_decay = INPUT1_VAL_ONE;
199213
const COORD_TYPE_4 box_i = FUNC_CALL(getBoxCoords)(input_boxes, batchId, sorted_score_indices[i]);
200214
for (int j = 0; j < i; ++j) {
201215
const COORD_TYPE_4 box_j = FUNC_CALL(getBoxCoords)(input_boxes, batchId, sorted_score_indices[j]);
202216
const INPUT1_TYPE iou = FUNC_CALL(intersectionOverUnion)(box_i, box_j);
203217

204218
max_iou = max(iou, max_iou);
205-
iou_matrix[i * (i - 1) / 2 + j] = iou;
219+
iou_matrix[j] = iou;
206220
}
207221
iou_max[i] = max_iou;
222+
223+
for (int j = 0; j < i; ++j) {
224+
INPUT1_TYPE decay =
225+
DECAY_FUNC == 0 ? FUNC_CALL(decay_gaussian)(iou_matrix[j], iou_max[j]) : FUNC_CALL(decay_linear)(iou_matrix[j], iou_max[j]);
226+
min_decay = min(min_decay, decay);
227+
}
228+
min_decays[i] = min_decay;
208229
}
209230

210231
const INPUT1_TYPE first_score = input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[0])];
@@ -222,15 +243,7 @@ KERNEL(matrix_nms_ref_stage_0)
222243
}
223244

224245
for (int i = 1; i < valid_boxes_num; ++i) {
225-
INPUT1_TYPE min_decay = INPUT1_VAL_ONE;
226-
for (int j = 0; j < i; ++j) {
227-
INPUT1_TYPE iou = iou_matrix[i * (i - 1) / 2 + j];
228-
INPUT1_TYPE decay =
229-
DECAY_FUNC == 0 ? FUNC_CALL(decay_gaussian)(iou, iou_max[j]) : FUNC_CALL(decay_linear)(iou, iou_max[j]);
230-
min_decay = min(min_decay, decay);
231-
}
232-
233-
INPUT1_TYPE ds = min_decay * input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[i])];
246+
INPUT1_TYPE ds = min_decays[i] * input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[i])];
234247

235248
if (ds <= POST_THRESHOLD)
236249
continue;

src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,15 +84,25 @@ KernelsData MatrixNmsKernelRef::GetKernelsData(const Params& params) const {
8484

8585
int max_boxes_per_class, max_boxes_per_batch;
8686
std::tie(max_boxes_per_class, max_boxes_per_batch) = GetMaxBoxes(new_params);
87+
max_boxes_per_class = std::min(max_boxes_per_class, batches_num * max_boxes_per_batch);
8788

8889
const size_t box_info_num = batches_num * classes_num * max_boxes_per_class;
8990

9091
const size_t box_info_buffer_size = box_info_num * BOX_INFO_SIZE;
9192
const size_t sel_boxes_num_buffer_size = batches_num * classes_num * sizeof(int);
9293

94+
size_t datatype_size = BytesPerElement(new_params.inputs[1].GetDType());
95+
96+
const size_t iou_matrix_buffer_size = batches_num * classes_num * max_boxes_per_class * datatype_size;
97+
const size_t iou_max_buffer_size = iou_matrix_buffer_size;
98+
const size_t min_decays_buffer_size = iou_matrix_buffer_size;
99+
93100
kernel_data.internalBuffers.push_back(box_info_buffer_size);
94101
kernel_data.internalBuffers.push_back(sel_boxes_num_buffer_size);
95-
kernel_data.internalBufferDataType = Datatype::F32;
102+
kernel_data.internalBuffers.push_back(iou_matrix_buffer_size);
103+
kernel_data.internalBuffers.push_back(iou_max_buffer_size);
104+
kernel_data.internalBuffers.push_back(min_decays_buffer_size);
105+
kernel_data.internalBufferDataType = new_params.inputs[1].GetDType(); // input_scores
96106

97107
for (size_t i{}; i < kernels_num; ++i) {
98108
auto entry_point = GetEntryPoint(kernelName, new_params.layerID, params, i);
@@ -167,6 +177,9 @@ void MatrixNmsKernelRef::SetKernelArguments(const matrix_nms_params& params, clK
167177
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
168178
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
169179
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
180+
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2});
181+
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 3});
182+
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4});
170183
break;
171184

172185
case 1:

src/plugins/intel_gpu/tests/unit/test_cases/matrix_nms_gpu_test.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,54 @@ matrix_nms_test_inputs get_matrix_nms_no_output_inputs() {
626626
"matrix_nms_no_output"};
627627
}
628628

629+
matrix_nms_test_inputs get_matrix_nms_large_value_of_max_boxes_per_class() {
630+
const int num_boxes = 22743;
631+
const int num_classes = 2;
632+
633+
// [batch, boxes, 1, 4]
634+
std::vector<float> boxes = {
635+
0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9,
636+
0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
637+
boxes.resize(num_boxes * 4, PAD);
638+
639+
// [batch, classes, 1, boxes]
640+
std::vector<float> scores = {
641+
0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
642+
scores.resize(num_boxes * num_classes, PAD);
643+
scores[num_boxes * (num_classes - 1)] = 0.95;
644+
scores[num_boxes * (num_classes - 1) + 1] = 0.75;
645+
scores[num_boxes * (num_classes - 1) + 2] = 0.6;
646+
scores[num_boxes * (num_classes - 1) + 3] = 0.80;
647+
scores[num_boxes * (num_classes - 1) + 4] = 0.5;
648+
scores[num_boxes * (num_classes - 1) + 5] = 0.3;
649+
650+
std::vector<float> expected_output = {
651+
1.00, 0.95, 0.00, 0.00, 1.00, 1.00, 1.00, 0.8, 0.00, 10.00, 1.00, 11.00,
652+
1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1};
653+
654+
return {
655+
1, // num_butches
656+
num_boxes,// num_boxes
657+
num_classes,// num_classes
658+
3, // num_selected_boxes
659+
false, // sort_result_across_bch
660+
0.01f, // score_threshold
661+
-1, // nms_top_k
662+
3, // keep_top_k
663+
0, // background_class
664+
2.0f, // gaussian_sigma
665+
0.01f, // post_threshold
666+
true, // normalized
667+
boxes,
668+
scores,
669+
expected_output,// expected_output
670+
std::vector<int>{0, 3, 1},// expected_selected_boxes
671+
std::vector<int>{3},// expected_valid_output
672+
ov::op::v8::MatrixNms::SortResultType::SCORE, // sort_result_type
673+
ov::op::v8::MatrixNms::DecayFunction::LINEAR, // decay_function
674+
"large_value_of_max_boxes_per_class"};
675+
}
676+
629677
const std::vector<format::type> layout_formats = {format::bfyx,
630678
format::b_fs_yx_fsv16,
631679
format::b_fs_yx_fsv32,
@@ -663,6 +711,7 @@ INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_identical_boxes_inputs)
663711
INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_top_k_inputs)
664712
INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_single_box_inputs)
665713
INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_no_output_inputs)
714+
INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_large_value_of_max_boxes_per_class)
666715

667716
using ov::float16;
668717
INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_smoke_inputs)
@@ -678,6 +727,7 @@ INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_identical_boxes_inputs
678727
INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_top_k_inputs)
679728
INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_single_box_inputs)
680729
INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_no_output_inputs)
730+
INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_large_value_of_max_boxes_per_class)
681731

682732
#ifndef RUN_ALL_MODEL_CACHING_TESTS
683733
INSTANTIATE_TEST_SUITE_P(matrix_nms_test_float16get_matrix_nms_smoke_inputs_cached,

0 commit comments

Comments
 (0)