openvinotoolkit
diff --git a/‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl‎
Lines changed: 35 additions & 22 deletions b/‎src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl‎
Lines changed: 35 additions & 22 deletions
diff --git a/‎src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp‎
Lines changed: 14 additions & 1 deletion b/‎src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎src/plugins/intel_gpu/tests/unit/test_cases/matrix_nms_gpu_test.cpp‎
Lines changed: 50 additions & 0 deletions b/‎src/plugins/intel_gpu/tests/unit/test_cases/matrix_nms_gpu_test.cpp‎
Lines changed: 50 additions & 0 deletions
@@ -166,45 +166,66 @@ KERNEL(matrix_nms_ref_stage_0)
 (const __global INPUT0_TYPE* input_boxes,
  const __global INPUT1_TYPE* input_scores,
  __global uchar* buffer0,
- __global int* selected_boxes_num) {
+ __global int* selected_boxes_num,
+ __global INPUT1_TYPE* input_iou_matrix,
+ __global INPUT1_TYPE* input_iou_max,
+ __global INPUT1_TYPE* input_min_decays) {
  const int batchId = get_global_id(0);
  const int classId = get_global_id(1);
 
  if (classId == BACKGROUND_CLASS)
  return;
 
+ const int offset = batchId * NUM_CLASSES + classId;
  int sorted_score_indices[NUM_BOXES];
-
- for (int i = 0; i < NUM_BOXES; ++i)
- sorted_score_indices[i] = i;
-
  int valid_boxes_num = 0;
- for (int i = 0; i < NUM_BOXES; i++) {
- if (input_scores[INPUT1_GET_INDEX(batchId, classId, 0, i)] > SCORE_THRESHOLD)
- ++valid_boxes_num;
+
+ const int BLOCK_SIZE = 256;
+ const int num_blocks = (NUM_BOXES + BLOCK_SIZE - 1) / BLOCK_SIZE;
+ for (int i = 0; i < num_blocks; i++) {
+ for (int j = 0; j < BLOCK_SIZE; j++) {
+ const int idx = i * BLOCK_SIZE + j;
+ if (idx >= NUM_BOXES)
+ break;
+ if (input_scores[INPUT1_GET_INDEX(batchId, classId, 0, idx)] > SCORE_THRESHOLD) {
+ sorted_score_indices[valid_boxes_num] = idx;
+ ++valid_boxes_num;
+ }
+ }
  }
 
+ for (int i = valid_boxes_num; i < NUM_BOXES; ++i)
+ sorted_score_indices[i] = 0;
+
  // TODO: consider faster sorting algorithm
- FUNC_CALL(sortIterative)(input_scores, batchId, classId, sorted_score_indices, NUM_BOXES);
+ FUNC_CALL(sortIterative)(input_scores, batchId, classId, sorted_score_indices, valid_boxes_num);
 
  valid_boxes_num = min(valid_boxes_num, MAX_BOXES_PER_CLASS);
 
- const int matrix_size = MAX_BOXES_PER_CLASS < 3 ? 1 : (MAX_BOXES_PER_CLASS * (MAX_BOXES_PER_CLASS - 1)) >> 1;
- INPUT1_TYPE iou_matrix[matrix_size];
- INPUT1_TYPE iou_max[MAX_BOXES_PER_CLASS];
+ __global INPUT1_TYPE* iou_matrix = input_iou_matrix + offset * MAX_BOXES_PER_CLASS * sizeof(INPUT1_TYPE);
+ __global INPUT1_TYPE* iou_max = input_iou_max + offset * MAX_BOXES_PER_CLASS * sizeof(INPUT1_TYPE);
+ __global INPUT1_TYPE* min_decays = input_min_decays + offset * MAX_BOXES_PER_CLASS * sizeof(INPUT1_TYPE);
 
  iou_max[0] = INPUT1_VAL_ZERO;
  for (int i = 1; i < valid_boxes_num; ++i) {
  INPUT1_TYPE max_iou = INPUT1_VAL_ZERO;
+ INPUT1_TYPE min_decay = INPUT1_VAL_ONE;
  const COORD_TYPE_4 box_i = FUNC_CALL(getBoxCoords)(input_boxes, batchId, sorted_score_indices[i]);
  for (int j = 0; j < i; ++j) {
  const COORD_TYPE_4 box_j = FUNC_CALL(getBoxCoords)(input_boxes, batchId, sorted_score_indices[j]);
  const INPUT1_TYPE iou = FUNC_CALL(intersectionOverUnion)(box_i, box_j);
 
  max_iou = max(iou, max_iou);
- iou_matrix[i * (i - 1) / 2 + j] = iou;
+ iou_matrix[j] = iou;
  }
  iou_max[i] = max_iou;
+
+ for (int j = 0; j < i; ++j) {
+ INPUT1_TYPE decay =
+ DECAY_FUNC == 0 ? FUNC_CALL(decay_gaussian)(iou_matrix[j], iou_max[j]) : FUNC_CALL(decay_linear)(iou_matrix[j], iou_max[j]);
+ min_decay = min(min_decay, decay);
+ }
+ min_decays[i] = min_decay;
  }
 
  const INPUT1_TYPE first_score = input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[0])];
@@ -222,15 +243,7 @@ KERNEL(matrix_nms_ref_stage_0)
  }
 
  for (int i = 1; i < valid_boxes_num; ++i) {
- INPUT1_TYPE min_decay = INPUT1_VAL_ONE;
- for (int j = 0; j < i; ++j) {
- INPUT1_TYPE iou = iou_matrix[i * (i - 1) / 2 + j];
- INPUT1_TYPE decay =
- DECAY_FUNC == 0 ? FUNC_CALL(decay_gaussian)(iou, iou_max[j]) : FUNC_CALL(decay_linear)(iou, iou_max[j]);
- min_decay = min(min_decay, decay);
- }
-
- INPUT1_TYPE ds = min_decay * input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[i])];
+ INPUT1_TYPE ds = min_decays[i] * input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[i])];
 
  if (ds <= POST_THRESHOLD)
  continue;
 
@@ -84,15 +84,25 @@ KernelsData MatrixNmsKernelRef::GetKernelsData(const Params& params) const {
 
  int max_boxes_per_class, max_boxes_per_batch;
  std::tie(max_boxes_per_class, max_boxes_per_batch) = GetMaxBoxes(new_params);
+ max_boxes_per_class = std::min(max_boxes_per_class, batches_num * max_boxes_per_batch);
 
  const size_t box_info_num = batches_num * classes_num * max_boxes_per_class;
 
  const size_t box_info_buffer_size = box_info_num * BOX_INFO_SIZE;
  const size_t sel_boxes_num_buffer_size = batches_num * classes_num * sizeof(int);
 
+ size_t datatype_size = BytesPerElement(new_params.inputs[1].GetDType());
+
+ const size_t iou_matrix_buffer_size = batches_num * classes_num * max_boxes_per_class * datatype_size;
+ const size_t iou_max_buffer_size = iou_matrix_buffer_size;
+ const size_t min_decays_buffer_size = iou_matrix_buffer_size;
+
  kernel_data.internalBuffers.push_back(box_info_buffer_size);
  kernel_data.internalBuffers.push_back(sel_boxes_num_buffer_size);
- kernel_data.internalBufferDataType = Datatype::F32;
+ kernel_data.internalBuffers.push_back(iou_matrix_buffer_size);
+ kernel_data.internalBuffers.push_back(iou_max_buffer_size);
+ kernel_data.internalBuffers.push_back(min_decays_buffer_size);
+ kernel_data.internalBufferDataType = new_params.inputs[1].GetDType(); // input_scores
 
  for (size_t i{}; i < kernels_num; ++i) {
  auto entry_point = GetEntryPoint(kernelName, new_params.layerID, params, i);
@@ -167,6 +177,9 @@ void MatrixNmsKernelRef::SetKernelArguments(const matrix_nms_params& params, clK
  kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
  kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
  kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+ kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 2});
+ kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 3});
+ kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 4});
  break;
 
  case 1:
 
@@ -626,6 +626,54 @@ matrix_nms_test_inputs get_matrix_nms_no_output_inputs() {
  "matrix_nms_no_output"};
 }
 
+matrix_nms_test_inputs get_matrix_nms_large_value_of_max_boxes_per_class() {
+ const int num_boxes = 22743;
+ const int num_classes = 2;
+
+ // [batch, boxes, 1, 4]
+ std::vector<float> boxes = {
+ 0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9,
+ 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+ boxes.resize(num_boxes * 4, PAD);
+
+ // [batch, classes, 1, boxes]
+ std::vector<float> scores = {
+ 0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+ scores.resize(num_boxes * num_classes, PAD);
+ scores[num_boxes * (num_classes - 1)] = 0.95;
+ scores[num_boxes * (num_classes - 1) + 1] = 0.75;
+ scores[num_boxes * (num_classes - 1) + 2] = 0.6;
+ scores[num_boxes * (num_classes - 1) + 3] = 0.80;
+ scores[num_boxes * (num_classes - 1) + 4] = 0.5;
+ scores[num_boxes * (num_classes - 1) + 5] = 0.3;
+
+ std::vector<float> expected_output = {
+1.00, 0.95, 0.00, 0.00, 1.00, 1.00, 1.00, 0.8, 0.00, 10.00, 1.00, 11.00,
+ 1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1};
+
+ return {
+ 1, // num_butches
+ num_boxes,// num_boxes
+ num_classes,// num_classes
+ 3, // num_selected_boxes
+ false, // sort_result_across_bch
+ 0.01f, // score_threshold
+ -1, // nms_top_k
+ 3, // keep_top_k
+ 0, // background_class
+ 2.0f, // gaussian_sigma
+ 0.01f, // post_threshold
+ true, // normalized
+ boxes,
+ scores,
+ expected_output,// expected_output
+ std::vector<int>{0, 3, 1},// expected_selected_boxes
+ std::vector<int>{3},// expected_valid_output
+ ov::op::v8::MatrixNms::SortResultType::SCORE, // sort_result_type
+ ov::op::v8::MatrixNms::DecayFunction::LINEAR, // decay_function
+ "large_value_of_max_boxes_per_class"};
+}
+
 const std::vector<format::type> layout_formats = {format::bfyx,
  format::b_fs_yx_fsv16,
  format::b_fs_yx_fsv32,
@@ -663,6 +711,7 @@ INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_identical_boxes_inputs)
 INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_top_k_inputs)
 INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_single_box_inputs)
 INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_no_output_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_large_value_of_max_boxes_per_class)
 
 using ov::float16;
 INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_smoke_inputs)
@@ -678,6 +727,7 @@ INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_identical_boxes_inputs
 INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_top_k_inputs)
 INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_single_box_inputs)
 INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_no_output_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float16, get_matrix_nms_large_value_of_max_boxes_per_class)
 
 #ifndef RUN_ALL_MODEL_CACHING_TESTS
 INSTANTIATE_TEST_SUITE_P(matrix_nms_test_float16get_matrix_nms_smoke_inputs_cached,