Skip to content

Commit 2d86626

Browse files
authored
[CPU] Revert "Fix Alibaba-NLP accuracy issue (#32507)" (#32653)
### Details: - *mlp fusion pattern changes will result in perf regression for some models, so we need re-consider the pattern limitation settings to both fix the acc issue and keep current models no perf regression* ### Tickets: - *CVS-176051 CVS-176019*
1 parent 4d7ab59 commit 2d86626

File tree

2 files changed

+13
-68
lines changed

2 files changed

+13
-68
lines changed

src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/mlp_fusion.cpp

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -122,34 +122,6 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
122122
matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
123123
const auto& pattern_map = m.get_pattern_value_map();
124124
auto root = m.get_match_root();
125-
// Check that the first input of Multiply is the gate (activation) branch and the second input is the up branch;
126-
// otherwise, do not fuse. Input order is critical for correctness: mismatched input order can silently cause
127-
// accuracy issues.
128-
auto mlp_gated_up_node = pattern_map.at(mlp_gated_up).get_node_shared_ptr();
129-
auto input0 = mlp_gated_up_node->input_value(0);
130-
auto input1 = mlp_gated_up_node->input_value(1);
131-
132-
bool input0_is_gate = false;
133-
bool input1_is_up = false;
134-
135-
if (pattern_map.count(mlp_silu_gate) && input0.get_node() == pattern_map.at(mlp_silu_gate).get_node()) {
136-
input0_is_gate = true;
137-
}
138-
if (pattern_map.count(mlp_gelu_gate) && input0.get_node() == pattern_map.at(mlp_gelu_gate).get_node()) {
139-
input0_is_gate = true;
140-
}
141-
142-
if (pattern_map.count(mlp_up_proj) && input1.get_node() == pattern_map.at(mlp_up_proj).get_node()) {
143-
input1_is_up = true;
144-
}
145-
if (pattern_map.count(gate_up_proj_split) &&
146-
input1.get_node() == pattern_map.at(gate_up_proj_split).get_node() && input1.get_index() == 1) {
147-
input1_is_up = true;
148-
}
149-
150-
if (!input0_is_gate || !input1_is_up) {
151-
return false;
152-
}
153125
auto src = pattern_map.at(input);
154126
if (!src.get_element_type().is_real()) {
155127
// FakeQuantize, should skip fusion

src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mlp_fusion.cpp

Lines changed: 13 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@
66
#include <vector>
77

88
#include "common_test_utils/ov_tensor_utils.hpp"
9+
#include "openvino/runtime/exec_model_info.hpp"
10+
#include "shared_test_classes/base/ov_subgraph.hpp"
911
#include "openvino/op/convert.hpp"
1012
#include "openvino/op/gelu.hpp"
1113
#include "openvino/op/matmul.hpp"
1214
#include "openvino/op/multiply.hpp"
1315
#include "openvino/op/swish.hpp"
14-
#include "openvino/runtime/exec_model_info.hpp"
15-
#include "shared_test_classes/base/ov_subgraph.hpp"
1616

1717
namespace ov {
1818
namespace test {
@@ -23,7 +23,6 @@ struct LLMMLPFusionParams {
2323
size_t up_size;
2424
std::string act_type;
2525
bool use_dynamic_quant;
26-
bool swap_inputs; // true = swap inputs to prevent fusion, false = normal order for fusion
2726
};
2827

2928
class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>, public ov::test::SubgraphBaseTest {
@@ -40,7 +39,6 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
4039
result << "up_size=" << obj.param.up_size << "_";
4140
result << "act_type=" << obj.param.act_type << "_";
4241
result << "use_dynamic_quant=" << obj.param.use_dynamic_quant << "_";
43-
result << "swap_inputs=" << obj.param.swap_inputs << "_";
4442
result << obj.index;
4543
return result.str();
4644
}
@@ -72,8 +70,7 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
7270
in_data.start_from = 0;
7371
in_data.range = 1;
7472
in_data.resolution = 128;
75-
auto tensor_scale_per_oc =
76-
ov::test::utils::create_and_fill_tensor(ov::element::f32, ov::Shape{OC, 1}, in_data);
73+
auto tensor_scale_per_oc = ov::test::utils::create_and_fill_tensor(ov::element::f32, ov::Shape{OC, 1}, in_data);
7774
auto scale_per_oc = std::make_shared<ov::op::v0::Constant>(tensor_scale_per_oc);
7875

7976
auto weight_deq = std::make_shared<ov::op::v1::Multiply>(weight_const_f32, scale_per_oc);
@@ -88,8 +85,7 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
8885
return std::make_shared<ov::op::v0::Constant>(tensor);
8986
};
9087
if (param.use_dynamic_quant)
91-
configuration.insert(
92-
{ov::hint::dynamic_quantization_group_size.name(), std::numeric_limits<uint64_t>::max()});
88+
configuration.insert({ov::hint::dynamic_quantization_group_size.name(), std::numeric_limits<uint64_t>::max()});
9389

9490
auto gate_weight = create_const(param.up_size, param.down_size, 100);
9591
auto up_weight = create_const(param.up_size, param.down_size, 100);
@@ -105,22 +101,13 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
105101
if (param.act_type == "Gelu")
106102
gate_act = std::make_shared<ov::op::v7::Gelu>(gate_proj);
107103

108-
// Control input order based on swap_inputs parameter
109-
std::shared_ptr<ov::op::v1::Multiply> gate_up;
110-
if (param.swap_inputs) {
111-
// Swapped order should prevent fusion
112-
gate_up = std::make_shared<ov::op::v1::Multiply>(up_proj, gate_act);
113-
} else {
114-
// Normal order should allow fusion
115-
gate_up = std::make_shared<ov::op::v1::Multiply>(gate_act, up_proj);
116-
}
117-
104+
auto gate_up = std::make_shared<ov::op::v1::Multiply>(gate_act, up_proj);
118105
auto output = std::make_shared<ov::op::v0::MatMul>(gate_up, down_weight, false, true);
119106

120107
function = std::make_shared<ov::Model>(ov::OutputVector{output}, ov::ParameterVector{src});
121108
}
122109

123-
void check_fusion_result() {
110+
void check_results() {
124111
auto exec_model = compiledModel.get_runtime_model();
125112

126113
int fused_node_found = 0;
@@ -129,40 +116,26 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
129116
if (layer_type == "LLMMLP")
130117
fused_node_found++;
131118
}
132-
133-
auto& param = this->GetParam();
134-
if (param.swap_inputs) {
135-
// When inputs are swapped, fusion should NOT happen
136-
ASSERT_EQ(fused_node_found, 0) << "Fusion should not occur with swapped inputs";
137-
} else {
138-
// Normal case, fusion should happen
139-
ASSERT_EQ(fused_node_found, 1) << "Fusion should occur with correct input order";
140-
}
119+
ASSERT_EQ(fused_node_found, 1);
141120
}
142121
};
143122

144123
TEST_P(LLMMLPFusionTest, CompareWithRefs) {
145124
if (!ov::with_cpu_x86_avx512_core_amx_bf16())
146125
GTEST_SKIP();
147126
run();
148-
check_fusion_result();
127+
check_results();
149128
}
150129

151130
namespace {
152131

153-
static ov::test::InputShape ishape{ov::PartialShape{-1, -1, 4096 / 4},
154-
{ov::Shape{1, 8, 4096 / 4}, ov::Shape{5, 37, 4096 / 4}}};
132+
static ov::test::InputShape ishape{ov::PartialShape{-1, -1, 4096 / 4}, {ov::Shape{1, 8, 4096 / 4}, ov::Shape{5, 37, 4096 / 4}}};
155133

156-
// Test parameters combining both normal fusion and no-fusion cases
157134
const std::vector<LLMMLPFusionParams> mlp_params = {
158-
// Normal cases - should fuse (swap_inputs = false)
159-
{ishape, 4096 / 4, 11008 / 4, "Gelu", false, false},
160-
{ishape, 4096 / 4, 11008 / 4, "Gelu", true, false},
161-
{ishape, 4096 / 4, 11008 / 4, "Swish", false, false},
162-
{ishape, 4096 / 4, 11008 / 4, "Swish", true, false},
163-
164-
// Port order issue cases - should NOT fuse (swap_inputs = true)
165-
{ishape, 4096 / 4, 11008 / 4, "Gelu", false, true},
135+
{ishape, 4096 / 4, 11008 / 4, "Gelu", false},
136+
{ishape, 4096 / 4, 11008 / 4, "Gelu", true},
137+
{ishape, 4096 / 4, 11008 / 4, "Swish", false},
138+
{ishape, 4096 / 4, 11008 / 4, "Swish", true},
166139
};
167140

168141
INSTANTIATE_TEST_SUITE_P(smoke_LLMMLPFusion,

0 commit comments

Comments
 (0)