Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit c314d6c

Browse files
airMengluoyu-intel
andauthored
[Graph]Add AMX_BF16 and AMX_INT8 kernels (#126)
Signed-off-by: Hengyu Meng <hengyu.meng@intel.com> Co-authored-by: luoyu-intel <yu.luo@intel.com>
1 parent 69c1f33 commit c314d6c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+7377
-1052
lines changed

.github/CODEOWNERS

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,5 @@
2626

2727

2828
/intel_extension_for_transformers/backends/neural_engine/Cmake* yu.luo@intel.com
29-
/intel_extension_for_transformers/backends/neural_engine/cmake/ yu.luo@intel.com
29+
/intel_extension_for_transformers/backends/neural_engine/cmake/ yu.luo@intel.com
30+
/intel_extension_for_transformers/backends/neural_engine/graph/jblas yu.luo@intel.com

intel_extension_for_transformers/backends/neural_engine/graph/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,14 @@ option(NE_PROFILING "neural_engine: use Profiling"
7474
if (NE_PROFILING)
7575
add_compile_definitions(NE_PERF)
7676
endif()
77+
option(NE_GELU_VEC "neural_engine: enable vec in gelu" ON)
78+
if (NE_GELU_VEC)
79+
add_compile_definitions(NE_GELU_USE_VEC)
80+
endif()
81+
82+
if(NE_BUILD_TESTS)
83+
enable_testing()
84+
endif()
7785

7886
if (MSVC)
7987
add_compile_definitions(_CRT_SECURE_NO_WARNINGS NOMINMAX)

intel_extension_for_transformers/backends/neural_engine/graph/CMakePresets.json

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,6 @@
2323
"inherits": "linux-debug",
2424
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
2525
},
26-
{
27-
"name": "macos-debug",
28-
"displayName": "macOS Debug",
29-
"description": "Target a remote macOS system.",
30-
"generator": "Ninja",
31-
"binaryDir": "${sourceDir}/out/build/${presetName}",
32-
"installDir": "${sourceDir}/out/install/${presetName}",
33-
"cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" },
34-
"condition": {
35-
"type": "equals",
36-
"lhs": "${hostSystemName}",
37-
"rhs": "Darwin"
38-
},
39-
"vendor": { "microsoft.com/VisualStudioRemoteSettings/CMake/1.0": { "sourceDir": "$env{HOME}/.vs/$ms{projectDirName}" } }
40-
},
4126
{
4227
"name": "windows-base",
4328
"description": "Target Windows with the Visual Studio development environment.",
@@ -72,24 +57,6 @@
7257
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
7358
"inherits": "x64-debug",
7459
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
75-
},
76-
{
77-
"name": "x86-debug",
78-
"displayName": "x86 Debug",
79-
"description": "Target Windows (32-bit) with the Visual Studio development environment. (Debug)",
80-
"inherits": "windows-base",
81-
"architecture": {
82-
"value": "x86",
83-
"strategy": "external"
84-
},
85-
"cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" }
86-
},
87-
{
88-
"name": "x86-release",
89-
"displayName": "x86 Release",
90-
"description": "Target Windows (32-bit) with the Visual Studio development environment. (RelWithDebInfo)",
91-
"inherits": "x86-debug",
92-
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
9360
}
9461
]
9562
}

intel_extension_for_transformers/backends/neural_engine/graph/application/ChatGPTJ/pybind_gptj.cpp

Lines changed: 184 additions & 82 deletions
Large diffs are not rendered by default.

intel_extension_for_transformers/backends/neural_engine/graph/application/ChatGPTJ/quant_gptj.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ class gptj_quant_layer : public quant_layer_base {
3333
virtual quant_params_internal get_layer_config(std::string layername, std::vector<int64_t> ne,
3434
ne_type type) override {
3535
bool quantize = layername.rfind("weight") == layername.size() - 6; // ends with 'weight'?
36-
if (layername == "transformer.wte.weight") {
36+
if (layername == "transformer.wte.weight" || layername == "lm_head.weight") {
3737
// special layer process, can be loaded by config file
38-
return quant_params_internal(); // return q4_0 to cover the usage of getrow
38+
return quant_params_internal{quant_bits::count}; // skip for head and tail layers
3939
}
4040
quantize &= (ne.size() == 2);
4141
if (quantize) {

intel_extension_for_transformers/backends/neural_engine/graph/application/common.cpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -786,21 +786,13 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params param
786786
if (params.compute_type == "int8") {
787787
using GemmKernel = jblas::wrapper::gemm_default::weight_comp::avx512_vnni::GemmKernelDynamicQuantS4KBlock;
788788
static GemmKernel kernel;
789-
if (cd->AVX512F()) {
790-
packedw =
791-
kernel.getWeightPtr()->compressWeightTranspose<JblasAVX512F>(n, k, f32ptr, k, params.block_size, type);
792-
} else {
793-
packedw = kernel.getWeightPtr()->compressWeightTranspose<JblasNoSIMD>(n, k, f32ptr, k, params.block_size, type);
794-
}
789+
assert(cd->AVX512F());
790+
packedw = kernel.getWeightPtr()->compressWeightTranspose(n, k, f32ptr, k, params.block_size, type);
795791
} else if (params.compute_type == "fp32") {
796792
using GemmKernel = jblas::wrapper::gemm_default::weight_comp::avx512f::GemmKernelS4KBlock;
797793
static GemmKernel kernel;
798-
if (cd->AVX512F()) {
799-
packedw =
800-
kernel.getWeightPtr()->compressWeightTranspose<JblasAVX512F>(n, k, f32ptr, k, params.block_size, type);
801-
} else {
802-
packedw = kernel.getWeightPtr()->compressWeightTranspose<JblasNoSIMD>(n, k, f32ptr, k, params.block_size, type);
803-
}
794+
assert(cd->AVX512F());
795+
packedw = kernel.getWeightPtr()->compressWeightTranspose(n, k, f32ptr, k, params.block_size, type);
804796
}
805797
} else if (params.bits == 8) {
806798
// TODO add 8bit quantization

intel_extension_for_transformers/backends/neural_engine/graph/core/CMakeLists.txt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,32 @@ if(NOT WIN32)
2626
target_link_libraries(ne_layers PUBLIC rt)
2727
endif()
2828

29+
add_compile_definitions(NE_USE_RN_BF16FP16=1)
30+
31+
32+
if (NE_BUILD_TESTS)
33+
34+
function(add_test_target src)
35+
get_filename_component(test_target ${src} NAME_WE)
36+
get_filename_component(src_dir ${src} DIRECTORY)
37+
string(REGEX REPLACE [/\\] "_" src_dir ${src_dir})
38+
if(src_dir)
39+
set (test_target "${src_dir}_${test_target}")
40+
endif()
41+
set (test_target "test_${test_target}")
42+
add_executable_w_warning(${test_target} ${src})
43+
target_compile_definitions(${test_target} PRIVATE NE_TESTS)
44+
target_compile_options(${test_target} PRIVATE -fsanitize=address)
45+
target_link_options(${test_target} PRIVATE -fsanitize=address)
46+
target_include_directories(${test_target} PUBLIC .)
47+
target_link_libraries(${test_target} PUBLIC Threads::Threads jblas::jblas ne_vec)
48+
if(NOT WIN32)
49+
target_link_libraries(${test_target} PUBLIC rt)
50+
endif()
51+
add_test(NAME ${test_target} COMMAND ${test_target})
52+
set_tests_properties(${test_target} PROPERTIES LABELS "${src_dir}_test")
53+
endfunction()
54+
55+
add_test_target(layers/mha_dense.cpp)
56+
57+
endif()

intel_extension_for_transformers/backends/neural_engine/graph/core/data_types.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
// limitations under the License.
1414
#pragma once
1515

16-
#include <stdint.h>
17-
#include <stddef.h>
18-
#include <stdbool.h>
1916
#include <assert.h>
17+
#include <math.h>
18+
#include <stdbool.h>
19+
#include <stddef.h>
20+
#include <stdint.h>
2021
#include <string.h>
2122

2223
#ifdef __cplusplus
@@ -26,6 +27,7 @@ extern "C" {
2627
// floating point type used to accumulate sums
2728
typedef double ne_float;
2829
typedef uint16_t ne_fp16_t;
30+
typedef uint16_t ne_bf16_t;
2931

3032
enum ne_type {
3133
NE_TYPE_F32 = 0,

intel_extension_for_transformers/backends/neural_engine/graph/core/layers/Ops.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ enum ne_op {
4747
NE_OP_RMS_NORM_BACK,
4848

4949
NE_OP_MUL_MAT,
50-
50+
NE_OP_MUL_MAT_BIAS,
5151
NE_OP_SCALE,
5252
NE_OP_SET,
5353
NE_OP_CPY,
@@ -72,6 +72,8 @@ enum ne_op {
7272
// LLM related
7373
NE_OP_MUL_QKV,
7474
NE_OP_MUL_FFN_SILU,
75+
NE_OP_MUL_FFN_GELU,
76+
NE_OP_MUL_FFN_ADD_GELU,
7577
NE_OP_FLASH_ATTN,
7678
NE_OP_FLASH_FF,
7779

intel_extension_for_transformers/backends/neural_engine/graph/core/layers/ele_wise.h

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@ inline static void ne_vec_set_f16(const int n, ne_fp16_t* x, const int32_t v) {
4040
for (int i = 0; i < n; ++i) x[i] = v;
4141
}
4242

43+
inline static void ne_vec_srl_i32(const int n, int32_t* z, const int32_t* x, int32_t v) {
44+
for (int i = 0; i < n; ++i) z[i] = x[i] >> v;
45+
}
46+
47+
inline static void ne_vec_and_i32(const int n, int32_t* z, const int32_t* x, const int32_t* y) {
48+
for (int i = 0; i < n; ++i) z[i] = x[i] & y[i];
49+
}
50+
4351
inline static void ne_vec_add_f32(const int n, float* z, const float* x, const float* y) {
4452
for (int i = 0; i < n; ++i) z[i] = x[i] + y[i];
4553
}
@@ -171,22 +179,38 @@ inline static void ne_vec_gelu_f16(const int n, ne_fp16_t* y, const ne_fp16_t* x
171179
}
172180
}
173181

174-
#ifdef NE_GELU_FP16
175-
inline static void ne_vec_gelu_f32(const int n, float* y, const float* x) {
176-
uint16_t t;
177-
for (int i = 0; i < n; ++i) {
178-
ne_fp16_t fp16 = NE_FP32_TO_FP16(x[i]);
179-
memcpy(&t, &fp16, sizeof(uint16_t));
180-
y[i] = NE_FP16_TO_FP32(table_gelu_f16[t]);
181-
}
182+
inline static void ne_vec_tanh_f32(const int n, float* y, const float* x) {
183+
for (int i = 0; i < n; i++) y[i] = tanhf(x[i]);
182184
}
183-
#else
185+
184186
inline static void ne_vec_gelu_f32(const int n, float* y, const float* x) {
187+
#ifdef NE_GELU_USE_VEC
188+
// compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
189+
float* aux0 = (float*)malloc(n * sizeof(float));
190+
ne_vec_sqr_f32(n, aux0, x);
191+
float* aux1 = (float*)malloc(n * sizeof(float));
192+
ne_vec_set_f32(n, aux1, 1.0f);
193+
ne_vec_mad_f32(n, aux1, aux0, GELU_COEF_A);
194+
ne_vec_mul_f32(n, aux0, x, aux1);
195+
ne_vec_set_f32(n, aux1, SQRT_2_OVER_PI);
196+
ne_vec_mul_f32(n, aux1, aux0, aux1);
197+
198+
// compute tanh(G(x))
199+
ne_vec_tanh_f32(n, aux0, aux1);
200+
// Gelu(x)= 0.5f * x * (1.0f + tanh(G(x)))
201+
ne_vec_acc1_f32(n, aux0, 1.0f);
202+
ne_vec_mul_f32(n, y, x, aux0);
203+
ne_vec_set_f32(n, aux0, 0.5f);
204+
ne_vec_mul_f32(n, y, y, aux0);
205+
206+
free(aux0);
207+
free(aux1);
208+
#else
185209
for (int i = 0; i < n; ++i) {
186210
y[i] = ne_gelu_f32(x[i]);
187211
}
188-
}
189212
#endif
213+
}
190214

191215
// Sigmoid Linear Unit (SiLU) function
192216
inline static float ne_silu_f32(float x) { return x / (1.0f + expf(-x)); }

0 commit comments

Comments
 (0)