Skip to content

Commit 9f8403b

Browse files
Merge branch '25-add-avx2-support' into 'develop'
Resolve "Add avx2 support" See merge request geon/composite-comps!22
2 parents 401dfc9 + 4d4f178 commit 9f8403b

File tree

18 files changed

+594
-281
lines changed

18 files changed

+594
-281
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ include(FetchContent)
2727
# composite dependency
2828
FetchContent_Declare(composite
2929
GIT_REPOSITORY https://github.com/geontech/composite.git
30-
GIT_TAG v0.3.0
30+
GIT_TAG v0.3.3
3131
)
3232
# vrtgen dependency
3333
FetchContent_Declare(vrtgen

docker/Dockerfile.rocky9

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ WORKDIR /opt/composite-comps
2828
RUN set -ex; \
2929
. /opt/rh/gcc-toolset-13/enable; \
3030
cmake -B docker-build \
31-
-DCOMPOSITE_USE_NATS=ON \
31+
-DCOMPOSITE_USE_NATS=OFF \
3232
-DCOMPOSITE_INSTALL=ON \
3333
-DCMAKE_INSTALL_PREFIX=/opt/usr/local \
3434
-DCMAKE_BUILD_TYPE=Release; \

examples/write.json

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,6 @@
2323
"value" : "9999"
2424
}
2525
]
26-
},
27-
{
28-
"name" : "histogram",
29-
"properties" : [
30-
{
31-
"name" : "transport",
32-
"value" : "vita49"
33-
},
34-
{
35-
"name" : "sample_rate",
36-
"value" : "32768"
37-
},
38-
{
39-
"name" : "adc_bits",
40-
"value" : "16"
41-
}
42-
]
4326
}
4427
],
4528
"connections" : [
@@ -49,9 +32,9 @@
4932
"port" : "data_out"
5033
},
5134
"input" : {
52-
"component" : "histogram",
35+
"component" : "stov",
5336
"port" : "data_in"
5437
}
55-
}
38+
},
5639
]
5740
}

src/components/exp_smooth/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ set(CMAKE_CXX_FLAGS_RELEASE_INIT "-O3")
3333
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
3434

3535
# Custom compile options
36-
add_compile_options(-march=cascadelake)
36+
add_compile_options(-march=x86-64-v2 -mtune=generic)
3737

3838
# Library
3939
add_library(exp_smooth MODULE

src/components/exp_smooth/work.hpp

Lines changed: 106 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,58 +28,144 @@ class work {};
2828

2929
template <>
3030
class work<float> {
31-
using psd_data_t = aligned::aligned_mem<float>;
31+
using data_type = aligned::aligned_mem<float>;
32+
static constexpr auto STRIDE_256 = std::size_t{256u / 8u / sizeof(float)};
33+
static constexpr auto STRIDE_512 = std::size_t{512u / 8u / sizeof(float)};
3234
public:
3335
explicit work(float alpha) {
34-
m_alpha_vec = _mm512_set1_ps(alpha);
35-
m_one_minus_alpha_vec = _mm512_set1_ps(1 - alpha);
36+
// Initialize the function pointer based on CPU features
37+
if (__builtin_cpu_supports("avx512f")) {
38+
init_avx512(alpha);
39+
} else if (__builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma")) {
40+
init_avx2(alpha);
41+
}
42+
}
43+
44+
auto process(data_type* curr_psd, data_type* prev_psd) const -> void {
45+
(this->*process_func)(curr_psd, prev_psd);
46+
}
47+
48+
private:
49+
[[gnu::target("avx2,fma")]]
50+
auto init_avx2(double alpha) -> void {
51+
process_func = &work::process_avx2;
52+
m_alpha_vec_256 = _mm256_set1_ps(alpha);
53+
m_one_minus_alpha_vec_256 = _mm256_set1_ps(1 - alpha);
54+
}
55+
56+
[[gnu::target("avx512f")]]
57+
auto init_avx512(double alpha) -> void {
58+
process_func = &work::process_avx512;
59+
m_alpha_vec_512 = _mm512_set1_ps(alpha);
60+
m_one_minus_alpha_vec_512 = _mm512_set1_ps(1 - alpha);
3661
}
3762

38-
auto process(psd_data_t* curr_psd, psd_data_t* prev_psd) const -> void {
39-
for (auto i=0u; i < curr_psd->size(); i += 16) {
63+
[[gnu::target("avx2,fma")]]
64+
auto process_avx2(data_type* curr_psd, data_type* prev_psd) const -> void {
65+
for (auto i=0u; i < curr_psd->size(); i += STRIDE_256) {
66+
// Load data
67+
auto curr_data = _mm256_load_ps(curr_psd->data() + i);
68+
auto prev_data = _mm256_load_ps(prev_psd->data() + i);
69+
// Multiply current data by alpha
70+
curr_data = _mm256_mul_ps(curr_data, m_alpha_vec_256);
71+
// Multiply prev by (1-alpha) and add to current
72+
curr_data = _mm256_fmadd_ps(prev_data, m_one_minus_alpha_vec_256, curr_data);
73+
// Store result into psd
74+
_mm256_store_ps(curr_psd->data() + i, curr_data);
75+
}
76+
}
77+
78+
[[gnu::target("avx512f")]]
79+
auto process_avx512(data_type* curr_psd, data_type* prev_psd) const -> void {
80+
for (auto i=0u; i < curr_psd->size(); i += STRIDE_512) {
4081
// Load data
4182
auto curr_data = _mm512_load_ps(curr_psd->data() + i);
4283
auto prev_data = _mm512_load_ps(prev_psd->data() + i);
4384
// Multiply current data by alpha
44-
curr_data = _mm512_mul_ps(curr_data, m_alpha_vec);
85+
curr_data = _mm512_mul_ps(curr_data, m_alpha_vec_512);
4586
// Multiply prev by (1-alpha) and add to current
46-
curr_data = _mm512_fmadd_ps(prev_data, m_one_minus_alpha_vec, curr_data);
87+
curr_data = _mm512_fmadd_ps(prev_data, m_one_minus_alpha_vec_512, curr_data);
4788
// Store result into psd
4889
_mm512_store_ps(curr_psd->data() + i, curr_data);
4990
}
5091
}
5192

52-
private:
53-
__m512 m_alpha_vec;
54-
__m512 m_one_minus_alpha_vec;
93+
auto (work::*process_func)(data_type*, data_type*) const -> void;
94+
__m256 m_alpha_vec_256;
95+
__m512 m_alpha_vec_512;
96+
__m256 m_one_minus_alpha_vec_256;
97+
__m512 m_one_minus_alpha_vec_512;
5598

5699
}; // class work<float>
57100

58101
template <>
59102
class work<double> {
60-
using psd_data_t = aligned::aligned_mem<double>;
103+
using data_type = aligned::aligned_mem<double>;
104+
static constexpr auto STRIDE_256 = std::size_t{256u / 8u / sizeof(double)};
105+
static constexpr auto STRIDE_512 = std::size_t{512u / 8u / sizeof(double)};
61106
public:
62107
explicit work(double alpha) {
63-
m_alpha_vec = _mm512_set1_pd(alpha);
64-
m_one_minus_alpha_vec = _mm512_set1_pd(1 - alpha);
108+
// Initialize the function pointer based on CPU features
109+
if (__builtin_cpu_supports("avx512f")) {
110+
init_avx512(alpha);
111+
} else if (__builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma")) {
112+
init_avx2(alpha);
113+
}
114+
}
115+
116+
auto process(data_type* curr_psd, data_type* prev_psd) const -> void {
117+
(this->*process_func)(curr_psd, prev_psd);
118+
}
119+
120+
private:
121+
[[gnu::target("avx2,fma")]]
122+
auto init_avx2(double alpha) -> void {
123+
process_func = &work::process_avx2;
124+
m_alpha_vec_256 = _mm256_set1_pd(alpha);
125+
m_one_minus_alpha_vec_256 = _mm256_set1_pd(1 - alpha);
126+
}
127+
128+
[[gnu::target("avx512f")]]
129+
auto init_avx512(double alpha) -> void {
130+
process_func = &work::process_avx512;
131+
m_alpha_vec_512 = _mm512_set1_pd(alpha);
132+
m_one_minus_alpha_vec_512 = _mm512_set1_pd(1 - alpha);
65133
}
66134

67-
auto process(psd_data_t* curr_psd, psd_data_t* prev_psd) const -> void {
68-
for (auto i=0u; i < curr_psd->size(); i += 8) {
135+
[[gnu::target("avx2,fma")]]
136+
auto process_avx2(data_type* curr_psd, data_type* prev_psd) const -> void {
137+
for (auto i=0u; i < curr_psd->size(); i += STRIDE_256) {
138+
// Load data
139+
auto curr_data = _mm256_load_pd(curr_psd->data() + i);
140+
auto prev_data = _mm256_load_pd(prev_psd->data() + i);
141+
// Multiply current data by alpha
142+
curr_data = _mm256_mul_pd(curr_data, m_alpha_vec_256);
143+
// Multiply prev by (1-alpha) and add to current
144+
curr_data = _mm256_fmadd_pd(prev_data, m_one_minus_alpha_vec_256, curr_data);
145+
// Store result into psd
146+
_mm256_store_pd(curr_psd->data() + i, curr_data);
147+
}
148+
}
149+
150+
[[gnu::target("avx512f")]]
151+
auto process_avx512(data_type* curr_psd, data_type* prev_psd) const -> void {
152+
for (auto i=0u; i < curr_psd->size(); i += STRIDE_512) {
69153
// Load data
70154
auto curr_data = _mm512_load_pd(curr_psd->data() + i);
71155
auto prev_data = _mm512_load_pd(prev_psd->data() + i);
72156
// Multiply current data by alpha
73-
curr_data = _mm512_mul_pd(curr_data, m_alpha_vec);
157+
curr_data = _mm512_mul_pd(curr_data, m_alpha_vec_512);
74158
// Multiply prev by (1-alpha) and add to current
75-
curr_data = _mm512_fmadd_pd(prev_data, m_one_minus_alpha_vec, curr_data);
159+
curr_data = _mm512_fmadd_pd(prev_data, m_one_minus_alpha_vec_512, curr_data);
76160
// Store result into psd
77161
_mm512_store_pd(curr_psd->data() + i, curr_data);
78162
}
79163
}
80164

81-
private:
82-
__m512d m_alpha_vec;
83-
__m512d m_one_minus_alpha_vec;
165+
auto (work::*process_func)(data_type*, data_type*) const -> void;
166+
__m256d m_alpha_vec_256;
167+
__m512d m_alpha_vec_512;
168+
__m256d m_one_minus_alpha_vec_256;
169+
__m512d m_one_minus_alpha_vec_512;
84170

85171
}; // class work<double>

src/components/fft/CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ set(CMAKE_CXX_FLAGS_RELEASE_INIT "-O3")
3333
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
3434

3535
# Custom compile options
36-
add_compile_options(-march=cascadelake -ffast-math)
36+
add_compile_options(-march=x86-64-v2 -mtune=generic -ffast-math)
3737

3838
# Library
3939
add_library(fft MODULE
@@ -43,7 +43,6 @@ add_library(fft MODULE
4343
target_include_directories(fft
4444
PRIVATE
4545
${PROJECT_SOURCE_DIR}/../../../include
46-
${vrtgen_SOURCE_DIR}/include
4746
)
4847
target_link_libraries(fft
4948
PRIVATE

src/components/fft/apply_window.hpp

Lines changed: 0 additions & 44 deletions
This file was deleted.

0 commit comments

Comments
 (0)