Skip to content

Commit d3175c8

Browse files
committed
harmonic_mean
1 parent 0eec1b3 commit d3175c8

File tree

7 files changed

+354
-1
lines changed

7 files changed

+354
-1
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ list(APPEND link_libs gtest
164164
${Boost_LIBRARIES}
165165
${REQUIRED_LLVM_LIBRARIES}
166166
)
167-
167+
list(APPEND link_libs curses)
168168
add_subdirectory(src/modules)
169169
message(STATUS "link_libs=${link_libs}")
170170
add_subdirectory(unittest)

benchmark/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ set(BENCHMARKS
1717
benchmark_memequal_shuffle1.cc
1818
benchmark_raw_allocator.cc
1919
benchmark_interpreters.cc
20+
benchmark_calc_harmonic_mean.cc
2021
)
2122
foreach (src ${BENCHMARKS})
2223
get_filename_component(exe ${src} NAME_WE)
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
//
2+
// Created by grakra on 23-1-9.
3+
//
4+
#include <benchmark/benchmark.h>
5+
#include "hll.hh"
6+
std::vector<int8_t> data14(2<<14);
7+
std::vector<int8_t> data17(2<<14);
8+
std::vector<int8_t> data20(2<<14);
9+
10+
DataInitializer data14_init(data14);
11+
DataInitializer data17_init(data17);
12+
DataInitializer data20_init(data20);
13+
constexpr size_t num_round = 10;
14+
void BM_calc_harmonic_mean1_14(benchmark::State& state) {
15+
for (auto _ : state) {
16+
for (auto i=0; i<num_round;++i)
17+
benchmark::DoNotOptimize(calc_harmonic_mean1(data14.data(), data14.size()));
18+
}
19+
}
20+
21+
void BM_calc_harmonic_mean2_14(benchmark::State& state) {
22+
for (auto _ : state) {
23+
for (auto i=0; i<num_round;++i)
24+
benchmark::DoNotOptimize(calc_harmonic_mean2(data14.data(), data14.size()));
25+
}
26+
}
27+
28+
void BM_calc_harmonic_mean3_14(benchmark::State& state) {
29+
for (auto _ : state) {
30+
for (auto i=0; i<num_round;++i)
31+
benchmark::DoNotOptimize(calc_harmonic_mean3(data14.data(), data14.size()));
32+
}
33+
}
34+
35+
void BM_calc_harmonic_mean4_14(benchmark::State& state) {
36+
for (auto _ : state) {
37+
for (auto i=0; i<num_round;++i)
38+
benchmark::DoNotOptimize(calc_harmonic_mean4(data14.data(), data14.size()));
39+
}
40+
}
41+
42+
void BM_calc_harmonic_mean1_20(benchmark::State& state) {
43+
for (auto _ : state) {
44+
for (auto i=0; i<num_round;++i)
45+
benchmark::DoNotOptimize(calc_harmonic_mean1(data20.data(), data20.size()));
46+
}
47+
}
48+
49+
void BM_calc_harmonic_mean2_20(benchmark::State& state) {
50+
for (auto _ : state) {
51+
for (auto i=0; i<num_round;++i)
52+
benchmark::DoNotOptimize(calc_harmonic_mean2(data20.data(), data20.size()));
53+
}
54+
}
55+
56+
void BM_calc_harmonic_mean3_20(benchmark::State& state) {
57+
for (auto _ : state) {
58+
for (auto i=0; i<num_round;++i)
59+
benchmark::DoNotOptimize(calc_harmonic_mean3(data20.data(), data20.size()));
60+
}
61+
}
62+
63+
void BM_calc_harmonic_mean4_20(benchmark::State& state) {
64+
for (auto _ : state) {
65+
for (auto i=0; i<num_round;++i)
66+
benchmark::DoNotOptimize(calc_harmonic_mean4(data20.data(), data20.size()));
67+
}
68+
}
69+
70+
BENCHMARK(BM_calc_harmonic_mean4_14);
71+
BENCHMARK(BM_calc_harmonic_mean3_14);
72+
BENCHMARK(BM_calc_harmonic_mean1_14);
73+
BENCHMARK(BM_calc_harmonic_mean2_14);
74+
75+
BENCHMARK(BM_calc_harmonic_mean4_20);
76+
BENCHMARK(BM_calc_harmonic_mean3_20);
77+
BENCHMARK(BM_calc_harmonic_mean1_20);
78+
BENCHMARK(BM_calc_harmonic_mean2_20);
79+
80+
BENCHMARK_MAIN();

include/hll.hh

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
#pragma once
2+
#include <immintrin.h>
3+
4+
#include <cmath>
5+
#include <cstddef>
6+
#include <cstdint>
7+
#include <vector>
8+
#include <random>
9+
#include <iostream>
10+
11+
struct DataInitializer {
12+
explicit DataInitializer(std::vector<int8_t>& data){
13+
std::random_device rd;
14+
std::mt19937 gen(rd());
15+
std::uniform_int_distribution<int8_t> r(0, 64);
16+
for (auto i=0; i<data.size(); ++i){
17+
data[i] = r(gen);
18+
}
19+
}
20+
};
21+
22+
std::pair<float, int> calc_harmonic_mean1(int8_t* data, size_t n) {
23+
float harmonic_mean = 0;
24+
int num_zeros = 0;
25+
26+
for (int i = 0; i < n; ++i) {
27+
harmonic_mean += powf(2.0f, -data[i]);
28+
29+
if (data[i] == 0) {
30+
++num_zeros;
31+
}
32+
}
33+
harmonic_mean = 1.0f / harmonic_mean;
34+
return std::make_pair(harmonic_mean, num_zeros);
35+
}
36+
37+
__m256 exp256_ps(__m256 x) {
38+
/* Modified code. The original code is here: https://github.com/reyoung/avx_mathfun
39+
40+
AVX implementation of exp
41+
Based on "sse_mathfun.h", by Julien Pommier
42+
http://gruntthepeon.free.fr/ssemath/
43+
Copyright (C) 2012 Giovanni Garberoglio
44+
Interdisciplinary Laboratory for Computational Science (LISC)
45+
Fondazione Bruno Kessler and University of Trento
46+
via Sommarive, 18
47+
I-38123 Trento (Italy)
48+
This software is provided 'as-is', without any express or implied
49+
warranty. In no event will the authors be held liable for any damages
50+
arising from the use of this software.
51+
Permission is granted to anyone to use this software for any purpose,
52+
including commercial applications, and to alter it and redistribute it
53+
freely, subject to the following restrictions:
54+
1. The origin of this software must not be misrepresented; you must not
55+
claim that you wrote the original software. If you use this software
56+
in a product, an acknowledgment in the product documentation would be
57+
appreciated but is not required.
58+
2. Altered source versions must be plainly marked as such, and must not be
59+
misrepresented as being the original software.
60+
3. This notice may not be removed or altered from any source distribution.
61+
(this is the zlib license)
62+
*/
63+
/*
64+
To increase the compatibility across different compilers the original code is
65+
converted to plain AVX2 intrinsics code without ingenious macro's,
66+
gcc style alignment attributes etc. The modified code requires AVX2
67+
*/
68+
__m256 exp_hi = _mm256_set1_ps(88.3762626647949f);
69+
__m256 exp_lo = _mm256_set1_ps(-88.3762626647949f);
70+
71+
__m256 cephes_LOG2EF = _mm256_set1_ps(1.44269504088896341);
72+
__m256 cephes_exp_C1 = _mm256_set1_ps(0.693359375);
73+
__m256 cephes_exp_C2 = _mm256_set1_ps(-2.12194440e-4);
74+
75+
__m256 cephes_exp_p0 = _mm256_set1_ps(1.9875691500E-4);
76+
__m256 cephes_exp_p1 = _mm256_set1_ps(1.3981999507E-3);
77+
__m256 cephes_exp_p2 = _mm256_set1_ps(8.3334519073E-3);
78+
__m256 cephes_exp_p3 = _mm256_set1_ps(4.1665795894E-2);
79+
__m256 cephes_exp_p4 = _mm256_set1_ps(1.6666665459E-1);
80+
__m256 cephes_exp_p5 = _mm256_set1_ps(5.0000001201E-1);
81+
__m256 tmp = _mm256_setzero_ps(), fx;
82+
__m256i imm0;
83+
__m256 one = _mm256_set1_ps(1.0f);
84+
85+
x = _mm256_min_ps(x, exp_hi);
86+
x = _mm256_max_ps(x, exp_lo);
87+
88+
/* express exp(x) as exp(g + n*log(2)) */
89+
fx = _mm256_mul_ps(x, cephes_LOG2EF);
90+
fx = _mm256_add_ps(fx, _mm256_set1_ps(0.5f));
91+
tmp = _mm256_floor_ps(fx);
92+
__m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
93+
mask = _mm256_and_ps(mask, one);
94+
fx = _mm256_sub_ps(tmp, mask);
95+
tmp = _mm256_mul_ps(fx, cephes_exp_C1);
96+
__m256 z = _mm256_mul_ps(fx, cephes_exp_C2);
97+
x = _mm256_sub_ps(x, tmp);
98+
x = _mm256_sub_ps(x, z);
99+
z = _mm256_mul_ps(x, x);
100+
101+
__m256 y = cephes_exp_p0;
102+
y = _mm256_mul_ps(y, x);
103+
y = _mm256_add_ps(y, cephes_exp_p1);
104+
y = _mm256_mul_ps(y, x);
105+
y = _mm256_add_ps(y, cephes_exp_p2);
106+
y = _mm256_mul_ps(y, x);
107+
y = _mm256_add_ps(y, cephes_exp_p3);
108+
y = _mm256_mul_ps(y, x);
109+
y = _mm256_add_ps(y, cephes_exp_p4);
110+
y = _mm256_mul_ps(y, x);
111+
y = _mm256_add_ps(y, cephes_exp_p5);
112+
y = _mm256_mul_ps(y, z);
113+
y = _mm256_add_ps(y, x);
114+
y = _mm256_add_ps(y, one);
115+
116+
/* build 2^n */
117+
imm0 = _mm256_cvttps_epi32(fx);
118+
imm0 = _mm256_add_epi32(imm0, _mm256_set1_epi32(0x7f));
119+
imm0 = _mm256_slli_epi32(imm0, 23);
120+
__m256 pow2n = _mm256_castsi256_ps(imm0);
121+
y = _mm256_mul_ps(y, pow2n);
122+
return y;
123+
}
124+
125+
std::pair<float, int> calc_harmonic_mean2(int8_t* data, size_t n) {
126+
float harmonic_mean = 0;
127+
int num_zeros = 0;
128+
#if defined(__AVX2__)
129+
auto* p = data;
130+
const auto end = data + n;
131+
constexpr auto BLOCK_SIZE = sizeof(__m256i);
132+
const auto end0 = data + (n & ~(BLOCK_SIZE - 1));
133+
const auto ln2 = _mm256_set1_ps(0.69314718055995f);
134+
const auto zerof32 = _mm256_setzero_ps();
135+
const auto zeroi8 = _mm256_setzero_si256();
136+
auto sumf32 = _mm256_setzero_ps();
137+
for (; p < end0; p += BLOCK_SIZE) {
138+
auto d = _mm256_load_si256(reinterpret_cast<__m256i*>(p));
139+
num_zeros += _mm_popcnt_u32(_mm256_movemask_epi8(_mm256_cmpeq_epi8(d, zeroi8)));
140+
141+
auto pp = p;
142+
for (int i = 0; i < 4; ++i) {
143+
auto x = _mm256_set_ps(pp[0], pp[1], pp[2], pp[3], pp[4], pp[5], pp[6], pp[7]);
144+
sumf32 = _mm256_add_ps(exp256_ps((_mm256_mul_ps(_mm256_sub_ps(zerof32, x), ln2))), sumf32);
145+
pp += 8;
146+
}
147+
}
148+
for (int i = 0; i < sizeof(sumf32) / sizeof(float); ++i) {
149+
harmonic_mean += (reinterpret_cast<float*>(&sumf32))[i];
150+
}
151+
#endif
152+
for (; p < end; ++p) {
153+
harmonic_mean += powf(2.0f, p[0]);
154+
if (p[0] == 0) {
155+
++num_zeros;
156+
}
157+
}
158+
159+
harmonic_mean = 1.0f / harmonic_mean;
160+
return std::make_pair(harmonic_mean, num_zeros);
161+
}
162+
163+
std::pair<float, int> calc_harmonic_mean3(int8_t* data, size_t n) {
164+
float harmonic_mean = 0;
165+
int num_zeros = 0;
166+
167+
for (int i = 0; i < n; ++i) {
168+
harmonic_mean += 1.0f / static_cast<float>((1L << data[i]));
169+
if (data[i] == 0) {
170+
++num_zeros;
171+
}
172+
}
173+
harmonic_mean = 1.0f / harmonic_mean;
174+
return std::make_pair(harmonic_mean, num_zeros);
175+
}
176+
177+
static float tables[65] = {
178+
1.0f / static_cast<float>(1L << 0), 1.0f / static_cast<float>(1L << 1), 1.0f / static_cast<float>(1L << 2),
179+
1.0f / static_cast<float>(1L << 3), 1.0f / static_cast<float>(1L << 4), 1.0f / static_cast<float>(1L << 5),
180+
1.0f / static_cast<float>(1L << 6), 1.0f / static_cast<float>(1L << 7), 1.0f / static_cast<float>(1L << 8),
181+
1.0f / static_cast<float>(1L << 9), 1.0f / static_cast<float>(1L << 10), 1.0f / static_cast<float>(1L << 11),
182+
1.0f / static_cast<float>(1L << 12), 1.0f / static_cast<float>(1L << 13), 1.0f / static_cast<float>(1L << 14),
183+
1.0f / static_cast<float>(1L << 15), 1.0f / static_cast<float>(1L << 16), 1.0f / static_cast<float>(1L << 17),
184+
1.0f / static_cast<float>(1L << 18), 1.0f / static_cast<float>(1L << 19), 1.0f / static_cast<float>(1L << 20),
185+
1.0f / static_cast<float>(1L << 21), 1.0f / static_cast<float>(1L << 22), 1.0f / static_cast<float>(1L << 23),
186+
1.0f / static_cast<float>(1L << 24), 1.0f / static_cast<float>(1L << 25), 1.0f / static_cast<float>(1L << 26),
187+
1.0f / static_cast<float>(1L << 27), 1.0f / static_cast<float>(1L << 28), 1.0f / static_cast<float>(1L << 29),
188+
1.0f / static_cast<float>(1L << 30), 1.0f / static_cast<float>(1L << 31), 1.0f / static_cast<float>(1L << 32),
189+
1.0f / static_cast<float>(1L << 33), 1.0f / static_cast<float>(1L << 34), 1.0f / static_cast<float>(1L << 35),
190+
1.0f / static_cast<float>(1L << 36), 1.0f / static_cast<float>(1L << 37), 1.0f / static_cast<float>(1L << 38),
191+
1.0f / static_cast<float>(1L << 39), 1.0f / static_cast<float>(1L << 40), 1.0f / static_cast<float>(1L << 41),
192+
1.0f / static_cast<float>(1L << 42), 1.0f / static_cast<float>(1L << 43), 1.0f / static_cast<float>(1L << 44),
193+
1.0f / static_cast<float>(1L << 45), 1.0f / static_cast<float>(1L << 46), 1.0f / static_cast<float>(1L << 47),
194+
1.0f / static_cast<float>(1L << 48), 1.0f / static_cast<float>(1L << 49), 1.0f / static_cast<float>(1L << 50),
195+
1.0f / static_cast<float>(1L << 51), 1.0f / static_cast<float>(1L << 52), 1.0f / static_cast<float>(1L << 53),
196+
1.0f / static_cast<float>(1L << 54), 1.0f / static_cast<float>(1L << 55), 1.0f / static_cast<float>(1L << 56),
197+
1.0f / static_cast<float>(1L << 57), 1.0f / static_cast<float>(1L << 58), 1.0f / static_cast<float>(1L << 59),
198+
1.0f / static_cast<float>(1L << 60), 1.0f / static_cast<float>(1L << 61), 1.0f / static_cast<float>(1L << 62),
199+
1.0f / static_cast<float>(1L << 63), 1.0f / static_cast<float>(1L << 64),
200+
};
201+
std::pair<float, int> calc_harmonic_mean4(int8_t* data, size_t n) {
202+
float harmonic_mean = 0;
203+
int num_zeros = 0;
204+
205+
for (int i = 0; i < n; ++i) {
206+
harmonic_mean += tables[data[i]];
207+
if (data[i] == 0) {
208+
++num_zeros;
209+
}
210+
}
211+
harmonic_mean = 1.0f / harmonic_mean;
212+
return std::make_pair(harmonic_mean, num_zeros);
213+
}

src/main/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ set(MAINS
99
atomic_fetch_add_strength_test.cc
1010
interpreter_demo.cc
1111
cache_oom.cc
12+
simd.cc
1213
)
1314
foreach (src ${MAINS})
1415
get_filename_component(exe ${src} NAME_WE)

src/main/simd.cc

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
//
2+
// Created by grakra on 23-1-6.
3+
//
4+
5+
#include <immintrin.h>
6+
7+
#include <cstddef>
8+
#include <cstdint>
9+
#include <cstdio>
10+
#include <iostream>
11+
#include <vector>
12+
float calc(int8_t* p) {
13+
auto d = _mm256_load_si256(reinterpret_cast<__m256i*>(p));
14+
auto x0 = _mm256_setzero_ps();
15+
auto y0 = _mm256_setzero_ps();
16+
auto* p8 = reinterpret_cast<int8_t*>(&d);
17+
for (auto i = 0; i < 4; ++i) {
18+
p8 += 8;
19+
y0 = _mm256_set_ps(p8[0], p8[1], p8[2], p8[3], p8[4], p8[5], p8[6], p8[7]);
20+
x0 = _mm256_add_ps(x0, y0);
21+
}
22+
//x0 = _mm256_add_ps(x0, _mm256_srli_si256(x0, 16));
23+
//x0 = _mm256_add_ps(x0, _mm256_srli_si256(x0, 8));
24+
//x0 = _mm256_add_ps(x0, _mm256_srli_si256(x0, 4));
25+
return _mm256_cvtss_f32(x0);
26+
}
27+
28+
int main(int argc, char**argv) {
29+
std::vector<int8_t> data;
30+
data.resize(256);
31+
for (int i=0; i <256; ++i) {
32+
data[i] = i;
33+
}
34+
auto r = calc(data.data());
35+
std::cout<<r<<std::endl;
36+
}

unittest/test_llvm.cc

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
//
99
#include <gtest/gtest.h>
1010
#include <llvm-c/Core.h>
11+
#include <llvm-c/TargetMachine.h>
1112

1213
#include <memory>
1314
namespace test {
@@ -73,7 +74,28 @@ TEST_F(LLVMTest, testTupleReference) {
7374
}
7475
std::cout << "end" << std::endl;
7576
}
77+
TEST_F(LLVMTest, testBasic2) {
78+
auto llvmCtx = LLVMContextCreate();
79+
ASSERT_TRUE(LLVMInitializeNativeTarget() == 0);
80+
auto module = LLVMModuleCreateWithName("m0");
81+
std::string triple = LLVMNormalizeTargetTriple(LLVMGetDefaultTargetTriple());
82+
auto target = LLVMGetFirstTarget();
83+
ASSERT_TRUE(target != nullptr);
84+
std::string hostCPU = LLVMGetHostCPUName();
85+
std::string hostCPUFeatures = LLVMGetHostCPUFeatures();
86+
auto targetMachine = LLVMCreateTargetMachine(target, triple.c_str(), hostCPU.c_str(), hostCPUFeatures.c_str(),
87+
LLVMCodeGenOptLevel::LLVMCodeGenLevelDefault,
88+
LLVMRelocMode::LLVMRelocPIC, LLVMCodeModel::LLVMCodeModelDefault);
89+
auto targetDataLayout = LLVMCreateTargetDataLayout(targetMachine);
90+
auto dataLayout = LLVMCopyStringRepOfTargetData(targetDataLayout);
91+
LLVMSetTarget(module, triple.c_str());
92+
LLVMSetDataLayout(module, dataLayout);
93+
auto builder = LLVMCreateBuilder();
94+
std::vector<LLVMTypeRef> funcParams(2, LLVMInt32Type());
95+
auto funcType = LLVMFunctionType(LLVMInt32Type(),funcParams.data(), funcParams.size(), false);
96+
LLVMAddFunction(module, "foobar", funcType);
7697

98+
}
7799
} // namespace test
78100
int main(int argc, char** argv) {
79101
::testing::InitGoogleTest(&argc, argv);

0 commit comments

Comments
 (0)