Skip to content

Commit 6bb92f9

Browse files
mikejuliet13vuvova
authored andcommitted
MDEV-36184 - mhnsw: support powerpc64 SIMD instructions
This patch optimises the dot_product function by leveraging vectorisation through SIMD intrinsics. This transformation enables parallel execution of multiple operations, significantly improving the performance of dot product computation on supported architectures. The original dot_product function does undergo auto-vectorisation when compiled with -O3. However, performance analysis has shown that the newly optimised implementation performs better on Power10 and achieves comparable performance on Power9 machines. Benchmark tests were conducted on both Power9 and Power10 machines, comparing the time taken by the original (auto-vectorized) code and the new vectorised code. GCC 11.5.0 on RHEL 9.5 operating system with -O3 were used. The benchmarks were performed using a sample test code with a vector size of 4096 and 10⁷ loop iterations. Here are the average execution times (in seconds) over multiple runs: Power9: Before change: ~16.364 s After change: ~16.180 s Performance gain is modest but measurable. Power10: Before change: ~8.989 s After change: ~6.446 s Significant improvement, roughly 28–30% faster. Signed-off-by: Manjul Mohan <manjul.mohan@ibm.com>
1 parent db5bb6f commit 6bb92f9

File tree

2 files changed

+56
-0
lines changed

2 files changed

+56
-0
lines changed

sql/bloom_filters.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ SOFTWARE.
5353
#define NEON_IMPLEMENTATION
5454
#endif
5555
#endif
56+
#if defined __powerpc64__ && defined __VSX__
57+
#include <altivec.h>
58+
#define POWER_IMPLEMENTATION
59+
#endif
5660

5761
template <typename T>
5862
struct PatternedSimdBloomFilter

sql/vector_mhnsw.cc

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,58 @@ struct FVector
229229
}
230230
#endif
231231

232+
#ifdef POWER_IMPLEMENTATION
233+
/************* POWERPC *****************************************************/
234+
static constexpr size_t POWER_bytes= 128 / 8; // Assume 128-bit vector width
235+
static constexpr size_t POWER_dims= POWER_bytes / sizeof(int16_t);
236+
237+
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
238+
{
239+
// Using vector long long for int64_t accumulation
240+
vector long long ll_sum= {0, 0};
241+
// Round up to process full vector, including padding
242+
size_t base= ((len + POWER_dims - 1) / POWER_dims) * POWER_dims;
243+
244+
for (size_t i= 0; i < base; i+= POWER_dims)
245+
{
246+
vector short x= vec_ld(0, &v1[i]);
247+
vector short y= vec_ld(0, &v2[i]);
248+
249+
// Vectorized multiplication using vec_mule() and vec_mulo()
250+
vector int product_hi= vec_mule(x, y);
251+
vector int product_lo= vec_mulo(x, y);
252+
253+
// Extend vector int to vector long long for accumulation
254+
vector long long llhi1= vec_unpackh(product_hi);
255+
vector long long llhi2= vec_unpackl(product_hi);
256+
vector long long lllo1= vec_unpackh(product_lo);
257+
vector long long lllo2= vec_unpackl(product_lo);
258+
259+
ll_sum+= llhi1 + llhi2 + lllo1 + lllo2;
260+
}
261+
262+
return static_cast<float>(static_cast<int64_t>(ll_sum[0]) +
263+
static_cast<int64_t>(ll_sum[1]));
264+
}
265+
266+
static size_t alloc_size(size_t n)
267+
{
268+
return alloc_header + MY_ALIGN(n * 2, POWER_bytes) + POWER_bytes - 1;
269+
}
270+
271+
static FVector *align_ptr(void *ptr)
272+
{
273+
return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, POWER_bytes)
274+
- alloc_header);
275+
}
276+
277+
void fix_tail(size_t vec_len)
278+
{
279+
bzero(dims + vec_len, (MY_ALIGN(vec_len, POWER_dims) - vec_len) * 2);
280+
}
281+
#undef DEFAULT_IMPLEMENTATION
282+
#endif
283+
232284
/************* no-SIMD default ******************************************/
233285
#ifdef DEFAULT_IMPLEMENTATION
234286
DEFAULT_IMPLEMENTATION

0 commit comments

Comments
 (0)