|
| 1 | +/* Copyright (C) 2024 Povilas Kanapickas <povilas@radix.lt> |
| 2 | +
|
| 3 | + Distributed under the Boost Software License, Version 1.0. |
| 4 | + (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | + http://www.boost.org/LICENSE_1_0.txt) |
| 6 | +*/ |
| 7 | + |
| 8 | +#ifndef LIBSIMDPP_SIMDPP_MATH_LOG2_APPROX_H |
| 9 | +#define LIBSIMDPP_SIMDPP_MATH_LOG2_APPROX_H |
| 10 | + |
| 11 | +#include <simdpp/simd.h> |
| 12 | + |
| 13 | +namespace simdpp { |
| 14 | +namespace SIMDPP_ARCH_NAMESPACE { |
| 15 | + |
| 16 | +/** Calculates approximate log2(x). The function is optimized for maximum speed. |
| 17 | + The absolute error of the result is ... over entire range. |
| 18 | +
|
| 19 | + This version of the function requires that the argument is nonzero positive number that |
| 20 | + is also not an infinity. |
| 21 | +*/ |
| 22 | +template<unsigned N> |
| 23 | +float32<N> log2_approx_positive_finite(const float32<N>& a) |
| 24 | +{ |
| 25 | + uint32<N> exponent_mask = make_uint(0x7f800000); |
| 26 | + uint32<N> exponent_for_1 = make_uint(0x3f800000); |
| 27 | + |
| 28 | + // IEEE-754 floating-point numbers are convenient as they store the 2-based exponent as a port |
| 29 | + // of their format already. The algorithm below extracts the exponent and then appliyes |
| 30 | + // a polynomial to map the [1..2) mantissa to approximate value. |
| 31 | + |
| 32 | + // extract the exponent into float value |
| 33 | + auto a_int = bit_cast<uint32<N>>(a); |
| 34 | + auto a_exponent_int = shift_r<23>(a_int) & 0xff; |
| 35 | + auto res = to_float32(bit_cast<int32<N>>(a_exponent_int) - 128); |
| 36 | + |
| 37 | + // extract the mantissa to the range [1..2) |
| 38 | + auto mantissa = bit_cast<float32<N>>(bit_select(exponent_for_1, a_int, exponent_mask)); |
| 39 | + |
| 40 | + auto mantissa_res = -0.34484362f * mantissa + 2.02466192f; |
| 41 | + mantissa_res = mantissa_res * mantissa - 0.67487591f; |
| 42 | + res = res + mantissa_res; |
| 43 | + |
| 44 | + return res; |
| 45 | +} |
| 46 | + |
| 47 | +/** Calculates approximate log2(x). The function is optimized for maximum speed. |
| 48 | + The absolute error of the result is ... over entire range. |
| 49 | +
|
| 50 | + This version of the function handles full range of inputs including special cases correctly. |
| 51 | +*/ |
| 52 | +template<unsigned N> |
| 53 | +float32<N> log2_approx(const float32<N>& a) |
| 54 | +{ |
| 55 | + uint32<N> exponent_mask = make_uint(0x7f800000); |
| 56 | + uint32<N> exponent_for_1 = make_uint(0x3f800000); |
| 57 | + float32<N> neg_infinity = make_float(-std::numeric_limits<float>::infinity()); |
| 58 | + |
| 59 | + // IEEE-754 floating-point numbers are convenient as they store the 2-based exponent as a port |
| 60 | + // of their format already. The algorithm below extracts the exponent and then appliyes |
| 61 | + // a polynomial to map the [1..2) mantissa to approximate value. |
| 62 | + |
| 63 | + auto nan_mask = a < 0; |
| 64 | + |
| 65 | + auto zero_mask = a == 0; |
| 66 | + auto finite_mask = isfinite(a); |
| 67 | + |
| 68 | + // extract the exponent into float value |
| 69 | + auto a_int = bit_cast<uint32<N>>(a); |
| 70 | + auto a_exponent_int = shift_r<23>(a_int) & 0xff; |
| 71 | + auto res = to_float32(bit_cast<int32<N>>(a_exponent_int) - 128); |
| 72 | + |
| 73 | + // extract the mantissa to the range [1..2) |
| 74 | + auto mantissa = bit_cast<float32<N>>(bit_select(exponent_for_1, a_int, exponent_mask)); |
| 75 | + |
| 76 | + auto mantissa_res = -0.34484362f * mantissa + 2.02466192f; |
| 77 | + mantissa_res = mantissa_res * mantissa - 0.67487591f; |
| 78 | + res = res + mantissa_res; |
| 79 | + |
| 80 | + // put back infinity if the argument was infinity |
| 81 | + res = blend(res, a, finite_mask); |
| 82 | + // put negative infinity if argument was zero |
| 83 | + res = blend(neg_infinity, res, zero_mask); |
| 84 | + // put NaN if argument was negative |
| 85 | + res = res | nan_mask; // 0xffffffff mask is convenient because it's NaN itself |
| 86 | + |
| 87 | + return res; |
| 88 | +} |
| 89 | + |
| 90 | +} // namespace simdpp |
| 91 | +} // namespace SIMDPP_ARCH_NAMESPACE |
| 92 | + |
| 93 | +#endif // LIBSIMDPP_SIMDPP_ALGORITHM_BITONIC_SORT |
0 commit comments