|
| 1 | +/* |
| 2 | + * Doube-precision vector log10(1+x) function. |
| 3 | + * |
| 4 | + * Copyright (c) 2025, Arm Limited. |
| 5 | + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception |
| 6 | + */ |
| 7 | + |
| 8 | +#include "test_defs.h" |
| 9 | +#include "v_math.h" |
| 10 | + |
| 11 | +static const struct data |
| 12 | +{ |
| 13 | + float64x2_t c1, c2, c3, c4, c6, c8, c10, c12, c14, c16, c18, c20; |
| 14 | + double c5, c7, c9, c11, c13, c15, c17, c19; |
| 15 | + double inv_log2_10, inv_ln10, one, minus_one; |
| 16 | + int64x2_t one_top; |
| 17 | + uint64x2_t one_m_hf_rt2_top, umask, hf_rt2_top, bottom_mask; |
| 18 | + uint64x2_t inf, minf, nan; |
| 19 | +} data = { |
| 20 | + /* Coefficients generated using FPMinimax deg=20, in |
| 21 | + [sqrt(2)/2-1, sqrt(2)-1]. */ |
| 22 | + .c1 = V2 (0x1.bcb7b1526e50fp-2), |
| 23 | + .c2 = V2 (-0x1.bcb7b1526e4fep-3), |
| 24 | + .c3 = V2 (0x1.287a7636f3cfp-3), |
| 25 | + .c4 = V2 (-0x1.bcb7b152707cap-4), |
| 26 | + .c5 = 0x1.63c62775e6667p-4, |
| 27 | + .c6 = V2 (-0x1.287a76368cf37p-4), |
| 28 | + .c7 = 0x1.fc3fa57ed69cep-5, |
| 29 | + .c8 = V2 (-0x1.bcb7b0eed8335p-5), |
| 30 | + .c9 = 0x1.8b4e10d4ecd69p-5, |
| 31 | + .c10 = V2 (-0x1.63c65554f2db4p-5), |
| 32 | + .c11 = 0x1.436b003e5358p-5, |
| 33 | + .c12 = V2 (-0x1.2872d378b6363p-5), |
| 34 | + .c13 = 0x1.11e15dd8ac0efp-5, |
| 35 | + .c14 = V2 (-0x1.fd8dc08e6b21p-6), |
| 36 | + .c15 = 0x1.d6fabf7e5c622p-6, |
| 37 | + .c16 = V2 (-0x1.ad53855566e62p-6), |
| 38 | + .c17 = 0x1.a9547f6043884p-6, |
| 39 | + .c18 = V2 (-0x1.e4a167fcd3e22p-6), |
| 40 | + .c19 = 0x1.c2f6859a15a65p-6, |
| 41 | + .c20 = V2 (-0x1.91c6df82d809bp-7), |
| 42 | + .hf_rt2_top = V2 (0x3fe6a09e00000000), |
| 43 | + .one_m_hf_rt2_top = V2 (0x00095f6200000000), |
| 44 | + .umask = V2 (0x000fffff00000000), |
| 45 | + .one_top = V2 (0x3ff), |
| 46 | + .inv_ln10 = 0x1.bcb7b1526e50ep-2, |
| 47 | + .inv_log2_10 = 0x1.34413509f79ffp-2, |
| 48 | + .inf = V2 (0x7ff0000000000000), |
| 49 | + .minf = V2 (0xfff0000000000000), |
| 50 | + .nan = V2 (0x7fffffffffffffff), |
| 51 | + .bottom_mask = V2 (0xffffffff), |
| 52 | + .minus_one = -1.0f, |
| 53 | + .one = 1.0f, |
| 54 | +}; |
| 55 | + |
| 56 | +static inline float64x2_t |
| 57 | +special_case (const struct data *d, float64x2_t x, float64x2_t y, |
| 58 | + uint64x2_t cmp) |
| 59 | +{ |
| 60 | + uint64x2_t ret_inf = vcgeq_f64 (x, vreinterpretq_f64_u64 (d->inf)); |
| 61 | + uint64x2_t neg_val |
| 62 | + = vbslq_u64 (vcgeq_f64 (x, v_f64 (d->minus_one)), d->minf, d->nan); |
| 63 | + float64x2_t s = vreinterpretq_f64_u64 (vbslq_u64 (ret_inf, d->inf, neg_val)); |
| 64 | + return vbslq_f64 (cmp, s, y); |
| 65 | +} |
| 66 | + |
| 67 | +/* Vector log10p1 approximation using polynomial on reduced interval. |
| 68 | + Worst-case error is 2.69 ULP: |
| 69 | + _ZGVnN2v_log10p1(-0x1.2582542cd267p-15) got -0x1.fde2ee0eb629p-17 |
| 70 | + want -0x1.fde2ee0eb628dp-17 . */ |
| 71 | +VPCS_ATTR float64x2_t V_NAME_D1 (log10p1) (float64x2_t x) |
| 72 | +{ |
| 73 | + const struct data *d = ptr_barrier (&data); |
| 74 | + |
| 75 | + /* Calculate scaling factor k. */ |
| 76 | + float64x2_t m = vaddq_f64 (x, v_f64 (d->one)); |
| 77 | + uint64x2_t mi = vreinterpretq_u64_f64 (m); |
| 78 | + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); |
| 79 | + int64x2_t ki |
| 80 | + = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); |
| 81 | + float64x2_t k = vcvtq_f64_s64 (ki); |
| 82 | + |
| 83 | + /* Reduce x to f in [sqrt(2)/2, sqrt (2)]. */ |
| 84 | + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); |
| 85 | + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, d->bottom_mask)); |
| 86 | + float64x2_t f |
| 87 | + = vaddq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (d->minus_one)); |
| 88 | + /* Correction term c/m. */ |
| 89 | + float64x2_t cm |
| 90 | + = vdivq_f64 (vsubq_f64 (x, vaddq_f64 (m, v_f64 (d->minus_one))), m); |
| 91 | + |
| 92 | + /* Order-18 Pairwise Horner evaluation scheme. */ |
| 93 | + float64x2_t f2 = vmulq_f64 (f, f); |
| 94 | + float64x2_t c57 = vld1q_f64 (&d->c5); |
| 95 | + float64x2_t c911 = vld1q_f64 (&d->c9); |
| 96 | + float64x2_t c1315 = vld1q_f64 (&d->c13); |
| 97 | + float64x2_t c1719 = vld1q_f64 (&d->c17); |
| 98 | + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, f, c1719, 1); |
| 99 | + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, f, c1719, 0); |
| 100 | + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, f, c1315, 1); |
| 101 | + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, f, c1315, 0); |
| 102 | + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, f, c911, 1); |
| 103 | + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, c911, 0); |
| 104 | + |
| 105 | + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, c57, 1); |
| 106 | + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, c57, 0); |
| 107 | + float64x2_t p23 = vfmaq_f64 (d->c2, f, d->c3); |
| 108 | + float64x2_t p = vfmaq_f64 (p1819, f2, d->c20); |
| 109 | + p = vfmaq_f64 (p1617, f2, p); |
| 110 | + p = vfmaq_f64 (p1415, f2, p); |
| 111 | + p = vfmaq_f64 (p1213, f2, p); |
| 112 | + p = vfmaq_f64 (p1011, f2, p); |
| 113 | + p = vfmaq_f64 (p89, f2, p); |
| 114 | + p = vfmaq_f64 (p67, f2, p); |
| 115 | + p = vfmaq_f64 (p45, f2, p); |
| 116 | + p = vfmaq_f64 (p23, f2, p); |
| 117 | + p = vfmaq_f64 (d->c1, f, p); |
| 118 | + |
| 119 | + float64x2_t inv_log_consts = vld1q_f64 (&d->inv_log2_10); |
| 120 | + /* Assemble log10p1(x) = k/log2(10) + log10p1(f) + c/(m * ln10). */ |
| 121 | + float64x2_t y = vfmaq_laneq_f64 (vmulq_f64 (p, f), k, inv_log_consts, 0); |
| 122 | + y = vfmaq_laneq_f64 (y, cm, inv_log_consts, 1); |
| 123 | + |
| 124 | + /* Special cases: x == (-inf), x == nan, x <= -1 . */ |
| 125 | + uint64x2_t special |
| 126 | + = vorrq_u64 (vcleq_f64 (x, v_f64 (d->minus_one)), |
| 127 | + vcgeq_f64 (x, vreinterpretq_f64_u64 (d->inf))); |
| 128 | + if (unlikely (v_any_u64 (special))) |
| 129 | + return special_case (d, x, y, special); |
| 130 | + return y; |
| 131 | +} |
| 132 | +#if WANT_C23_TESTS |
| 133 | +TEST_ULP (V_NAME_D1 (log10p1), 2.20) |
| 134 | +TEST_SYM_INTERVAL (V_NAME_D1 (log10p1), 0.0, 0x1p-23, 30000) |
| 135 | +TEST_SYM_INTERVAL (V_NAME_D1 (log10p1), 0x1p-23, 1, 50000) |
| 136 | +TEST_INTERVAL (V_NAME_D1 (log10p1), 1, inf, 50000) |
| 137 | +TEST_INTERVAL (V_NAME_D1 (log10p1), -1.0, -inf, 1000) |
| 138 | +#endif |
0 commit comments