Skip to content

Commit 7905ec3

Browse files
authored
[libc] Use UMAXV.4S to reduce bcmp result.
We can use UMAXV.4S to reduce the comparison result in a single instruction. This improves performance by roughly 4% on Apple M1: Summary bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 ran 1.01 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.01 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.01 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark3 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.01 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark2 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.02 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.03 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.03 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark2 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.05 ± 0.03 times faster than bin/libc.src.string.bcmp_benchmark1 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 1.05 ± 0.02 times faster than bin/libc.src.string.bcmp_benchmark1 --study-name="new bcmp" --sweep-mode --sweep-max-size=128 --output=/dev/null --num-trials=10 (1 = original, 2 = a variant of this patch that uses UMAXV.16B, 3 = this patch) Reviewers: michaelrj-google, gchatelet, overmighty, SchrodingerZhu Pull Request: #99260
1 parent d742903 commit 7905ec3

File tree

1 file changed

+6
-12
lines changed

1 file changed

+6
-12
lines changed

libc/src/string/memory_utils/op_aarch64.h

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,7 @@ template <size_t Size> struct Bcmp {
8484
uint8x16_t a = vld1q_u8(_p1);
8585
uint8x16_t n = vld1q_u8(_p2);
8686
uint8x16_t an = veorq_u8(a, n);
87-
uint32x2_t an_reduced = vqmovn_u64(vreinterpretq_u64_u8(an));
88-
return vmaxv_u32(an_reduced);
87+
return vmaxvq_u32(vreinterpretq_u32_u8(an));
8988
} else if constexpr (Size == 32) {
9089
auto _p1 = as_u8(p1);
9190
auto _p2 = as_u8(p2);
@@ -97,12 +96,9 @@ template <size_t Size> struct Bcmp {
9796
uint8x16_t bo = veorq_u8(b, o);
9897
// anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
9998
// a difference between the two buffers. We reduce this value down to 4
100-
// bytes in two steps. First, calculate the saturated move value when
101-
// going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
102-
// a single 32 bit nonzero value if a mismatch occurred.
99+
// bytes using the UMAXV instruction to compute the max across the vector.
103100
uint8x16_t anbo = vorrq_u8(an, bo);
104-
uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
105-
return vmaxv_u32(anbo_reduced);
101+
return vmaxvq_u32(vreinterpretq_u32_u8(anbo));
106102
} else if constexpr ((Size % BlockSize) == 0) {
107103
for (size_t offset = 0; offset < Size; offset += BlockSize)
108104
if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
@@ -129,8 +125,7 @@ template <size_t Size> struct Bcmp {
129125
uint8x16_t bo = veorq_u8(b, o);
130126
// anbo = (a ^ n) | (b ^ o)
131127
uint8x16_t anbo = vorrq_u8(an, bo);
132-
uint32x2_t anbo_reduced = vqmovn_u64(vreinterpretq_u64_u8(anbo));
133-
return vmaxv_u32(anbo_reduced);
128+
return vmaxvq_u32(vreinterpretq_u32_u8(anbo));
134129
} else if constexpr (Size == 32) {
135130
auto _p1 = as_u8(p1);
136131
auto _p2 = as_u8(p2);
@@ -150,9 +145,8 @@ template <size_t Size> struct Bcmp {
150145
uint8x16_t cpdq = vorrq_u8(cp, dq);
151146
// abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
152147
// a nonzero 32 bit value if a mismatch occurred.
153-
uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
154-
uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
155-
return vmaxv_u32(abnocpdq_reduced);
148+
uint8x16_t abnocpdq = anbo | cpdq;
149+
return vmaxvq_u32(vreinterpretq_u32_u8(abnocpdq));
156150
} else {
157151
static_assert(cpp::always_false<decltype(Size)>, "SIZE not implemented");
158152
}

0 commit comments

Comments
 (0)