Skip to content

Commit 3427f16

Browse files
committed
Revert "hash/crc32: improve the AMD64 implementation using SSE4.2"
This reverts commit 54d7de7. It was breaking non-amd64 builds. Change-Id: I22650e922498eeeba3d4fa08bb4ea40a210c8f97 Reviewed-on: https://go-review.googlesource.com/27925 Reviewed-by: Keith Randall <khr@golang.org>
1 parent 54d7de7 commit 3427f16

File tree

7 files changed

+14
-292
lines changed

7 files changed

+14
-292
lines changed

src/hash/crc32/crc32.go

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,8 @@ var castagnoliTable8 *slicing8Table
5252
var castagnoliOnce sync.Once
5353

5454
func castagnoliInit() {
55-
// Call the arch-specific init function and let it decide if we will need
56-
// the tables for the generic implementation.
57-
needGenericTables := castagnoliInitArch()
58-
59-
if needGenericTables {
60-
castagnoliTable = makeTable(Castagnoli)
61-
castagnoliTable8 = makeTable8(Castagnoli)
62-
}
55+
castagnoliTable = makeTable(Castagnoli)
56+
castagnoliTable8 = makeTable8(Castagnoli)
6357
}
6458

6559
// IEEETable is the table for the IEEE polynomial.

src/hash/crc32/crc32_amd64.go

Lines changed: 7 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44

55
package crc32
66

7-
import "unsafe"
8-
97
// This file contains the code to call the SSE 4.2 version of the Castagnoli
108
// and IEEE CRC.
119

@@ -15,20 +13,11 @@ func haveSSE41() bool
1513
func haveSSE42() bool
1614
func haveCLMUL() bool
1715

18-
// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
16+
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
1917
// instruction.
2018
//go:noescape
2119
func castagnoliSSE42(crc uint32, p []byte) uint32
2220

23-
// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
24-
// instruction.
25-
//go:noescape
26-
func castagnoliSSE42Triple(
27-
crcA, crcB, crcC uint32,
28-
a, b, c []byte,
29-
rounds uint32,
30-
) (retA uint32, retB uint32, retC uint32)
31-
3221
// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
3322
// instruction as well as SSE 4.1.
3423
//go:noescape
@@ -37,160 +26,15 @@ func ieeeCLMUL(crc uint32, p []byte) uint32
3726
var sse42 = haveSSE42()
3827
var useFastIEEE = haveCLMUL() && haveSSE41()
3928

40-
const castagnoliK1 = 168
41-
const castagnoliK2 = 1344
42-
43-
type sse42Table [4]Table
44-
45-
var castagnoliSSE42TableK1 *sse42Table
46-
var castagnoliSSE42TableK2 *sse42Table
47-
48-
func castagnoliInitArch() (needGenericTables bool) {
49-
if !sse42 {
50-
return true
51-
}
52-
castagnoliSSE42TableK1 = new(sse42Table)
53-
castagnoliSSE42TableK2 = new(sse42Table)
54-
// See description in updateCastagnoli.
55-
// t[0][i] = CRC(i000, O)
56-
// t[1][i] = CRC(0i00, O)
57-
// t[2][i] = CRC(00i0, O)
58-
// t[3][i] = CRC(000i, O)
59-
// where O is a sequence of K zeros.
60-
var tmp [castagnoliK2]byte
61-
for b := 0; b < 4; b++ {
62-
for i := 0; i < 256; i++ {
63-
val := uint32(i) << uint32(b*8)
64-
castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1])
65-
castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:])
66-
}
67-
}
68-
return false
69-
}
70-
71-
// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
72-
// table given) with the given initial crc value. This corresponds to
73-
// CRC(crc, O) in the description in updateCastagnoli.
74-
func castagnoliShift(table *sse42Table, crc uint32) uint32 {
75-
return table[3][crc>>24] ^
76-
table[2][(crc>>16)&0xFF] ^
77-
table[1][(crc>>8)&0xFF] ^
78-
table[0][crc&0xFF]
79-
}
80-
8129
func updateCastagnoli(crc uint32, p []byte) uint32 {
82-
if !sse42 {
83-
// Use slicing-by-8 on larger inputs.
84-
if len(p) >= sliceBy8Cutoff {
85-
return updateSlicingBy8(crc, castagnoliTable8, p)
86-
}
87-
return update(crc, castagnoliTable, p)
88-
}
89-
90-
// This method is inspired from the algorithm in Intel's white paper:
91-
// "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
92-
// The same strategy of splitting the buffer in three is used but the
93-
// combining calculation is different; the complete derivation is explained
94-
// below.
95-
//
96-
// -- The basic idea --
97-
//
98-
// The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
99-
// time. In recent Intel architectures the instruction takes 3 cycles;
100-
// however the processor can pipeline up to three instructions if they
101-
// don't depend on each other.
102-
//
103-
// Roughly this means that we can process three buffers in about the same
104-
// time we can process one buffer.
105-
//
106-
// The idea is then to split the buffer in three, CRC the three pieces
107-
// separately and then combine the results.
108-
//
109-
// Combining the results requires precomputed tables, so we must choose a
110-
// fixed buffer length to optimize. The longer the length, the faster; but
111-
// only buffers longer than this length will use the optimization. We choose
112-
// two cutoffs and compute tables for both:
113-
// - one around 512: 168*3=504
114-
// - one around 4KB: 1344*3=4032
115-
//
116-
// -- The nitty gritty --
117-
//
118-
// Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
119-
// initial non-inverted CRC I). This function has the following properties:
120-
// (a) CRC(I, AB) = CRC(CRC(I, A), B)
121-
// (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
122-
//
123-
// Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
124-
// K bytes each, where K is a fixed constant. Let O be the sequence of K zero
125-
// bytes.
126-
//
127-
// CRC(I, ABC) = CRC(I, ABO xor C)
128-
// = CRC(I, ABO) xor CRC(0, C)
129-
// = CRC(CRC(I, AB), O) xor CRC(0, C)
130-
// = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
131-
// = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
132-
// = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
133-
//
134-
// The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
135-
// and CRC(0, C) efficiently. We just need to find a way to quickly compute
136-
// CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
137-
// values; since we can't have a 32-bit table, we break it up into four
138-
// 8-bit tables:
139-
//
140-
// CRC(uvwx, O) = CRC(u000, O) xor
141-
// CRC(0v00, O) xor
142-
// CRC(00w0, O) xor
143-
// CRC(000x, O)
144-
//
145-
// We can compute tables corresponding to the four terms for all 8-bit
146-
// values.
147-
148-
crc = ^crc
149-
150-
// If a buffer is long enough to use the optimization, process the first few
151-
// bytes to align the buffer to an 8 byte boundary (if necessary).
152-
if len(p) >= castagnoliK1*3 {
153-
delta := int(uintptr(unsafe.Pointer(&p[0])) & 7)
154-
if delta != 0 {
155-
delta = 8 - delta
156-
crc = castagnoliSSE42(crc, p[:delta])
157-
p = p[delta:]
158-
}
159-
}
160-
161-
// Process 3*K2 at a time.
162-
for len(p) >= castagnoliK2*3 {
163-
// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
164-
crcA, crcB, crcC := castagnoliSSE42Triple(
165-
crc, 0, 0,
166-
p, p[castagnoliK2:], p[castagnoliK2*2:],
167-
castagnoliK2/24)
168-
169-
// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
170-
crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB
171-
// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
172-
crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC
173-
p = p[castagnoliK2*3:]
30+
if sse42 {
31+
return castagnoliSSE42(crc, p)
17432
}
175-
176-
// Process 3*K1 at a time.
177-
for len(p) >= castagnoliK1*3 {
178-
// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
179-
crcA, crcB, crcC := castagnoliSSE42Triple(
180-
crc, 0, 0,
181-
p, p[castagnoliK1:], p[castagnoliK1*2:],
182-
castagnoliK1/24)
183-
184-
// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
185-
crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB
186-
// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
187-
crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC
188-
p = p[castagnoliK1*3:]
33+
// Use slicing-by-8 on larger inputs.
34+
if len(p) >= sliceBy8Cutoff {
35+
return updateSlicingBy8(crc, castagnoliTable8, p)
18936
}
190-
191-
// Use the simple implementation for what's left.
192-
crc = castagnoliSSE42(crc, p)
193-
return ^crc
37+
return update(crc, castagnoliTable, p)
19438
}
19539

19640
func updateIEEE(crc uint32, p []byte) uint32 {

src/hash/crc32/crc32_amd64.s

Lines changed: 3 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44

55
#include "textflag.h"
66

7-
// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
8-
//
97
// func castagnoliSSE42(crc uint32, p []byte) uint32
108
TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
119
MOVL crc+0(FP), AX // CRC value
1210
MOVQ p+8(FP), SI // data pointer
1311
MOVQ p_len+16(FP), CX // len(p)
1412

13+
NOTL AX
14+
1515
// If there are fewer than 8 bytes to process, skip alignment.
1616
CMPQ CX, $8
1717
JL less_than_8
@@ -87,53 +87,10 @@ less_than_2:
8787
CRC32B (SI), AX
8888

8989
done:
90+
NOTL AX
9091
MOVL AX, ret+32(FP)
9192
RET
9293

93-
// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
94-
// bytes from each buffer.
95-
//
96-
// func castagnoliSSE42Triple(
97-
// crc1, crc2, crc3 uint32,
98-
// a, b, c []byte,
99-
// rounds uint32,
100-
// ) (retA uint32, retB uint32, retC uint32)
101-
TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
102-
MOVL crcA+0(FP), AX
103-
MOVL crcB+4(FP), CX
104-
MOVL crcC+8(FP), DX
105-
106-
MOVQ a+16(FP), R8 // data pointer
107-
MOVQ b+40(FP), R9 // data pointer
108-
MOVQ c+64(FP), R10 // data pointer
109-
110-
MOVL rounds+88(FP), R11
111-
112-
loop:
113-
CRC32Q (R8), AX
114-
CRC32Q (R9), CX
115-
CRC32Q (R10), DX
116-
117-
CRC32Q 8(R8), AX
118-
CRC32Q 8(R9), CX
119-
CRC32Q 8(R10), DX
120-
121-
CRC32Q 16(R8), AX
122-
CRC32Q 16(R9), CX
123-
CRC32Q 16(R10), DX
124-
125-
ADDQ $24, R8
126-
ADDQ $24, R9
127-
ADDQ $24, R10
128-
129-
DECQ R11
130-
JNZ loop
131-
132-
MOVL AX, retA+96(FP)
133-
MOVL CX, retB+100(FP)
134-
MOVL DX, retC+104(FP)
135-
RET
136-
13794
// func haveSSE42() bool
13895
TEXT ·haveSSE42(SB),NOSPLIT,$0
13996
XORQ AX, AX

src/hash/crc32/crc32_amd64p32.go

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,17 @@ package crc32
77
// This file contains the code to call the SSE 4.2 version of the Castagnoli
88
// CRC.
99

10-
// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
10+
// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2
1111
// support.
1212
func haveSSE42() bool
1313

14-
// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
14+
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
1515
// instruction.
1616
//go:noescape
1717
func castagnoliSSE42(crc uint32, p []byte) uint32
1818

1919
var sse42 = haveSSE42()
2020

21-
func castagnoliInitArch() (needGenericTables bool) {
22-
// We only need the generic implementation tables if we don't have SSE4.2.
23-
return !sse42
24-
}
25-
2621
func updateCastagnoli(crc uint32, p []byte) uint32 {
2722
if sse42 {
2823
return castagnoliSSE42(crc, p)

src/hash/crc32/crc32_generic.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,6 @@ package crc32
99
// This file contains the generic version of updateCastagnoli which does
1010
// slicing-by-8, or uses the fallback for very small sizes.
1111

12-
func castagnoliInitArch() (needGenericTables bool) {
13-
return true
14-
}
15-
1612
func updateCastagnoli(crc uint32, p []byte) uint32 {
1713
// Use slicing-by-8 on larger inputs.
1814
if len(p) >= sliceBy8Cutoff {

src/hash/crc32/crc32_s390x.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ func vectorizedCastagnoli(crc uint32, p []byte) uint32
2525
//go:noescape
2626
func vectorizedIEEE(crc uint32, p []byte) uint32
2727

28-
func castagnoliInitArch() (needGenericTables bool) {
29-
return true
30-
}
31-
3228
func genericCastagnoli(crc uint32, p []byte) uint32 {
3329
// Use slicing-by-8 on larger inputs.
3430
if len(p) >= sliceBy8Cutoff {

0 commit comments

Comments
 (0)