44
55package crc32
66
7- import "unsafe"
8-
97// This file contains the code to call the SSE 4.2 version of the Castagnoli
108// and IEEE CRC.
119
@@ -15,20 +13,11 @@ func haveSSE41() bool
1513func haveSSE42 () bool
1614func haveCLMUL () bool
1715
18- // castagnoliSSE42 is defined in crc32_amd64 .s and uses the SSE4.2 CRC32
16+ // castagnoliSSE42 is defined in crc_amd64 .s and uses the SSE4.2 CRC32
1917// instruction.
2018//go:noescape
2119func castagnoliSSE42 (crc uint32 , p []byte ) uint32
2220
23- // castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
24- // instruction.
25- //go:noescape
26- func castagnoliSSE42Triple (
27- crcA , crcB , crcC uint32 ,
28- a , b , c []byte ,
29- rounds uint32 ,
30- ) (retA uint32 , retB uint32 , retC uint32 )
31-
3221// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
3322// instruction as well as SSE 4.1.
3423//go:noescape
@@ -37,160 +26,15 @@ func ieeeCLMUL(crc uint32, p []byte) uint32
3726var sse42 = haveSSE42 ()
3827var useFastIEEE = haveCLMUL () && haveSSE41 ()
3928
40- const castagnoliK1 = 168
41- const castagnoliK2 = 1344
42-
43- type sse42Table [4 ]Table
44-
45- var castagnoliSSE42TableK1 * sse42Table
46- var castagnoliSSE42TableK2 * sse42Table
47-
48- func castagnoliInitArch () (needGenericTables bool ) {
49- if ! sse42 {
50- return true
51- }
52- castagnoliSSE42TableK1 = new (sse42Table )
53- castagnoliSSE42TableK2 = new (sse42Table )
54- // See description in updateCastagnoli.
55- // t[0][i] = CRC(i000, O)
56- // t[1][i] = CRC(0i00, O)
57- // t[2][i] = CRC(00i0, O)
58- // t[3][i] = CRC(000i, O)
59- // where O is a sequence of K zeros.
60- var tmp [castagnoliK2 ]byte
61- for b := 0 ; b < 4 ; b ++ {
62- for i := 0 ; i < 256 ; i ++ {
63- val := uint32 (i ) << uint32 (b * 8 )
64- castagnoliSSE42TableK1 [b ][i ] = castagnoliSSE42 (val , tmp [:castagnoliK1 ])
65- castagnoliSSE42TableK2 [b ][i ] = castagnoliSSE42 (val , tmp [:])
66- }
67- }
68- return false
69- }
70-
71- // castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
72- // table given) with the given initial crc value. This corresponds to
73- // CRC(crc, O) in the description in updateCastagnoli.
74- func castagnoliShift (table * sse42Table , crc uint32 ) uint32 {
75- return table [3 ][crc >> 24 ] ^
76- table [2 ][(crc >> 16 )& 0xFF ] ^
77- table [1 ][(crc >> 8 )& 0xFF ] ^
78- table [0 ][crc & 0xFF ]
79- }
80-
8129func updateCastagnoli (crc uint32 , p []byte ) uint32 {
82- if ! sse42 {
83- // Use slicing-by-8 on larger inputs.
84- if len (p ) >= sliceBy8Cutoff {
85- return updateSlicingBy8 (crc , castagnoliTable8 , p )
86- }
87- return update (crc , castagnoliTable , p )
88- }
89-
90- // This method is inspired from the algorithm in Intel's white paper:
91- // "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
92- // The same strategy of splitting the buffer in three is used but the
93- // combining calculation is different; the complete derivation is explained
94- // below.
95- //
96- // -- The basic idea --
97- //
98- // The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
99- // time. In recent Intel architectures the instruction takes 3 cycles;
100- // however the processor can pipeline up to three instructions if they
101- // don't depend on each other.
102- //
103- // Roughly this means that we can process three buffers in about the same
104- // time we can process one buffer.
105- //
106- // The idea is then to split the buffer in three, CRC the three pieces
107- // separately and then combine the results.
108- //
109- // Combining the results requires precomputed tables, so we must choose a
110- // fixed buffer length to optimize. The longer the length, the faster; but
111- // only buffers longer than this length will use the optimization. We choose
112- // two cutoffs and compute tables for both:
113- // - one around 512: 168*3=504
114- // - one around 4KB: 1344*3=4032
115- //
116- // -- The nitty gritty --
117- //
118- // Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
119- // initial non-inverted CRC I). This function has the following properties:
120- // (a) CRC(I, AB) = CRC(CRC(I, A), B)
121- // (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
122- //
123- // Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
124- // K bytes each, where K is a fixed constant. Let O be the sequence of K zero
125- // bytes.
126- //
127- // CRC(I, ABC) = CRC(I, ABO xor C)
128- // = CRC(I, ABO) xor CRC(0, C)
129- // = CRC(CRC(I, AB), O) xor CRC(0, C)
130- // = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
131- // = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
132- // = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
133- //
134- // The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
135- // and CRC(0, C) efficiently. We just need to find a way to quickly compute
136- // CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
137- // values; since we can't have a 32-bit table, we break it up into four
138- // 8-bit tables:
139- //
140- // CRC(uvwx, O) = CRC(u000, O) xor
141- // CRC(0v00, O) xor
142- // CRC(00w0, O) xor
143- // CRC(000x, O)
144- //
145- // We can compute tables corresponding to the four terms for all 8-bit
146- // values.
147-
148- crc = ^ crc
149-
150- // If a buffer is long enough to use the optimization, process the first few
151- // bytes to align the buffer to an 8 byte boundary (if necessary).
152- if len (p ) >= castagnoliK1 * 3 {
153- delta := int (uintptr (unsafe .Pointer (& p [0 ])) & 7 )
154- if delta != 0 {
155- delta = 8 - delta
156- crc = castagnoliSSE42 (crc , p [:delta ])
157- p = p [delta :]
158- }
159- }
160-
161- // Process 3*K2 at a time.
162- for len (p ) >= castagnoliK2 * 3 {
163- // Compute CRC(I, A), CRC(0, B), and CRC(0, C).
164- crcA , crcB , crcC := castagnoliSSE42Triple (
165- crc , 0 , 0 ,
166- p , p [castagnoliK2 :], p [castagnoliK2 * 2 :],
167- castagnoliK2 / 24 )
168-
169- // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
170- crcAB := castagnoliShift (castagnoliSSE42TableK2 , crcA ) ^ crcB
171- // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
172- crc = castagnoliShift (castagnoliSSE42TableK2 , crcAB ) ^ crcC
173- p = p [castagnoliK2 * 3 :]
30+ if sse42 {
31+ return castagnoliSSE42 (crc , p )
17432}
175-
176- // Process 3*K1 at a time.
177- for len (p ) >= castagnoliK1 * 3 {
178- // Compute CRC(I, A), CRC(0, B), and CRC(0, C).
179- crcA , crcB , crcC := castagnoliSSE42Triple (
180- crc , 0 , 0 ,
181- p , p [castagnoliK1 :], p [castagnoliK1 * 2 :],
182- castagnoliK1 / 24 )
183-
184- // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
185- crcAB := castagnoliShift (castagnoliSSE42TableK1 , crcA ) ^ crcB
186- // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
187- crc = castagnoliShift (castagnoliSSE42TableK1 , crcAB ) ^ crcC
188- p = p [castagnoliK1 * 3 :]
33+ // Use slicing-by-8 on larger inputs.
34+ if len (p ) >= sliceBy8Cutoff {
35+ return updateSlicingBy8 (crc , castagnoliTable8 , p )
18936}
190-
191- // Use the simple implementation for what's left.
192- crc = castagnoliSSE42 (crc , p )
193- return ^ crc
37+ return update (crc , castagnoliTable , p )
19438}
19539
19640func updateIEEE (crc uint32 , p []byte ) uint32 {
0 commit comments