Skip to content

Commit a6a129b

Browse files
floodyberryhackmod
authored andcommitted
add avx version for salsa64
1 parent 951bd6d commit a6a129b

File tree

4 files changed

+402
-8
lines changed

4 files changed

+402
-8
lines changed
Lines changed: 367 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,367 @@
1+
/* x64 */
2+
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
3+
4+
#define SCRYPT_SALSA64_AVX
5+
6+
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
7+
asm_naked_fn(scrypt_ChunkMix_avx)
8+
a1(push rbp)
9+
a2(mov rbp, rsp)
10+
a2(and rsp, ~63)
11+
a2(sub rsp, 128)
12+
a2(lea rcx,[rcx*2])
13+
a2(shl rcx,7)
14+
a2(lea r9,[rcx-128])
15+
a2(lea rax,[rsi+r9])
16+
a2(lea r9,[rdx+r9])
17+
a2(and rdx, rdx)
18+
a2(vmovdqa xmm0,[rax+0])
19+
a2(vmovdqa xmm1,[rax+16])
20+
a2(vmovdqa xmm2,[rax+32])
21+
a2(vmovdqa xmm3,[rax+48])
22+
a2(vmovdqa xmm4,[rax+64])
23+
a2(vmovdqa xmm5,[rax+80])
24+
a2(vmovdqa xmm6,[rax+96])
25+
a2(vmovdqa xmm7,[rax+112])
26+
a1(jz scrypt_ChunkMix_avx_no_xor1)
27+
a3(vpxor xmm0,xmm0,[r9+0])
28+
a3(vpxor xmm1,xmm1,[r9+16])
29+
a3(vpxor xmm2,xmm2,[r9+32])
30+
a3(vpxor xmm3,xmm3,[r9+48])
31+
a3(vpxor xmm4,xmm4,[r9+64])
32+
a3(vpxor xmm5,xmm5,[r9+80])
33+
a3(vpxor xmm6,xmm6,[r9+96])
34+
a3(vpxor xmm7,xmm7,[r9+112])
35+
a1(scrypt_ChunkMix_avx_no_xor1:)
36+
a2(xor r9,r9)
37+
a2(xor r8,r8)
38+
a1(scrypt_ChunkMix_avx_loop:)
39+
a2(and rdx, rdx)
40+
a3(vpxor xmm0,xmm0,[rsi+r9+0])
41+
a3(vpxor xmm1,xmm1,[rsi+r9+16])
42+
a3(vpxor xmm2,xmm2,[rsi+r9+32])
43+
a3(vpxor xmm3,xmm3,[rsi+r9+48])
44+
a3(vpxor xmm4,xmm4,[rsi+r9+64])
45+
a3(vpxor xmm5,xmm5,[rsi+r9+80])
46+
a3(vpxor xmm6,xmm6,[rsi+r9+96])
47+
a3(vpxor xmm7,xmm7,[rsi+r9+112])
48+
a1(jz scrypt_ChunkMix_avx_no_xor2)
49+
a3(vpxor xmm0,xmm0,[rdx+r9+0])
50+
a3(vpxor xmm1,xmm1,[rdx+r9+16])
51+
a3(vpxor xmm2,xmm2,[rdx+r9+32])
52+
a3(vpxor xmm3,xmm3,[rdx+r9+48])
53+
a3(vpxor xmm4,xmm4,[rdx+r9+64])
54+
a3(vpxor xmm5,xmm5,[rdx+r9+80])
55+
a3(vpxor xmm6,xmm6,[rdx+r9+96])
56+
a3(vpxor xmm7,xmm7,[rdx+r9+112])
57+
a1(scrypt_ChunkMix_avx_no_xor2:)
58+
a2(vmovdqa [rsp+0],xmm0)
59+
a2(vmovdqa [rsp+16],xmm1)
60+
a2(vmovdqa [rsp+32],xmm2)
61+
a2(vmovdqa [rsp+48],xmm3)
62+
a2(vmovdqa [rsp+64],xmm4)
63+
a2(vmovdqa [rsp+80],xmm5)
64+
a2(vmovdqa [rsp+96],xmm6)
65+
a2(vmovdqa [rsp+112],xmm7)
66+
a2(mov rax,8)
67+
a1(scrypt_salsa64_avx_loop: )
68+
a3(vpaddq xmm8, xmm0, xmm2)
69+
a3(vpaddq xmm9, xmm1, xmm3)
70+
a3(vpshufd xmm8, xmm8, 0xb1)
71+
a3(vpshufd xmm9, xmm9, 0xb1)
72+
a3(vpxor xmm6, xmm6, xmm8)
73+
a3(vpxor xmm7, xmm7, xmm9)
74+
a3(vpaddq xmm10, xmm0, xmm6)
75+
a3(vpaddq xmm11, xmm1, xmm7)
76+
a3(vpsrlq xmm8, xmm10, 51)
77+
a3(vpsrlq xmm9, xmm11, 51)
78+
a3(vpsllq xmm10, xmm10, 13)
79+
a3(vpsllq xmm11, xmm11, 13)
80+
a3(vpxor xmm4, xmm4, xmm8)
81+
a3(vpxor xmm5, xmm5, xmm9)
82+
a3(vpxor xmm4, xmm4, xmm10)
83+
a3(vpxor xmm5, xmm5, xmm11)
84+
a3(vpaddq xmm8, xmm6, xmm4)
85+
a3(vpaddq xmm9, xmm7, xmm5)
86+
a3(vpsrlq xmm10, xmm8, 25)
87+
a3(vpsrlq xmm11, xmm9, 25)
88+
a3(vpsllq xmm8, xmm8, 39)
89+
a3(vpsllq xmm9, xmm9, 39)
90+
a3(vpxor xmm2, xmm2, xmm10)
91+
a3(vpxor xmm3, xmm3, xmm11)
92+
a3(vpxor xmm2, xmm2, xmm8)
93+
a3(vpxor xmm3, xmm3, xmm9)
94+
a3(vpaddq xmm10, xmm4, xmm2)
95+
a3(vpaddq xmm11, xmm5, xmm3)
96+
a3(vpshufd xmm10, xmm10, 0xb1)
97+
a3(vpshufd xmm11, xmm11, 0xb1)
98+
a3(vpxor xmm0, xmm0, xmm10)
99+
a3(vpxor xmm1, xmm1, xmm11)
100+
a2(vmovdqa xmm8, xmm2)
101+
a2(vmovdqa xmm9, xmm3)
102+
a4(vpalignr xmm2, xmm6, xmm7, 8)
103+
a4(vpalignr xmm3, xmm7, xmm6, 8)
104+
a4(vpalignr xmm6, xmm9, xmm8, 8)
105+
a4(vpalignr xmm7, xmm8, xmm9, 8)
106+
a2(sub rax, 2)
107+
a3(vpaddq xmm10, xmm0, xmm2)
108+
a3(vpaddq xmm11, xmm1, xmm3)
109+
a3(vpshufd xmm10, xmm10, 0xb1)
110+
a3(vpshufd xmm11, xmm11, 0xb1)
111+
a3(vpxor xmm6, xmm6, xmm10)
112+
a3(vpxor xmm7, xmm7, xmm11)
113+
a3(vpaddq xmm8, xmm0, xmm6)
114+
a3(vpaddq xmm9, xmm1, xmm7)
115+
a3(vpsrlq xmm10, xmm8, 51)
116+
a3(vpsrlq xmm11, xmm9, 51)
117+
a3(vpsllq xmm8, xmm8, 13)
118+
a3(vpsllq xmm9, xmm9, 13)
119+
a3(vpxor xmm5, xmm5, xmm10)
120+
a3(vpxor xmm4, xmm4, xmm11)
121+
a3(vpxor xmm5, xmm5, xmm8)
122+
a3(vpxor xmm4, xmm4, xmm9)
123+
a3(vpaddq xmm10, xmm6, xmm5)
124+
a3(vpaddq xmm11, xmm7, xmm4)
125+
a3(vpsrlq xmm8, xmm10, 25)
126+
a3(vpsrlq xmm9, xmm11, 25)
127+
a3(vpsllq xmm10, xmm10, 39)
128+
a3(vpsllq xmm11, xmm11, 39)
129+
a3(vpxor xmm2, xmm2, xmm8)
130+
a3(vpxor xmm3, xmm3, xmm9)
131+
a3(vpxor xmm2, xmm2, xmm10)
132+
a3(vpxor xmm3, xmm3, xmm11)
133+
a3(vpaddq xmm8, xmm5, xmm2)
134+
a3(vpaddq xmm9, xmm4, xmm3)
135+
a3(vpshufd xmm8, xmm8, 0xb1)
136+
a3(vpshufd xmm9, xmm9, 0xb1)
137+
a3(vpxor xmm0, xmm0, xmm8)
138+
a3(vpxor xmm1, xmm1, xmm9)
139+
a2(vmovdqa xmm10, xmm2)
140+
a2(vmovdqa xmm11, xmm3)
141+
a4(vpalignr xmm2, xmm6, xmm7, 8)
142+
a4(vpalignr xmm3, xmm7, xmm6, 8)
143+
a4(vpalignr xmm6, xmm11, xmm10, 8)
144+
a4(vpalignr xmm7, xmm10, xmm11, 8)
145+
a1(ja scrypt_salsa64_avx_loop)
146+
a3(vpaddq xmm0,xmm0,[rsp+0])
147+
a3(vpaddq xmm1,xmm1,[rsp+16])
148+
a3(vpaddq xmm2,xmm2,[rsp+32])
149+
a3(vpaddq xmm3,xmm3,[rsp+48])
150+
a3(vpaddq xmm4,xmm4,[rsp+64])
151+
a3(vpaddq xmm5,xmm5,[rsp+80])
152+
a3(vpaddq xmm6,xmm6,[rsp+96])
153+
a3(vpaddq xmm7,xmm7,[rsp+112])
154+
a2(lea rax,[r8+r9])
155+
a2(xor r8,rcx)
156+
a2(and rax,~0xff)
157+
a2(add r9,128)
158+
a2(shr rax,1)
159+
a2(add rax, rdi)
160+
a2(cmp r9,rcx)
161+
a2(vmovdqa [rax+0],xmm0)
162+
a2(vmovdqa [rax+16],xmm1)
163+
a2(vmovdqa [rax+32],xmm2)
164+
a2(vmovdqa [rax+48],xmm3)
165+
a2(vmovdqa [rax+64],xmm4)
166+
a2(vmovdqa [rax+80],xmm5)
167+
a2(vmovdqa [rax+96],xmm6)
168+
a2(vmovdqa [rax+112],xmm7)
169+
a1(jne scrypt_ChunkMix_avx_loop)
170+
a2(mov rsp, rbp)
171+
a1(pop rbp)
172+
a1(ret)
173+
asm_naked_fn_end(scrypt_ChunkMix_avx)
174+
175+
#endif
176+
177+
178+
/* intrinsic */
179+
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX)
180+
181+
#define SCRYPT_SALSA64_AVX
182+
183+
static void STDCALL
184+
scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
185+
uint32_t i, blocksPerChunk = r * 2, half = 0;
186+
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
187+
size_t rounds;
188+
189+
/* 1: X = B_{2r - 1} */
190+
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
191+
x0 = xmmp[0];
192+
x1 = xmmp[1];
193+
x2 = xmmp[2];
194+
x3 = xmmp[3];
195+
x4 = xmmp[4];
196+
x5 = xmmp[5];
197+
x6 = xmmp[6];
198+
x7 = xmmp[7];
199+
200+
if (Bxor) {
201+
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
202+
x0 = _mm_xor_si128(x0, xmmp[0]);
203+
x1 = _mm_xor_si128(x1, xmmp[1]);
204+
x2 = _mm_xor_si128(x2, xmmp[2]);
205+
x3 = _mm_xor_si128(x3, xmmp[3]);
206+
x4 = _mm_xor_si128(x4, xmmp[4]);
207+
x5 = _mm_xor_si128(x5, xmmp[5]);
208+
x6 = _mm_xor_si128(x6, xmmp[6]);
209+
x7 = _mm_xor_si128(x7, xmmp[7]);
210+
}
211+
212+
/* 2: for i = 0 to 2r - 1 do */
213+
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
214+
/* 3: X = H(X ^ B_i) */
215+
xmmp = (xmmi *)scrypt_block(Bin, i);
216+
x0 = _mm_xor_si128(x0, xmmp[0]);
217+
x1 = _mm_xor_si128(x1, xmmp[1]);
218+
x2 = _mm_xor_si128(x2, xmmp[2]);
219+
x3 = _mm_xor_si128(x3, xmmp[3]);
220+
x4 = _mm_xor_si128(x4, xmmp[4]);
221+
x5 = _mm_xor_si128(x5, xmmp[5]);
222+
x6 = _mm_xor_si128(x6, xmmp[6]);
223+
x7 = _mm_xor_si128(x7, xmmp[7]);
224+
225+
if (Bxor) {
226+
xmmp = (xmmi *)scrypt_block(Bxor, i);
227+
x0 = _mm_xor_si128(x0, xmmp[0]);
228+
x1 = _mm_xor_si128(x1, xmmp[1]);
229+
x2 = _mm_xor_si128(x2, xmmp[2]);
230+
x3 = _mm_xor_si128(x3, xmmp[3]);
231+
x4 = _mm_xor_si128(x4, xmmp[4]);
232+
x5 = _mm_xor_si128(x5, xmmp[5]);
233+
x6 = _mm_xor_si128(x6, xmmp[6]);
234+
x7 = _mm_xor_si128(x7, xmmp[7]);
235+
}
236+
237+
t0 = x0;
238+
t1 = x1;
239+
t2 = x2;
240+
t3 = x3;
241+
t4 = x4;
242+
t5 = x5;
243+
t6 = x6;
244+
t7 = x7;
245+
246+
for (rounds = 8; rounds; rounds -= 2) {
247+
z0 = _mm_add_epi64(x0, x2);
248+
z1 = _mm_add_epi64(x1, x3);
249+
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
250+
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
251+
x6 = _mm_xor_si128(x6, z0);
252+
x7 = _mm_xor_si128(x7, z1);
253+
254+
z0 = _mm_add_epi64(x6, x0);
255+
z1 = _mm_add_epi64(x7, x1);
256+
z2 = _mm_srli_epi64(z0, 64-13);
257+
z3 = _mm_srli_epi64(z1, 64-13);
258+
z0 = _mm_slli_epi64(z0, 13);
259+
z1 = _mm_slli_epi64(z1, 13);
260+
x4 = _mm_xor_si128(x4, z2);
261+
x5 = _mm_xor_si128(x5, z3);
262+
x4 = _mm_xor_si128(x4, z0);
263+
x5 = _mm_xor_si128(x5, z1);
264+
265+
z0 = _mm_add_epi64(x4, x6);
266+
z1 = _mm_add_epi64(x5, x7);
267+
z2 = _mm_srli_epi64(z0, 64-39);
268+
z3 = _mm_srli_epi64(z1, 64-39);
269+
z0 = _mm_slli_epi64(z0, 39);
270+
z1 = _mm_slli_epi64(z1, 39);
271+
x2 = _mm_xor_si128(x2, z2);
272+
x3 = _mm_xor_si128(x3, z3);
273+
x2 = _mm_xor_si128(x2, z0);
274+
x3 = _mm_xor_si128(x3, z1);
275+
276+
z0 = _mm_add_epi64(x2, x4);
277+
z1 = _mm_add_epi64(x3, x5);
278+
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
279+
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
280+
x0 = _mm_xor_si128(x0, z0);
281+
x1 = _mm_xor_si128(x1, z1);
282+
283+
z0 = x2;
284+
z1 = x3;
285+
x2 = _mm_alignr_epi8(x6, x7, 8);
286+
x3 = _mm_alignr_epi8(x7, x6, 8);
287+
x6 = _mm_alignr_epi8(z1, z0, 8);
288+
x7 = _mm_alignr_epi8(z0, z1, 8);
289+
290+
z0 = _mm_add_epi64(x0, x2);
291+
z1 = _mm_add_epi64(x1, x3);
292+
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
293+
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
294+
x6 = _mm_xor_si128(x6, z0);
295+
x7 = _mm_xor_si128(x7, z1);
296+
297+
z0 = _mm_add_epi64(x6, x0);
298+
z1 = _mm_add_epi64(x7, x1);
299+
z2 = _mm_srli_epi64(z0, 64-13);
300+
z3 = _mm_srli_epi64(z1, 64-13);
301+
z0 = _mm_slli_epi64(z0, 13);
302+
z1 = _mm_slli_epi64(z1, 13);
303+
x5 = _mm_xor_si128(x5, z2);
304+
x4 = _mm_xor_si128(x4, z3);
305+
x5 = _mm_xor_si128(x5, z0);
306+
x4 = _mm_xor_si128(x4, z1);
307+
308+
z0 = _mm_add_epi64(x5, x6);
309+
z1 = _mm_add_epi64(x4, x7);
310+
z2 = _mm_srli_epi64(z0, 64-39);
311+
z3 = _mm_srli_epi64(z1, 64-39);
312+
z0 = _mm_slli_epi64(z0, 39);
313+
z1 = _mm_slli_epi64(z1, 39);
314+
x2 = _mm_xor_si128(x2, z2);
315+
x3 = _mm_xor_si128(x3, z3);
316+
x2 = _mm_xor_si128(x2, z0);
317+
x3 = _mm_xor_si128(x3, z1);
318+
319+
z0 = _mm_add_epi64(x2, x5);
320+
z1 = _mm_add_epi64(x3, x4);
321+
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
322+
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
323+
x0 = _mm_xor_si128(x0, z0);
324+
x1 = _mm_xor_si128(x1, z1);
325+
326+
z0 = x2;
327+
z1 = x3;
328+
x2 = _mm_alignr_epi8(x6, x7, 8);
329+
x3 = _mm_alignr_epi8(x7, x6, 8);
330+
x6 = _mm_alignr_epi8(z1, z0, 8);
331+
x7 = _mm_alignr_epi8(z0, z1, 8);
332+
}
333+
334+
x0 = _mm_add_epi64(x0, t0);
335+
x1 = _mm_add_epi64(x1, t1);
336+
x2 = _mm_add_epi64(x2, t2);
337+
x3 = _mm_add_epi64(x3, t3);
338+
x4 = _mm_add_epi64(x4, t4);
339+
x5 = _mm_add_epi64(x5, t5);
340+
x6 = _mm_add_epi64(x6, t6);
341+
x7 = _mm_add_epi64(x7, t7);
342+
343+
/* 4: Y_i = X */
344+
/* 6: B'[0..r-1] = Y_even */
345+
/* 6: B'[r..2r-1] = Y_odd */
346+
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
347+
xmmp[0] = x0;
348+
xmmp[1] = x1;
349+
xmmp[2] = x2;
350+
xmmp[3] = x3;
351+
xmmp[4] = x4;
352+
xmmp[5] = x5;
353+
xmmp[6] = x6;
354+
xmmp[7] = x7;
355+
}
356+
}
357+
358+
#endif
359+
360+
#if defined(SCRYPT_SALSA64_AVX)
361+
/* uses salsa64_core_tangle_sse2 */
362+
363+
#undef SCRYPT_MIX
364+
#define SCRYPT_MIX "Salsa64/8-AVX"
365+
#undef SCRYPT_SALSA64_INCLUDED
366+
#define SCRYPT_SALSA64_INCLUDED
367+
#endif

0 commit comments

Comments
 (0)