Skip to content

Commit 2a95b06

Browse files
committed
Proper matrix transposal code for SSE and SSE2
1 parent eac6e15 commit 2a95b06

File tree

5 files changed

+243
-12
lines changed

5 files changed

+243
-12
lines changed

include/private/dsp/arch/x86/sse/3dmath.h

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -164,19 +164,34 @@
164164
__ASM_EMIT("mulps %" x3 ", %" x0) /* xmm0 = dx1*dy2 dy1*dz2 dz1*dx2 dw1*dw1 */ \
165165
__ASM_EMIT("subps %" x1 ", %" x0) /* xmm0 = dx1*dy2-dx2*dy1 dy1*dz2-dy2*dz1 dz1*dx2-dz2*dx1 dw1*dw1-dw2*dw2 = NY NZ NX NW */
166166

167+
/* 1x matrix transpose
168+
* Input:
169+
* x0 = row 0 (a1 a2 a3 a4)
170+
* x1 = row 1 (b1 b2 b3 b4)
171+
* x2 = row 2 (c1 c2 c3 c4)
172+
* x3 = row 3 (d1 d2 d3 d4)
173+
* x4 = temporary
174+
*
175+
* Output:
176+
* x0 = row 0 (a1 b1 c1 d1)
177+
* x1 = row 1 (a2 b2 c2 d2)
178+
* x2 = row 2 (a3 b3 c3 d3)
179+
* x3 = row 3 (a4 b4 c4 d4)
180+
*/
167181
#define MAT4_TRANSPOSE(x0, x1, x2, x3, x4) \
168-
__ASM_EMIT("movaps %" x2 ", %" x4) /* xmm4 = c1 c2 c3 c4 */ \
169-
__ASM_EMIT("punpckldq %" x3 ", %" x2) /* xmm2 = c1 d1 c2 d2 */ \
170-
__ASM_EMIT("punpckhdq %" x3 ", %" x4) /* xmm4 = c3 d3 c4 d4 */ \
171-
__ASM_EMIT("movaps %" x0 ", %" x3) /* xmm3 = a1 a2 a3 a4 */ \
172-
__ASM_EMIT("punpckldq %" x1 ", %" x0) /* xmm0 = a1 b1 a2 b2 */ \
173-
__ASM_EMIT("punpckhdq %" x1 ", %" x3) /* xmm3 = a3 b3 a4 b4 */ \
174-
__ASM_EMIT("movaps %" x0 ", %" x1) /* xmm1 = a1 b2 a2 b2 */ \
175-
__ASM_EMIT("punpcklqdq %" x2 ", %" x0) /* xmm0 = a1 b1 c1 d1 */ \
176-
__ASM_EMIT("punpckhqdq %" x2 ", %" x1) /* xmm1 = a2 b2 c2 d2 */ \
177-
__ASM_EMIT("movaps %" x3 ", %" x2) /* xmm2 = a3 b3 a4 b4 */ \
178-
__ASM_EMIT("punpcklqdq %" x4 ", %" x2) /* xmm2 = a3 b3 c3 d3 */ \
179-
__ASM_EMIT("punpckhqdq %" x4 ", %" x3) /* xmm3 = a4 b4 c4 d4 */
182+
__ASM_EMIT("movaps %" x2 ", %" x4) /* xmm4 = c1 c2 c3 c4 */ \
183+
__ASM_EMIT("unpcklps %" x3 ", %" x2) /* xmm2 = c1 d1 c2 d2 */ \
184+
__ASM_EMIT("unpckhps %" x3 ", %" x4) /* xmm4 = c3 d3 c4 d4 */ \
185+
__ASM_EMIT("movaps %" x0 ", %" x3) /* xmm3 = a1 a2 a3 a4 */ \
186+
__ASM_EMIT("unpcklps %" x1 ", %" x0) /* xmm0 = a1 b1 a2 b2 */ \
187+
__ASM_EMIT("unpckhps %" x1 ", %" x3) /* xmm3 = a3 b3 a4 b4 */ \
188+
__ASM_EMIT("movaps %" x2 ", %" x1) /* xmm1 = c1 d1 c2 d2 */ \
189+
__ASM_EMIT("movhlps %" x0 ", %" x1) /* xmm1 = a2 b2 c2 d2 */ \
190+
__ASM_EMIT("movlhps %" x2 ", %" x0) /* xmm0 = a1 b1 c1 d1 */ \
191+
__ASM_EMIT("movaps %" x3 ", %" x2) /* xmm2 = a3 b3 a4 b4 */ \
192+
__ASM_EMIT("movaps %" x4 ", %" x3) /* xmm3 = c3 d3 c4 d4 */ \
193+
__ASM_EMIT("movhlps %" x2 ", %" x3) /* xmm3 = a4 b4 c4 d4 */ \
194+
__ASM_EMIT("movlhps %" x4 ", %" x2) /* xmm2 = a3 b3 c3 d3 */
180195

181196
#define MATRIX_LOAD(ptr, x0, x1, x2, x3) \
182197
__ASM_EMIT("movups 0x00(%[" ptr "]), %" x0 ) \

include/private/dsp/arch/x86/sse2/3dmath.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,20 @@
2626
#error "This header should not be included directly"
2727
#endif /* PRIVATE_DSP_ARCH_X86_SSE2_IMPL */
2828

29+
/* 1x matrix transpose
30+
* Input:
31+
* x0 = row 0 (a1 a2 a3 a4)
32+
* x1 = row 1 (b1 b2 b3 b4)
33+
* x2 = row 2 (c1 c2 c3 c4)
34+
* x3 = row 3 (d1 d2 d3 d4)
35+
* x4 = temporary
36+
*
37+
* Output:
38+
* x0 = row 0 (a1 b1 c1 d1)
39+
* x1 = row 1 (a2 b2 c2 d2)
40+
* x2 = row 2 (a3 b3 c3 d3)
41+
* x3 = row 3 (a4 b4 c4 d4)
42+
*/
2943
#define MAT4_TRANSPOSE(x0, x1, x2, x3, x4) \
3044
__ASM_EMIT("movaps %" x2 ", %" x4) /* xmm4 = c1 c2 c3 c4 */ \
3145
__ASM_EMIT("punpckldq %" x3 ", %" x2) /* xmm2 = c1 d1 c2 d2 */ \
@@ -40,6 +54,18 @@
4054
__ASM_EMIT("punpcklqdq %" x4 ", %" x2) /* xmm2 = a3 b3 c3 d3 */ \
4155
__ASM_EMIT("punpckhqdq %" x4 ", %" x3) /* xmm3 = a4 b4 c4 d4 */
4256

57+
#define MATRIX_LOAD(ptr, x0, x1, x2, x3) \
58+
__ASM_EMIT("movups 0x00(%[" ptr "]), %" x0 ) \
59+
__ASM_EMIT("movups 0x10(%[" ptr "]), %" x1 ) \
60+
__ASM_EMIT("movups 0x20(%[" ptr "]), %" x2 ) \
61+
__ASM_EMIT("movups 0x30(%[" ptr "]), %" x3 )
62+
63+
#define MATRIX_STORE(ptr, x0, x1, x2, x3) \
64+
__ASM_EMIT("movups %" x0 ", 0x00(%[" ptr "])") \
65+
__ASM_EMIT("movups %" x1 ", 0x10(%[" ptr "])") \
66+
__ASM_EMIT("movups %" x2 ", 0x20(%[" ptr "])") \
67+
__ASM_EMIT("movups %" x3 ", 0x30(%[" ptr "])")
68+
4369
namespace lsp
4470
{
4571
namespace sse2
@@ -354,9 +380,39 @@ namespace lsp
354380
return pt;
355381
}
356382

383+
void transpose_matrix3d1(matrix3d_t *r)
384+
{
385+
ARCH_X86_ASM
386+
(
387+
MATRIX_LOAD("m", "%xmm0", "%xmm1", "%xmm2", "%xmm3")
388+
MAT4_TRANSPOSE("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4")
389+
MATRIX_STORE("m", "%xmm0", "%xmm1", "%xmm2", "%xmm3")
390+
:
391+
: [m] "r" (r)
392+
: "memory",
393+
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4"
394+
);
395+
}
396+
397+
void transpose_matrix3d2(matrix3d_t *r, const matrix3d_t *m)
398+
{
399+
ARCH_X86_ASM
400+
(
401+
MATRIX_LOAD("m", "%xmm0", "%xmm1", "%xmm2", "%xmm3")
402+
MAT4_TRANSPOSE("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4")
403+
MATRIX_STORE("r", "%xmm0", "%xmm1", "%xmm2", "%xmm3")
404+
:
405+
: [r] "r" (r), [m] "r" (m)
406+
: "memory",
407+
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4"
408+
);
409+
}
410+
357411
} /* namespace sse2 */
358412
} /* namespace lsp */
359413

360414
#undef MAT4_TRANSPOSE
415+
#undef MAT4_LOAD
416+
#undef MAT4_STORE
361417

362418
#endif /* PRIVATE_DSP_ARCH_X86_SSE2_3DMATH_H_ */

src/main/x86/sse2.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@
163163
EXPORT1(dexpander_x1_curve)
164164

165165
// 3D Math
166+
EXPORT1(transpose_matrix3d1);
167+
EXPORT1(transpose_matrix3d2);
168+
166169
EXPORT1(colocation_x2_v1p2);
167170
EXPORT1(colocation_x2_v1pv);
168171
EXPORT1(colocation_x3_v1p3);
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
* Copyright (C) 2025 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2025 Vladimir Sadovnikov <sadko4u@gmail.com>
4+
*
5+
* This file is part of lsp-dsp-lib
6+
* Created on: 18 авг. 2025 г.
7+
*
8+
* lsp-dsp-lib is free software: you can redistribute it and/or modify
9+
* it under the terms of the GNU Lesser General Public License as published by
10+
* the Free Software Foundation, either version 3 of the License, or
11+
* any later version.
12+
*
13+
* lsp-dsp-lib is distributed in the hope that it will be useful,
14+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
* GNU Lesser General Public License for more details.
17+
*
18+
* You should have received a copy of the GNU Lesser General Public License
19+
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
20+
*/
21+
22+
#include <lsp-plug.in/common/alloc.h>
23+
#include <lsp-plug.in/common/types.h>
24+
#include <lsp-plug.in/dsp/dsp.h>
25+
#include <lsp-plug.in/stdlib/math.h>
26+
#include <lsp-plug.in/test-fw/helpers.h>
27+
#include <lsp-plug.in/test-fw/ptest.h>
28+
29+
#define N_MATRICES 0x1000
30+
31+
namespace lsp
32+
{
33+
namespace generic
34+
{
35+
void transpose_matrix3d1(dsp::matrix3d_t *r);
36+
void transpose_matrix3d2(dsp::matrix3d_t *r, const dsp::matrix3d_t *m);
37+
}
38+
39+
IF_ARCH_X86(
40+
namespace sse
41+
{
42+
void transpose_matrix3d1(dsp::matrix3d_t *r);
43+
void transpose_matrix3d2(dsp::matrix3d_t *r, const dsp::matrix3d_t *m);
44+
}
45+
46+
namespace sse2
47+
{
48+
void transpose_matrix3d1(dsp::matrix3d_t *r);
49+
void transpose_matrix3d2(dsp::matrix3d_t *r, const dsp::matrix3d_t *m);
50+
}
51+
52+
namespace avx
53+
{
54+
void transpose_matrix3d1(dsp::matrix3d_t *r);
55+
void transpose_matrix3d2(dsp::matrix3d_t *r, const dsp::matrix3d_t *m);
56+
}
57+
)
58+
59+
typedef void (* transpose_matrix3d1_t)(dsp::matrix3d_t *r);
60+
typedef void (* transpose_matrix3d2_t)(dsp::matrix3d_t *r, const dsp::matrix3d_t *m);
61+
}
62+
63+
//-----------------------------------------------------------------------------
64+
// Performance test
65+
PTEST_BEGIN("dsp.3d", mat4_transpose, 5, 1000)
66+
67+
void call(const char *label, dsp::matrix3d_t *dst, const dsp::matrix3d_t *src, transpose_matrix3d2_t func)
68+
{
69+
if (!PTEST_SUPPORTED(func))
70+
return;
71+
72+
char buf[80];
73+
snprintf(buf, sizeof(buf), "%s", label);
74+
printf("Testing %s ...\n", buf);
75+
76+
PTEST_LOOP(buf,
77+
for (size_t i=0; i<N_MATRICES; ++i)
78+
func(&dst[i], &src[i]);
79+
);
80+
}
81+
82+
void call(const char *label, dsp::matrix3d_t *dst, transpose_matrix3d1_t func)
83+
{
84+
if (!PTEST_SUPPORTED(func))
85+
return;
86+
87+
char buf[80];
88+
snprintf(buf, sizeof(buf), "%s", label);
89+
printf("Testing %s ...\n", buf);
90+
91+
PTEST_LOOP(buf,
92+
for (size_t i=0; i<N_MATRICES; ++i)
93+
func(&dst[i]);
94+
);
95+
}
96+
97+
PTEST_MAIN
98+
{
99+
size_t buf_size = N_MATRICES * sizeof(dsp::matrix3d_t);
100+
uint8_t *data = NULL;
101+
uint8_t *ptr = alloc_aligned<uint8_t>(data, buf_size * 2, 64);
102+
103+
dsp::matrix3d_t *dst = reinterpret_cast<dsp::matrix3d_t *>(ptr);
104+
dsp::matrix3d_t *src = reinterpret_cast<dsp::matrix3d_t *>(&dst[N_MATRICES]);
105+
106+
// Initialize matrices
107+
for (size_t i=0; i < N_MATRICES;)
108+
{
109+
randomize_sign(src[i].m, 16);
110+
++i;
111+
}
112+
113+
#define CALL1(func) \
114+
memcpy(dst, src, buf_size); \
115+
call(#func, dst, func);
116+
117+
#define CALL2(func) \
118+
memcpy(dst, src, buf_size); \
119+
call(#func, dst, src, func);
120+
121+
CALL1(generic::transpose_matrix3d1);
122+
CALL1(sse::transpose_matrix3d1);
123+
CALL1(sse2::transpose_matrix3d1);
124+
CALL1(avx::transpose_matrix3d1);
125+
PTEST_SEPARATOR;
126+
127+
CALL2(generic::transpose_matrix3d2);
128+
CALL2(sse::transpose_matrix3d2);
129+
CALL2(sse2::transpose_matrix3d2);
130+
CALL2(avx::transpose_matrix3d2);
131+
PTEST_SEPARATOR;
132+
133+
free_aligned(data);
134+
}
135+
PTEST_END
136+
137+
138+
139+
140+

src/test/utest/3d/matrix.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ namespace lsp
7676
void apply_matrix3d_mm1(dsp::matrix3d_t *r, const dsp::matrix3d_t *m);
7777
}
7878

79+
namespace sse2
80+
{
81+
void transpose_matrix3d1(dsp::matrix3d_t *r);
82+
void transpose_matrix3d2(dsp::matrix3d_t *r, const dsp::matrix3d_t *m);
83+
}
84+
7985
namespace avx
8086
{
8187
void init_matrix3d(dsp::matrix3d_t *dst, const dsp::matrix3d_t *src);
@@ -519,6 +525,17 @@ UTEST_BEGIN("dsp.3d", matrix)
519525
sse::transpose_matrix3d2
520526
));
521527

528+
IF_ARCH_X86(
529+
init_data(
530+
"sse2 init_matrix",
531+
sse::init_matrix3d,
532+
sse::init_matrix3d_zero,
533+
sse::init_matrix3d_one,
534+
sse::init_matrix3d_identity,
535+
sse2::transpose_matrix3d1,
536+
sse2::transpose_matrix3d2
537+
));
538+
522539
IF_ARCH_X86(
523540
init_data(
524541
"avx init_matrix",

0 commit comments

Comments
 (0)