116
116
117
117
/* 3x vector multiplication
118
118
* Input:
119
- * x0 = vector1 [dx dy dz ? ]
120
- * x1 = vector2 [dx dy dz ? ]
121
- * x2 = vector3 [dx dy dz ? ]
119
+ * x0 = vector1 [dx dy dz 0 ]
120
+ * x1 = vector2 [dx dy dz 0 ]
121
+ * x2 = vector3 [dx dy dz 0 ]
122
122
*
123
123
* Output:
124
- * x0 = vector1 * vector2 [ vz vx vy ? ]
125
- * x1 = vector2 * vector3 [ vz vx vy ? ]
126
- * x2 = vector3 * vector1 [ vz vx vy ? ]
124
+ * x0 = vector1 * vector2 [ vz vx vy 0 ]
125
+ * x1 = vector2 * vector3 [ vz vx vy 0 ]
126
+ * x2 = vector3 * vector1 [ vz vx vy 0 ]
127
127
*/
128
128
#define VECTOR_MUL3 (x0, x1, x2, x3, x4, x5, x6, x7 ) \
129
- __ASM_EMIT (" movaps %" x0 " , %" x3) /* xmm3 = dx0 dy0 dz0 dw0 */ \
130
- __ASM_EMIT(" movaps %" x1 " , %" x4) /* xmm4 = dx1 dy1 dz1 dw1 */ \
131
- __ASM_EMIT(" movaps %" x2 " , %" x5) /* xmm5 = dx2 dy2 dz2 dw2 */ \
132
- __ASM_EMIT(" shufps $0xc9, %" x3 " , %" x3) /* xmm3 = dy0 dz0 dx0 dw0 */ \
133
- __ASM_EMIT(" shufps $0xc9, %" x4 " , %" x4) /* xmm4 = dy1 dz1 dx1 dw1 */ \
134
- __ASM_EMIT(" shufps $0xc9, %" x5 " , %" x5) /* xmm5 = dy2 dz2 dx2 dw2 */ \
135
- __ASM_EMIT(" movaps %" x0 " , %" x6) /* xmm6 = dx0 dy0 dz0 dw0 */ \
136
- __ASM_EMIT(" movaps %" x3 " , %" x7) /* xmm7 = dy0 dz0 dx0 dw0 */ \
129
+ __ASM_EMIT (" movaps %" x0 " , %" x3) /* xmm3 = dx0 dy0 dz0 0 */ \
130
+ __ASM_EMIT(" movaps %" x1 " , %" x4) /* xmm4 = dx1 dy1 dz1 0 */ \
131
+ __ASM_EMIT(" movaps %" x2 " , %" x5) /* xmm5 = dx2 dy2 dz2 0 */ \
132
+ __ASM_EMIT(" shufps $0xc9, %" x3 " , %" x3) /* xmm3 = dy0 dz0 dx0 0 */ \
133
+ __ASM_EMIT(" shufps $0xc9, %" x4 " , %" x4) /* xmm4 = dy1 dz1 dx1 0 */ \
134
+ __ASM_EMIT(" shufps $0xc9, %" x5 " , %" x5) /* xmm5 = dy2 dz2 dx2 0 */ \
135
+ __ASM_EMIT(" movaps %" x0 " , %" x6) /* xmm6 = dx0 dy0 dz0 0 */ \
136
+ __ASM_EMIT(" movaps %" x3 " , %" x7) /* xmm7 = dy0 dz0 dx0 0 */ \
137
137
\
138
- __ASM_EMIT(" mulps %" x4 " , %" x0) /* xmm0 = dx0*dy1 dy0*dz1 dz0*dx1 dw0*dw1 */ \
139
- __ASM_EMIT(" mulps %" x1 " , %" x3) /* xmm3 = dy0*dx1 dz0*dy1 dx0*dz1 dw0*dw1 */ \
140
- __ASM_EMIT(" mulps %" x5 " , %" x1) /* xmm1 = dx1*dy2 dy1*dz2 dz1*dx2 dw1*dw2 */ \
141
- __ASM_EMIT(" mulps %" x2 " , %" x4) /* xmm4 = dy1*dx2 dz1*dy2 dx1*dz2 dw1*dw2 */ \
142
- __ASM_EMIT(" mulps %" x7 " , %" x2) /* xmm2 = dx2*dy0 dy2*dz0 dz2*dx0 dw2*dw0 */ \
143
- __ASM_EMIT(" mulps %" x6 " , %" x5) /* xmm5 = dy2*dx0 dz2*dy0 dx2*dz0 dw2*dw0 */ \
144
- __ASM_EMIT(" subps %" x3 " , %" x0) /* xmm0 = nz0 nx0 ny0 nw0 */ \
145
- __ASM_EMIT(" subps %" x4 " , %" x1) /* xmm1 = nz1 nx1 ny1 nw1 */ \
146
- __ASM_EMIT(" subps %" x5 " , %" x2) /* xmm2 = nz2 nx2 ny2 nw2 */
138
+ __ASM_EMIT(" mulps %" x4 " , %" x0) /* xmm0 = dx0*dy1 dy0*dz1 dz0*dx1 0 */ \
139
+ __ASM_EMIT(" mulps %" x1 " , %" x3) /* xmm3 = dy0*dx1 dz0*dy1 dx0*dz1 0 */ \
140
+ __ASM_EMIT(" mulps %" x5 " , %" x1) /* xmm1 = dx1*dy2 dy1*dz2 dz1*dx2 0 */ \
141
+ __ASM_EMIT(" mulps %" x2 " , %" x4) /* xmm4 = dy1*dx2 dz1*dy2 dx1*dz2 0 */ \
142
+ __ASM_EMIT(" mulps %" x7 " , %" x2) /* xmm2 = dx2*dy0 dy2*dz0 dz2*dx0 0 */ \
143
+ __ASM_EMIT(" mulps %" x6 " , %" x5) /* xmm5 = dy2*dx0 dz2*dy0 dx2*dz0 0 */ \
144
+ __ASM_EMIT(" subps %" x3 " , %" x0) /* xmm0 = nz0 nx0 ny0 0 */ \
145
+ __ASM_EMIT(" subps %" x4 " , %" x1) /* xmm1 = nz1 nx1 ny1 0 */ \
146
+ __ASM_EMIT(" subps %" x5 " , %" x2) /* xmm2 = nz2 nx2 ny2 0 */
147
147
148
148
/* 1x vector multiplication
149
149
* Input:
@@ -1331,49 +1331,39 @@ namespace lsp
1331
1331
ARCH_X86_ASM
1332
1332
(
1333
1333
/* Load vectors */
1334
- __ASM_EMIT (" movups (%[p]), %[x3]" ) /* xmm3 = px py pz pw */
1335
- __ASM_EMIT (" movups 0x00(%[pv]), %[x0]" ) /* xmm0 = x0 y0 z0 w0 */
1336
- __ASM_EMIT (" movups 0x10(%[pv]), %[x1]" ) /* xmm1 = x1 y1 z1 w1 */
1337
- __ASM_EMIT (" movups 0x20(%[pv]), %[x2]" ) /* xmm2 = x2 y2 z2 w2 */
1338
- __ASM_EMIT (" subps %[x3], %[x0]" ) /* xmm0 = dx0 dy0 dz0 dw0 */
1339
- __ASM_EMIT (" subps %[x3], %[x1]" ) /* xmm1 = dx1 dy1 dz1 dw1 */
1340
- __ASM_EMIT (" subps %[x3], %[x2]" ) /* xmm2 = dx2 dy2 dz2 dw2 */
1334
+ __ASM_EMIT (" movups (%[p]), %[x3]" ) /* xmm3 = px py pz 1 */
1335
+ __ASM_EMIT (" movups 0x00(%[pv]), %[x0]" ) /* xmm0 = x0 y0 z0 1 */
1336
+ __ASM_EMIT (" movups 0x10(%[pv]), %[x1]" ) /* xmm1 = x1 y1 z1 1 */
1337
+ __ASM_EMIT (" movups 0x20(%[pv]), %[x2]" ) /* xmm2 = x2 y2 z2 1 */
1338
+ __ASM_EMIT (" subps %[x3], %[x0]" ) /* xmm0 = v0 = dx0 dy0 dz0 0 */
1339
+ __ASM_EMIT (" subps %[x3], %[x1]" ) /* xmm1 = v1 = dx1 dy1 dz1 0 */
1340
+ __ASM_EMIT (" subps %[x3], %[x2]" ) /* xmm2 = v2 = dx2 dy2 dz2 0 */
1341
1341
/* 3x vector multiplications */
1342
1342
VECTOR_MUL3 (" [x0]" , " [x1]" , " [x2]" , " [x3]" , " [x4]" , " [x5]" , " [x6]" , " [x7]" )
1343
1343
/* 3x scalar multiplications */
1344
+ /* xmm0 = m0 = v0 cross v1 */
1345
+ /* xmm1 = m1 = v1 cross v2 */
1346
+ /* xmm2 = m2 = v2 cross v0 */
1344
1347
SCALAR_MUL3 (" [x0]" , " [x1]" , " [x2]" , " [x3]" )
1345
1348
/* Compare with zeros */
1346
- __ASM_EMIT (" xorps %[x4], %[x4]" )
1347
- __ASM_EMIT (" ucomiss %[x4], %[x0]" )
1348
- __ASM_EMIT (" jb 110f" )
1349
- __ASM_EMIT (" ucomiss %[x4], %[x1]" )
1350
- __ASM_EMIT (" jb 109f" )
1351
- __ASM_EMIT (" ucomiss %[x4], %[x2]" )
1352
- __ASM_EMIT (" jb 108f" )
1353
- __ASM_EMIT (" mulss %[x1], %[x0]" )
1354
- __ASM_EMIT (" mulss %[x2], %[x0]" )
1355
- __ASM_EMIT (" ucomiss %[x4], %[x0]" )
1356
- __ASM_EMIT (" jne 110f" )
1357
- /* There is somewhere zero, need additional check */
1358
- /* Load vectors */
1359
- __ASM_EMIT (" movups (%[p]), %[x3]" ) /* xmm3 = px py pz pw */
1360
- __ASM_EMIT (" movups 0x00(%[pv]), %[x0]" ) /* xmm0 = x0 y0 z0 w0 */
1361
- __ASM_EMIT (" movups 0x10(%[pv]), %[x1]" ) /* xmm1 = x1 y1 z1 w1 */
1362
- __ASM_EMIT (" movups 0x20(%[pv]), %[x2]" ) /* xmm2 = x2 y2 z2 w2 */
1363
- __ASM_EMIT (" subps %[x3], %[x0]" ) /* xmm0 = dx0 dy0 dz0 dw0 */
1364
- __ASM_EMIT (" subps %[x3], %[x1]" ) /* xmm1 = dx1 dy1 dz1 dw1 */
1365
- __ASM_EMIT (" subps %[x3], %[x2]" ) /* xmm2 = dx2 dy2 dz2 dw2 */
1366
- /* Do 3x scalar multiplications */
1367
- SCALAR_MUL3 (" [x0]" , " [x1]" , " [x2]" , " [x3]" )
1368
- __ASM_EMIT (" mulss %[x1], %[x0]" )
1369
- __ASM_EMIT (" mulss %[x2], %[x0]" )
1370
- __ASM_EMIT (" jmp 110f" )
1371
-
1372
- __ASM_EMIT (" 108:" )
1373
- __ASM_EMIT (" movss %[x2], %[x0]" )
1349
+ /* xmm0 = r0 = m0 dot m1 */
1350
+ /* xmm1 = r1 = m1 dot m2 */
1351
+ /* xmm2 = r2 = m2 dot m0 */
1352
+ __ASM_EMIT (" xorps %[x4], %[x4]" ) /* xmm4 = 0 */
1353
+ __ASM_EMIT (" ucomiss %[x4], %[x0]" ) /* r0 <=> 0 */
1354
+ __ASM_EMIT (" jb 110f" ) /* r0 < 0 */
1355
+ __ASM_EMIT (" mulss %[x1], %[x0]" ) /* xmm0 = r0 * r1 */
1356
+ __ASM_EMIT (" ucomiss %[x4], %[x1]" ) /* r1 <=> 0 */
1357
+ __ASM_EMIT (" jb 109f" ) /* r1 < 0 */
1358
+ __ASM_EMIT (" mulss %[x2], %[x0]" ) /* xmm0 = r0 * r1 * r2 */
1359
+ __ASM_EMIT (" ucomiss %[x4], %[x2]" ) /* r2 <=> 0 */
1360
+ __ASM_EMIT (" jae 110f" ) /* r2 >= 0 */
1361
+ /* Fail cases */
1362
+ __ASM_EMIT (" movaps %[x2], %[x0]" )
1374
1363
__ASM_EMIT (" jmp 110f" )
1375
1364
__ASM_EMIT (" 109:" )
1376
- __ASM_EMIT (" movss %[x1], %[x0]" )
1365
+ __ASM_EMIT (" movaps %[x1], %[x0]" )
1366
+ /* End */
1377
1367
__ASM_EMIT (" 110:" )
1378
1368
: [x0] " =&x" (x0), [x1] " =&x" (x1), [x2] " =&x" (x2), [x3] " =&x" (x3),
1379
1369
[x4] " =&x" (x4), [x5] " =&x" (x5), [x6] " =&x" (x6), [x7] " =&x" (x7)
@@ -1390,49 +1380,39 @@ namespace lsp
1390
1380
ARCH_X86_ASM
1391
1381
(
1392
1382
/* Load vectors */
1393
- __ASM_EMIT (" movups (%[p]), %[x3]" ) /* xmm3 = px py pz pw */
1394
- __ASM_EMIT (" movups (%[p1]), %[x0]" ) /* xmm0 = x0 y0 z0 w0 */
1395
- __ASM_EMIT (" movups (%[p2]), %[x1]" ) /* xmm1 = x1 y1 z1 w1 */
1396
- __ASM_EMIT (" movups (%[p3]), %[x2]" ) /* xmm2 = x2 y2 z2 w2 */
1397
- __ASM_EMIT (" subps %[x3], %[x0]" ) /* xmm0 = dx0 dy0 dz0 dw0 */
1398
- __ASM_EMIT (" subps %[x3], %[x1]" ) /* xmm1 = dx1 dy1 dz1 dw1 */
1399
- __ASM_EMIT (" subps %[x3], %[x2]" ) /* xmm2 = dx2 dy2 dz2 dw2 */
1383
+ __ASM_EMIT (" movups (%[p]), %[x3]" ) /* xmm3 = px py pz 1 */
1384
+ __ASM_EMIT (" movups (%[p1]), %[x0]" ) /* xmm0 = x0 y0 z0 1 */
1385
+ __ASM_EMIT (" movups (%[p2]), %[x1]" ) /* xmm1 = x1 y1 z1 1 */
1386
+ __ASM_EMIT (" movups (%[p3]), %[x2]" ) /* xmm2 = x2 y2 z2 1 */
1387
+ __ASM_EMIT (" subps %[x3], %[x0]" ) /* xmm0 = v0 = dx0 dy0 dz0 0 */
1388
+ __ASM_EMIT (" subps %[x3], %[x1]" ) /* xmm1 = v1 = dx1 dy1 dz1 0 */
1389
+ __ASM_EMIT (" subps %[x3], %[x2]" ) /* xmm2 = v2 = dx2 dy2 dz2 0 */
1400
1390
/* 3x vector multiplications */
1401
1391
VECTOR_MUL3 (" [x0]" , " [x1]" , " [x2]" , " [x3]" , " [x4]" , " [x5]" , " [x6]" , " [x7]" )
1402
1392
/* 3x scalar multiplications */
1393
+ /* xmm0 = m0 = v0 cross v1 */
1394
+ /* xmm1 = m1 = v1 cross v2 */
1395
+ /* xmm2 = m2 = v2 cross v0 */
1403
1396
SCALAR_MUL3 (" [x0]" , " [x1]" , " [x2]" , " [x3]" )
1404
1397
/* Compare with zeros */
1405
- __ASM_EMIT (" xorps %[x4], %[x4]" )
1406
- __ASM_EMIT (" ucomiss %[x4], %[x0]" )
1407
- __ASM_EMIT (" jb 110f" )
1408
- __ASM_EMIT (" ucomiss %[x4], %[x1]" )
1409
- __ASM_EMIT (" jb 109f" )
1410
- __ASM_EMIT (" ucomiss %[x4], %[x2]" )
1411
- __ASM_EMIT (" jb 108f" )
1412
- __ASM_EMIT (" mulss %[x1], %[x0]" )
1413
- __ASM_EMIT (" mulss %[x2], %[x0]" )
1414
- __ASM_EMIT (" ucomiss %[x4], %[x0]" )
1415
- __ASM_EMIT (" jne 110f" )
1416
- /* There is somewhere zero, need additional check */
1417
- /* Load vectors */
1418
- __ASM_EMIT (" movups (%[p]), %[x3]" ) /* xmm3 = px py pz pw */
1419
- __ASM_EMIT (" movups (%[p1]), %[x0]" ) /* xmm0 = x0 y0 z0 w0 */
1420
- __ASM_EMIT (" movups (%[p2]), %[x1]" ) /* xmm1 = x1 y1 z1 w1 */
1421
- __ASM_EMIT (" movups (%[p3]), %[x2]" ) /* xmm2 = x2 y2 z2 w2 */
1422
- __ASM_EMIT (" subps %[x3], %[x0]" ) /* xmm0 = dx0 dy0 dz0 dw0 */
1423
- __ASM_EMIT (" subps %[x3], %[x1]" ) /* xmm1 = dx1 dy1 dz1 dw1 */
1424
- __ASM_EMIT (" subps %[x3], %[x2]" ) /* xmm2 = dx2 dy2 dz2 dw2 */
1425
- /* Do 3x scalar multiplications */
1426
- SCALAR_MUL3 (" [x0]" , " [x1]" , " [x2]" , " [x3]" )
1427
- __ASM_EMIT (" mulss %[x1], %[x0]" )
1428
- __ASM_EMIT (" mulss %[x2], %[x0]" )
1429
- __ASM_EMIT (" jmp 110f" )
1430
-
1431
- __ASM_EMIT (" 108:" )
1432
- __ASM_EMIT (" movss %[x2], %[x0]" )
1398
+ /* xmm0 = r0 = m0 dot m1 */
1399
+ /* xmm1 = r1 = m1 dot m2 */
1400
+ /* xmm2 = r2 = m2 dot m0 */
1401
+ __ASM_EMIT (" xorps %[x4], %[x4]" ) /* xmm4 = 0 */
1402
+ __ASM_EMIT (" ucomiss %[x4], %[x0]" ) /* r0 <=> 0 */
1403
+ __ASM_EMIT (" jb 110f" ) /* r0 < 0 */
1404
+ __ASM_EMIT (" mulss %[x1], %[x0]" ) /* xmm0 = r0 * r1 */
1405
+ __ASM_EMIT (" ucomiss %[x4], %[x1]" ) /* r1 <=> 0 */
1406
+ __ASM_EMIT (" jb 109f" ) /* r1 < 0 */
1407
+ __ASM_EMIT (" mulss %[x2], %[x0]" ) /* xmm0 = r0 * r1 * r2 */
1408
+ __ASM_EMIT (" ucomiss %[x4], %[x2]" ) /* r2 <=> 0 */
1409
+ __ASM_EMIT (" jae 110f" ) /* r2 >= 0 */
1410
+ /* Fail cases */
1411
+ __ASM_EMIT (" movaps %[x2], %[x0]" )
1433
1412
__ASM_EMIT (" jmp 110f" )
1434
1413
__ASM_EMIT (" 109:" )
1435
- __ASM_EMIT (" movss %[x1], %[x0]" )
1414
+ __ASM_EMIT (" movaps %[x1], %[x0]" )
1415
+ /* End */
1436
1416
__ASM_EMIT (" 110:" )
1437
1417
: [x0] " =&x" (x0), [x1] " =&x" (x1), [x2] " =&x" (x2), [x3] " =&x" (x3),
1438
1418
[x4] " =&x" (x4), [x5] " =&x" (x5), [x6] " =&x" (x6), [x7] " =&x" (x7)
0 commit comments