Skip to content

Commit 5d8f037

Browse files
committed
zstd: Optimize seqdeq amd64 asm
copyMemoryPrecise now generates a loop over 16-byte blocks with a single branchless 16-byte fixup after it. This is a tiny bit faster on the whole and quite a bit faster for some inputs. Benchmark results on Intel Core i7-3770K: name old speed new speed delta Decoder_DecoderSmall/kppkn.gtb.zst-8 369MB/s ± 0% 374MB/s ± 1% +1.56% (p=0.008 n=5+5) Decoder_DecoderSmall/geo.protodata.zst-8 977MB/s ± 0% 1056MB/s ± 1% +8.17% (p=0.008 n=5+5) Decoder_DecoderSmall/plrabn12.txt.zst-8 291MB/s ± 0% 289MB/s ± 0% -0.74% (p=0.008 n=5+5) Decoder_DecoderSmall/lcet10.txt.zst-8 329MB/s ± 1% 333MB/s ± 0% +1.23% (p=0.008 n=5+5) Decoder_DecoderSmall/asyoulik.txt.zst-8 310MB/s ± 0% 310MB/s ± 1% ~ (p=1.000 n=5+5) Decoder_DecoderSmall/alice29.txt.zst-8 291MB/s ± 0% 291MB/s ± 1% ~ (p=0.421 n=5+5) Decoder_DecoderSmall/html_x_4.zst-8 2.07GB/s ± 0% 2.15GB/s ± 2% +4.05% (p=0.008 n=5+5) Decoder_DecoderSmall/paper-100k.pdf.zst-8 3.58GB/s ± 3% 3.74GB/s ± 1% +4.31% (p=0.008 n=5+5) Decoder_DecoderSmall/fireworks.jpeg.zst-8 8.57GB/s ± 0% 8.60GB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecoderSmall/urls.10K.zst-8 474MB/s ± 1% 507MB/s ± 1% +6.80% (p=0.008 n=5+5) Decoder_DecoderSmall/html.zst-8 745MB/s ± 0% 803MB/s ± 0% +7.68% (p=0.008 n=5+5) Decoder_DecoderSmall/comp-data.bin.zst-8 399MB/s ± 1% 400MB/s ± 0% ~ (p=0.841 n=5+5) Decoder_DecodeAll/kppkn.gtb.zst-8 521MB/s ± 0% 521MB/s ± 0% ~ (p=0.841 n=5+5) Decoder_DecodeAll/geo.protodata.zst-8 1.27GB/s ± 1% 1.29GB/s ± 0% +1.19% (p=0.008 n=5+5) Decoder_DecodeAll/plrabn12.txt.zst-8 429MB/s ± 0% 427MB/s ± 0% -0.51% (p=0.032 n=5+5) Decoder_DecodeAll/lcet10.txt.zst-8 435MB/s ± 0% 439MB/s ± 0% +0.94% (p=0.008 n=5+5) Decoder_DecodeAll/asyoulik.txt.zst-8 438MB/s ± 0% 436MB/s ± 0% -0.39% (p=0.008 n=5+5) Decoder_DecodeAll/alice29.txt.zst-8 423MB/s ± 0% 420MB/s ± 1% -0.72% (p=0.008 n=5+5) Decoder_DecodeAll/html_x_4.zst-8 1.59GB/s ± 0% 1.59GB/s ± 1% +0.54% (p=0.032 n=5+5) Decoder_DecodeAll/paper-100k.pdf.zst-8 4.53GB/s ± 1% 4.54GB/s ± 1% ~ (p=0.310 n=5+5) Decoder_DecodeAll/fireworks.jpeg.zst-8 9.64GB/s ± 1% 9.57GB/s ± 0% ~ (p=0.151 n=5+5) Decoder_DecodeAll/urls.10K.zst-8 683MB/s ± 0% 681MB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAll/html.zst-8 1.04GB/s ± 1% 1.06GB/s ± 0% +1.77% (p=0.008 n=5+5) Decoder_DecodeAll/comp-data.bin.zst-8 398MB/s ± 1% 399MB/s ± 1% ~ (p=1.000 n=5+5) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/fastest-8 439MB/s ± 0% 437MB/s ± 0% -0.39% (p=0.016 n=5+5) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/default-8 448MB/s ± 0% 448MB/s ± 0% ~ (p=0.841 n=5+5) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/better-8 478MB/s ± 0% 477MB/s ± 0% ~ (p=0.151 n=5+5) Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/best-8 463MB/s ± 0% 460MB/s ± 0% -0.57% (p=0.008 n=5+5) Decoder_DecodeAllFiles/e.txt/fastest-8 9.62GB/s ± 3% 9.66GB/s ± 1% ~ (p=0.841 n=5+5) Decoder_DecodeAllFiles/e.txt/default-8 394MB/s ± 0% 395MB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAllFiles/e.txt/better-8 438MB/s ± 0% 442MB/s ± 0% +0.82% (p=0.008 n=5+5) Decoder_DecodeAllFiles/e.txt/best-8 501MB/s ± 0% 506MB/s ± 0% +1.07% (p=0.008 n=5+5) Decoder_DecodeAllFiles/fse-artifact3.bin/fastest-8 1.04GB/s ± 0% 1.05GB/s ± 1% ~ (p=0.056 n=5+5) Decoder_DecodeAllFiles/fse-artifact3.bin/default-8 1.20GB/s ± 1% 1.20GB/s ± 1% ~ (p=0.095 n=5+5) Decoder_DecodeAllFiles/fse-artifact3.bin/better-8 1.01GB/s ± 0% 1.00GB/s ± 1% -0.82% (p=0.008 n=5+5) Decoder_DecodeAllFiles/fse-artifact3.bin/best-8 386MB/s ± 0% 383MB/s ± 0% -0.57% (p=0.008 n=5+5) Decoder_DecodeAllFiles/gettysburg.txt/fastest-8 271MB/s ± 1% 275MB/s ± 1% +1.59% (p=0.008 n=5+5) Decoder_DecodeAllFiles/gettysburg.txt/default-8 224MB/s ± 1% 223MB/s ± 1% ~ (p=0.222 n=5+5) Decoder_DecodeAllFiles/gettysburg.txt/better-8 228MB/s ± 0% 226MB/s ± 0% -0.89% (p=0.008 n=5+5) Decoder_DecodeAllFiles/gettysburg.txt/best-8 223MB/s ± 1% 221MB/s ± 1% -1.03% (p=0.016 n=5+5) Decoder_DecodeAllFiles/html.txt/fastest-8 592MB/s ± 1% 611MB/s ± 0% +3.20% (p=0.008 n=5+5) Decoder_DecodeAllFiles/html.txt/default-8 597MB/s ± 0% 607MB/s ± 0% +1.71% (p=0.008 n=5+5) Decoder_DecodeAllFiles/html.txt/better-8 623MB/s ± 0% 633MB/s ± 0% +1.57% (p=0.008 n=5+5) Decoder_DecodeAllFiles/html.txt/best-8 603MB/s ± 0% 610MB/s ± 0% +1.25% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pi.txt/fastest-8 9.59GB/s ± 1% 9.70GB/s ± 1% +1.16% (p=0.032 n=5+5) Decoder_DecodeAllFiles/pi.txt/default-8 391MB/s ± 0% 393MB/s ± 0% +0.62% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pi.txt/better-8 437MB/s ± 1% 441MB/s ± 2% ~ (p=0.087 n=5+5) Decoder_DecodeAllFiles/pi.txt/best-8 501MB/s ± 0% 507MB/s ± 0% +1.22% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pngdata.bin/fastest-8 1.66GB/s ± 1% 1.70GB/s ± 0% +2.49% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pngdata.bin/default-8 1.49GB/s ± 0% 1.51GB/s ± 0% +1.18% (p=0.008 n=5+5) Decoder_DecodeAllFiles/pngdata.bin/better-8 1.87GB/s ± 0% 1.90GB/s ± 1% ~ (p=0.056 n=5+5) Decoder_DecodeAllFiles/pngdata.bin/best-8 1.44GB/s ± 1% 1.46GB/s ± 0% +1.75% (p=0.008 n=5+5) Decoder_DecodeAllFiles/sharnd.out/fastest-8 9.64GB/s ± 1% 9.66GB/s ± 1% ~ (p=0.841 n=5+5) Decoder_DecodeAllFiles/sharnd.out/default-8 9.70GB/s ± 1% 9.70GB/s ± 2% ~ (p=1.000 n=5+5) Decoder_DecodeAllFiles/sharnd.out/better-8 9.71GB/s ± 1% 9.79GB/s ± 1% ~ (p=0.151 n=5+5) Decoder_DecodeAllFiles/sharnd.out/best-8 9.76GB/s ± 0% 9.80GB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/fastest-8 1.85GB/s ± 0% 1.85GB/s ± 0% -0.31% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/default-8 1.86GB/s ± 0% 1.85GB/s ± 0% -0.47% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/better-8 2.00GB/s ± 0% 2.00GB/s ± 0% -0.32% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/best-8 1.93GB/s ± 0% 1.93GB/s ± 0% -0.22% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/e.txt/fastest-8 37.7GB/s ± 0% 37.5GB/s ± 0% -0.38% (p=0.016 n=5+5) Decoder_DecodeAllFilesP/e.txt/default-8 1.68GB/s ± 0% 1.69GB/s ± 0% +0.55% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/e.txt/better-8 1.91GB/s ± 0% 1.92GB/s ± 0% +0.96% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/e.txt/best-8 2.22GB/s ± 0% 2.25GB/s ± 0% +1.50% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/fse-artifact3.bin/fastest-8 5.18GB/s ± 0% 5.05GB/s ± 2% -2.50% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/fse-artifact3.bin/default-8 5.50GB/s ± 1% 5.34GB/s ± 1% -2.86% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/fse-artifact3.bin/better-8 5.11GB/s ± 0% 5.14GB/s ± 0% +0.57% (p=0.016 n=5+5) Decoder_DecodeAllFilesP/fse-artifact3.bin/best-8 2.36GB/s ± 0% 2.37GB/s ± 0% +0.20% (p=0.032 n=5+5) Decoder_DecodeAllFilesP/gettysburg.txt/fastest-8 1.16GB/s ± 0% 1.16GB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAllFilesP/gettysburg.txt/default-8 1.09GB/s ± 0% 1.08GB/s ± 0% -1.19% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/gettysburg.txt/better-8 1.09GB/s ± 0% 1.08GB/s ± 1% -0.96% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/gettysburg.txt/best-8 1.03GB/s ± 3% 1.02GB/s ± 0% ~ (p=0.151 n=5+5) Decoder_DecodeAllFilesP/html.txt/fastest-8 2.50GB/s ± 1% 2.56GB/s ± 0% +2.39% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/html.txt/default-8 2.51GB/s ± 0% 2.55GB/s ± 0% +1.69% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/html.txt/better-8 2.61GB/s ± 0% 2.66GB/s ± 0% +1.93% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/html.txt/best-8 2.53GB/s ± 0% 2.56GB/s ± 0% +1.13% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/pi.txt/fastest-8 37.8GB/s ± 0% 37.6GB/s ± 0% -0.44% (p=0.016 n=5+5) Decoder_DecodeAllFilesP/pi.txt/default-8 1.67GB/s ± 0% 1.68GB/s ± 0% +0.61% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/pi.txt/better-8 1.91GB/s ± 0% 1.93GB/s ± 0% +0.82% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/pi.txt/best-8 2.23GB/s ± 0% 2.26GB/s ± 0% +1.35% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/pngdata.bin/fastest-8 6.99GB/s ± 0% 7.00GB/s ± 0% ~ (p=0.690 n=5+5) Decoder_DecodeAllFilesP/pngdata.bin/default-8 6.88GB/s ± 0% 6.87GB/s ± 0% ~ (p=0.222 n=5+5) Decoder_DecodeAllFilesP/pngdata.bin/better-8 8.49GB/s ± 0% 8.44GB/s ± 1% ~ (p=0.310 n=5+5) Decoder_DecodeAllFilesP/pngdata.bin/best-8 6.59GB/s ± 1% 6.53GB/s ± 1% -0.96% (p=0.032 n=5+5) Decoder_DecodeAllFilesP/sharnd.out/fastest-8 37.8GB/s ± 0% 37.5GB/s ± 0% -0.86% (p=0.008 n=5+5) Decoder_DecodeAllFilesP/sharnd.out/default-8 37.9GB/s ± 1% 38.0GB/s ± 1% ~ (p=0.310 n=5+5) Decoder_DecodeAllFilesP/sharnd.out/better-8 37.9GB/s ± 0% 37.8GB/s ± 2% ~ (p=0.841 n=5+5) Decoder_DecodeAllFilesP/sharnd.out/best-8 37.8GB/s ± 0% 38.0GB/s ± 1% ~ (p=0.310 n=5+5) Decoder_DecodeAllParallel/kppkn.gtb.zst-8 2.20GB/s ± 0% 2.20GB/s ± 0% ~ (p=1.000 n=5+5) Decoder_DecodeAllParallel/geo.protodata.zst-8 5.37GB/s ± 0% 5.39GB/s ± 0% +0.35% (p=0.008 n=5+5) Decoder_DecodeAllParallel/plrabn12.txt.zst-8 1.77GB/s ± 0% 1.76GB/s ± 0% -0.19% (p=0.008 n=5+5) Decoder_DecodeAllParallel/lcet10.txt.zst-8 1.90GB/s ± 0% 1.92GB/s ± 0% +0.80% (p=0.008 n=5+5) Decoder_DecodeAllParallel/asyoulik.txt.zst-8 1.83GB/s ± 0% 1.83GB/s ± 0% ~ (p=0.841 n=5+5) Decoder_DecodeAllParallel/alice29.txt.zst-8 1.74GB/s ± 0% 1.74GB/s ± 0% ~ (p=0.548 n=5+5) Decoder_DecodeAllParallel/html_x_4.zst-8 6.55GB/s ± 0% 6.49GB/s ± 0% -0.97% (p=0.008 n=5+5) Decoder_DecodeAllParallel/paper-100k.pdf.zst-8 18.3GB/s ± 0% 18.3GB/s ± 0% ~ (p=0.056 n=5+5) Decoder_DecodeAllParallel/fireworks.jpeg.zst-8 37.4GB/s ± 0% 37.2GB/s ± 1% -0.57% (p=0.016 n=4+5) Decoder_DecodeAllParallel/urls.10K.zst-8 2.97GB/s ± 0% 2.96GB/s ± 0% ~ (p=0.310 n=5+5) Decoder_DecodeAllParallel/html.zst-8 4.42GB/s ± 1% 4.43GB/s ± 0% ~ (p=0.556 n=5+4) Decoder_DecodeAllParallel/comp-data.bin.zst-8 1.69GB/s ± 1% 1.70GB/s ± 0% +0.84% (p=0.008 n=5+5) [Geo mean] 1.77GB/s 1.78GB/s +0.57%
1 parent 51e1025 commit 5d8f037

File tree

2 files changed

+542
-283
lines changed

2 files changed

+542
-283
lines changed

zstd/_generate/gen.go

Lines changed: 80 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,9 +1135,9 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
11351135
e.copyMemoryPrecise("1", c.literals, c.outBase, ll)
11361136
} else {
11371137
e.copyMemoryND("1", c.literals, c.outBase, ll)
1138+
ADDQ(ll, c.literals)
1139+
ADDQ(ll, c.outBase)
11381140
}
1139-
ADDQ(ll, c.literals)
1140-
ADDQ(ll, c.outBase)
11411141
ADDQ(ll, c.outPosition)
11421142
}
11431143

@@ -1203,7 +1203,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
12031203
*/
12041204
e.copyMemoryPrecise("4", ptr, c.outBase, ml)
12051205
ADDQ(ml, c.outPosition)
1206-
ADDQ(ml, c.outBase)
12071206
// Note: for the current go tests this branch is taken in 99.53% cases,
12081207
// this is why we repeat a little code here.
12091208
handleLoop()
@@ -1219,7 +1218,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
12191218
}
12201219
*/
12211220
e.copyMemoryPrecise("5", ptr, c.outBase, v)
1222-
ADDQ(v, c.outBase)
12231221
ADDQ(v, c.outPosition)
12241222
SUBQ(v, ml)
12251223
// fallback to the next block
@@ -1254,7 +1252,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
12541252
ADDQ(ml, c.outPosition)
12551253
if e.safeMem {
12561254
e.copyMemoryPrecise("2", src, c.outBase, ml)
1257-
ADDQ(ml, c.outBase)
12581255
} else {
12591256
dst := GP64()
12601257
MOVQ(c.outBase, dst)
@@ -1312,62 +1309,93 @@ func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtua
13121309
}
13131310

13141311
// copyMemoryPrecise will copy memory in blocks of 16 bytes,
1315-
// without overwriting nor overreading.
1312+
// without overreading. It adds length to src and dst,
1313+
// preserving length.
13161314
func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) {
1317-
label := "copy_" + suffix
1318-
ofs := GP64()
1319-
s := Mem{Base: src, Index: ofs, Scale: 1}
1320-
d := Mem{Base: dst, Index: ofs, Scale: 1}
1321-
1322-
tmp := GP64()
1323-
XORQ(ofs, ofs)
1324-
1325-
Label("copy_" + suffix + "_byte")
1326-
TESTQ(U32(0x1), length)
1327-
JZ(LabelRef("copy_" + suffix + "_word"))
1315+
n := GP64()
1316+
MOVQ(length, n)
1317+
SUBQ(U8(16), n)
1318+
JB(LabelRef("copy_" + suffix + "_small"))
13281319

1329-
// copy one byte if length & 0x01 != 0
1330-
MOVB(s, tmp.As8())
1331-
MOVB(tmp.As8(), d)
1332-
ADDQ(U8(1), ofs)
1320+
// If length >= 16, copy blocks of 16 bytes and handle any remainder
1321+
// by a block copy that overlaps with the last full block.
1322+
{
1323+
t := XMM()
13331324

1334-
Label("copy_" + suffix + "_word")
1335-
TESTQ(U32(0x2), length)
1336-
JZ(LabelRef("copy_" + suffix + "_dword"))
1325+
loop := "copy_" + suffix + "_loop"
1326+
Label(loop)
1327+
{
1328+
MOVUPS(Mem{Base: src}, t)
1329+
MOVUPS(t, Mem{Base: dst})
1330+
ADDQ(U8(16), src)
1331+
ADDQ(U8(16), dst)
1332+
SUBQ(U8(16), n)
1333+
JAE(LabelRef(loop))
1334+
}
13371335

1338-
// copy two bytes if length & 0x02 != 0
1339-
MOVW(s, tmp.As16())
1340-
MOVW(tmp.As16(), d)
1341-
ADDQ(U8(2), ofs)
1336+
// n is now the range [-16,-1].
1337+
// -16 means we copy the entire last block again.
1338+
// That should happen about 1/16th of the time,
1339+
// so we don't bother to check for it.
1340+
LEAQ(Mem{Base: src, Index: n, Disp: 16, Scale: 1}, src)
1341+
LEAQ(Mem{Base: dst, Index: n, Disp: 16, Scale: 1}, dst)
1342+
MOVUPS(Mem{Base: src, Disp: -16}, t)
1343+
MOVUPS(t, Mem{Base: dst, Disp: -16})
13421344

1343-
Label("copy_" + suffix + "_dword")
1344-
TESTQ(U32(0x4), length)
1345-
JZ(LabelRef("copy_" + suffix + "_qword"))
1345+
JMP(LabelRef("copy_" + suffix + "_end"))
1346+
}
13461347

1347-
// copy four bytes if length & 0x04 != 0
1348-
MOVL(s, tmp.As32())
1349-
MOVL(tmp.As32(), d)
1350-
ADDQ(U8(4), ofs)
1348+
Label("copy_" + suffix + "_small")
1349+
{
1350+
ofs := GP64()
1351+
s := Mem{Base: src, Index: ofs, Scale: 1}
1352+
d := Mem{Base: dst, Index: ofs, Scale: 1}
13511353

1352-
Label("copy_" + suffix + "_qword")
1353-
TESTQ(U32(0x8), length)
1354-
JZ(LabelRef("copy_" + suffix + "_test"))
1354+
tmp := GP64()
1355+
XORQ(ofs, ofs)
1356+
1357+
Label("copy_" + suffix + "_byte")
1358+
TESTQ(U32(0x1), length)
1359+
JZ(LabelRef("copy_" + suffix + "_word"))
1360+
1361+
// copy one byte if length & 0x01 != 0
1362+
MOVB(s, tmp.As8())
1363+
MOVB(tmp.As8(), d)
1364+
ADDQ(U8(1), ofs)
1365+
1366+
Label("copy_" + suffix + "_word")
1367+
TESTQ(U32(0x2), length)
1368+
JZ(LabelRef("copy_" + suffix + "_dword"))
1369+
1370+
// copy two bytes if length & 0x02 != 0
1371+
MOVW(s, tmp.As16())
1372+
MOVW(tmp.As16(), d)
1373+
ADDQ(U8(2), ofs)
1374+
1375+
Label("copy_" + suffix + "_dword")
1376+
TESTQ(U32(0x4), length)
1377+
JZ(LabelRef("copy_" + suffix + "_qword"))
1378+
1379+
// copy four bytes if length & 0x04 != 0
1380+
MOVL(s, tmp.As32())
1381+
MOVL(tmp.As32(), d)
1382+
ADDQ(U8(4), ofs)
1383+
1384+
Label("copy_" + suffix + "_qword")
1385+
TESTQ(U32(0x8), length)
1386+
JZ(LabelRef("copy_" + suffix + "_add"))
1387+
1388+
// copy eight bytes if length & 0x08 != 0
1389+
MOVQ(s, tmp)
1390+
MOVQ(tmp, d)
1391+
ADDQ(U8(8), ofs)
1392+
}
13551393

1356-
// copy eight bytes if length & 0x08 != 0
1357-
MOVQ(s, tmp)
1358-
MOVQ(tmp, d)
1359-
ADDQ(U8(8), ofs)
1360-
JMP(LabelRef("copy_" + suffix + "_test"))
1394+
Label("copy_" + suffix + "_add")
1395+
ADDQ(length, dst)
1396+
ADDQ(length, src)
13611397

1362-
// copy in 16-byte chunks
1363-
Label(label)
1364-
t := XMM()
1365-
MOVUPS(s, t)
1366-
MOVUPS(t, d)
1367-
ADDQ(U8(16), ofs)
1368-
Label("copy_" + suffix + "_test")
1369-
CMPQ(ofs, length)
1370-
JB(LabelRef(label))
1398+
Label("copy_" + suffix + "_end")
13711399
}
13721400

13731401
// copyOverlappedMemory will copy one byte at the time from src to dst.

0 commit comments

Comments
 (0)