@@ -1135,9 +1135,9 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
11351135e .copyMemoryPrecise ("1" , c .literals , c .outBase , ll )
11361136} else {
11371137e .copyMemoryND ("1" , c .literals , c .outBase , ll )
1138+ ADDQ (ll , c .literals )
1139+ ADDQ (ll , c .outBase )
11381140}
1139- ADDQ (ll , c .literals )
1140- ADDQ (ll , c .outBase )
11411141ADDQ (ll , c .outPosition )
11421142}
11431143
@@ -1203,7 +1203,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
12031203*/
12041204e .copyMemoryPrecise ("4" , ptr , c .outBase , ml )
12051205ADDQ (ml , c .outPosition )
1206- ADDQ (ml , c .outBase )
12071206// Note: for the current go tests this branch is taken in 99.53% cases,
12081207// this is why we repeat a little code here.
12091208handleLoop ()
@@ -1219,7 +1218,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
12191218 }
12201219*/
12211220e .copyMemoryPrecise ("5" , ptr , c .outBase , v )
1222- ADDQ (v , c .outBase )
12231221ADDQ (v , c .outPosition )
12241222SUBQ (v , ml )
12251223// fallback to the next block
@@ -1254,7 +1252,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
12541252ADDQ (ml , c .outPosition )
12551253if e .safeMem {
12561254e .copyMemoryPrecise ("2" , src , c .outBase , ml )
1257- ADDQ (ml , c .outBase )
12581255} else {
12591256dst := GP64 ()
12601257MOVQ (c .outBase , dst )
@@ -1312,62 +1309,93 @@ func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtua
13121309}
13131310
13141311// copyMemoryPrecise will copy memory in blocks of 16 bytes,
1315- // without overwriting nor overreading.
1312+ // without overreading. It adds length to src and dst,
1313+ // preserving length.
13161314func (e executeSimple ) copyMemoryPrecise (suffix string , src , dst , length reg.GPVirtual ) {
1317- label := "copy_" + suffix
1318- ofs := GP64 ()
1319- s := Mem {Base : src , Index : ofs , Scale : 1 }
1320- d := Mem {Base : dst , Index : ofs , Scale : 1 }
1321-
1322- tmp := GP64 ()
1323- XORQ (ofs , ofs )
1324-
1325- Label ("copy_" + suffix + "_byte" )
1326- TESTQ (U32 (0x1 ), length )
1327- JZ (LabelRef ("copy_" + suffix + "_word" ))
1315+ n := GP64 ()
1316+ MOVQ (length , n )
1317+ SUBQ (U8 (16 ), n )
1318+ JB (LabelRef ("copy_" + suffix + "_small" ))
13281319
1329- // copy one byte if length & 0x01 != 0
1330- MOVB ( s , tmp . As8 ())
1331- MOVB ( tmp . As8 (), d )
1332- ADDQ ( U8 ( 1 ), ofs )
1320+ // If length >= 16, copy blocks of 16 bytes and handle any remainder
1321+ // by a block copy that overlaps with the last full block.
1322+ {
1323+ t := XMM ( )
13331324
1334- Label ("copy_" + suffix + "_word" )
1335- TESTQ (U32 (0x2 ), length )
1336- JZ (LabelRef ("copy_" + suffix + "_dword" ))
1325+ loop := "copy_" + suffix + "_loop"
1326+ Label (loop )
1327+ {
1328+ MOVUPS (Mem {Base : src }, t )
1329+ MOVUPS (t , Mem {Base : dst })
1330+ ADDQ (U8 (16 ), src )
1331+ ADDQ (U8 (16 ), dst )
1332+ SUBQ (U8 (16 ), n )
1333+ JAE (LabelRef (loop ))
1334+ }
13371335
1338- // copy two bytes if length & 0x02 != 0
1339- MOVW (s , tmp .As16 ())
1340- MOVW (tmp .As16 (), d )
1341- ADDQ (U8 (2 ), ofs )
1336+ // n is now the range [-16,-1].
1337+ // -16 means we copy the entire last block again.
1338+ // That should happen about 1/16th of the time,
1339+ // so we don't bother to check for it.
1340+ LEAQ (Mem {Base : src , Index : n , Disp : 16 , Scale : 1 }, src )
1341+ LEAQ (Mem {Base : dst , Index : n , Disp : 16 , Scale : 1 }, dst )
1342+ MOVUPS (Mem {Base : src , Disp : - 16 }, t )
1343+ MOVUPS (t , Mem {Base : dst , Disp : - 16 })
13421344
1343- Label ("copy_" + suffix + "_dword" )
1344- TESTQ (U32 (0x4 ), length )
1345- JZ (LabelRef ("copy_" + suffix + "_qword" ))
1345+ JMP (LabelRef ("copy_" + suffix + "_end" ))
1346+ }
13461347
1347- // copy four bytes if length & 0x04 != 0
1348- MOVL (s , tmp .As32 ())
1349- MOVL (tmp .As32 (), d )
1350- ADDQ (U8 (4 ), ofs )
1348+ Label ("copy_" + suffix + "_small" )
1349+ {
1350+ ofs := GP64 ()
1351+ s := Mem {Base : src , Index : ofs , Scale : 1 }
1352+ d := Mem {Base : dst , Index : ofs , Scale : 1 }
13511353
1352- Label ("copy_" + suffix + "_qword" )
1353- TESTQ (U32 (0x8 ), length )
1354- JZ (LabelRef ("copy_" + suffix + "_test" ))
1354+ tmp := GP64 ()
1355+ XORQ (ofs , ofs )
1356+
1357+ Label ("copy_" + suffix + "_byte" )
1358+ TESTQ (U32 (0x1 ), length )
1359+ JZ (LabelRef ("copy_" + suffix + "_word" ))
1360+
1361+ // copy one byte if length & 0x01 != 0
1362+ MOVB (s , tmp .As8 ())
1363+ MOVB (tmp .As8 (), d )
1364+ ADDQ (U8 (1 ), ofs )
1365+
1366+ Label ("copy_" + suffix + "_word" )
1367+ TESTQ (U32 (0x2 ), length )
1368+ JZ (LabelRef ("copy_" + suffix + "_dword" ))
1369+
1370+ // copy two bytes if length & 0x02 != 0
1371+ MOVW (s , tmp .As16 ())
1372+ MOVW (tmp .As16 (), d )
1373+ ADDQ (U8 (2 ), ofs )
1374+
1375+ Label ("copy_" + suffix + "_dword" )
1376+ TESTQ (U32 (0x4 ), length )
1377+ JZ (LabelRef ("copy_" + suffix + "_qword" ))
1378+
1379+ // copy four bytes if length & 0x04 != 0
1380+ MOVL (s , tmp .As32 ())
1381+ MOVL (tmp .As32 (), d )
1382+ ADDQ (U8 (4 ), ofs )
1383+
1384+ Label ("copy_" + suffix + "_qword" )
1385+ TESTQ (U32 (0x8 ), length )
1386+ JZ (LabelRef ("copy_" + suffix + "_add" ))
1387+
1388+ // copy eight bytes if length & 0x08 != 0
1389+ MOVQ (s , tmp )
1390+ MOVQ (tmp , d )
1391+ ADDQ (U8 (8 ), ofs )
1392+ }
13551393
1356- // copy eight bytes if length & 0x08 != 0
1357- MOVQ (s , tmp )
1358- MOVQ (tmp , d )
1359- ADDQ (U8 (8 ), ofs )
1360- JMP (LabelRef ("copy_" + suffix + "_test" ))
1394+ Label ("copy_" + suffix + "_add" )
1395+ ADDQ (length , dst )
1396+ ADDQ (length , src )
13611397
1362- // copy in 16-byte chunks
1363- Label (label )
1364- t := XMM ()
1365- MOVUPS (s , t )
1366- MOVUPS (t , d )
1367- ADDQ (U8 (16 ), ofs )
1368- Label ("copy_" + suffix + "_test" )
1369- CMPQ (ofs , length )
1370- JB (LabelRef (label ))
1398+ Label ("copy_" + suffix + "_end" )
13711399}
13721400
13731401// copyOverlappedMemory will copy one byte at the time from src to dst.
0 commit comments