Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/System.Private.CoreLib/shared/System/Buffer.Unix.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public static partial class Buffer
#elif ARM
private const nuint MemmoveNativeThreshold = 512;
#else
private const nuint MemmoveNativeThreshold = 2048;
private const nuint MemmoveNativeThreshold = 4096;
#endif
}
}
2 changes: 1 addition & 1 deletion src/System.Private.CoreLib/shared/System/Buffer.Windows.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public static partial class Buffer
// https://github.com/dotnet/coreclr/issues/13843
private const nuint MemmoveNativeThreshold = ulong.MaxValue;
#else
private const nuint MemmoveNativeThreshold = 2048;
private const nuint MemmoveNativeThreshold = 4096;
#endif
}
}
279 changes: 19 additions & 260 deletions src/System.Private.CoreLib/shared/System/Buffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

#if AMD64 || ARM64 || (BIT32 && !ARM)
#define HAS_CUSTOM_BLOCKS
#endif

using System.Diagnostics;
using System.Runtime;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

using Internal.Runtime.CompilerServices;

Expand All @@ -20,6 +16,8 @@
using nint = System.Int32;
using nuint = System.UInt32;
#endif
using Block16 = System.Runtime.Intrinsics.Vector128<byte>;
using Block32 = System.Runtime.Intrinsics.Vector256<byte>;

namespace System
{
Expand Down Expand Up @@ -87,7 +85,7 @@ internal static unsafe void ZeroMemory(byte* dest, nuint len)

// The attributes on this method are chosen for best JIT performance.
// Please do not edit unless intentional.
[MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CLSCompliant(false)]
public static unsafe void MemoryCopy(void* source, void* destination, long destinationSizeInBytes, long sourceBytesToCopy)
{
Expand All @@ -100,7 +98,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, long desti

// The attributes on this method are chosen for best JIT performance.
// Please do not edit unless intentional.
[MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CLSCompliant(false)]
public static unsafe void MemoryCopy(void* source, void* destination, ulong destinationSizeInBytes, ulong sourceBytesToCopy)
{
Expand Down Expand Up @@ -141,171 +139,7 @@ internal static unsafe void Memcpy(byte* pDest, int destIndex, byte[] src, int s

// This method has different signature for x64 and other platforms and is done for performance reasons.
internal static unsafe void Memmove(byte* dest, byte* src, nuint len)
{
// P/Invoke into the native version when the buffers are overlapping.
if (((nuint)dest - (nuint)src < len) || ((nuint)src - (nuint)dest < len))
{
goto PInvoke;
}

byte* srcEnd = src + len;
byte* destEnd = dest + len;

if (len <= 16) goto MCPY02;
if (len > 64) goto MCPY05;

MCPY00:
// Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS
*(Block16*)dest = *(Block16*)src; // [0,16]
#elif BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8); // [0,16]
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12); // [0,16]
#endif
if (len <= 32) goto MCPY01;
#if HAS_CUSTOM_BLOCKS
*(Block16*)(dest + 16) = *(Block16*)(src + 16); // [0,32]
#elif BIT64
*(long*)(dest + 16) = *(long*)(src + 16);
*(long*)(dest + 24) = *(long*)(src + 24); // [0,32]
#else
*(int*)(dest + 16) = *(int*)(src + 16);
*(int*)(dest + 20) = *(int*)(src + 20);
*(int*)(dest + 24) = *(int*)(src + 24);
*(int*)(dest + 28) = *(int*)(src + 28); // [0,32]
#endif
if (len <= 48) goto MCPY01;
#if HAS_CUSTOM_BLOCKS
*(Block16*)(dest + 32) = *(Block16*)(src + 32); // [0,48]
#elif BIT64
*(long*)(dest + 32) = *(long*)(src + 32);
*(long*)(dest + 40) = *(long*)(src + 40); // [0,48]
#else
*(int*)(dest + 32) = *(int*)(src + 32);
*(int*)(dest + 36) = *(int*)(src + 36);
*(int*)(dest + 40) = *(int*)(src + 40);
*(int*)(dest + 44) = *(int*)(src + 44); // [0,48]
#endif

MCPY01:
// Unconditionally copy the last 16 bytes using destEnd and srcEnd and return.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS
*(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16);
#elif BIT64
*(long*)(destEnd - 16) = *(long*)(srcEnd - 16);
*(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
#else
*(int*)(destEnd - 16) = *(int*)(srcEnd - 16);
*(int*)(destEnd - 12) = *(int*)(srcEnd - 12);
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
#endif
return;

MCPY02:
// Copy the first 8 bytes and then unconditionally copy the last 8 bytes and return.
if ((len & 24) == 0) goto MCPY03;
Debug.Assert(len >= 8 && len <= 16);
#if BIT64
*(long*)dest = *(long*)src;
*(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
#endif
return;

MCPY03:
// Copy the first 4 bytes and then unconditionally copy the last 4 bytes and return.
if ((len & 4) == 0) goto MCPY04;
Debug.Assert(len >= 4 && len < 8);
*(int*)dest = *(int*)src;
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
return;

MCPY04:
// Copy the first byte. For pending bytes, do an unconditionally copy of the last 2 bytes and return.
Debug.Assert(len < 4);
if (len == 0) return;
*dest = *src;
if ((len & 2) == 0) return;
*(short*)(destEnd - 2) = *(short*)(srcEnd - 2);
return;

MCPY05:
// PInvoke to the native version when the copy length exceeds the threshold.
if (len > MemmoveNativeThreshold)
{
goto PInvoke;
}

// Copy 64-bytes at a time until the remainder is less than 64.
// If remainder is greater than 16 bytes, then jump to MCPY00. Otherwise, unconditionally copy the last 16 bytes and return.
Debug.Assert(len > 64 && len <= MemmoveNativeThreshold);
nuint n = len >> 6;

MCPY06:
#if HAS_CUSTOM_BLOCKS
*(Block64*)dest = *(Block64*)src;
#elif BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
*(long*)(dest + 16) = *(long*)(src + 16);
*(long*)(dest + 24) = *(long*)(src + 24);
*(long*)(dest + 32) = *(long*)(src + 32);
*(long*)(dest + 40) = *(long*)(src + 40);
*(long*)(dest + 48) = *(long*)(src + 48);
*(long*)(dest + 56) = *(long*)(src + 56);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
*(int*)(dest + 16) = *(int*)(src + 16);
*(int*)(dest + 20) = *(int*)(src + 20);
*(int*)(dest + 24) = *(int*)(src + 24);
*(int*)(dest + 28) = *(int*)(src + 28);
*(int*)(dest + 32) = *(int*)(src + 32);
*(int*)(dest + 36) = *(int*)(src + 36);
*(int*)(dest + 40) = *(int*)(src + 40);
*(int*)(dest + 44) = *(int*)(src + 44);
*(int*)(dest + 48) = *(int*)(src + 48);
*(int*)(dest + 52) = *(int*)(src + 52);
*(int*)(dest + 56) = *(int*)(src + 56);
*(int*)(dest + 60) = *(int*)(src + 60);
#endif
dest += 64;
src += 64;
n--;
if (n != 0) goto MCPY06;

len %= 64;
if (len > 16) goto MCPY00;
#if HAS_CUSTOM_BLOCKS
*(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16);
#elif BIT64
*(long*)(destEnd - 16) = *(long*)(srcEnd - 16);
*(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
#else
*(int*)(destEnd - 16) = *(int*)(srcEnd - 16);
*(int*)(destEnd - 12) = *(int*)(srcEnd - 12);
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
#endif
return;

PInvoke:
_Memmove(dest, src, len);
}
=> Memmove(ref Unsafe.AsRef<byte>(dest), ref Unsafe.AsRef<byte>(src), len);

// This method has different signature for x64 and other platforms and is done for performance reasons.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down Expand Up @@ -358,58 +192,25 @@ private static void Memmove(ref byte dest, ref byte src, nuint len)
MCPY00:
// Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref dest) = Unsafe.As<byte, Block16>(ref src); // [0,16]
#elif BIT64
Unsafe.As<byte, long>(ref dest) = Unsafe.As<byte, long>(ref src);
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 8)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 8)); // [0,16]
#else
Unsafe.As<byte, int>(ref dest) = Unsafe.As<byte, int>(ref src);
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 4)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 4));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 8)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 8));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 12)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 12)); // [0,16]
#endif

if (len <= 32)
goto MCPY01;
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, Block16>(ref Unsafe.Add(ref src, 16)); // [0,32]
#elif BIT64
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 16));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 24)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 24)); // [0,32]
#else
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 16));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 20)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 20));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 24)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 24));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 28)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 28)); // [0,32]
#endif

if (len <= 48)
goto MCPY01;
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, Block16>(ref Unsafe.Add(ref src, 32)); // [0,48]
#elif BIT64
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 32));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 40)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 40)); // [0,48]
#else
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 32));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 36)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 36));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 40)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 40));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 44)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 44)); // [0,48]
#endif

MCPY01:
// Unconditionally copy the last 16 bytes using destEnd and srcEnd and return.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, Block16>(ref Unsafe.Add(ref srcEnd, -16));
#elif BIT64
Unsafe.As<byte, long>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref srcEnd, -16));
Unsafe.As<byte, long>(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref srcEnd, -8));
#else
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -16));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -12)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -12));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -8));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -4));
#endif

return;

MCPY02:
Expand Down Expand Up @@ -461,35 +262,9 @@ private static void Memmove(ref byte dest, ref byte src, nuint len)
nuint n = len >> 6;

MCPY06:
#if HAS_CUSTOM_BLOCKS
Unsafe.As<byte, Block64>(ref dest) = Unsafe.As<byte, Block64>(ref src);
#elif BIT64
Unsafe.As<byte, long>(ref dest) = Unsafe.As<byte, long>(ref src);
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 8)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 8));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 16));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 24)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 24));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 32));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 40)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 40));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 48)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 48));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 56)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 56));
#else
Unsafe.As<byte, int>(ref dest) = Unsafe.As<byte, int>(ref src);
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 4)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 4));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 8)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 8));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 12)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 12));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 16));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 20)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 20));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 24)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 24));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 28)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 28));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 32));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 36)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 36));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 40)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 40));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 44)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 44));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 48)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 48));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 52)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 52));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 56)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 56));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 60)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 60));
#endif
Unsafe.As<byte, Block32>(ref dest) = Unsafe.As<byte, Block32>(ref src);
Unsafe.As<byte, Block32>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, Block32>(ref Unsafe.Add(ref src, 32));

dest = ref Unsafe.Add(ref dest, 64);
src = ref Unsafe.Add(ref src, 64);
n--;
Expand All @@ -499,17 +274,9 @@ private static void Memmove(ref byte dest, ref byte src, nuint len)
len %= 64;
if (len > 16)
goto MCPY00;
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, Block16>(ref Unsafe.Add(ref srcEnd, -16));
#elif BIT64
Unsafe.As<byte, long>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref srcEnd, -16));
Unsafe.As<byte, long>(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref srcEnd, -8));
#else
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -16));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -12)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -12));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -8));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -4));
#endif

return;

BuffersOverlap:
Expand All @@ -525,28 +292,20 @@ private static void Memmove(ref byte dest, ref byte src, nuint len)

// Non-inlinable wrapper around the QCall that avoids polluting the fast path
// with P/Invoke prolog/epilog.
[MethodImplAttribute(MethodImplOptions.NoInlining)]
[MethodImpl(MethodImplOptions.NoInlining)]
private static unsafe void _Memmove(byte* dest, byte* src, nuint len)
{
__Memmove(dest, src, len);
}

// Non-inlinable wrapper around the QCall that avoids polluting the fast path
// with P/Invoke prolog/epilog.
[MethodImplAttribute(MethodImplOptions.NoInlining)]
[MethodImpl(MethodImplOptions.NoInlining)]
private static unsafe void _Memmove(ref byte dest, ref byte src, nuint len)
{
fixed (byte* pDest = &dest)
fixed (byte* pSrc = &src)
__Memmove(pDest, pSrc, len);
}

#if HAS_CUSTOM_BLOCKS
[StructLayout(LayoutKind.Sequential, Size = 16)]
private struct Block16 { }

[StructLayout(LayoutKind.Sequential, Size = 64)]
private struct Block64 { }
#endif // HAS_CUSTOM_BLOCKS
}
}