Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions src/Microsoft.Data.Analysis/DataFrame.Arrow.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,19 @@ private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray
AppendDataFrameColumnFromArrowArray(fieldsEnumerator.Current, structArrayEnumerator.Current, ret, field.Name + "_");
}
break;
#if NET8_0_OR_GREATER
case ArrowTypeId.Date32:
{
Date32Array arrowDate32Array = (Date32Array)arrowArray;
DateOnlyDataFrameColumn dateTimeDataFrameColumn = new DateOnlyDataFrameColumn(fieldName, arrowDate32Array.Data.Length);
for (int i = 0; i < arrowDate32Array.Data.Length; ++i)
{
dateTimeDataFrameColumn[i] = arrowDate32Array.GetDateOnly(i);
}
dataFrameColumn = dateTimeDataFrameColumn;
}
break;
#endif
case ArrowTypeId.Date64:
{
Date64Array arrowDate64Array = (Date64Array)arrowArray;
Expand All @@ -115,18 +128,20 @@ private static void AppendDataFrameColumnFromArrowArray(Field field, IArrowArray
case ArrowTypeId.Timestamp:
{
TimestampArray arrowTimeStampArray = (TimestampArray)arrowArray;
var dataTimeDataFrameColumn = new DateTimeDataFrameColumn(fieldName, arrowTimeStampArray.Data.Length);
for (int i = 0; i < arrowTimeStampArray.Data.Length; i++)
DateTimeOffsetDataFrameColumn dataTimeDataFrameColumn = new DateTimeOffsetDataFrameColumn(fieldName, arrowTimeStampArray.Data.Length);
for (int i = 0; i < arrowTimeStampArray.Data.Length; ++i)
{
dataTimeDataFrameColumn[i] = arrowTimeStampArray.GetTimestamp(i)?.DateTime;
dataTimeDataFrameColumn[i] = arrowTimeStampArray.GetTimestamp(i);
}
dataFrameColumn = dataTimeDataFrameColumn;
}
break;
case ArrowTypeId.Decimal128:
case ArrowTypeId.Decimal256:
case ArrowTypeId.Binary:
#if !NET8_0_OR_GREATER
case ArrowTypeId.Date32:
#endif
case ArrowTypeId.Dictionary:
case ArrowTypeId.FixedSizedBinary:
case ArrowTypeId.HalfFloat:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

#if NET8_0_OR_GREATER
using System;
using System.Collections.Generic;

namespace Microsoft.Data.Analysis
{
public partial class DateOnlyDataFrameColumn : PrimitiveDataFrameColumn<DateOnly>
{
public DateOnlyDataFrameColumn(string name, IEnumerable<DateOnly?> values) : base(name, values) { }

public DateOnlyDataFrameColumn(string name, IEnumerable<DateOnly> values) : base(name, values) { }

public DateOnlyDataFrameColumn(string name, long length = 0) : base(name, length) { }

public DateOnlyDataFrameColumn(string name, ReadOnlyMemory<byte> buffer, ReadOnlyMemory<byte> nullBitMap, int length = 0, int nullCount = 0) : base(name, buffer, nullBitMap, length, nullCount) { }

internal DateOnlyDataFrameColumn(string name, PrimitiveColumnContainer<DateOnly> values) : base(name, values) { }

protected override PrimitiveDataFrameColumn<DateOnly> CreateNewColumn(string name, long length = 0)
{
return new DateOnlyDataFrameColumn(name, length);
}

internal override PrimitiveDataFrameColumn<DateOnly> CreateNewColumn(string name, PrimitiveColumnContainer<DateOnly> container)
{
return new DateOnlyDataFrameColumn(name, container);
}
}
}
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

using System;
using System.Collections.Generic;
using System.Text;

namespace Microsoft.Data.Analysis
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;

namespace Microsoft.Data.Analysis
{
public partial class DateTimeOffsetDataFrameColumn : PrimitiveDataFrameColumn<DateTimeOffset>
{
public DateTimeOffsetDataFrameColumn(string name, IEnumerable<DateTimeOffset?> values) : base(name, values) { }

public DateTimeOffsetDataFrameColumn(string name, IEnumerable<DateTimeOffset> values) : base(name, values) { }

public DateTimeOffsetDataFrameColumn(string name, long length = 0) : base(name, length) { }

public DateTimeOffsetDataFrameColumn(string name, ReadOnlyMemory<byte> buffer, ReadOnlyMemory<byte> nullBitMap, int length = 0, int nullCount = 0) : base(name, buffer, nullBitMap, length, nullCount) { }

internal DateTimeOffsetDataFrameColumn(string name, PrimitiveColumnContainer<DateTimeOffset> values) : base(name, values) { }

protected override PrimitiveDataFrameColumn<DateTimeOffset> CreateNewColumn(string name, long length = 0)
{
return new DateTimeOffsetDataFrameColumn(name, length);
}

internal override PrimitiveDataFrameColumn<DateTimeOffset> CreateNewColumn(string name, PrimitiveColumnContainer<DateTimeOffset> container)
{
return new DateTimeOffsetDataFrameColumn(name, container);
}
}
}
4 changes: 2 additions & 2 deletions src/Microsoft.Data.Analysis/Microsoft.Data.Analysis.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<PackageReleaseNotes>Initial preview of robust and extensible types and algorithms for manipulating structured data that supports aggregations, statistical funtions, sorting, grouping, joins, merges, handling missing values and more. </PackageReleaseNotes>
<PackageTags>ML.NET ML Machine Learning Data Science DataFrame Preparation DataView Analytics Exploration</PackageTags>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<!--
<!--
1591: Documentation warnings
NU5100: Warning that gets triggered because a .dll is not placed under lib folder on package. This is by design as we want MDAI to be under interactive-extensions folder.
-->
Expand Down Expand Up @@ -44,7 +44,7 @@
<DependentUpon>PrimitiveDataFrameColumn.BinaryOperators.tt</DependentUpon>
</None>
</ItemGroup>

<ItemGroup Condition="'$(TargetFramework)' == 'net8.0'">
<PackageReference Include="System.Numerics.Tensors" Version="9.0.0-preview.6.24327.7" />
</ItemGroup>
Expand Down
62 changes: 57 additions & 5 deletions src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ private IArrowType GetArrowType()
return UInt16Type.Default;
else if (typeof(T) == typeof(DateTime))
return Date64Type.Default;
else if (typeof(T) == typeof(DateTimeOffset))
return TimestampType.Default;
#if NET8_0_OR_GREATER
else if (typeof(T) == typeof(DateOnly))
return Date32Type.Default;
#endif
else
throw new NotImplementedException(nameof(T));
}
Expand Down Expand Up @@ -138,19 +144,19 @@ protected internal override Apache.Arrow.Array ToArrowArray(long startIndex, int

int nullCount = GetNullCount(startIndex, numberOfRows);

//DateTime requires convertion
// DateTime requires conversion
if (this.DataType == typeof(DateTime))
{
if (numberOfRows == 0)
return new Date64Array(ArrowBuffer.Empty, ArrowBuffer.Empty, numberOfRows, nullCount, offset);

ReadOnlyDataFrameBuffer<T> valueBuffer = (numberOfRows == 0) ? null : _columnContainer.Buffers[bufferIndex];
ReadOnlyDataFrameBuffer<byte> nullBuffer = (numberOfRows == 0) ? null : _columnContainer.NullBitMapBuffers[bufferIndex];
ReadOnlyDataFrameBuffer<T> valueBuffer = _columnContainer.Buffers[bufferIndex];
ReadOnlyDataFrameBuffer<byte> nullBuffer = _columnContainer.NullBitMapBuffers[bufferIndex];

ReadOnlySpan<DateTime> valueSpan = MemoryMarshal.Cast<T, DateTime>(valueBuffer.ReadOnlySpan);
Date64Array.Builder builder = new Date64Array.Builder().Reserve(valueBuffer.Length);

for (int i = 0; i < valueBuffer.Length; i++)
for (int i = 0; i < valueBuffer.Length; ++i)
{
if (BitUtility.GetBit(nullBuffer.ReadOnlySpan, i))
builder.Append(valueSpan[i]);
Expand All @@ -161,7 +167,53 @@ protected internal override Apache.Arrow.Array ToArrowArray(long startIndex, int
return builder.Build();
}

//No convertion
if (this.DataType == typeof(DateTimeOffset))
{
if (numberOfRows == 0)
return new TimestampArray(TimestampType.Default, ArrowBuffer.Empty, ArrowBuffer.Empty, numberOfRows, nullCount, offset);

ReadOnlyDataFrameBuffer<T> valueBuffer = _columnContainer.Buffers[bufferIndex];
ReadOnlyDataFrameBuffer<byte> nullBuffer = _columnContainer.NullBitMapBuffers[bufferIndex];

ReadOnlySpan<DateTimeOffset> valueSpan = MemoryMarshal.Cast<T, DateTimeOffset>(valueBuffer.ReadOnlySpan);
TimestampArray.Builder builder = new TimestampArray.Builder().Reserve(valueBuffer.Length);

for (int i = 0; i < valueBuffer.Length; ++i)
{
if (BitUtility.GetBit(nullBuffer.ReadOnlySpan, i))
builder.Append(valueSpan[i]);
else
builder.AppendNull();
}

return builder.Build();
}

#if NET8_0_OR_GREATER
if (this.DataType == typeof(DateOnly))
{
if (numberOfRows == 0)
return new Date32Array(ArrowBuffer.Empty, ArrowBuffer.Empty, numberOfRows, nullCount, offset);

ReadOnlyDataFrameBuffer<T> valueBuffer = _columnContainer.Buffers[bufferIndex];
ReadOnlyDataFrameBuffer<byte> nullBuffer = _columnContainer.NullBitMapBuffers[bufferIndex];

ReadOnlySpan<DateOnly> valueSpan = MemoryMarshal.Cast<T, DateOnly>(valueBuffer.ReadOnlySpan);
Date32Array.Builder builder = new Date32Array.Builder().Reserve(valueBuffer.Length);

for (int i = 0; i < valueBuffer.Length; ++i)
{
if (BitUtility.GetBit(nullBuffer.ReadOnlySpan, i))
builder.Append(valueSpan[i]);
else
builder.AppendNull();
}

return builder.Build();
}
#endif

// No conversion
ArrowBuffer arrowValueBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.Buffers[bufferIndex].ReadOnlyBuffer);
ArrowBuffer arrowNullBuffer = numberOfRows == 0 ? ArrowBuffer.Empty : new ArrowBuffer(_columnContainer.NullBitMapBuffers[bufferIndex].ReadOnlyBuffer);

Expand Down
6 changes: 5 additions & 1 deletion test/Microsoft.Data.Analysis.Tests/ArrayComparer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,19 @@ private void CompareArrays<T>(PrimitiveArray<T> actualArray)
}
else
{
// expectedArray may have passed in a null bitmap. DataFrame might have populated it with Length set bits
// expectedArray may have passed in a null bitmap. DataFrame might have populated it with Length set bits
Assert.Equal(0, expectedArray.NullCount);
Assert.Equal(0, actualArray.NullCount);
for (int i = 0; i < actualArray.Length; i++)
{
Assert.True(actualArray.IsValid(i));
}
}
#if NET8_0_OR_GREATER
Assert.Equal(expectedArray.Values[..expectedArray.Length], actualArray.Values[..actualArray.Length]);
#else
Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length)));
#endif
}

private void CompareArrays(BooleanArray actualArray)
Expand Down
4 changes: 4 additions & 0 deletions test/Microsoft.Data.Analysis.Tests/ArrowIntegrationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ public void TestArrowIntegration()
.Append("ByteColumn", false, new Int8Array.Builder().AppendRange(Enumerable.Repeat((sbyte)1, 10)).Build())
.Append("UByteColumn", false, new UInt8Array.Builder().AppendRange(Enumerable.Repeat((byte)1, 10)).Build())
.Append("Date64Column", false, new Date64Array.Builder().AppendRange(Enumerable.Repeat(DateTime.Now, 10)).Build())
.Append("TimestampColumn", false, new TimestampArray.Builder().AppendRange(Enumerable.Repeat(DateTimeOffset.Now, 10)).Build())
#if NET8_0_OR_GREATER
.Append("DateColumn", false, new Date32Array.Builder().AppendRange(Enumerable.Repeat(DateTime.Now, 10)).Build())
#endif
.Build();

DataFrame df = DataFrame.FromArrowRecordBatch(originalBatch);
Expand Down
29 changes: 19 additions & 10 deletions test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
using System.Data.SQLite.EF6;
using Xunit;
using Microsoft.ML.TestFramework.Attributes;
using System.Threading;
using Microsoft.ML.Data;
using System.Threading.Tasks;

Expand Down Expand Up @@ -107,6 +106,16 @@ internal static void VerifyColumnTypes(DataFrame df, bool testArrowStringColumn
{
Assert.IsType<DateTimeDataFrameColumn>(column);
}
else if (dataType == typeof(DateTimeOffset))
{
Assert.IsType<DateTimeOffsetDataFrameColumn>(column);
}
#if NET8_0_OR_GREATER
else if (dataType == typeof(DateOnly))
{
Assert.IsType<DateOnlyDataFrameColumn>(column);
}
#endif
else
{
throw new NotImplementedException("Unit test has to be updated");
Expand Down Expand Up @@ -354,17 +363,17 @@ void VerifyDataFrameWithNamedColumnsAndDataTypes(DataFrame df, bool verifyColumn
public void TestLoadCsvWithTypesAndGuessRows(bool header, int guessRows)
{
/* Tests this matrix
*
header GuessRows DataTypes
True 0 NotNull
False 0 NotNull
True 10 NotNull
False 10 NotNull
*
header GuessRows DataTypes
True 0 NotNull
False 0 NotNull
True 10 NotNull
False 10 NotNull
True 0 Null -----> Throws an exception
False 0 Null -----> Throws an exception
True 10 Null
False 10 Null
*
True 10 Null
False 10 Null
*
*/
string headerLine = @"vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,fare_amount
";
Expand Down
Loading