Skip to content

Commit 947b3f8

Browse files
authored
Updated CopyColumns, DropColumns and SelectColumns samples. (#3268)
1 parent e0c029c commit 947b3f8

File tree

3 files changed

+104
-113
lines changed

3 files changed

+104
-113
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CopyColumns.cs

Lines changed: 31 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -12,72 +12,58 @@ public static void Example()
1212
// as well as the source of randomness.
1313
var mlContext = new MLContext();
1414

15-
// Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
16-
IEnumerable<Microsoft.ML.SamplesUtils.DatasetUtils.SampleInfertData> data = Microsoft.ML.SamplesUtils.DatasetUtils.GetInfertData();
17-
var trainData = mlContext.Data.LoadFromEnumerable(data);
15+
// Create a small dataset as an IEnumerable.
16+
var samples = new List<InputData>()
17+
{
18+
new InputData(){ ImageId = 1, Features = new [] { 1.0f, 1.0f, 1.0f} },
19+
new InputData(){ ImageId = 2, Features = new [] { 2.0f, 2.0f, 2.0f} },
20+
new InputData(){ ImageId = 3, Features = new [] { 3.0f, 3.0f, 3.0f} },
21+
new InputData(){ ImageId = 4, Features = new [] { 4.0f, 4.0f, 4.0f} },
22+
new InputData(){ ImageId = 5, Features = new [] { 5.0f, 5.0f, 5.0f} },
23+
new InputData(){ ImageId = 6, Features = new [] { 6.0f, 6.0f, 6.0f} },
24+
};
1825

19-
// Preview of the data.
20-
//
21-
// Age Case Education induced parity pooled.stratum row_num ...
22-
// 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ...
23-
// 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ...
24-
// 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ...
25-
// 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ...
26-
// 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ...
26+
// Convert training data to IDataView.
27+
var dataview = mlContext.Data.LoadFromEnumerable(samples);
2728

2829
// CopyColumns is commonly used to rename columns.
29-
// For example, if you want to train towards Age, and your learner expects a "Label" column, you can
30-
// use CopyColumns to rename Age to Label. Technically, the Age columns still exists, but it won't be
30+
// For example, if you want to train towards ImageId, and your trainer expects a "Label" column, you can
31+
// use CopyColumns to rename ImageId to Label. Technically, the ImageId column still exists, but it won't be
3132
// materialized unless you actually need it somewhere (e.g. if you were to save the transformed data
3233
// without explicitly dropping the column). This is a general property of IDataView's lazy evaluation.
33-
string labelColumnName = "Label";
34-
var pipeline = mlContext.Transforms.CopyColumns(labelColumnName, "Age") as IEstimator<ITransformer>;
35-
36-
// You also may want to copy a column to perform some hand-featurization using built-in transforms or
37-
// a CustomMapping transform. For example, we could make an indicator variable if a feature, such as Parity
38-
// goes above some threshold. We simply copy the Parity column to a new column, then pass it through a custom function.
39-
Action<InputRow, OutputRow> mapping = (input, output) =>output.CustomValue = input.CustomValue > 4 ? 1 : 0;
40-
pipeline = pipeline.Append(mlContext.Transforms.CopyColumns("CustomValue", "Parity"))
41-
.Append(mlContext.Transforms.CustomMapping(mapping, null));
34+
var pipeline = mlContext.Transforms.CopyColumns("Label", "ImageId");
4235

4336
// Now we can transform the data and look at the output to confirm the behavior of CopyColumns.
4437
// Don't forget that this operation doesn't actually evaluate data until we read the data below.
45-
var transformedData = pipeline.Fit(trainData).Transform(trainData);
38+
var transformedData = pipeline.Fit(dataview).Transform(dataview);
4639

4740
// We can extract the newly created column as an IEnumerable of SampleInfertDataTransformed, the class we define below.
48-
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleInfertDataTransformed>(transformedData, reuseRowObject: false);
41+
var rowEnumerable = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: false);
4942

5043
// And finally, we can write out the rows of the dataset, looking at the columns of interest.
51-
Console.WriteLine($"Label, Parity, and CustomValue columns obtained post-transformation.");
44+
Console.WriteLine($"Label and ImageId columns obtained post-transformation.");
5245
foreach (var row in rowEnumerable)
53-
{
54-
Console.WriteLine($"Label: {row.Label} Parity: {row.Parity} CustomValue: {row.CustomValue}");
55-
}
46+
Console.WriteLine($"Label: {row.Label} ImageId: {row.ImageId}");
5647

5748
// Expected output:
58-
// Label, Parity, and CustomValue columns obtained post-transformation.
59-
// Label: 26 Parity: 6 CustomValue: 1
60-
// Label: 42 Parity: 1 CustomValue: 0
61-
// Label: 39 Parity: 6 CustomValue: 1
62-
// Label: 34 Parity: 4 CustomValue: 0
63-
// Label: 35 Parity: 3 CustomValue: 0
64-
}
65-
66-
private class SampleInfertDataTransformed
67-
{
68-
public float Label { get; set; }
69-
public float Parity { get; set; }
70-
public float CustomValue { get; set; }
49+
// ImageId and Label columns obtained post-transformation.
50+
// Label: 1 ImageId: 1
51+
// Label: 2 ImageId: 2
52+
// Label: 3 ImageId: 3
53+
// Label: 4 ImageId: 4
54+
// Label: 5 ImageId: 5
55+
// Label: 6 ImageId: 6
7156
}
7257

73-
private class OutputRow
58+
private class InputData
7459
{
75-
public float CustomValue { get; set; }
60+
public int ImageId { get; set; }
61+
public float[] Features { get; set; }
7662
}
7763

78-
private class InputRow
64+
private class TransformedData : InputData
7965
{
80-
public float CustomValue { get; set; }
66+
public int Label { get; set; }
8167
}
8268
}
8369
}

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/DropColumns.cs

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,71 +12,73 @@ public static void Example()
1212
// as well as the source of randomness.
1313
var mlContext = new MLContext();
1414

15-
// Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
16-
IEnumerable<Microsoft.ML.SamplesUtils.DatasetUtils.SampleInfertData> data = Microsoft.ML.SamplesUtils.DatasetUtils.GetInfertData();
17-
var trainData = mlContext.Data.LoadFromEnumerable(data);
15+
// Create a small dataset as an IEnumerable.
16+
var samples = new List<InputData>()
17+
{
18+
new InputData(){ Age = 21, Gender = "Male", Education = "BS", ExtraColumn = 1 },
19+
new InputData(){ Age = 23, Gender = "Female", Education = "MBA", ExtraColumn = 2 },
20+
new InputData(){ Age = 28, Gender = "Male", Education = "PhD", ExtraColumn = 3 },
21+
new InputData(){ Age = 22, Gender = "Male", Education = "BS", ExtraColumn = 4 },
22+
new InputData(){ Age = 23, Gender = "Female", Education = "MS", ExtraColumn = 5 },
23+
new InputData(){ Age = 27, Gender = "Female", Education = "PhD", ExtraColumn = 6 },
24+
};
1825

19-
// Preview of the data.
20-
//
21-
// Age Case Education Induced Parity Pooled.stratum Row_num ...
22-
// 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ...
23-
// 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ...
24-
// 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ...
25-
// 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ...
26-
// 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ...
26+
// Convert training data to IDataView.
27+
var dataview = mlContext.Data.LoadFromEnumerable(samples);
2728

28-
// Drop the Age and Education columns from the dataset.
29-
var pipeline = mlContext.Transforms.DropColumns("Age", "Education");
29+
// Drop the ExtraColumn from the dataset.
30+
var pipeline = mlContext.Transforms.DropColumns("ExtraColumn");
3031

3132
// Now we can transform the data and look at the output.
3233
// Don't forget that this operation doesn't actually operate on data until we perform an action that requires
3334
// the data to be materialized.
34-
var transformedData = pipeline.Fit(trainData).Transform(trainData);
35+
var transformedData = pipeline.Fit(dataview).Transform(dataview);
3536

3637
// Now let's take a look at what the DropColumns operations did.
37-
// We can extract the transformed data as an IEnumerable of SampleInfertDataNonExistentColumns, the class we define below.
38-
// When we try to pull out the Age and Education columns, ML.NET will raise an exception on the first non-existent column
39-
// that it tries to access.
38+
// We can extract the transformed data as an IEnumerable of InputData, the class we define below.
39+
// When we try to pull out the Age, Gender, Education and ExtraColumn columns, ML.NET will raise an exception on the ExtraColumn
4040
try
4141
{
42-
var failingRowEnumerable = mlContext.Data.CreateEnumerable<SampleInfertDataNonExistentColumns>(transformedData, reuseRowObject: false);
43-
} catch(ArgumentOutOfRangeException exception)
42+
var failingRowEnumerable = mlContext.Data.CreateEnumerable<InputData>(transformedData, reuseRowObject: false);
43+
}
44+
catch (ArgumentOutOfRangeException exception)
4445
{
45-
Console.WriteLine($"Age and Education were not available, so an exception was thrown: {exception.Message}.");
46+
Console.WriteLine($"ExtraColumn is not available, so an exception is thrown: {exception.Message}.");
4647
}
4748

4849
// Expected output:
49-
// Age and Education were not available, so an exception was thrown: Could not find column 'Age'.
50+
// ExtraColumn is not available, so an exception is thrown: Could not find column 'ExtraColumn'.
5051
// Parameter name: Schema
5152

5253
// And we can write a few columns out to see that the rest of the data is still available.
53-
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleInfertDataTransformed>(transformedData, reuseRowObject: false);
54+
var rowEnumerable = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: false);
5455
Console.WriteLine($"The columns we didn't drop are still available.");
5556
foreach (var row in rowEnumerable)
56-
{
57-
Console.WriteLine($"Case: {row.Case} Induced: {row.Induced} Parity: {row.Parity}");
58-
}
57+
Console.WriteLine($"Age: {row.Age} Gender: {row.Gender} Education: {row.Education}");
5958

6059
// Expected output:
6160
// The columns we didn't drop are still available.
62-
// Case: 1 Induced: 1 Parity: 6
63-
// Case: 1 Induced: 1 Parity: 1
64-
// Case: 1 Induced: 2 Parity: 6
65-
// Case: 1 Induced: 2 Parity: 4
66-
// Case: 1 Induced: 1 Parity: 3
61+
// Age: 21 Gender: Male Education: BS
62+
// Age: 23 Gender: Female Education: MBA
63+
// Age: 28 Gender: Male Education: PhD
64+
// Age: 22 Gender: Male Education: BS
65+
// Age: 23 Gender: Female Education: MS
66+
// Age: 27 Gender: Female Education: PhD
6767
}
6868

69-
private class SampleInfertDataNonExistentColumns
69+
private class InputData
7070
{
71-
public float Age { get; set; }
72-
public float Education { get; set; }
71+
public int Age { get; set; }
72+
public string Gender { get; set; }
73+
public string Education { get; set; }
74+
public float ExtraColumn { get; set; }
7375
}
7476

75-
private class SampleInfertDataTransformed
77+
private class TransformedData
7678
{
77-
public float Case { get; set; }
78-
public float Induced { get; set; }
79-
public float Parity { get; set; }
79+
public int Age { get; set; }
80+
public string Gender { get; set; }
81+
public string Education { get; set; }
8082
}
8183
}
8284
}

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/SelectColumns.cs

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System;
2+
using System.Collections.Generic;
23
using Microsoft.ML;
34

45
namespace Samples.Dynamic
@@ -11,61 +12,63 @@ public static void Example()
1112
// as well as the source of randomness.
1213
var mlContext = new MLContext();
1314

14-
// Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
15-
var enumerableData = Microsoft.ML.SamplesUtils.DatasetUtils.GetInfertData();
16-
var data = mlContext.Data.LoadFromEnumerable(enumerableData);
17-
18-
// Before transformation, take a look at the dataset
19-
Console.WriteLine($"Age\tCase\tEducation\tInduced\tParity\tPooledStratum");
20-
foreach (var row in enumerableData)
15+
// Create a small dataset as an IEnumerable.
16+
var samples = new List<InputData>()
2117
{
22-
Console.WriteLine($"{row.Age}\t{row.Case}\t{row.Education}\t{row.Induced}\t{row.Parity}\t{row.PooledStratum}");
23-
}
24-
Console.WriteLine();
25-
// Expected output:
26-
// Age Case Education Induced Parity PooledStratum
27-
// 26 1 0 - 5yrs 1 6 3
28-
// 42 1 0 - 5yrs 1 1 1
29-
// 39 1 12 + yrs 2 6 4
30-
// 34 1 0 - 5yrs 2 4 2
31-
// 35 1 6 - 11yrs 1 3 32
18+
new InputData(){ Age = 21, Gender = "Male", Education = "BS", ExtraColumn = 1 },
19+
new InputData(){ Age = 23, Gender = "Female", Education = "MBA", ExtraColumn = 2 },
20+
new InputData(){ Age = 28, Gender = "Male", Education = "PhD", ExtraColumn = 3 },
21+
new InputData(){ Age = 22, Gender = "Male", Education = "BS", ExtraColumn = 4 },
22+
new InputData(){ Age = 23, Gender = "Female", Education = "MS", ExtraColumn = 5 },
23+
new InputData(){ Age = 27, Gender = "Female", Education = "PhD", ExtraColumn = 6 },
24+
};
25+
26+
// Convert training data to IDataView.
27+
var dataview = mlContext.Data.LoadFromEnumerable(samples);
3228

3329
// Select a subset of columns to keep.
3430
var pipeline = mlContext.Transforms.SelectColumns("Age", "Education");
3531

36-
// Now we can transform the data and look at the output to confirm the behavior of CopyColumns.
32+
// Now we can transform the data and look at the output to confirm the behavior of SelectColumns.
3733
// Don't forget that this operation doesn't actually evaluate data until we read the data below,
3834
// as transformations are lazy in ML.NET.
39-
var transformedData = pipeline.Fit(data).Transform(data);
35+
var transformedData = pipeline.Fit(dataview).Transform(dataview);
4036

4137
// Print the number of columns in the schema
4238
Console.WriteLine($"There are {transformedData.Schema.Count} columns in the dataset.");
4339

4440
// Expected output:
4541
// There are 2 columns in the dataset.
4642

47-
// We can extract the newly created column as an IEnumerable of SampleInfertDataTransformed, the class we define below.
48-
var rowEnumerable = mlContext.Data.CreateEnumerable<SampleInfertDataTransformed>(transformedData, reuseRowObject: false);
43+
// We can extract the newly created column as an IEnumerable of TransformedData, the class we define below.
44+
var rowEnumerable = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: false);
4945

5046
// And finally, we can write out the rows of the dataset, looking at the columns of interest.
5147
Console.WriteLine($"Age and Educations columns obtained post-transformation.");
5248
foreach (var row in rowEnumerable)
53-
{
5449
Console.WriteLine($"Age: {row.Age} Education: {row.Education}");
55-
}
5650

5751
// Expected output:
58-
// Age and Education columns obtained post-transformation.
59-
// Age: 26 Education: 0-5yrs
60-
// Age: 42 Education: 0-5yrs
61-
// Age: 39 Education: 12+yrs
62-
// Age: 34 Education: 0-5yrs
63-
// Age: 35 Education: 6-11yrs
52+
// Age and Educations columns obtained post-transformation.
53+
// Age: 21 Education: BS
54+
// Age: 23 Education: MBA
55+
// Age: 28 Education: PhD
56+
// Age: 22 Education: BS
57+
// Age: 23 Education: MS
58+
// Age: 27 Education: PhD
59+
}
60+
61+
private class InputData
62+
{
63+
public int Age { get; set; }
64+
public string Gender { get; set; }
65+
public string Education { get; set; }
66+
public float ExtraColumn { get; set; }
6467
}
6568

66-
private class SampleInfertDataTransformed
69+
private class TransformedData
6770
{
68-
public float Age { get; set; }
71+
public int Age { get; set; }
6972
public string Education { get; set; }
7073
}
7174
}

0 commit comments

Comments
 (0)