Skip to content

Commit 856c7e8

Browse files
authored
Lockdown of Microsoft.ML.LightGBM public surface. (#2476)
1 parent b2127b2 commit 856c7e8

File tree

12 files changed

+523
-49
lines changed

12 files changed

+523
-49
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
using Microsoft.ML.Transforms.Categorical;
2+
3+
namespace Microsoft.ML.Samples.Dynamic
4+
{
5+
public class LightGbmBinaryClassification
6+
{
7+
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
8+
public static void Example()
9+
{
10+
// Creating the ML.Net IHostEnvironment object, needed for the pipeline.
11+
var mlContext = new MLContext();
12+
13+
// Download and featurize the dataset.
14+
var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
15+
16+
// Leave out 10% of data for testing.
17+
var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1);
18+
19+
// Create the Estimator.
20+
var pipeline = mlContext.BinaryClassification.Trainers.LightGbm("IsOver50K", "Features");
21+
22+
// Fit this Pipeline to the Training Data.
23+
var model = pipeline.Fit(split.TrainSet);
24+
25+
// Evaluate how the model is doing on the test data.
26+
var dataWithPredictions = model.Transform(split.TestSet);
27+
28+
var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K");
29+
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
30+
31+
// Output:
32+
// Accuracy: 0.88
33+
// AUC: 0.93
34+
// F1 Score: 0.71
35+
// Negative Precision: 0.90
36+
// Negative Recall: 0.94
37+
// Positive Precision: 0.76
38+
// Positive Recall: 0.66
39+
}
40+
}
41+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
using Microsoft.ML.LightGBM;
2+
using Microsoft.ML.Transforms.Categorical;
3+
using static Microsoft.ML.LightGBM.Options;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
class LightGbmBinaryClassificationWithOptions
8+
{
9+
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
10+
public static void Example()
11+
{
12+
// Creating the ML.Net IHostEnvironment object, needed for the pipeline
13+
var mlContext = new MLContext();
14+
15+
// Download and featurize the dataset.
16+
var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
17+
18+
// Leave out 10% of data for testing.
19+
var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1);
20+
21+
// Create the pipeline with LightGbm Estimator using advanced options.
22+
var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(
23+
new Options
24+
{
25+
LabelColumn = "IsOver50K",
26+
FeatureColumn = "Features",
27+
Booster = new GossBooster.Options
28+
{
29+
TopRate = 0.3,
30+
OtherRate = 0.2
31+
}
32+
});
33+
34+
// Fit this Pipeline to the Training Data.
35+
var model = pipeline.Fit(split.TrainSet);
36+
37+
// Evaluate how the model is doing on the test data.
38+
var dataWithPredictions = model.Transform(split.TestSet);
39+
40+
var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K");
41+
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
42+
43+
// Output:
44+
// Accuracy: 0.88
45+
// AUC: 0.93
46+
// F1 Score: 0.71
47+
// Negative Precision: 0.90
48+
// Negative Recall: 0.94
49+
// Positive Precision: 0.76
50+
// Positive Recall: 0.67
51+
}
52+
}
53+
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
using System;
2+
using System.Linq;
3+
using Microsoft.ML.Data;
4+
using Microsoft.ML.SamplesUtils;
5+
6+
namespace Microsoft.ML.Samples.Dynamic
7+
{
8+
class LightGbmMulticlassClassification
9+
{
10+
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
11+
public static void Example()
12+
{
13+
// Create a general context for ML.NET operations. It can be used for exception tracking and logging,
14+
// as a catalog of available operations and as the source of randomness.
15+
var mlContext = new MLContext();
16+
17+
// Create in-memory examples as C# native class.
18+
var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000);
19+
20+
// Convert native C# class to IDataView, a consumble format to ML.NET functions.
21+
var dataView = mlContext.Data.ReadFromEnumerable(examples);
22+
23+
//////////////////// Data Preview ////////////////////
24+
// Label Features
25+
// AA 0.7262433,0.8173254,0.7680227,0.5581612,0.2060332,0.5588848,0.9060271,0.4421779,0.9775497,0.2737045
26+
// BB 0.4919063,0.6673147,0.8326591,0.6695119,1.182151,0.230367,1.06237,1.195347,0.8771811,0.5145918
27+
// CC 1.216908,1.248052,1.391902,0.4326252,1.099942,0.9262842,1.334019,1.08762,0.9468155,0.4811099
28+
// DD 0.7871246,1.053327,0.8971719,1.588544,1.242697,1.362964,0.6303943,0.9810045,0.9431419,1.557455
29+
30+
// Create a pipeline.
31+
// - Convert the string labels into key types.
32+
// - Apply LightGbm multiclass trainer.
33+
var pipeline = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label")
34+
.Append(mlContext.MulticlassClassification.Trainers.LightGbm(labelColumn: "LabelIndex"))
35+
.Append(mlContext.Transforms.Conversion.MapValueToKey("PredictedLabelIndex", "PredictedLabel"))
36+
.Append(mlContext.Transforms.CopyColumns("Scores", "Score"));
37+
38+
// Split the static-typed data into training and test sets. Only training set is used in fitting
39+
// the created pipeline. Metrics are computed on the test.
40+
var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5);
41+
42+
// Train the model.
43+
var model = pipeline.Fit(split.TrainSet);
44+
45+
// Do prediction on the test set.
46+
var dataWithPredictions = model.Transform(split.TestSet);
47+
48+
// Evaluate the trained model using the test set.
49+
var metrics = mlContext.MulticlassClassification.Evaluate(dataWithPredictions, label: "LabelIndex");
50+
51+
// Check if metrics are reasonable.
52+
Console.WriteLine($"Macro accuracy: {metrics.AccuracyMacro:F4}, Micro accuracy: {metrics.AccuracyMicro:F4}.");
53+
// Console output:
54+
// Macro accuracy: 0.8655, Micro accuracy: 0.8651.
55+
56+
// IDataView with predictions, to an IEnumerable<DatasetUtils.MulticlassClassificationExample>.
57+
var nativePredictions = mlContext.CreateEnumerable<DatasetUtils.MulticlassClassificationExample>(dataWithPredictions, false).ToList();
58+
59+
// Get schema object out of the prediction. It contains metadata such as the mapping from predicted label index
60+
// (e.g., 1) to its actual label (e.g., "AA").
61+
// The metadata can be used to get all the unique labels used during training.
62+
var labelBuffer = new VBuffer<ReadOnlyMemory<char>>();
63+
dataWithPredictions.Schema["PredictedLabelIndex"].GetKeyValues(ref labelBuffer);
64+
// nativeLabels is { "AA" , "BB", "CC", "DD" }
65+
var nativeLabels = labelBuffer.DenseValues().ToArray(); // nativeLabels[nativePrediction.PredictedLabelIndex - 1] is the original label indexed by nativePrediction.PredictedLabelIndex.
66+
67+
68+
// Show prediction result for the 3rd example.
69+
var nativePrediction = nativePredictions[2];
70+
// Console output:
71+
// Our predicted label to this example is "AA" with probability 0.9257.
72+
Console.WriteLine($"Our predicted label to this example is {nativeLabels[(int)nativePrediction.PredictedLabelIndex - 1]} " +
73+
$"with probability {nativePrediction.Scores[(int)nativePrediction.PredictedLabelIndex - 1]:F4}.");
74+
75+
// Scores and nativeLabels are two parallel attributes; that is, Scores[i] is the probability of being nativeLabels[i].
76+
// Console output:
77+
// The probability of being class "AA" is 0.9257.
78+
// The probability of being class "BB" is 0.0739.
79+
// The probability of being class "CC" is 0.0002.
80+
// The probability of being class "DD" is 0.0001.
81+
for (int i = 0; i < nativeLabels.Length; ++i)
82+
Console.WriteLine($"The probability of being class {nativeLabels[i]} is {nativePrediction.Scores[i]:F4}.");
83+
}
84+
}
85+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
using System;
2+
using System.Linq;
3+
using Microsoft.ML.Data;
4+
using Microsoft.ML.LightGBM;
5+
using Microsoft.ML.SamplesUtils;
6+
using static Microsoft.ML.LightGBM.Options;
7+
8+
namespace Microsoft.ML.Samples.Dynamic
9+
{
10+
class LightGbmMulticlassClassificationWithOptions
11+
{
12+
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
13+
public static void Example()
14+
{
15+
// Create a general context for ML.NET operations. It can be used for exception tracking and logging,
16+
// as a catalog of available operations and as the source of randomness.
17+
var mlContext = new MLContext(seed: 0);
18+
19+
// Create in-memory examples as C# native class.
20+
var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000);
21+
22+
// Convert native C# class to IDataView, a consumble format to ML.NET functions.
23+
var dataView = mlContext.Data.ReadFromEnumerable(examples);
24+
25+
//////////////////// Data Preview ////////////////////
26+
// Label Features
27+
// AA 0.7262433,0.8173254,0.7680227,0.5581612,0.2060332,0.5588848,0.9060271,0.4421779,0.9775497,0.2737045
28+
// BB 0.4919063,0.6673147,0.8326591,0.6695119,1.182151,0.230367,1.06237,1.195347,0.8771811,0.5145918
29+
// CC 1.216908,1.248052,1.391902,0.4326252,1.099942,0.9262842,1.334019,1.08762,0.9468155,0.4811099
30+
// DD 0.7871246,1.053327,0.8971719,1.588544,1.242697,1.362964,0.6303943,0.9810045,0.9431419,1.557455
31+
32+
// Create a pipeline.
33+
// - Convert the string labels into key types.
34+
// - Apply LightGbm multiclass trainer with advanced options.
35+
var pipeline = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label")
36+
.Append(mlContext.MulticlassClassification.Trainers.LightGbm(new Options
37+
{
38+
LabelColumn = "LabelIndex",
39+
FeatureColumn = "Features",
40+
Booster = new DartBooster.Options
41+
{
42+
DropRate = 0.15,
43+
XgboostDartMode = false
44+
}
45+
}))
46+
.Append(mlContext.Transforms.Conversion.MapValueToKey("PredictedLabelIndex", "PredictedLabel"))
47+
.Append(mlContext.Transforms.CopyColumns("Scores", "Score"));
48+
49+
// Split the static-typed data into training and test sets. Only training set is used in fitting
50+
// the created pipeline. Metrics are computed on the test.
51+
var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5);
52+
53+
// Train the model.
54+
var model = pipeline.Fit(split.TrainSet);
55+
56+
// Do prediction on the test set.
57+
var dataWithPredictions = model.Transform(split.TestSet);
58+
59+
// Evaluate the trained model using the test set.
60+
var metrics = mlContext.MulticlassClassification.Evaluate(dataWithPredictions, label: "LabelIndex");
61+
62+
// Check if metrics are reasonable.
63+
Console.WriteLine($"Macro accuracy: {metrics.AccuracyMacro:F4}, Micro accuracy: {metrics.AccuracyMicro:F4}.");
64+
// Console output:
65+
// Macro accuracy: 0.8619, Micro accuracy: 0.8611.
66+
67+
// IDataView with predictions, to an IEnumerable<DatasetUtils.MulticlassClassificationExample>.
68+
var nativePredictions = mlContext.CreateEnumerable<DatasetUtils.MulticlassClassificationExample>(dataWithPredictions, false).ToList();
69+
70+
// Get schema object out of the prediction. It contains metadata such as the mapping from predicted label index
71+
// (e.g., 1) to its actual label (e.g., "AA").
72+
// The metadata can be used to get all the unique labels used during training.
73+
var labelBuffer = new VBuffer<ReadOnlyMemory<char>>();
74+
dataWithPredictions.Schema["PredictedLabelIndex"].GetKeyValues(ref labelBuffer);
75+
// nativeLabels is { "AA" , "BB", "CC", "DD" }
76+
var nativeLabels = labelBuffer.DenseValues().ToArray(); // nativeLabels[nativePrediction.PredictedLabelIndex - 1] is the original label indexed by nativePrediction.PredictedLabelIndex.
77+
78+
79+
// Show prediction result for the 3rd example.
80+
var nativePrediction = nativePredictions[2];
81+
// Console output:
82+
// Our predicted label to this example is AA with probability 0.8986.
83+
Console.WriteLine($"Our predicted label to this example is {nativeLabels[(int)nativePrediction.PredictedLabelIndex - 1]} " +
84+
$"with probability {nativePrediction.Scores[(int)nativePrediction.PredictedLabelIndex - 1]:F4}.");
85+
86+
// Scores and nativeLabels are two parallel attributes; that is, Scores[i] is the probability of being nativeLabels[i].
87+
// Console output:
88+
// The probability of being class AA is 0.8986.
89+
// The probability of being class BB is 0.0961.
90+
// The probability of being class CC is 0.0050.
91+
// The probability of being class DD is 0.0003.
92+
for (int i = 0; i < nativeLabels.Length; ++i)
93+
Console.WriteLine($"The probability of being class {nativeLabels[i]} is {nativePrediction.Scores[i]:F4}.");
94+
}
95+
}
96+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
using System;
2+
using System.Linq;
3+
using Microsoft.ML.Data;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
class LightGbmRegression
8+
{
9+
// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
10+
public static void Example()
11+
{
12+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
13+
// as well as the source of randomness.
14+
var mlContext = new MLContext();
15+
16+
// Download and load the housing dataset into an IDataView.
17+
var dataView = SamplesUtils.DatasetUtils.LoadHousingRegressionDataset(mlContext);
18+
19+
//////////////////// Data Preview ////////////////////
20+
/// Only 6 columns are displayed here.
21+
// MedianHomeValue CrimesPerCapita PercentResidental PercentNonRetail CharlesRiver NitricOxides RoomsPerDwelling PercentPre40s ...
22+
// 24.00 0.00632 18.00 2.310 0 0.5380 6.5750 65.20 ...
23+
// 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90 ...
24+
// 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10 ...
25+
26+
var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.1);
27+
28+
// Create the estimator, here we only need LightGbm trainer
29+
// as data is already processed in a form consumable by the trainer.
30+
var labelName = "MedianHomeValue";
31+
var featureNames = dataView.Schema
32+
.Select(column => column.Name) // Get the column names
33+
.Where(name => name != labelName) // Drop the Label
34+
.ToArray();
35+
var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
36+
.Append(mlContext.Regression.Trainers.LightGbm(
37+
labelColumn: labelName,
38+
numLeaves: 4,
39+
minDataPerLeaf: 6,
40+
learningRate: 0.001));
41+
42+
// Fit this pipeline to the training data.
43+
var model = pipeline.Fit(split.TrainSet);
44+
45+
// Get the feature importance based on the information gain used during training.
46+
VBuffer<float> weights = default;
47+
model.LastTransformer.Model.GetFeatureWeights(ref weights);
48+
var weightsValues = weights.DenseValues().ToArray();
49+
Console.WriteLine($"weight 0 - {weightsValues[0]}"); // CrimesPerCapita (weight 0) = 0.1898361
50+
Console.WriteLine($"weight 5 - {weightsValues[5]}"); // RoomsPerDwelling (weight 5) = 1
51+
52+
// Evaluate how the model is doing on the test data.
53+
var dataWithPredictions = model.Transform(split.TestSet);
54+
var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName);
55+
SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
56+
57+
// Output
58+
// L1: 4.97
59+
// L2: 51.37
60+
// LossFunction: 51.37
61+
// RMS: 7.17
62+
// RSquared: 0.08
63+
}
64+
}
65+
}

0 commit comments

Comments
 (0)