Data file: Download the and place to Data folder two files: issues_train.tsv and the issues_test.tsv
GitHubIssueData.cs
// <SnippetAddUsings>
using Microsoft.ML.Data;
// </SnippetAddUsings>
namespace GitHubIssueClassification
{
// <SnippetDeclareTypes>
public class GitHubIssue
{
[LoadColumn(0)]
public string ID { get; set; }
[LoadColumn(1)]
public string Area { get; set; }
[LoadColumn(2)]
public string Title { get; set; }
[LoadColumn(3)]
public string Description { get; set; }
}
public class IssuePrediction
{
[ColumnName("PredictedLabel")]
public string Area;
}
// </SnippetDeclareTypes>
}
Program.cs
// <SnippetAddUsings>
using System;
using System.IO;
using System.Linq;
using Microsoft.ML;
// </SnippetAddUsings>
namespace GitHubIssueClassification
{
class Program
{
// <SnippetDeclareGlobalVariables>
private static string _appPath => Path.GetDirectoryName(Environment.GetCommandLineArgs()[0]);
private static string _trainDataPath => Path.Combine(_appPath, "..", "..", "..", "Data", "issues_train.tsv");
private static string _testDataPath => Path.Combine(_appPath, "..", "..", "..", "Data", "issues_test.tsv");
private static string _modelPath => Path.Combine(_appPath, "..", "..", "..", "Models", "model.zip");
private static MLContext _mlContext;
private static PredictionEngine<GitHubIssue, IssuePrediction> _predEngine;
private static ITransformer _trainedModel;
static IDataView _trainingDataView;
// </SnippetDeclareGlobalVariables>
static void Main(string[] args)
{
// Create MLContext to be shared across the model creation workflow objects
// Set a random seed for repeatable/deterministic results across multiple trainings.
// <SnippetCreateMLContext>
_mlContext = new MLContext(seed: 0);
// </SnippetCreateMLContext>
// STEP 1: Common data loading configuration
// CreateTextReader<GitHubIssue>(hasHeader: true) - Creates a TextLoader by inferencing the dataset schema from the GitHubIssue data model type.
// .Read(_trainDataPath) - Loads the training text file into an IDataView (_trainingDataView) and maps from input columns to IDataView columns.
Console.WriteLine($"=============== Loading Dataset ===============");
// <SnippetLoadTrainData>
_trainingDataView = _mlContext.Data.LoadFromTextFile<GitHubIssue>(_trainDataPath, hasHeader: true);
// </SnippetLoadTrainData>
Console.WriteLine($"=============== Finished Loading Dataset ===============");
// <SnippetSplitData>
// var (trainData, testData) = _mlContext.MulticlassClassification.TrainTestSplit(_trainingDataView, testFraction: 0.1);
// </SnippetSplitData>
// <SnippetCallProcessData>
var pipeline = ProcessData();
// </SnippetCallProcessData>
// <SnippetCallBuildAndTrainModel>
var trainingPipeline = BuildAndTrainModel(_trainingDataView, pipeline);
// </SnippetCallBuildAndTrainModel>
// <SnippetCallEvaluate>
Evaluate(_trainingDataView.Schema);
// </SnippetCallEvaluate>
// <SnippetCallPredictIssue>
PredictIssue();
// </SnippetCallPredictIssue>
Console.WriteLine("Press enter to close...");
Console.ReadLine();
}
public static IEstimator<ITransformer> ProcessData()
{
Console.WriteLine($"=============== Processing Data ===============");
// STEP 2: Common data process configuration with pipeline data transformations
// <SnippetMapValueToKey>
var pipeline = _mlContext.Transforms.Conversion.MapValueToKey(inputColumnName: "Area", outputColumnName: "Label")
// </SnippetMapValueToKey>
// <SnippetFeaturizeText>
.Append(_mlContext.Transforms.Text.FeaturizeText(inputColumnName: "Title", outputColumnName: "TitleFeaturized"))
.Append(_mlContext.Transforms.Text.FeaturizeText(inputColumnName: "Description", outputColumnName: "DescriptionFeaturized"))
// </SnippetFeaturizeText>
// <SnippetConcatenate>
.Append(_mlContext.Transforms.Concatenate("Features", "TitleFeaturized", "DescriptionFeaturized"))
// </SnippetConcatenate>
//Sample Caching the DataView so estimators iterating over the data multiple times, instead of always reading from file, using the cache might get better performance.
// <SnippetAppendCache>
.AppendCacheCheckpoint(_mlContext);
// </SnippetAppendCache>
Console.WriteLine($"=============== Finished Processing Data ===============");
// <SnippetReturnPipeline>
return pipeline;
// </SnippetReturnPipeline>
}
public static IEstimator<ITransformer> BuildAndTrainModel(IDataView trainingDataView, IEstimator<ITransformer> pipeline)
{
// STEP 3: Create the training algorithm/trainer
// Use the multi-class SDCA algorithm to predict the label using features.
//Set the trainer/algorithm and map label to value (original readable state)
// <SnippetAddTrainer>
var trainingPipeline = pipeline.Append(_mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features"))
.Append(_mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));
// </SnippetAddTrainer>
// STEP 4: Train the model fitting to the DataSet
Console.WriteLine($"=============== Training the model ===============");
// <SnippetTrainModel>
_trainedModel = trainingPipeline.Fit(trainingDataView);
// </SnippetTrainModel>
Console.WriteLine($"=============== Finished Training the model Ending time: {DateTime.Now.ToString()} ===============");
// (OPTIONAL) Try/test a single prediction with the "just-trained model" (Before saving the model)
Console.WriteLine($"=============== Single Prediction just-trained-model ===============");
// Create prediction engine related to the loaded trained model
// <SnippetCreatePredictionEngine1>
_predEngine = _mlContext.Model.CreatePredictionEngine<GitHubIssue, IssuePrediction>(_trainedModel);
// </SnippetCreatePredictionEngine1>
// <SnippetCreateTestIssue1>
GitHubIssue issue = new GitHubIssue()
{
Title = "WebSockets communication is slow in my machine",
Description = "The WebSockets communication used under the covers by SignalR looks like is going slow in my development machine.."
};
// </SnippetCreateTestIssue1>
// <SnippetPredict>
var prediction = _predEngine.Predict(issue);
// </SnippetPredict>
// <SnippetOutputPrediction>
Console.WriteLine($"=============== Single Prediction just-trained-model - Result: {prediction.Area} ===============");
// </SnippetOutputPrediction>
// <SnippetReturnModel>
return trainingPipeline;
// </SnippetReturnModel>
}
public static void Evaluate(DataViewSchema trainingDataViewSchema)
{
// STEP 5: Evaluate the model in order to get the model's accuracy metrics
Console.WriteLine($"=============== Evaluating to get model's accuracy metrics - Starting time: {DateTime.Now.ToString()} ===============");
//Load the test dataset into the IDataView
// <SnippetLoadTestDataset>
var testDataView = _mlContext.Data.LoadFromTextFile<GitHubIssue>(_testDataPath, hasHeader: true);
// </SnippetLoadTestDataset>
//Evaluate the model on a test dataset and calculate metrics of the model on the test data.
// <SnippetEvaluate>
var testMetrics = _mlContext.MulticlassClassification.Evaluate(_trainedModel.Transform(testDataView));
// </SnippetEvaluate>
Console.WriteLine($"=============== Evaluating to get model's accuracy metrics - Ending time: {DateTime.Now.ToString()} ===============");
// <SnippetDisplayMetrics>
Console.WriteLine($"*************************************************************************************************************");
Console.WriteLine($"* Metrics for Multi-class Classification model - Test Data ");
Console.WriteLine($"*------------------------------------------------------------------------------------------------------------");
Console.WriteLine($"* MicroAccuracy: {testMetrics.MicroAccuracy:0.###}");
Console.WriteLine($"* MacroAccuracy: {testMetrics.MacroAccuracy:0.###}");
Console.WriteLine($"* LogLoss: {testMetrics.LogLoss:#.###}");
Console.WriteLine($"* LogLossReduction: {testMetrics.LogLossReduction:#.###}");
Console.WriteLine($"*************************************************************************************************************");
// </SnippetDisplayMetrics>
// Save the new model to .ZIP file
// <SnippetCallSaveModel>
SaveModelAsFile(_mlContext, trainingDataViewSchema, _trainedModel);
// </SnippetCallSaveModel>
}
public static void PredictIssue()
{
// <SnippetLoadModel>
ITransformer loadedModel = _mlContext.Model.Load(_modelPath, out var modelInputSchema);
// </SnippetLoadModel>
// <SnippetAddTestIssue>
GitHubIssue singleIssue = new GitHubIssue() { Title = "Entity Framework crashes", Description = "When connecting to the database, EF is crashing" };
// </SnippetAddTestIssue>
//Predict label for single hard-coded issue
// <SnippetCreatePredictionEngine>
_predEngine = _mlContext.Model.CreatePredictionEngine<GitHubIssue, IssuePrediction>(loadedModel);
// </SnippetCreatePredictionEngine>
// <SnippetPredictIssue>
var prediction = _predEngine.Predict(singleIssue);
// </SnippetPredictIssue>
// <SnippetDisplayResults>
Console.WriteLine($"=============== Single Prediction - Result: {prediction.Area} ===============");
// </SnippetDisplayResults>
}
private static void SaveModelAsFile(MLContext mlContext, DataViewSchema trainingDataViewSchema, ITransformer model)
{
// <SnippetSaveModel>
mlContext.Model.Save(model, trainingDataViewSchema, _modelPath);
// </SnippetSaveModel>
Console.WriteLine("The model is saved to {0}", _modelPath);
}
}
}
Output:
* Metrics for Multi-class Classification model - Test Data *------------------------------------------------------------------------------------------------------------ * MicroAccuracy: 0.739 * MacroAccuracy: 0.674 * LogLoss: .907 * LogLossReduction: .648
https://github.com/gantovnik/wordpress_examples/tree/main/ex307
Last Updated on 2022-10-29 by gantovnik
Recent Comments