Skip to content

Instantly share code, notes, and snippets.

@RyushiAok
Last active October 15, 2023 12:35
Show Gist options
  • Save RyushiAok/d46cc711e1c8f857bf23a7b77962249b to your computer and use it in GitHub Desktop.
Save RyushiAok/d46cc711e1c8f857bf23a7b77962249b to your computer and use it in GitHub Desktop.
fix_ml_pipline

fix: https://gist.github.com/jkone27/de477e362f3ee9f36069c0010e0a1f29

1. type

[<CLIMutable>]
type PricePrediction = {
    [<ColumnName("Score")>]
    Price: float32
}

[<CLIMutable>]
type HousePricePerYear = {
    [<LoadColumn(0)>]
    Price: float32
    [<LoadColumn(1)>]
    Year: float32
}

2. casting

let house_price_data_view (row: PriceCsvProvider.Row seq) =
    row
    |> Seq.map (fun r -> {
        Price = float32 r.Price // decimal to f32
        Year = float32 r.Year
    })
    |> Seq.toArray
    |> mlContext.Data.LoadFromEnumerable<HousePricePerYear>

3. pipline (optional)

    |> Estimator.append (
        mlContext.Transforms.CopyColumns(
            outputColumnName = "Label",
            inputColumnName = "Price"
        )
    )

    // Not necessary if fields are converted to float32(single) in 1,2
    |> Estimator.append (
        mlContext.Transforms.Conversion.ConvertType(
            [|
                InputOutputColumnPair("Price", "Price")
                InputOutputColumnPair("Year", "Year")
            |],
            DataKind.Single
        )
    )


    // append additional encoded features...
    |> Estimator.append (
        mlContext.Transforms.Concatenate(
            outputColumnName = "Features",
            inputColumnNames = [| "Price"; "Year" |]
        )
    )

reference

InputOutputColumnPair Type

https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.conversionsextensionscatalog.converttype?view=ml-dotnet

// fix: https://gist.github.com/jkone27/de477e362f3ee9f36069c0010e0a1f29
open Microsoft.ML
open Microsoft.ML.Data
open FSharp.Data
[<Literal>]
let trainDataPath = __SOURCE_DIRECTORY__ + "/house-price-train.csv"
[<Literal>]
let testDataPath = __SOURCE_DIRECTORY__ + "/house-price-test.csv"
type PriceCsvProvider = CsvProvider<trainDataPath>
[<CLIMutable>]
type PricePrediction = {
[<ColumnName("Score")>]
Price: float32
}
[<CLIMutable>]
type HousePricePerYear = {
[<LoadColumn(0)>]
Price: float32
[<LoadColumn(1)>]
Year: float32
}
let mlContext = MLContext(seed = 0)
let trainSample = PriceCsvProvider.GetSample()
let testSample = PriceCsvProvider.Load(testDataPath)
let house_price_data_view (row: PriceCsvProvider.Row seq) =
row
|> Seq.map (fun r -> {
Price = float32 r.Price
Year = float32 r.Year
})
|> Seq.toArray
|> mlContext.Data.LoadFromEnumerable<HousePricePerYear>
let trainDataView = trainSample.Rows |> house_price_data_view
for r in trainDataView.Preview(1).ColumnView do
r.Values |> Seq.last |> printfn "%A"
let testDataView = testSample.Rows |> house_price_data_view
for r in testDataView.Preview(1).ColumnView do
r.Values |> Seq.last |> printfn "%A"
/// https://github.com/CSBiology/FSharpML/tree/master/src/FSharpML
module Estimator =
let downcastEstimator (estimator: IEstimator<'a>) =
match estimator with
| :? IEstimator<ITransformer> as p -> p
| _ ->
failwith "The estimator has to be an instance of IEstimator<ITransformer>."
///
let append (source1: IEstimator<'a>) (source2: IEstimator<'b>) =
(source2 |> downcastEstimator).Append(source1)
///
let createEstimatorChainOf (estimators: IEstimator<'a> seq) =
estimators
|> Seq.fold (fun acc e -> append e acc) (EstimatorChain())
///
let appendCacheCheckpoint (mlContext: MLContext) (pipeline: IEstimator<'a>) =
pipeline.AppendCacheCheckpoint mlContext
|> downcastEstimator
/// https://github.com/CSBiology/FSharpML/tree/master/src/FSharpML
module Transformer =
let downcastTransformer (transformer: ITransformer) =
match transformer with
| :? IPredictionTransformer<_> as p -> p
| _ ->
failwith
"The transformer has to be an instance of IPredictionTransformer<IPredictor>."
///
let append (source1: ITransformer) (source2: ITransformer) =
(source2 |> downcastTransformer).Append(source1)
///
let createTransformerChainOf (estimators: ITransformer seq) =
estimators
|> Seq.fold (fun acc e -> append e acc) (TransformerChain())
let pipeline =
new EstimatorChain<ITransformer>()
|> Estimator.append (
mlContext.Transforms.CopyColumns(
outputColumnName = "Label",
inputColumnName = "Price"
)
)
// Ensure types are correct
|> Estimator.append (
mlContext.Transforms.Conversion.ConvertType(
[|
InputOutputColumnPair("Price", "Price")
InputOutputColumnPair("Year", "Year")
|],
DataKind.Single
)
)
// append additional encoded features...
|> Estimator.append (
mlContext.Transforms.Concatenate(
outputColumnName = "Features",
inputColumnNames = [| "Price"; "Year" |]
)
)
// choose the regression algo for prediciton, fast tree in the example
|> Estimator.append (mlContext.Regression.Trainers.FastTree())
let model = pipeline.Fit(trainDataView)
let predictions = model.Transform(testDataView)
let metrics = mlContext.Regression.Evaluate(predictions, "Label", "Score")
//evaluate bounty of the model
$"""
METRICS EVALUATE MODEL
RSquared ([0,1] close to 1 better): %.5f{metrics.RSquared}
RMS err (lowest the better): %.5f{metrics.RootMeanSquaredError}
"""
|> printfn "%s"
@jkone27
Copy link

jkone27 commented Oct 15, 2023

awesome! thanks a lot for your kindness, F#❤️
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment