Skip to content

Instantly share code, notes, and snippets.

@nhirschey
Created May 5, 2022 14:47
Show Gist options
  • Save nhirschey/bbf4b7344f42921130b6848f3a6b0196 to your computer and use it in GitHub Desktop.
Save nhirschey/bbf4b7344f42921130b6848f3a6b0196 to your computer and use it in GitHub Desktop.
ML.NET CV repro
#r "nuget:FSharp.Stats"
#r "nuget: Microsoft.ML, 1.7.*"
#r "nuget: Microsoft.ML.FastTree"
#r "nuget: FSharp.Data"
#r "nuget: Plotly.NET, 2.0.*"
open System
open System.IO
open System.IO.Compression
open System.Text.Json
open System.Net
open System
open FSharp.Data
open FSharp.Stats
open Plotly.NET
open Microsoft.ML
open Microsoft.ML.Data
open Microsoft.ML.Transforms.Text
Environment.CurrentDirectory <- __SOURCE_DIRECTORY__
let download (inputUrl:string) (outputFile:string) =
Directory.CreateDirectory(Path.GetDirectoryName(outputFile)) |> ignore
if IO.File.Exists(outputFile) then
printfn $"The file {outputFile} already exists. Skipping download"
else
let web = Http.RequestStream(inputUrl)
use fileStream = IO.File.Create(outputFile)
web.ResponseStream.CopyTo(fileStream)
fileStream.Close()
// Decompress a gzip file
let gunzip (inputFile:string) (outputFile:string) =
Directory.CreateDirectory(Path.GetDirectoryName(outputFile)) |> ignore
if File.Exists(outputFile) then File.Delete(outputFile)
use inputStream = File.OpenRead(inputFile)
use outputStream = File.Create(outputFile)
use gzipStream = new GZipStream(inputStream, CompressionMode.Decompress)
gzipStream.CopyTo(outputStream)
let nq100FullUrl = "https://www.dropbox.com/s/izcsjp06lgwbauu/Nasdaq100CallFull.json.gz?dl=1"
let dataFolder = "data"
let nqFullFile = Path.Combine(dataFolder, "Nasdaq100CallFull.json")
let nq100FullFileGz = nqFullFile.Replace(".json", ".json.gz")
download nq100FullUrl nq100FullFileGz
gunzip nq100FullFileGz nqFullFile
type CallId =
{ Ticker: string
Exchange: string
FiscalQuarter: int
Date: DateTime }
type CallFull =
{ CallId: CallId
Header: string
PreparedRemarks: string
QuestionsAndAnswers: string
Label: float }
let nq100Full =
File.ReadAllText(nqFullFile)
|> JsonSerializer.Deserialize<List<CallFull>>
[<CLIMutable>]
type BinarySentimentInput =
{ Label: bool
Text: string }
[<CLIMutable>]
type BinarySentimentOutput =
{ PredictedLabel: bool
Probability: single
Score: single }
let ctx = new MLContext(seed = 1)
let nq100FullSentiment =
nq100Full
|> Seq.map (fun x ->
{ Label = x.Label > 0.0
Text = x.QuestionsAndAnswers })
|> ctx.Data.LoadFromEnumerable
let featurizePipeline =
ctx.Transforms.Text.FeaturizeText(
outputColumnName = "Features",
inputColumnName = "Text")
let treeTrainer =
ctx.BinaryClassification.Trainers.FastTree(
labelColumnName = "Label",
featureColumnName = "Features")
let treePipeline = featurizePipeline.Append(treeTrainer)
let downcastPipeline (pipeline : IEstimator<'a>) =
match pipeline with
| :? IEstimator<ITransformer> as p -> p
| _ -> failwith "The pipeline has to be an instance of IEstimator<ITransformer>."
let cvResults =
ctx.BinaryClassification
.CrossValidate(data = nq100FullSentiment,
estimator = downcastPipeline treePipeline,
numberOfFolds=5,
seed = 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment