Skip to content

Instantly share code, notes, and snippets.

@berewt
Created June 17, 2018 19:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save berewt/6c2d3bbf98a5df8fde546594d903581c to your computer and use it in GitHub Desktop.
Save berewt/6c2d3bbf98a5df8fde546594d903581c to your computer and use it in GitHub Desktop.
Data pre-processing in Idris with the version 0.0.1 of Leon
module Audit
import Control.Monad.Syntax
import Data.Fin
import Data.String
import Data.Vect
import Data.Vect.Sub
import Leon.DataFrame.Columns
import Leon.DataFrame.Row
%default total
-- Initial type
public export
auditFields : Vect 12 String
auditFields = ["ID", "Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions", "Hours", "IGNORE_Accounts", "RISK_Adjustment", "TARGET_Adjusted"]
public export
AuditHeader : Type
AuditHeader= Header Audit.auditFields
public export
InitialHeader : AuditHeader
InitialHeader = [String, String, String, String, String, String, String, String, String, String, String, String]
-- Step1 : ID should be a Natural
-- Here to decompose the steps, can be kept aside if we chain transformation
public export
Step1Header : AuditHeader
Step1Header = set "ID" (Maybe Nat) InitialHeader
step1 : Record InitialHeader -> Record Step1Header
step1 = update "ID" parsePositive
-- Step2 : Age should be a positive less than 100
public export
Step2Header : AuditHeader
Step2Header = set "Age" (Maybe (Fin 100)) Step1Header
lessThan100 : Nat -> Maybe (Fin 100)
lessThan100 n = natToFin n 100
step2 : Record Step1Header -> Record Step2Header
step2 = Row.update "Age" (parsePositive >=> lessThan100)
-- Step3 : keep only ID, Age and gender
public export
Step3Header : Header ["ID", "Gender", "Age"]
Step3Header = keepOnly ["ID", "Gender", "Age"] Step2Header
step3 : Record Step2Header -> Record Step3Header
step3 x = Row.keepOnly ["ID", "Gender", "Age"] x
-- Step4 : keep only male and female in Gender
public export
data Gender = Male | Female
public export
Step4Header : Header ["ID", "Gender", "Age"]
Step4Header = set "Gender" (Maybe Gender) Step3Header
parseGender : String -> Maybe Gender
parseGender "Male" = Just Male
parseGender "Female" = Just Female
parseGender _ = Nothing
step4 : Record Step3Header -> Record Step4Header
step4 = Row.update "Gender" parseGender
-- final Put everything together
FinalHeader : Header ["ID", "Gender", "Age"]
FinalHeader = [Maybe Nat, Maybe Gender, Maybe (Fin 100)]
pipeline : Record InitialHeader -> Record FinalHeader
pipeline = step4 . step3 . step2 . step1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment