Skip to content

Instantly share code, notes, and snippets.

@derrickturk
Last active June 19, 2020 15:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save derrickturk/bb078cd650f5e7e36fe8792b89157576 to your computer and use it in GitHub Desktop.
Save derrickturk/bb078cd650f5e7e36fe8792b89157576 to your computer and use it in GitHub Desktop.
I love Idris - tafrae revisited
-- dependently-typed dataframes from scratch
module Tafra
%access public export
%hide Nat -- we'll roll our own, this is Carl Sagan's apple pie
-- natural numbers a la Peano
data Nat : Type where
Z : Nat
S : Nat -> Nat
-- addition, by recursion on the left
plus : Nat -> Nat -> Nat
plus Z m = m
plus (S n) m = S (plus n m)
-- "finite sets" - naturals less than a given number
data Fin : Nat -> Type where
FZ : Fin (S n) -- zero is less than any successor
FS : Fin n -> Fin (S n) -- by induction
-- "vectors" - known-length homogeneous sequences
namespace Vect
data Vect : Nat -> Type -> Type where
Nil : Vect Z a
(::) : a -> Vect n a -> Vect (S n) a
-- bounds-checked element access
index : Fin n -> Vect n a -> a
index FZ (x :: _) = x
index (FS i) (_ :: xs) = index i xs
-- length-tracking concatenation
(++) : Vect n a -> Vect m a -> Vect (plus n m) a
[] ++ ys = ys
(x :: xs) ++ ys = x :: xs ++ ys
-- we can map over vectors, preserving type
Functor (Vect n) where
map f [] = []
map f (x :: xs) = f x :: map f xs
-- we'll want this later to implement "slicing"
vslice : Vect k (Fin n) -> Vect n a -> Vect k a
vslice [] _ = []
vslice (i :: is) xs = index i xs :: vslice is xs
-- we'll want this to walk two parallel vectors in "lock-step"
zip : Vect n a -> Vect n b -> Vect n (a, b)
zip [] [] = []
zip (x::xs) (y::ys) = (x, y)::zip xs ys
-- the types of "row transformer" functions for a given Tafra
RowTransformer : Vect n Type -> Vect n Type -> Vect n Type
RowTransformer [] [] = []
RowTransformer (a :: as) (b :: bs) = (a -> b) :: RowTransformer as bs
-- heterogeneous vectors - we'll need these to represent "rows"
namespace HVect
data HVect : Vect n Type -> Type where
Nil : HVect []
(::) : a -> HVect as -> HVect (a::as)
-- bounds-checked element access, with appropriate return type
index : {n : Nat} ->
{as : Vect n Type} ->
(i : Fin n) ->
HVect as ->
index i as
index FZ [] impossible
index (FS _) [] impossible
index FZ (x :: _) = x
index (FS i) (_ :: xs) = index i xs
-- HVects can be compared for equality, when their components
-- are comparable - we'll need this for "group by"
Eq (HVect []) where
[] == [] = True
(Eq a, Eq (HVect as)) => Eq (HVect (a::as)) where
(x :: xs) == (y :: ys) = x == y && xs == ys
-- length- and type-tracking concatenation
(++) : HVect as -> HVect bs -> HVect (as ++ bs)
[] ++ ys = ys
(x :: xs) ++ ys = x :: xs ++ ys
-- first element only
head : HVect (a::as) -> a
head (x :: _) = x
-- all but first element
tail : HVect (a::as) -> HVect as
tail (_ :: xs) = xs
-- "ap"ply an HVect of functions
ap : {as : Vect n Type} ->
{bs : Vect n Type} ->
HVect (RowTransformer as bs) ->
HVect as ->
HVect bs
ap {as = []} {bs = []} [] [] = []
ap {as = _ :: _} {bs = _ :: _} (f :: fs) (x :: xs) = f x :: ap fs xs
namespace Tafra
-- a "tafra" is a list of same-length vectors of various types
-- the type is indexed by the row count and a vector of column types
data Tafra : (rows : Nat) -> Vect cols Type -> Type where
Nil : Tafra rows []
(::) : Vect rows a -> Tafra rows as -> Tafra rows (a::as)
-- we can concatenate tafras with the same column types and different
-- row counts "vertically"
(++) : Tafra n as -> Tafra m as -> Tafra (plus n m) as
[] ++ [] = []
(topFirst :: topRest) ++ (botFirst :: botRest) =
Vect.(++) topFirst botFirst :: topRest ++ botRest
-- we can concatenate tafras with different column types but the same row
-- counts "horizontally"
hcat : Tafra n as -> Tafra n bs -> Tafra n (as ++ bs)
hcat [] y = y
hcat (x :: xs) y = x :: hcat xs y
-- indexing by columns is simple, and returns a column vector
index : {cols: Nat} ->
{as : Vect cols Type} ->
(i : Fin cols) ->
Tafra rows as ->
Vect rows (index i as)
index FZ [] impossible
index (FS _) [] impossible
index FZ (x :: _) = x
index (FS i) (_ :: xs) = index i xs
-- indexing by rows is trickier - we have to return a heterogeneous "tuple"
rindex : Fin rows -> Tafra rows as -> HVect as
rindex _ [] = []
rindex i (x :: xs) = index i x :: rindex i xs
-- we can also "slice" by multiple row or column indices
-- we'll do this a little inefficiently
-- "column" slices
slice : {cols : Nat} ->
{as : Vect cols Type} ->
(ixs : Vect k (Fin cols)) ->
Tafra rows as ->
Tafra rows (vslice ixs as)
slice [] _ = []
slice (i :: is) xs = index i xs :: slice is xs
-- "row" slices
rslice : {rows : Nat} ->
(ixs : Vect k (Fin rows)) ->
Tafra rows as ->
Tafra k as
rslice _ [] = []
rslice [] (_ :: xs) = [] :: rslice [] xs
rslice is (x :: xs) = vslice is x :: rslice is xs
-- just the first row
rhead : Tafra (S n) as -> HVect as
rhead [] = []
rhead ((x :: _) :: ys) = x :: rhead ys
-- all rows but the first
rtail : Tafra (S n) as -> Tafra n as
rtail [] = []
rtail ((_ :: xs) :: ys) = xs :: rtail ys
-- combine the "head" and the "tail" (we'll need this to
-- correctly type "group by"
rcons : HVect as -> Tafra n as -> Tafra (S n) as
rcons [] [] = []
rcons (x :: xs) (r :: rs) = (x :: r) :: rcons xs rs
-- transpose to a row-oriented representation
transpose : {rows : Nat} -> Tafra rows as -> Vect rows (HVect as)
transpose {rows = Z} _ = []
transpose {rows = (S m)} xs = rhead xs :: transpose (rtail xs)
-- rebuild a tafra from a transposed representation
untranspose : {rows : Nat} ->
{cols : Nat} ->
{as : Vect cols Type} ->
Vect rows (HVect as) ->
Tafra rows as
untranspose {rows = Z} {as = []} [] = []
untranspose {rows = Z} {as = (t :: ts)} [] = [] :: untranspose []
untranspose {rows = (S m)} {as = []} _ = []
untranspose {rows = (S m)} {as = (t :: ts)} (xs :: yss) =
(head xs :: map head yss) :: untranspose (tail xs :: map tail yss)
-- a zero-row tafra of any type
empty : Tafra Z as
empty {as = []} = []
empty {as = (_ :: _)} = [] :: empty
-- "map" a set of row-transformer functions over a tafra
rmap : HVect (RowTransformer as bs) -> Tafra n as -> Tafra n bs
rmap fs = untranspose . map (ap fs) . transpose
-- let's create some example tafrae
someData : Tafra (S (S (S Z))) [Double, Bool, List String]
someData =
[ [23.4, 45.6, 78.9]
, [True, False, True]
, [ ["yo", "dawg"]
, ["i heard you", "like structures"]
, ["so i put structures in your structures"]
]
]
otherData : Tafra (S (S Z)) [Int, String]
otherData =
[ [99, 45]
, ["some", "text"]
]
-- instead of maps, we'll use "association lists" for associative lookup
-- the prelude helpfully provides
-- lookup : Eq a => a -> List (a, b) -> Maybe b
-- we'll define a helpful type synonym
Map : Type -> Type -> Type
Map k v = List (k, v)
-- and some additional helpers
insert : Eq k => k -> v -> Map k v -> Map k v
insert k v [] = [(k, v)]
insert k v (x@(k', _) :: xs) =
if k == k'
then (k, v) :: xs
else x :: insert k v xs
-- we're almost ready to write "group by"
-- first, let's prove we can collect rows by unique combination of group-by
-- column values
-- an "accumulating" function we'll use to collect rows by unique combination
accumRows : Eq k =>
Map k (List v) ->
Vect n (k, v) ->
Map k (List v)
accumRows map [] = map
accumRows map ((b, g) :: rest) = case lookup b map of
Nothing => accumRows (insert b [g] map) rest
-- for performance, we're going to build these "backward"
(Just xs) => accumRows (insert b (g::xs) map) rest
-- this is also the only example I have ever seen of explicitly passing
-- a "locally named" interface implementation
rowsByUnique : (is : Vect groups (Fin cols)) -> -- indices of group-by columns
(js : Vect aggs (Fin cols)) -> -- indices of aggregated-over columns
{auto prf: Eq (HVect (vslice is as))} ->
Tafra rows as ->
Map (HVect (vslice is as)) (List (HVect (vslice js as)))
rowsByUnique {prf} is js xs =
accumRows @{prf} [] (zip (transpose (slice is xs)) (transpose (slice js xs)))
-- ok, let's type group by
groupBy : {cols : Nat} ->
{rows : Nat} ->
{as : Vect cols Type} ->
{groups : Nat} ->
{aggs : Nat} ->
-- indices of group-by columns
(is : Vect groups (Fin cols)) ->
-- indices of aggregated-over columns
(js : Vect aggs (Fin cols)) ->
-- aggregation function
(aggFn : List (HVect (vslice js as)) -> HVect bs) ->
-- a proof that group-by columns can be compared for equality
-- (automatically found by compiler)
{auto prf: Eq (HVect (vslice is as))} ->
-- data frame to group-by
Tafra rows as ->
-- result type: an existentially quantified tafra of some
-- row count, with columns from the group-by columns and
-- the result of the aggregation function
(k : Nat ** Tafra k (vslice is as ++ bs))
groupBy is js fn xs = applyAggregation fn (rowsByUnique is js xs) where
applyAggregation : (List (HVect gs) -> HVect rs) ->
Map (HVect as) (List (HVect gs)) ->
(k : Nat ** Tafra k (as ++ rs))
applyAggregation f [] = (Z ** empty)
applyAggregation f ((k, rs) :: rest) with (applyAggregation f rest)
applyAggregation f ((k, rs) :: _) | (n ** restAgg) =
(S n ** rcons (k ++ f rs) restAgg)
-- select
-- col2,
-- join('\n', str(col1) & join(' ', col3))
-- from someData
-- group by col2
exampleGroupBy : (k : Nat ** Tafra k [Bool, String])
exampleGroupBy = groupBy
[FS FZ]
[FZ, FS (FS FZ)]
((::[]) . unlines . map (\[num, strs] => unwords (show num :: strs)))
someData
exampleMapped : Tafra (S (S (S Z))) [Double, Bool, String]
exampleMapped = rmap [(/ 3.0), not, unwords] someData
module TafraParser
import Tafra
%access public export
charDigit : Char -> Maybe Int
charDigit x = if isDigit x
then Just (ord x - ord '0')
else Nothing
interface ParseMaybe a where
parseMaybe : String -> Maybe a
ParseMaybe String where
parseMaybe = Just
ParseMaybe Bool where
parseMaybe "True" = Just True
parseMaybe "False" = Just False
parseMaybe _ = Nothing
ParseMaybe Int where
parseMaybe str with (strM str)
parseMaybe "" | StrNil = Nothing
parseMaybe (strCons x xs) | (StrCons x xs) = do
val <- charDigit x
go val xs
where
go val str with (strM str)
go val "" | StrNil = Just val
go val (strCons y ys) | (StrCons y ys) = do
next <- charDigit y
go (val * 10 + next) ys
ParseMaybe Integer where
parseMaybe str with (strM str)
parseMaybe "" | StrNil = Nothing
parseMaybe (strCons x xs) | (StrCons x xs) = do
val <- cast <$> charDigit x
go val xs
where
go : Integer -> String -> Maybe Integer
go val str with (strM str)
go val "" | StrNil = Just val
go val (strCons y ys) | (StrCons y ys) = do
next <- cast <$> charDigit y
go (val * 10 + next) ys
ParseMaybe Double where
parseMaybe str with (strM str)
parseMaybe "" | StrNil = Nothing
parseMaybe (strCons x xs) | (StrCons x xs) = do
val <- cast <$> charDigit x
go False val 0 xs
where
go : Bool -> Integer -> Integer -> String -> Maybe Double
go afterDecimal whole frac str with (strM str)
-- this is a little lazy...
go afterDecimal whole frac "" | StrNil =
Just (cast (show whole ++ "." ++ show frac))
go afterDecimal whole frac (strCons y ys) | (StrCons y ys) =
if y == '.'
then go True whole frac ys
else do
next <- cast <$> charDigit y
if afterDecimal
then go afterDecimal whole (frac * 10 + next) ys
else go afterDecimal (whole * 10 + next) frac ys
ParseMaybe (HVect []) where
parseMaybe "" = Just []
parseMaybe _ = Nothing
(ParseMaybe a, ParseMaybe (HVect as)) => ParseMaybe (HVect (a::as)) where
parseMaybe xs with (split (== ',') xs)
parseMaybe xs | [] = Nothing
parseMaybe xs | (y :: ys) = do
first <- parseMaybe (trim y)
rest <- parseMaybe (concat (intersperse "," ys))
pure (first::rest)
listToVect : List a -> (n : Nat ** Vect n a)
listToVect [] = (Z ** [])
listToVect (x :: xs) with (listToVect xs)
listToVect (x :: xs) | (n ** v) = (S n ** x :: v)
-- skip header and parse each row
parseCSV' : ParseMaybe (HVect as) => List String -> Maybe (n : Nat ** Tafra n as)
parseCSV' @{c} csvLines = case nonEmpty csvLines of
(Yes prf) => do
(n ** rows) <- listToVect <$> traverse (parseMaybe @{c}) (tail csvLines)
pure (n ** untranspose rows)
(No _) => Nothing
parseCSV : ParseMaybe (HVect as) => String -> Maybe (n : Nat ** Tafra n as)
parseCSV = parseCSV' . lines
exampleCSV : String
exampleCSV = "age,name\n123,bobby\n456,sally\n"
exampleFrame : Maybe (n : Nat ** Tafra n [Int, String])
exampleFrame = parseCSV exampleCSV
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment