I love Idris - tafrae revisited
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- dependently-typed dataframes from scratch | |
module Tafra | |
%access public export | |
%hide Nat -- we'll roll our own, this is Carl Sagan's apple pie | |
-- natural numbers a la Peano | |
data Nat : Type where | |
Z : Nat | |
S : Nat -> Nat | |
-- addition, by recursion on the left | |
plus : Nat -> Nat -> Nat | |
plus Z m = m | |
plus (S n) m = S (plus n m) | |
-- "finite sets" - naturals less than a given number | |
data Fin : Nat -> Type where | |
FZ : Fin (S n) -- zero is less than any successor | |
FS : Fin n -> Fin (S n) -- by induction | |
-- "vectors" - known-length homogeneous sequences | |
namespace Vect | |
data Vect : Nat -> Type -> Type where | |
Nil : Vect Z a | |
(::) : a -> Vect n a -> Vect (S n) a | |
-- bounds-checked element access | |
index : Fin n -> Vect n a -> a | |
index FZ (x :: _) = x | |
index (FS i) (_ :: xs) = index i xs | |
-- length-tracking concatenation | |
(++) : Vect n a -> Vect m a -> Vect (plus n m) a | |
[] ++ ys = ys | |
(x :: xs) ++ ys = x :: xs ++ ys | |
-- we can map over vectors, preserving type | |
Functor (Vect n) where | |
map f [] = [] | |
map f (x :: xs) = f x :: map f xs | |
-- we'll want this later to implement "slicing" | |
vslice : Vect k (Fin n) -> Vect n a -> Vect k a | |
vslice [] _ = [] | |
vslice (i :: is) xs = index i xs :: vslice is xs | |
-- we'll want this to walk two parallel vectors in "lock-step" | |
zip : Vect n a -> Vect n b -> Vect n (a, b) | |
zip [] [] = [] | |
zip (x::xs) (y::ys) = (x, y)::zip xs ys | |
-- the types of "row transformer" functions for a given Tafra | |
RowTransformer : Vect n Type -> Vect n Type -> Vect n Type | |
RowTransformer [] [] = [] | |
RowTransformer (a :: as) (b :: bs) = (a -> b) :: RowTransformer as bs | |
-- heterogeneous vectors - we'll need these to represent "rows" | |
namespace HVect | |
data HVect : Vect n Type -> Type where | |
Nil : HVect [] | |
(::) : a -> HVect as -> HVect (a::as) | |
-- bounds-checked element access, with appropriate return type | |
index : {n : Nat} -> | |
{as : Vect n Type} -> | |
(i : Fin n) -> | |
HVect as -> | |
index i as | |
index FZ [] impossible | |
index (FS _) [] impossible | |
index FZ (x :: _) = x | |
index (FS i) (_ :: xs) = index i xs | |
-- HVects can be compared for equality, when their components | |
-- are comparable - we'll need this for "group by" | |
Eq (HVect []) where | |
[] == [] = True | |
(Eq a, Eq (HVect as)) => Eq (HVect (a::as)) where | |
(x :: xs) == (y :: ys) = x == y && xs == ys | |
-- length- and type-tracking concatenation | |
(++) : HVect as -> HVect bs -> HVect (as ++ bs) | |
[] ++ ys = ys | |
(x :: xs) ++ ys = x :: xs ++ ys | |
-- first element only | |
head : HVect (a::as) -> a | |
head (x :: _) = x | |
-- all but first element | |
tail : HVect (a::as) -> HVect as | |
tail (_ :: xs) = xs | |
-- "ap"ply an HVect of functions | |
ap : {as : Vect n Type} -> | |
{bs : Vect n Type} -> | |
HVect (RowTransformer as bs) -> | |
HVect as -> | |
HVect bs | |
ap {as = []} {bs = []} [] [] = [] | |
ap {as = _ :: _} {bs = _ :: _} (f :: fs) (x :: xs) = f x :: ap fs xs | |
namespace Tafra | |
-- a "tafra" is a list of same-length vectors of various types | |
-- the type is indexed by the row count and a vector of column types | |
data Tafra : (rows : Nat) -> Vect cols Type -> Type where | |
Nil : Tafra rows [] | |
(::) : Vect rows a -> Tafra rows as -> Tafra rows (a::as) | |
-- we can concatenate tafras with the same column types and different | |
-- row counts "vertically" | |
(++) : Tafra n as -> Tafra m as -> Tafra (plus n m) as | |
[] ++ [] = [] | |
(topFirst :: topRest) ++ (botFirst :: botRest) = | |
Vect.(++) topFirst botFirst :: topRest ++ botRest | |
-- we can concatenate tafras with different column types but the same row | |
-- counts "horizontally" | |
hcat : Tafra n as -> Tafra n bs -> Tafra n (as ++ bs) | |
hcat [] y = y | |
hcat (x :: xs) y = x :: hcat xs y | |
-- indexing by columns is simple, and returns a column vector | |
index : {cols: Nat} -> | |
{as : Vect cols Type} -> | |
(i : Fin cols) -> | |
Tafra rows as -> | |
Vect rows (index i as) | |
index FZ [] impossible | |
index (FS _) [] impossible | |
index FZ (x :: _) = x | |
index (FS i) (_ :: xs) = index i xs | |
-- indexing by rows is trickier - we have to return a heterogeneous "tuple" | |
rindex : Fin rows -> Tafra rows as -> HVect as | |
rindex _ [] = [] | |
rindex i (x :: xs) = index i x :: rindex i xs | |
-- we can also "slice" by multiple row or column indices | |
-- we'll do this a little inefficiently | |
-- "column" slices | |
slice : {cols : Nat} -> | |
{as : Vect cols Type} -> | |
(ixs : Vect k (Fin cols)) -> | |
Tafra rows as -> | |
Tafra rows (vslice ixs as) | |
slice [] _ = [] | |
slice (i :: is) xs = index i xs :: slice is xs | |
-- "row" slices | |
rslice : {rows : Nat} -> | |
(ixs : Vect k (Fin rows)) -> | |
Tafra rows as -> | |
Tafra k as | |
rslice _ [] = [] | |
rslice [] (_ :: xs) = [] :: rslice [] xs | |
rslice is (x :: xs) = vslice is x :: rslice is xs | |
-- just the first row | |
rhead : Tafra (S n) as -> HVect as | |
rhead [] = [] | |
rhead ((x :: _) :: ys) = x :: rhead ys | |
-- all rows but the first | |
rtail : Tafra (S n) as -> Tafra n as | |
rtail [] = [] | |
rtail ((_ :: xs) :: ys) = xs :: rtail ys | |
-- combine the "head" and the "tail" (we'll need this to | |
-- correctly type "group by" | |
rcons : HVect as -> Tafra n as -> Tafra (S n) as | |
rcons [] [] = [] | |
rcons (x :: xs) (r :: rs) = (x :: r) :: rcons xs rs | |
-- transpose to a row-oriented representation | |
transpose : {rows : Nat} -> Tafra rows as -> Vect rows (HVect as) | |
transpose {rows = Z} _ = [] | |
transpose {rows = (S m)} xs = rhead xs :: transpose (rtail xs) | |
-- rebuild a tafra from a transposed representation | |
untranspose : {rows : Nat} -> | |
{cols : Nat} -> | |
{as : Vect cols Type} -> | |
Vect rows (HVect as) -> | |
Tafra rows as | |
untranspose {rows = Z} {as = []} [] = [] | |
untranspose {rows = Z} {as = (t :: ts)} [] = [] :: untranspose [] | |
untranspose {rows = (S m)} {as = []} _ = [] | |
untranspose {rows = (S m)} {as = (t :: ts)} (xs :: yss) = | |
(head xs :: map head yss) :: untranspose (tail xs :: map tail yss) | |
-- a zero-row tafra of any type | |
empty : Tafra Z as | |
empty {as = []} = [] | |
empty {as = (_ :: _)} = [] :: empty | |
-- "map" a set of row-transformer functions over a tafra | |
rmap : HVect (RowTransformer as bs) -> Tafra n as -> Tafra n bs | |
rmap fs = untranspose . map (ap fs) . transpose | |
-- let's create some example tafrae | |
someData : Tafra (S (S (S Z))) [Double, Bool, List String] | |
someData = | |
[ [23.4, 45.6, 78.9] | |
, [True, False, True] | |
, [ ["yo", "dawg"] | |
, ["i heard you", "like structures"] | |
, ["so i put structures in your structures"] | |
] | |
] | |
otherData : Tafra (S (S Z)) [Int, String] | |
otherData = | |
[ [99, 45] | |
, ["some", "text"] | |
] | |
-- instead of maps, we'll use "association lists" for associative lookup | |
-- the prelude helpfully provides | |
-- lookup : Eq a => a -> List (a, b) -> Maybe b | |
-- we'll define a helpful type synonym | |
Map : Type -> Type -> Type | |
Map k v = List (k, v) | |
-- and some additional helpers | |
insert : Eq k => k -> v -> Map k v -> Map k v | |
insert k v [] = [(k, v)] | |
insert k v (x@(k', _) :: xs) = | |
if k == k' | |
then (k, v) :: xs | |
else x :: insert k v xs | |
-- we're almost ready to write "group by" | |
-- first, let's prove we can collect rows by unique combination of group-by | |
-- column values | |
-- an "accumulating" function we'll use to collect rows by unique combination | |
accumRows : Eq k => | |
Map k (List v) -> | |
Vect n (k, v) -> | |
Map k (List v) | |
accumRows map [] = map | |
accumRows map ((b, g) :: rest) = case lookup b map of | |
Nothing => accumRows (insert b [g] map) rest | |
-- for performance, we're going to build these "backward" | |
(Just xs) => accumRows (insert b (g::xs) map) rest | |
-- this is also the only example I have ever seen of explicitly passing | |
-- a "locally named" interface implementation | |
rowsByUnique : (is : Vect groups (Fin cols)) -> -- indices of group-by columns | |
(js : Vect aggs (Fin cols)) -> -- indices of aggregated-over columns | |
{auto prf: Eq (HVect (vslice is as))} -> | |
Tafra rows as -> | |
Map (HVect (vslice is as)) (List (HVect (vslice js as))) | |
rowsByUnique {prf} is js xs = | |
accumRows @{prf} [] (zip (transpose (slice is xs)) (transpose (slice js xs))) | |
-- ok, let's type group by | |
groupBy : {cols : Nat} -> | |
{rows : Nat} -> | |
{as : Vect cols Type} -> | |
{groups : Nat} -> | |
{aggs : Nat} -> | |
-- indices of group-by columns | |
(is : Vect groups (Fin cols)) -> | |
-- indices of aggregated-over columns | |
(js : Vect aggs (Fin cols)) -> | |
-- aggregation function | |
(aggFn : List (HVect (vslice js as)) -> HVect bs) -> | |
-- a proof that group-by columns can be compared for equality | |
-- (automatically found by compiler) | |
{auto prf: Eq (HVect (vslice is as))} -> | |
-- data frame to group-by | |
Tafra rows as -> | |
-- result type: an existentially quantified tafra of some | |
-- row count, with columns from the group-by columns and | |
-- the result of the aggregation function | |
(k : Nat ** Tafra k (vslice is as ++ bs)) | |
groupBy is js fn xs = applyAggregation fn (rowsByUnique is js xs) where | |
applyAggregation : (List (HVect gs) -> HVect rs) -> | |
Map (HVect as) (List (HVect gs)) -> | |
(k : Nat ** Tafra k (as ++ rs)) | |
applyAggregation f [] = (Z ** empty) | |
applyAggregation f ((k, rs) :: rest) with (applyAggregation f rest) | |
applyAggregation f ((k, rs) :: _) | (n ** restAgg) = | |
(S n ** rcons (k ++ f rs) restAgg) | |
-- select | |
-- col2, | |
-- join('\n', str(col1) & join(' ', col3)) | |
-- from someData | |
-- group by col2 | |
exampleGroupBy : (k : Nat ** Tafra k [Bool, String]) | |
exampleGroupBy = groupBy | |
[FS FZ] | |
[FZ, FS (FS FZ)] | |
((::[]) . unlines . map (\[num, strs] => unwords (show num :: strs))) | |
someData | |
exampleMapped : Tafra (S (S (S Z))) [Double, Bool, String] | |
exampleMapped = rmap [(/ 3.0), not, unwords] someData |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module TafraParser | |
import Tafra | |
%access public export | |
charDigit : Char -> Maybe Int | |
charDigit x = if isDigit x | |
then Just (ord x - ord '0') | |
else Nothing | |
interface ParseMaybe a where | |
parseMaybe : String -> Maybe a | |
ParseMaybe String where | |
parseMaybe = Just | |
ParseMaybe Bool where | |
parseMaybe "True" = Just True | |
parseMaybe "False" = Just False | |
parseMaybe _ = Nothing | |
ParseMaybe Int where | |
parseMaybe str with (strM str) | |
parseMaybe "" | StrNil = Nothing | |
parseMaybe (strCons x xs) | (StrCons x xs) = do | |
val <- charDigit x | |
go val xs | |
where | |
go val str with (strM str) | |
go val "" | StrNil = Just val | |
go val (strCons y ys) | (StrCons y ys) = do | |
next <- charDigit y | |
go (val * 10 + next) ys | |
ParseMaybe Integer where | |
parseMaybe str with (strM str) | |
parseMaybe "" | StrNil = Nothing | |
parseMaybe (strCons x xs) | (StrCons x xs) = do | |
val <- cast <$> charDigit x | |
go val xs | |
where | |
go : Integer -> String -> Maybe Integer | |
go val str with (strM str) | |
go val "" | StrNil = Just val | |
go val (strCons y ys) | (StrCons y ys) = do | |
next <- cast <$> charDigit y | |
go (val * 10 + next) ys | |
ParseMaybe Double where | |
parseMaybe str with (strM str) | |
parseMaybe "" | StrNil = Nothing | |
parseMaybe (strCons x xs) | (StrCons x xs) = do | |
val <- cast <$> charDigit x | |
go False val 0 xs | |
where | |
go : Bool -> Integer -> Integer -> String -> Maybe Double | |
go afterDecimal whole frac str with (strM str) | |
-- this is a little lazy... | |
go afterDecimal whole frac "" | StrNil = | |
Just (cast (show whole ++ "." ++ show frac)) | |
go afterDecimal whole frac (strCons y ys) | (StrCons y ys) = | |
if y == '.' | |
then go True whole frac ys | |
else do | |
next <- cast <$> charDigit y | |
if afterDecimal | |
then go afterDecimal whole (frac * 10 + next) ys | |
else go afterDecimal (whole * 10 + next) frac ys | |
ParseMaybe (HVect []) where | |
parseMaybe "" = Just [] | |
parseMaybe _ = Nothing | |
(ParseMaybe a, ParseMaybe (HVect as)) => ParseMaybe (HVect (a::as)) where | |
parseMaybe xs with (split (== ',') xs) | |
parseMaybe xs | [] = Nothing | |
parseMaybe xs | (y :: ys) = do | |
first <- parseMaybe (trim y) | |
rest <- parseMaybe (concat (intersperse "," ys)) | |
pure (first::rest) | |
listToVect : List a -> (n : Nat ** Vect n a) | |
listToVect [] = (Z ** []) | |
listToVect (x :: xs) with (listToVect xs) | |
listToVect (x :: xs) | (n ** v) = (S n ** x :: v) | |
-- skip header and parse each row | |
parseCSV' : ParseMaybe (HVect as) => List String -> Maybe (n : Nat ** Tafra n as) | |
parseCSV' @{c} csvLines = case nonEmpty csvLines of | |
(Yes prf) => do | |
(n ** rows) <- listToVect <$> traverse (parseMaybe @{c}) (tail csvLines) | |
pure (n ** untranspose rows) | |
(No _) => Nothing | |
parseCSV : ParseMaybe (HVect as) => String -> Maybe (n : Nat ** Tafra n as) | |
parseCSV = parseCSV' . lines | |
exampleCSV : String | |
exampleCSV = "age,name\n123,bobby\n456,sally\n" | |
exampleFrame : Maybe (n : Nat ** Tafra n [Int, String]) | |
exampleFrame = parseCSV exampleCSV |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment