Skip to content
{{ message }}

Instantly share code, notes, and snippets.

# derrickturk/Tafra.idr

Last active Jun 19, 2020
I love Idris - tafrae revisited
 -- dependently-typed dataframes from scratch module Tafra %access public export %hide Nat -- we'll roll our own, this is Carl Sagan's apple pie -- natural numbers a la Peano data Nat : Type where Z : Nat S : Nat -> Nat -- addition, by recursion on the left plus : Nat -> Nat -> Nat plus Z m = m plus (S n) m = S (plus n m) -- "finite sets" - naturals less than a given number data Fin : Nat -> Type where FZ : Fin (S n) -- zero is less than any successor FS : Fin n -> Fin (S n) -- by induction -- "vectors" - known-length homogeneous sequences namespace Vect data Vect : Nat -> Type -> Type where Nil : Vect Z a (::) : a -> Vect n a -> Vect (S n) a -- bounds-checked element access index : Fin n -> Vect n a -> a index FZ (x :: _) = x index (FS i) (_ :: xs) = index i xs -- length-tracking concatenation (++) : Vect n a -> Vect m a -> Vect (plus n m) a [] ++ ys = ys (x :: xs) ++ ys = x :: xs ++ ys -- we can map over vectors, preserving type Functor (Vect n) where map f [] = [] map f (x :: xs) = f x :: map f xs -- we'll want this later to implement "slicing" vslice : Vect k (Fin n) -> Vect n a -> Vect k a vslice [] _ = [] vslice (i :: is) xs = index i xs :: vslice is xs -- we'll want this to walk two parallel vectors in "lock-step" zip : Vect n a -> Vect n b -> Vect n (a, b) zip [] [] = [] zip (x::xs) (y::ys) = (x, y)::zip xs ys -- the types of "row transformer" functions for a given Tafra RowTransformer : Vect n Type -> Vect n Type -> Vect n Type RowTransformer [] [] = [] RowTransformer (a :: as) (b :: bs) = (a -> b) :: RowTransformer as bs -- heterogeneous vectors - we'll need these to represent "rows" namespace HVect data HVect : Vect n Type -> Type where Nil : HVect [] (::) : a -> HVect as -> HVect (a::as) -- bounds-checked element access, with appropriate return type index : {n : Nat} -> {as : Vect n Type} -> (i : Fin n) -> HVect as -> index i as index FZ [] impossible index (FS _) [] impossible index FZ (x :: _) = x index (FS i) (_ :: xs) = index i xs -- HVects can be compared for equality, when their components -- are comparable - we'll need this for "group by" Eq (HVect []) where [] == [] = True (Eq a, Eq (HVect as)) => Eq (HVect (a::as)) where (x :: xs) == (y :: ys) = x == y && xs == ys -- length- and type-tracking concatenation (++) : HVect as -> HVect bs -> HVect (as ++ bs) [] ++ ys = ys (x :: xs) ++ ys = x :: xs ++ ys -- first element only head : HVect (a::as) -> a head (x :: _) = x -- all but first element tail : HVect (a::as) -> HVect as tail (_ :: xs) = xs -- "ap"ply an HVect of functions ap : {as : Vect n Type} -> {bs : Vect n Type} -> HVect (RowTransformer as bs) -> HVect as -> HVect bs ap {as = []} {bs = []} [] [] = [] ap {as = _ :: _} {bs = _ :: _} (f :: fs) (x :: xs) = f x :: ap fs xs namespace Tafra -- a "tafra" is a list of same-length vectors of various types -- the type is indexed by the row count and a vector of column types data Tafra : (rows : Nat) -> Vect cols Type -> Type where Nil : Tafra rows [] (::) : Vect rows a -> Tafra rows as -> Tafra rows (a::as) -- we can concatenate tafras with the same column types and different -- row counts "vertically" (++) : Tafra n as -> Tafra m as -> Tafra (plus n m) as [] ++ [] = [] (topFirst :: topRest) ++ (botFirst :: botRest) = Vect.(++) topFirst botFirst :: topRest ++ botRest -- we can concatenate tafras with different column types but the same row -- counts "horizontally" hcat : Tafra n as -> Tafra n bs -> Tafra n (as ++ bs) hcat [] y = y hcat (x :: xs) y = x :: hcat xs y -- indexing by columns is simple, and returns a column vector index : {cols: Nat} -> {as : Vect cols Type} -> (i : Fin cols) -> Tafra rows as -> Vect rows (index i as) index FZ [] impossible index (FS _) [] impossible index FZ (x :: _) = x index (FS i) (_ :: xs) = index i xs -- indexing by rows is trickier - we have to return a heterogeneous "tuple" rindex : Fin rows -> Tafra rows as -> HVect as rindex _ [] = [] rindex i (x :: xs) = index i x :: rindex i xs -- we can also "slice" by multiple row or column indices -- we'll do this a little inefficiently -- "column" slices slice : {cols : Nat} -> {as : Vect cols Type} -> (ixs : Vect k (Fin cols)) -> Tafra rows as -> Tafra rows (vslice ixs as) slice [] _ = [] slice (i :: is) xs = index i xs :: slice is xs -- "row" slices rslice : {rows : Nat} -> (ixs : Vect k (Fin rows)) -> Tafra rows as -> Tafra k as rslice _ [] = [] rslice [] (_ :: xs) = [] :: rslice [] xs rslice is (x :: xs) = vslice is x :: rslice is xs -- just the first row rhead : Tafra (S n) as -> HVect as rhead [] = [] rhead ((x :: _) :: ys) = x :: rhead ys -- all rows but the first rtail : Tafra (S n) as -> Tafra n as rtail [] = [] rtail ((_ :: xs) :: ys) = xs :: rtail ys -- combine the "head" and the "tail" (we'll need this to -- correctly type "group by" rcons : HVect as -> Tafra n as -> Tafra (S n) as rcons [] [] = [] rcons (x :: xs) (r :: rs) = (x :: r) :: rcons xs rs -- transpose to a row-oriented representation transpose : {rows : Nat} -> Tafra rows as -> Vect rows (HVect as) transpose {rows = Z} _ = [] transpose {rows = (S m)} xs = rhead xs :: transpose (rtail xs) -- rebuild a tafra from a transposed representation untranspose : {rows : Nat} -> {cols : Nat} -> {as : Vect cols Type} -> Vect rows (HVect as) -> Tafra rows as untranspose {rows = Z} {as = []} [] = [] untranspose {rows = Z} {as = (t :: ts)} [] = [] :: untranspose [] untranspose {rows = (S m)} {as = []} _ = [] untranspose {rows = (S m)} {as = (t :: ts)} (xs :: yss) = (head xs :: map head yss) :: untranspose (tail xs :: map tail yss) -- a zero-row tafra of any type empty : Tafra Z as empty {as = []} = [] empty {as = (_ :: _)} = [] :: empty -- "map" a set of row-transformer functions over a tafra rmap : HVect (RowTransformer as bs) -> Tafra n as -> Tafra n bs rmap fs = untranspose . map (ap fs) . transpose -- let's create some example tafrae someData : Tafra (S (S (S Z))) [Double, Bool, List String] someData = [ [23.4, 45.6, 78.9] , [True, False, True] , [ ["yo", "dawg"] , ["i heard you", "like structures"] , ["so i put structures in your structures"] ] ] otherData : Tafra (S (S Z)) [Int, String] otherData = [ [99, 45] , ["some", "text"] ] -- instead of maps, we'll use "association lists" for associative lookup -- the prelude helpfully provides -- lookup : Eq a => a -> List (a, b) -> Maybe b -- we'll define a helpful type synonym Map : Type -> Type -> Type Map k v = List (k, v) -- and some additional helpers insert : Eq k => k -> v -> Map k v -> Map k v insert k v [] = [(k, v)] insert k v (x@(k', _) :: xs) = if k == k' then (k, v) :: xs else x :: insert k v xs -- we're almost ready to write "group by" -- first, let's prove we can collect rows by unique combination of group-by -- column values -- an "accumulating" function we'll use to collect rows by unique combination accumRows : Eq k => Map k (List v) -> Vect n (k, v) -> Map k (List v) accumRows map [] = map accumRows map ((b, g) :: rest) = case lookup b map of Nothing => accumRows (insert b [g] map) rest -- for performance, we're going to build these "backward" (Just xs) => accumRows (insert b (g::xs) map) rest -- this is also the only example I have ever seen of explicitly passing -- a "locally named" interface implementation rowsByUnique : (is : Vect groups (Fin cols)) -> -- indices of group-by columns (js : Vect aggs (Fin cols)) -> -- indices of aggregated-over columns {auto prf: Eq (HVect (vslice is as))} -> Tafra rows as -> Map (HVect (vslice is as)) (List (HVect (vslice js as))) rowsByUnique {prf} is js xs = accumRows @{prf} [] (zip (transpose (slice is xs)) (transpose (slice js xs))) -- ok, let's type group by groupBy : {cols : Nat} -> {rows : Nat} -> {as : Vect cols Type} -> {groups : Nat} -> {aggs : Nat} -> -- indices of group-by columns (is : Vect groups (Fin cols)) -> -- indices of aggregated-over columns (js : Vect aggs (Fin cols)) -> -- aggregation function (aggFn : List (HVect (vslice js as)) -> HVect bs) -> -- a proof that group-by columns can be compared for equality -- (automatically found by compiler) {auto prf: Eq (HVect (vslice is as))} -> -- data frame to group-by Tafra rows as -> -- result type: an existentially quantified tafra of some -- row count, with columns from the group-by columns and -- the result of the aggregation function (k : Nat ** Tafra k (vslice is as ++ bs)) groupBy is js fn xs = applyAggregation fn (rowsByUnique is js xs) where applyAggregation : (List (HVect gs) -> HVect rs) -> Map (HVect as) (List (HVect gs)) -> (k : Nat ** Tafra k (as ++ rs)) applyAggregation f [] = (Z ** empty) applyAggregation f ((k, rs) :: rest) with (applyAggregation f rest) applyAggregation f ((k, rs) :: _) | (n ** restAgg) = (S n ** rcons (k ++ f rs) restAgg) -- select -- col2, -- join('\n', str(col1) & join(' ', col3)) -- from someData -- group by col2 exampleGroupBy : (k : Nat ** Tafra k [Bool, String]) exampleGroupBy = groupBy [FS FZ] [FZ, FS (FS FZ)] ((::[]) . unlines . map (\[num, strs] => unwords (show num :: strs))) someData exampleMapped : Tafra (S (S (S Z))) [Double, Bool, String] exampleMapped = rmap [(/ 3.0), not, unwords] someData
 module TafraParser import Tafra %access public export charDigit : Char -> Maybe Int charDigit x = if isDigit x then Just (ord x - ord '0') else Nothing interface ParseMaybe a where parseMaybe : String -> Maybe a ParseMaybe String where parseMaybe = Just ParseMaybe Bool where parseMaybe "True" = Just True parseMaybe "False" = Just False parseMaybe _ = Nothing ParseMaybe Int where parseMaybe str with (strM str) parseMaybe "" | StrNil = Nothing parseMaybe (strCons x xs) | (StrCons x xs) = do val <- charDigit x go val xs where go val str with (strM str) go val "" | StrNil = Just val go val (strCons y ys) | (StrCons y ys) = do next <- charDigit y go (val * 10 + next) ys ParseMaybe Integer where parseMaybe str with (strM str) parseMaybe "" | StrNil = Nothing parseMaybe (strCons x xs) | (StrCons x xs) = do val <- cast <\$> charDigit x go val xs where go : Integer -> String -> Maybe Integer go val str with (strM str) go val "" | StrNil = Just val go val (strCons y ys) | (StrCons y ys) = do next <- cast <\$> charDigit y go (val * 10 + next) ys ParseMaybe Double where parseMaybe str with (strM str) parseMaybe "" | StrNil = Nothing parseMaybe (strCons x xs) | (StrCons x xs) = do val <- cast <\$> charDigit x go False val 0 xs where go : Bool -> Integer -> Integer -> String -> Maybe Double go afterDecimal whole frac str with (strM str) -- this is a little lazy... go afterDecimal whole frac "" | StrNil = Just (cast (show whole ++ "." ++ show frac)) go afterDecimal whole frac (strCons y ys) | (StrCons y ys) = if y == '.' then go True whole frac ys else do next <- cast <\$> charDigit y if afterDecimal then go afterDecimal whole (frac * 10 + next) ys else go afterDecimal (whole * 10 + next) frac ys ParseMaybe (HVect []) where parseMaybe "" = Just [] parseMaybe _ = Nothing (ParseMaybe a, ParseMaybe (HVect as)) => ParseMaybe (HVect (a::as)) where parseMaybe xs with (split (== ',') xs) parseMaybe xs | [] = Nothing parseMaybe xs | (y :: ys) = do first <- parseMaybe (trim y) rest <- parseMaybe (concat (intersperse "," ys)) pure (first::rest) listToVect : List a -> (n : Nat ** Vect n a) listToVect [] = (Z ** []) listToVect (x :: xs) with (listToVect xs) listToVect (x :: xs) | (n ** v) = (S n ** x :: v) -- skip header and parse each row parseCSV' : ParseMaybe (HVect as) => List String -> Maybe (n : Nat ** Tafra n as) parseCSV' @{c} csvLines = case nonEmpty csvLines of (Yes prf) => do (n ** rows) <- listToVect <\$> traverse (parseMaybe @{c}) (tail csvLines) pure (n ** untranspose rows) (No _) => Nothing parseCSV : ParseMaybe (HVect as) => String -> Maybe (n : Nat ** Tafra n as) parseCSV = parseCSV' . lines exampleCSV : String exampleCSV = "age,name\n123,bobby\n456,sally\n" exampleFrame : Maybe (n : Nat ** Tafra n [Int, String]) exampleFrame = parseCSV exampleCSV
to join this conversation on GitHub. Already have an account? Sign in to comment