gelisam/Main.hs

## Main.hs
-- in response to https://www.reddit.com/r/haskell/comments/a50xpr/datahaskell_solve_this_small_problem_to_fill_some/
{-# LANGUAGE BangPatterns, OverloadedStrings, RecordWildCards, ScopedTypeVariables #-}
module Main where

import Control.Category ((>>>))
import Data.Function ((&))
import Data.Map.Strict (Map, (!))
import Data.Set (Set)
import Test.DocTest (doctest)

import qualified Data.List as List
import qualified Data.Map.Strict as Map
import qualified Data.Set as Set
import qualified Data.Text.Lazy as Lazy
import qualified Data.Text as Strict


-- This is the input given. I have hardcoded it into the file for completeness,
-- but you could also obtain it from a file using 'Data.Text.Lazy.IO.readFile'.
--
-- This looks like csv, so my first instict is to use the cassava library.
-- Unfortunately, due to the extra whitespace, this isn't _quite_ valid csv, so
-- we would need to do some pre-processing in order to transform the input into
-- a format which cassava can parse, and also cassava outputs a vector, which is
-- completely loaded into memory, not streamed. For those two reasons, I will
-- parse that format myself; it's just a simple comma-delimited format, after
-- all.
input :: Lazy.Text
input = Lazy.unlines
      [ "item                , price"
      , "computer            , 1000"
      , "car                 , 5000"
      , "legal fees (1 hour) , 400"
      , ""
      , "date , person , item-bought         , units-bought"
      , "7    , bob    , car                 , 1"
      , "5    , alice  , car                 , 1"
      , "4    , bob    , legal fees (1 hour) , 20"
      , "3    , alice  , computer            , 2"
      , "1    , bob    , computer            , 1"
      ]


-- My goal is to parse the above tables into the following more precise types.
--
-- Throughout this file, I will be careful to distinguish between 'Lazy.Text'
-- and 'Strict.Text'. I want the input to be lazy, so I can stream it, but once
-- an individual item name such as "computer" has been loaded, it is not useful
-- to be streaming the individual letters of the word "computer", it is better
-- to load the entire string into memory.

data PricingEntry = PricingEntry
  { item  :: Strict.Text
  , price :: Int
  }
  deriving Show

data Purchase = Purchase
  { date        :: Int
  , person      :: Strict.Text
  , itemBought  :: Strict.Text
  , unitsBought :: Int
  }
  deriving Show


-- |
-- The input contains two tables, which we will need to parse separately, so the
-- first step is to write a function which can split such an input into two
-- parts. Doing so will load the entirety of the first table into memory, which
-- is fine; it's the second table we want to stream.
--
-- >>> breakAtBlankLine "hello\nworld\n\nfoo\nbar\nbaz\n\nquux\n"
-- ("hello\nworld\n","foo\nbar\nbaz\n\nquux\n")
--
-- Note that I am being sloppy with error handling in this file, as I don't
-- specify what should happen if the input doesn't have the format we expect. In
-- this case, I get a pattern-matching exception if there is no blank line.
--
-- >>> breakAtBlankLine "hello"
-- ...Irrefutable pattern failed for pattern (before, "" : after)
-- ...
breakAtBlankLine :: Lazy.Text -> (Lazy.Text, Lazy.Text)
breakAtBlankLine text = (Lazy.unlines before, Lazy.unlines after)
  where
    -- ["hello","world","","foo","bar","baz","","quux"]
    allLines :: [Lazy.Text]
    allLines = Lazy.lines text

    -- ["hello","world"]
    -- ["foo","bar","baz","","quux"]
    before, after :: [Lazy.Text]
    (before, "":after) = List.break (== "") allLines

-- |
-- Each table cell is padded with extra whitespace, which we'll want to remove.
-- So the next step is to write a function which can trim the whitespace from
-- both sides.
--
-- My implementation compose two smaller functions, which respectively strip the
-- whitespace from the front and the end of the string. While in Haskell it is
-- more typical to compose those functions using '(.)' from right to left, I
-- will use '(>>>)' to compose them from left to right so the transformation
-- flows from top to bottom. In between each step, I will include a comment
-- demonstrating what the example input looks like in between those steps.
--
-- >>> trim "   foo bar  "
-- "foo bar"
trim :: Strict.Text -> Strict.Text
trim = Strict.dropWhile    (== ' ')
    -- "foo bar  "
   >>> Strict.dropWhileEnd (== ' ')

-- |
-- Next, let's parse an individual table.
--
-- >>> parseTable "item     , price\ncomputer , 1000\n" :: [PricingEntry]
-- [PricingEntry {item = "computer", price = 1000}]
parseTable :: forall a. FromRow a
           => Lazy.Text -> [a]
parseTable = Lazy.lines
          -- ["item     , price","computer , 1000"]
         >>> fmap (fmap trim . Strict.split (== ',') . Lazy.toStrict)
          -- [["item","price"],["computer","1000"]]
         >>> go
  where
    go :: [[Strict.Text]] -> [a]
    go (header:data_) = data_
                     -- [["computer","1000"]]
                      & fmap (zip header)
                     -- [[("item","computer"),("price","1000")]]
                      & fmap Map.fromList
                     -- [Map.fromList [("item","computer"),("price","1000")]]
                      & fmap parseRow
    go [] = error "no header"


-- The last step, 'parseRow', somehow converted a @Map Text Text@ into a
-- 'PricingEntry'. I of course need to specify how to do that for both
-- 'PricingEntry' and 'Purchase'.
--
-- I have written the 'Purchase' instance using the 'Applicative' style, and the
-- 'PricingEntry' instance more verbosely, without that style, in order to
-- demonstrate what that style desugars into.

-- |
-- >>> parseRow (Map.fromList [("item","computer"),("price","1000")]) :: PricingEntry
-- PricingEntry {item = "computer", price = 1000}
class FromRow a where
  parseRow :: Map Strict.Text Strict.Text -> a

instance FromRow PricingEntry where
  parseRow map_ = PricingEntry (cell "item"  map_)
                               (cell "price" map_)

instance FromRow Purchase where
  parseRow = Purchase <$> cell "date"
                      <*> cell "person"
                      <*> cell "item-bought"
                      <*> cell "units-bought"

-- |
-- >>> cell "price" (Map.fromList [("item","computer"),("price","1000")]) :: Int
-- 1000
cell :: FromCell a
     => Strict.Text -> Map Strict.Text Strict.Text -> a
cell k = (! k)
      -- "1000"
     >>> parseCell


-- Different calls to 'cell' return fields of different types, so I again need
-- to specify how to parse each type.

-- |
-- >>> parseCell "1000" :: Int
-- 1000
class FromCell a where
  parseCell :: Strict.Text -> a

instance FromCell Strict.Text where
  parseCell = id

instance FromCell Int where
  parseCell = read . Strict.unpack


-- |
-- All right, we have now parsed our two tables into two lists. A list of
-- 'Purchase's is good because we can stream lists, but for 'PricingEntry', a
-- 'Map' from each item to its price would be more convenient.
--
-- >>> toPriceMap [PricingEntry "computer" 1000]
-- fromList [("computer",1000)]
toPriceMap :: [PricingEntry] -> Map Strict.Text Int
toPriceMap = fmap (\(PricingEntry {..}) -> (item, price))
          -- [("computer",1000)]
         >>> Map.fromList


-- The question we want to answer is "Averaged across persons, excluding legal
-- fees, how much money had each person spent by time 6?", so let's define a
-- datatype which accumulates all the data we need in order to answer this.
--
-- I am using bang patterns because I want to make sure that as I go through the
-- 'Purchase's and update the 'Summary', I really update all of the fields.
-- Otherwise, it's easy for those fields to accidentally hold a larger and
-- larger thunk instead of holding e.g. a simple Int, thereby negating our
-- efforts to keep a constant memory while we stream the data.

data Summary = Summary
  { people     :: !(Set Strict.Text)
  , moneySpent :: !Int
  }
  deriving Show

initialSummary :: Summary
initialSummary = Summary mempty 0

averageSpending :: Summary -> Double
averageSpending (Summary {..}) = fromIntegral moneySpent
                               / max 1 (fromIntegral (length people))


-- |
-- All right, we can now write a computation which updates our 'Summary' after
-- looking at a single 'Purchase':
--
-- >>> :{
-- updateSummary (Map.fromList [("computer",1000)])
--               initialSummary
--               (Purchase 3 "alice" "computer" 2)
-- :}
-- Summary {people = fromList ["alice"], moneySpent = 2000}
updateSummary :: Map Strict.Text Int -> Summary -> Purchase -> Summary
updateSummary priceMap (Summary {..}) (Purchase {..})
  | date < 6 && itemBought /= "legal fees (1 hour)"
  = Summary
      { people     = Set.insert person people
      , moneySpent = moneySpent + (priceMap ! itemBought) * unitsBought
      }
updateSummary _ summary _ = summary


-- |
-- Tying everything all together, let's parse the two tables and stream our
-- partial answers as we get more data.
--
-- First with a small input:
--
-- >>> :{
-- streamAnswers ( Lazy.unlines
--               [ "item                , price"
--               , "computer            , 1000"
--               , ""
--               , "date , person , item-bought         , units-bought"
--               , "3    , alice  , computer            , 2"
--               ])
-- :}
-- [0.0,2000.0]
--
-- Then with the sample data:
--
-- >>> streamAnswers input
-- [0.0,0.0,5000.0,5000.0,7000.0,4000.0]
--
-- And finally, to prove that we really streaming and aren't reading the entire
-- second table into memory, with an input which throws an exception after we
-- touch the last line:
--
-- >>> streamAnswers (input <> error "touched the last line")
-- [0.0,0.0,5000.0,5000.0,7000.0,4000.0*** Exception: touched the last line
-- ...
streamAnswers :: Lazy.Text -> [Double]
streamAnswers text = purchases
                  -- [Purchase 3 "alice" "computer" 2]
                   & List.scanl' (updateSummary priceMap) initialSummary
                  -- [initialSummary, Summary (Set.fromList ["alice"]) 2000]
                   & fmap averageSpending
  where
    firstPart, secondPart :: Lazy.Text
    (firstPart, secondPart) = breakAtBlankLine text

    priceMap :: Map Strict.Text Int
    priceMap = toPriceMap (parseTable firstPart)

    purchases :: [Purchase]
    purchases = parseTable secondPart

-- |
-- Finally, if we only care about the final answer, we don't have to print the
-- intermediate results, we can just traverse the entirety of the second table
-- and then only print the final result.
--
-- >>> finalAnswer input
-- 4000.0
finalAnswer :: Lazy.Text -> Double
finalAnswer = last . streamAnswers


-- This last part runs all the ">>>" tests I wrote throughout this file.
main :: IO ()
main = doctest ["-XOverloadedStrings", "src/Main.hs"]
	-- in response to https://www.reddit.com/r/haskell/comments/a50xpr/datahaskell_solve_this_small_problem_to_fill_some/
	{-# LANGUAGE BangPatterns, OverloadedStrings, RecordWildCards, ScopedTypeVariables #-}
	module Main where

	import Control.Category ((>>>))
	import Data.Function ((&))
	import Data.Map.Strict (Map, (!))
	import Data.Set (Set)
	import Test.DocTest (doctest)

	import qualified Data.List as List
	import qualified Data.Map.Strict as Map
	import qualified Data.Set as Set
	import qualified Data.Text.Lazy as Lazy
	import qualified Data.Text as Strict


	-- This is the input given. I have hardcoded it into the file for completeness,
	-- but you could also obtain it from a file using 'Data.Text.Lazy.IO.readFile'.
	--
	-- This looks like csv, so my first instict is to use the cassava library.
	-- Unfortunately, due to the extra whitespace, this isn't _quite_ valid csv, so
	-- we would need to do some pre-processing in order to transform the input into
	-- a format which cassava can parse, and also cassava outputs a vector, which is
	-- completely loaded into memory, not streamed. For those two reasons, I will
	-- parse that format myself; it's just a simple comma-delimited format, after
	-- all.
	input :: Lazy.Text
	input = Lazy.unlines
	[ "item , price"
	, "computer , 1000"
	, "car , 5000"
	, "legal fees (1 hour) , 400"
	, ""
	, "date , person , item-bought , units-bought"
	, "7 , bob , car , 1"
	, "5 , alice , car , 1"
	, "4 , bob , legal fees (1 hour) , 20"
	, "3 , alice , computer , 2"
	, "1 , bob , computer , 1"
	]


	-- My goal is to parse the above tables into the following more precise types.
	--
	-- Throughout this file, I will be careful to distinguish between 'Lazy.Text'
	-- and 'Strict.Text'. I want the input to be lazy, so I can stream it, but once
	-- an individual item name such as "computer" has been loaded, it is not useful
	-- to be streaming the individual letters of the word "computer", it is better
	-- to load the entire string into memory.

	data PricingEntry = PricingEntry
	{ item :: Strict.Text
	, price :: Int
	}
	deriving Show

	data Purchase = Purchase
	{ date :: Int
	, person :: Strict.Text
	, itemBought :: Strict.Text
	, unitsBought :: Int
	}
	deriving Show


	-- \|
	-- The input contains two tables, which we will need to parse separately, so the
	-- first step is to write a function which can split such an input into two
	-- parts. Doing so will load the entirety of the first table into memory, which
	-- is fine; it's the second table we want to stream.
	--
	-- >>> breakAtBlankLine "hello\nworld\n\nfoo\nbar\nbaz\n\nquux\n"
	-- ("hello\nworld\n","foo\nbar\nbaz\n\nquux\n")
	--
	-- Note that I am being sloppy with error handling in this file, as I don't
	-- specify what should happen if the input doesn't have the format we expect. In
	-- this case, I get a pattern-matching exception if there is no blank line.
	--
	-- >>> breakAtBlankLine "hello"
	-- ...Irrefutable pattern failed for pattern (before, "" : after)
	-- ...
	breakAtBlankLine :: Lazy.Text -> (Lazy.Text, Lazy.Text)
	breakAtBlankLine text = (Lazy.unlines before, Lazy.unlines after)
	where
	-- ["hello","world","","foo","bar","baz","","quux"]
	allLines :: [Lazy.Text]
	allLines = Lazy.lines text

	-- ["hello","world"]
	-- ["foo","bar","baz","","quux"]
	before, after :: [Lazy.Text]
	(before, "":after) = List.break (== "") allLines

	-- \|
	-- Each table cell is padded with extra whitespace, which we'll want to remove.
	-- So the next step is to write a function which can trim the whitespace from
	-- both sides.
	--
	-- My implementation compose two smaller functions, which respectively strip the
	-- whitespace from the front and the end of the string. While in Haskell it is
	-- more typical to compose those functions using '(.)' from right to left, I
	-- will use '(>>>)' to compose them from left to right so the transformation
	-- flows from top to bottom. In between each step, I will include a comment
	-- demonstrating what the example input looks like in between those steps.
	--
	-- >>> trim " foo bar "
	-- "foo bar"
	trim :: Strict.Text -> Strict.Text
	trim = Strict.dropWhile (== ' ')
	-- "foo bar "
	>>> Strict.dropWhileEnd (== ' ')

	-- \|
	-- Next, let's parse an individual table.
	--
	-- >>> parseTable "item , price\ncomputer , 1000\n" :: [PricingEntry]
	-- [PricingEntry {item = "computer", price = 1000}]
	parseTable :: forall a. FromRow a
	=> Lazy.Text -> [a]
	parseTable = Lazy.lines
	-- ["item , price","computer , 1000"]
	>>> fmap (fmap trim . Strict.split (== ',') . Lazy.toStrict)
	-- [["item","price"],["computer","1000"]]
	>>> go
	where
	go :: [[Strict.Text]] -> [a]
	go (header:data_) = data_
	-- [["computer","1000"]]
	& fmap (zip header)
	-- [[("item","computer"),("price","1000")]]
	& fmap Map.fromList
	-- [Map.fromList [("item","computer"),("price","1000")]]
	& fmap parseRow
	go [] = error "no header"


	-- The last step, 'parseRow', somehow converted a @Map Text Text@ into a
	-- 'PricingEntry'. I of course need to specify how to do that for both
	-- 'PricingEntry' and 'Purchase'.
	--
	-- I have written the 'Purchase' instance using the 'Applicative' style, and the
	-- 'PricingEntry' instance more verbosely, without that style, in order to
	-- demonstrate what that style desugars into.

	-- \|
	-- >>> parseRow (Map.fromList [("item","computer"),("price","1000")]) :: PricingEntry
	-- PricingEntry {item = "computer", price = 1000}
	class FromRow a where
	parseRow :: Map Strict.Text Strict.Text -> a

	instance FromRow PricingEntry where
	parseRow map_ = PricingEntry (cell "item" map_)
	(cell "price" map_)

	instance FromRow Purchase where
	parseRow = Purchase <$> cell "date"
	<*> cell "person"
	<*> cell "item-bought"
	<*> cell "units-bought"

	-- \|
	-- >>> cell "price" (Map.fromList [("item","computer"),("price","1000")]) :: Int
	-- 1000
	cell :: FromCell a
	=> Strict.Text -> Map Strict.Text Strict.Text -> a
	cell k = (! k)
	-- "1000"
	>>> parseCell


	-- Different calls to 'cell' return fields of different types, so I again need
	-- to specify how to parse each type.

	-- \|
	-- >>> parseCell "1000" :: Int
	-- 1000
	class FromCell a where
	parseCell :: Strict.Text -> a

	instance FromCell Strict.Text where
	parseCell = id

	instance FromCell Int where
	parseCell = read . Strict.unpack


	-- \|
	-- All right, we have now parsed our two tables into two lists. A list of
	-- 'Purchase's is good because we can stream lists, but for 'PricingEntry', a
	-- 'Map' from each item to its price would be more convenient.
	--
	-- >>> toPriceMap [PricingEntry "computer" 1000]
	-- fromList [("computer",1000)]
	toPriceMap :: [PricingEntry] -> Map Strict.Text Int
	toPriceMap = fmap (\(PricingEntry {..}) -> (item, price))
	-- [("computer",1000)]
	>>> Map.fromList


	-- The question we want to answer is "Averaged across persons, excluding legal
	-- fees, how much money had each person spent by time 6?", so let's define a
	-- datatype which accumulates all the data we need in order to answer this.
	--
	-- I am using bang patterns because I want to make sure that as I go through the
	-- 'Purchase's and update the 'Summary', I really update all of the fields.
	-- Otherwise, it's easy for those fields to accidentally hold a larger and
	-- larger thunk instead of holding e.g. a simple Int, thereby negating our
	-- efforts to keep a constant memory while we stream the data.

	data Summary = Summary
	{ people :: !(Set Strict.Text)
	, moneySpent :: !Int
	}
	deriving Show

	initialSummary :: Summary
	initialSummary = Summary mempty 0

	averageSpending :: Summary -> Double
	averageSpending (Summary {..}) = fromIntegral moneySpent
	/ max 1 (fromIntegral (length people))


	-- \|
	-- All right, we can now write a computation which updates our 'Summary' after
	-- looking at a single 'Purchase':
	--
	-- >>> :{
	-- updateSummary (Map.fromList [("computer",1000)])
	-- initialSummary
	-- (Purchase 3 "alice" "computer" 2)
	-- :}
	-- Summary {people = fromList ["alice"], moneySpent = 2000}
	updateSummary :: Map Strict.Text Int -> Summary -> Purchase -> Summary
	updateSummary priceMap (Summary {..}) (Purchase {..})
	\| date < 6 && itemBought /= "legal fees (1 hour)"
	= Summary
	{ people = Set.insert person people
	, moneySpent = moneySpent + (priceMap ! itemBought) * unitsBought
	}
	updateSummary _ summary _ = summary


	-- \|
	-- Tying everything all together, let's parse the two tables and stream our
	-- partial answers as we get more data.
	--
	-- First with a small input:
	--
	-- >>> :{
	-- streamAnswers ( Lazy.unlines
	-- [ "item , price"
	-- , "computer , 1000"
	-- , ""
	-- , "date , person , item-bought , units-bought"
	-- , "3 , alice , computer , 2"
	-- ])
	-- :}
	-- [0.0,2000.0]
	--
	-- Then with the sample data:
	--
	-- >>> streamAnswers input
	-- [0.0,0.0,5000.0,5000.0,7000.0,4000.0]
	--
	-- And finally, to prove that we really streaming and aren't reading the entire
	-- second table into memory, with an input which throws an exception after we
	-- touch the last line:
	--
	-- >>> streamAnswers (input <> error "touched the last line")
	-- [0.0,0.0,5000.0,5000.0,7000.0,4000.0*** Exception: touched the last line
	-- ...
	streamAnswers :: Lazy.Text -> [Double]
	streamAnswers text = purchases
	-- [Purchase 3 "alice" "computer" 2]
	& List.scanl' (updateSummary priceMap) initialSummary
	-- [initialSummary, Summary (Set.fromList ["alice"]) 2000]
	& fmap averageSpending
	where
	firstPart, secondPart :: Lazy.Text
	(firstPart, secondPart) = breakAtBlankLine text

	priceMap :: Map Strict.Text Int
	priceMap = toPriceMap (parseTable firstPart)

	purchases :: [Purchase]
	purchases = parseTable secondPart

	-- \|
	-- Finally, if we only care about the final answer, we don't have to print the
	-- intermediate results, we can just traverse the entirety of the second table
	-- and then only print the final result.
	--
	-- >>> finalAnswer input
	-- 4000.0
	finalAnswer :: Lazy.Text -> Double
	finalAnswer = last . streamAnswers


	-- This last part runs all the ">>>" tests I wrote throughout this file.
	main :: IO ()
	main = doctest ["-XOverloadedStrings", "src/Main.hs"]