Skip to content

Instantly share code, notes, and snippets.

@robinp
Last active August 29, 2015 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robinp/de550692c513f43f8d51 to your computer and use it in GitHub Desktop.
Save robinp/de550692c513f43f8d51 to your computer and use it in GitHub Desktop.
Throw-away script for text stats for rows where the rightmost column contains Hungarian text.
{-# LANGUAGE BangPatterns, OverloadedStrings #-}
import qualified Data.ByteString.Lazy as BL
import Data.Csv
import Data.Char (ord)
import Data.Monoid
import Data.Vector (Vector)
import qualified Data.Vector as V
import Data.Foldable (foldMap)
import qualified Data.Text.Lazy as T
import qualified Data.Text.Lazy.Encoding as T
import qualified Data.Set as S
import Control.Monad.Trans.State
import Debug.Trace
data Hun = H
{ hun :: !Int
, hunWords :: !Int
, hunChars :: !Int
} deriving (Eq, Ord, Show)
data Summary = S
{ nonEmpty :: !Int
, hunNoDedup :: !Hun
, hunDedup :: !Hun
} deriving (Eq, Ord, Show)
instance Monoid Summary where
mempty = S 0 mempty mempty
(S a b c) `mappend` (S e f g) = S (a+e) (b <> f) (c <> g)
instance Monoid Hun where
mempty = H 0 0 0
(H a b c) `mappend` (H e f g) = H (a+e) (b+f) (c+g)
type M = State (S.Set T.Text)
toStats :: Summary -> Vector BL.ByteString -> M Summary
toStats s fs0 = do
let fs = V.filter (not . BL.null) fs0
case V.toList fs of
[_, e] -> do
newS <- entryToStat e
return $ s `mappend` newS
_ -> return s
entryToStat :: BL.ByteString -> M Summary
entryToStat bs = case T.decodeUtf8' bs of
Left _ -> trace (show bs) . return $ mempty
Right t -> do
let is_hun = T.any (`elem` "öüóőúéáűíÖÜÓŐÚÉÁŰÍ") t
seen <- gets (S.member t)
modify (S.insert t)
return $ if is_hun then
let ws = length . T.words $ t
cs = fromIntegral . T.length $ t
h = H 1 ws cs
in S 1 h (if seen then mempty else h)
else S 1 mempty mempty
myopts = defaultDecodeOptions
{ decDelimiter = fromIntegral . ord $ '\t' }
main = do
input <- BL.readFile "utf8.txt"
let d = decodeWith myopts NoHeader input
:: Either String (Vector (Vector BL.ByteString))
case d of
Left err -> print err
Right vs -> do
print . flip evalState S.empty . V.foldM (toStats) mempty $ vs
BL.writeFile "cleaned.txt" . BL.intercalate "\r\n" . V.toList . V.map cleanup $ vs
cleanup :: Vector BL.ByteString -> BL.ByteString
cleanup fs0 =
let fs = V.filter (not . BL.null) fs0
in case V.toList fs of
[a, b] -> a <> ";" <> b
[a] -> a <> ";"
_ -> ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment