Last active
August 29, 2015 14:09
-
-
Save robinp/de550692c513f43f8d51 to your computer and use it in GitHub Desktop.
Throw-away script for text stats for rows where the rightmost column contains Hungarian text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE BangPatterns, OverloadedStrings #-} | |
import qualified Data.ByteString.Lazy as BL | |
import Data.Csv | |
import Data.Char (ord) | |
import Data.Monoid | |
import Data.Vector (Vector) | |
import qualified Data.Vector as V | |
import Data.Foldable (foldMap) | |
import qualified Data.Text.Lazy as T | |
import qualified Data.Text.Lazy.Encoding as T | |
import qualified Data.Set as S | |
import Control.Monad.Trans.State | |
import Debug.Trace | |
data Hun = H | |
{ hun :: !Int | |
, hunWords :: !Int | |
, hunChars :: !Int | |
} deriving (Eq, Ord, Show) | |
data Summary = S | |
{ nonEmpty :: !Int | |
, hunNoDedup :: !Hun | |
, hunDedup :: !Hun | |
} deriving (Eq, Ord, Show) | |
instance Monoid Summary where | |
mempty = S 0 mempty mempty | |
(S a b c) `mappend` (S e f g) = S (a+e) (b <> f) (c <> g) | |
instance Monoid Hun where | |
mempty = H 0 0 0 | |
(H a b c) `mappend` (H e f g) = H (a+e) (b+f) (c+g) | |
type M = State (S.Set T.Text) | |
toStats :: Summary -> Vector BL.ByteString -> M Summary | |
toStats s fs0 = do | |
let fs = V.filter (not . BL.null) fs0 | |
case V.toList fs of | |
[_, e] -> do | |
newS <- entryToStat e | |
return $ s `mappend` newS | |
_ -> return s | |
entryToStat :: BL.ByteString -> M Summary | |
entryToStat bs = case T.decodeUtf8' bs of | |
Left _ -> trace (show bs) . return $ mempty | |
Right t -> do | |
let is_hun = T.any (`elem` "öüóőúéáűíÖÜÓŐÚÉÁŰÍ") t | |
seen <- gets (S.member t) | |
modify (S.insert t) | |
return $ if is_hun then | |
let ws = length . T.words $ t | |
cs = fromIntegral . T.length $ t | |
h = H 1 ws cs | |
in S 1 h (if seen then mempty else h) | |
else S 1 mempty mempty | |
myopts = defaultDecodeOptions | |
{ decDelimiter = fromIntegral . ord $ '\t' } | |
main = do | |
input <- BL.readFile "utf8.txt" | |
let d = decodeWith myopts NoHeader input | |
:: Either String (Vector (Vector BL.ByteString)) | |
case d of | |
Left err -> print err | |
Right vs -> do | |
print . flip evalState S.empty . V.foldM (toStats) mempty $ vs | |
BL.writeFile "cleaned.txt" . BL.intercalate "\r\n" . V.toList . V.map cleanup $ vs | |
cleanup :: Vector BL.ByteString -> BL.ByteString | |
cleanup fs0 = | |
let fs = V.filter (not . BL.null) fs0 | |
in case V.toList fs of | |
[a, b] -> a <> ";" <> b | |
[a] -> a <> ";" | |
_ -> "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment