Skip to content

Instantly share code, notes, and snippets.

@imeckler
Created December 7, 2012 20:47
Show Gist options
  • Save imeckler/4236381 to your computer and use it in GitHub Desktop.
Save imeckler/4236381 to your computer and use it in GitHub Desktop.
Booth_parse
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE RecordWildCards #-}
import System.Directory (getDirectoryContents)
import qualified System.IO.Strict as S
import Data.String.Utils (split, join)
import qualified Data.HashMap.Lazy as M
import qualified Data.HashSet as H
import qualified Data.ByteString.Lazy as B
import Control.Applicative
import Data.Char
import Data.List (genericLength)
import GHC.Exts (sortWith)
import System.Environment (getArgs)
import System.IO.Unsafe (unsafePerformIO)
data Transcript =
Transcript { idNum :: String
, speaker :: String
, wordCount :: Int
, charCount :: Int
, dictWordCount :: M.HashMap String Int
, fkScore :: Float
, smogScore :: Float
, fogScore :: Float
, num :: String
, name :: String
, path :: String
, execName :: String
}
deriving (Show) --, Generic)
toCSVString :: Transcript -> String
toCSVString Transcript {..} =
join "," $ [ idNum
, show speaker
, show wordCount
, show charCount
, show fkScore
, show smogScore
, show fogScore
, num
, show name
, show execName
, show path
] ++ alphabetizedWCs
where
alphabetizedWCs = map (show . snd) . sortWith fst $ M.toList dictWordCount
dropExtension :: String -> String
dropExtension s = case dropWhile (/= '.') . reverse $ s of
"" -> s
s' -> reverse . tail $ s'
parsePath :: String -> (String, String, String, String, String)
parsePath path = (kind, idNum, num, name, execName)
where
(idNum:kind:numName:execName:xs) = (split "_" (dropExtension $ baseName path)) ++ repeat ""
(num, name) = span isDigit numName
baseName :: FilePath -> FilePath
baseName = last . split "/"
words' :: String -> [String]
words' = words . map toUpper . filter (\x -> isLetter x || isSpace x)
fog wordCnt sentenceCnt polySylCount = 0.4 * (wordCnt / sentenceCnt + 100 * (polySylCount / wordCnt))
--smog :: (Num a, Ord a, Floating b) => [a] -> b -> b
smog numSentences polySylCount = 1.043 * (sqrt $ polySylCount * (30 / numSentences)) + 3.1291
fleschKincaid wordCnt sentenceCnt sylCounts = 206.835 - 1.015 * (wordCnt / sentenceCnt) - 84.6 * (numSylls / wordCnt)
where
numSylls = fromIntegral $ sum sylCounts
sylMap :: M.HashMap String Int
sylMap = unsafePerformIO getMap
where getMap = fmap (M.fromList . read) $ S.readFile "sylmap.txt"
sylCount :: String -> Int
sylCount w = case M.lookup w sylMap of
Just x -> x
Nothing -> 2
fromFile :: [(String, H.HashSet String)] -> String -> String -> Transcript
fromFile wordSetAL path text =
Transcript { idNum
, speaker = dropExtension speaker
, wordCount
, charCount = length text
, dictWordCount
, fkScore = fleschKincaid wc numSentences sylCounts
, smogScore = smog numSentences polySylCount
, fogScore = fog wc numSentences polySylCount
, num
, name
, path
, execName
}
where
(speaker, idNum, num, name, execName) = parsePath path
dictWordCount = wordCountMap wordSetAL text
ws = words' text
wordCount = length ws
wc = fromIntegral wordCount
numSentences = genericLength $ split "." text
sylCounts = map sylCount ws
polySylCount = genericLength $ filter (> 2) sylCounts
countUniqs :: (Eq a, Data.Hashable.Hashable a, Num v) => [a] -> HashMap a v
countUniqs = foldr inc M.empty
where inc w = M.insertWith (+) w 1
wordCountMap :: [(String, H.HashSet String)] -> String -> M.HashMap String Int
wordCountMap wordSetAL text = M.fromList $ map countOccurences wordSetAL
where
countOccurences (k, s) = (baseName k, sum . map snd . filter (flip H.member s . fst) $ counts)
counts = M.toList . countUniqs . words' $ text
listDirectory :: FilePath -> IO [FilePath]
listDirectory d = fmap (map (d ++) . filter ((/= '.') . head)) $ getDirectoryContents d
getWordSetAL :: String -> IO [(String, H.HashSet String)]
getWordSetAL wordlistDirectory = do
paths <- listDirectory wordlistDirectory
let getWordSet = fmap (H.fromList . words . map toUpper) . S.readFile
zip paths <$> (sequence $ map getWordSet paths)
main = do
fileDirectory <- fmap (!! 0) getArgs
wordlistDirectory <- fmap (!! 1) getArgs
filePaths <- listDirectory fileDirectory
wordSetAL <- getWordSetAL wordlistDirectory
texts <- sequence $ map S.readFile filePaths
let res = zipWith (fromFile wordSetAL) filePaths texts
putStrLn "idNum,speaker,wordCount,charCount,fkScore,smogScore,fogScore,num,name,execName,path,GI_NEGWORDS_LEONE_revised.txt,GI_POSWORDS_LEONE_revised.txt,LM_litigious.txt,LM_negative.txt,LM_positive.txt,LM_uncertainty.txt,diction_blame.txt,diction_denial.txt,diction_hardship_revised.txt,diction_inspiration.txt,diction_negwords_revised.txt,diction_numericterms.txt,diction_poswords_revised.txt,diction_praise.txt,diction_satisfaction.txt,eh_negwords.txt,eh_poswords.txt,gi_fail.txt,gi_if.txt,gi_ovrst.txt,gi_undrst.txt"
putStrLn . join "\n" $ map toCSVString res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment