Created
December 7, 2012 20:47
-
-
Save imeckler/4236381 to your computer and use it in GitHub Desktop.
Booth_parse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE DeriveGeneric #-} | |
{-# LANGUAGE NamedFieldPuns #-} | |
{-# LANGUAGE RecordWildCards #-} | |
import System.Directory (getDirectoryContents) | |
import qualified System.IO.Strict as S | |
import Data.String.Utils (split, join) | |
import qualified Data.HashMap.Lazy as M | |
import qualified Data.HashSet as H | |
import qualified Data.ByteString.Lazy as B | |
import Control.Applicative | |
import Data.Char | |
import Data.List (genericLength) | |
import GHC.Exts (sortWith) | |
import System.Environment (getArgs) | |
import System.IO.Unsafe (unsafePerformIO) | |
data Transcript = | |
Transcript { idNum :: String | |
, speaker :: String | |
, wordCount :: Int | |
, charCount :: Int | |
, dictWordCount :: M.HashMap String Int | |
, fkScore :: Float | |
, smogScore :: Float | |
, fogScore :: Float | |
, num :: String | |
, name :: String | |
, path :: String | |
, execName :: String | |
} | |
deriving (Show) --, Generic) | |
toCSVString :: Transcript -> String | |
toCSVString Transcript {..} = | |
join "," $ [ idNum | |
, show speaker | |
, show wordCount | |
, show charCount | |
, show fkScore | |
, show smogScore | |
, show fogScore | |
, num | |
, show name | |
, show execName | |
, show path | |
] ++ alphabetizedWCs | |
where | |
alphabetizedWCs = map (show . snd) . sortWith fst $ M.toList dictWordCount | |
dropExtension :: String -> String | |
dropExtension s = case dropWhile (/= '.') . reverse $ s of | |
"" -> s | |
s' -> reverse . tail $ s' | |
parsePath :: String -> (String, String, String, String, String) | |
parsePath path = (kind, idNum, num, name, execName) | |
where | |
(idNum:kind:numName:execName:xs) = (split "_" (dropExtension $ baseName path)) ++ repeat "" | |
(num, name) = span isDigit numName | |
baseName :: FilePath -> FilePath | |
baseName = last . split "/" | |
words' :: String -> [String] | |
words' = words . map toUpper . filter (\x -> isLetter x || isSpace x) | |
fog wordCnt sentenceCnt polySylCount = 0.4 * (wordCnt / sentenceCnt + 100 * (polySylCount / wordCnt)) | |
--smog :: (Num a, Ord a, Floating b) => [a] -> b -> b | |
smog numSentences polySylCount = 1.043 * (sqrt $ polySylCount * (30 / numSentences)) + 3.1291 | |
fleschKincaid wordCnt sentenceCnt sylCounts = 206.835 - 1.015 * (wordCnt / sentenceCnt) - 84.6 * (numSylls / wordCnt) | |
where | |
numSylls = fromIntegral $ sum sylCounts | |
sylMap :: M.HashMap String Int | |
sylMap = unsafePerformIO getMap | |
where getMap = fmap (M.fromList . read) $ S.readFile "sylmap.txt" | |
sylCount :: String -> Int | |
sylCount w = case M.lookup w sylMap of | |
Just x -> x | |
Nothing -> 2 | |
fromFile :: [(String, H.HashSet String)] -> String -> String -> Transcript | |
fromFile wordSetAL path text = | |
Transcript { idNum | |
, speaker = dropExtension speaker | |
, wordCount | |
, charCount = length text | |
, dictWordCount | |
, fkScore = fleschKincaid wc numSentences sylCounts | |
, smogScore = smog numSentences polySylCount | |
, fogScore = fog wc numSentences polySylCount | |
, num | |
, name | |
, path | |
, execName | |
} | |
where | |
(speaker, idNum, num, name, execName) = parsePath path | |
dictWordCount = wordCountMap wordSetAL text | |
ws = words' text | |
wordCount = length ws | |
wc = fromIntegral wordCount | |
numSentences = genericLength $ split "." text | |
sylCounts = map sylCount ws | |
polySylCount = genericLength $ filter (> 2) sylCounts | |
countUniqs :: (Eq a, Data.Hashable.Hashable a, Num v) => [a] -> HashMap a v | |
countUniqs = foldr inc M.empty | |
where inc w = M.insertWith (+) w 1 | |
wordCountMap :: [(String, H.HashSet String)] -> String -> M.HashMap String Int | |
wordCountMap wordSetAL text = M.fromList $ map countOccurences wordSetAL | |
where | |
countOccurences (k, s) = (baseName k, sum . map snd . filter (flip H.member s . fst) $ counts) | |
counts = M.toList . countUniqs . words' $ text | |
listDirectory :: FilePath -> IO [FilePath] | |
listDirectory d = fmap (map (d ++) . filter ((/= '.') . head)) $ getDirectoryContents d | |
getWordSetAL :: String -> IO [(String, H.HashSet String)] | |
getWordSetAL wordlistDirectory = do | |
paths <- listDirectory wordlistDirectory | |
let getWordSet = fmap (H.fromList . words . map toUpper) . S.readFile | |
zip paths <$> (sequence $ map getWordSet paths) | |
main = do | |
fileDirectory <- fmap (!! 0) getArgs | |
wordlistDirectory <- fmap (!! 1) getArgs | |
filePaths <- listDirectory fileDirectory | |
wordSetAL <- getWordSetAL wordlistDirectory | |
texts <- sequence $ map S.readFile filePaths | |
let res = zipWith (fromFile wordSetAL) filePaths texts | |
putStrLn "idNum,speaker,wordCount,charCount,fkScore,smogScore,fogScore,num,name,execName,path,GI_NEGWORDS_LEONE_revised.txt,GI_POSWORDS_LEONE_revised.txt,LM_litigious.txt,LM_negative.txt,LM_positive.txt,LM_uncertainty.txt,diction_blame.txt,diction_denial.txt,diction_hardship_revised.txt,diction_inspiration.txt,diction_negwords_revised.txt,diction_numericterms.txt,diction_poswords_revised.txt,diction_praise.txt,diction_satisfaction.txt,eh_negwords.txt,eh_poswords.txt,gi_fail.txt,gi_if.txt,gi_ovrst.txt,gi_undrst.txt" | |
putStrLn . join "\n" $ map toCSVString res |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment