Skip to content

@luisgabriel /txt-processing.hs
Created

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
import System.Directory ( getDirectoryContents, doesFileExist, doesDirectoryExist )
import System.Environment ( getArgs )
import System.FilePath
import Control.Monad ( forM_, filterM )
import Data.Char ( isAscii, isAlphaNum, toLower )
import qualified Data.Map as Map
getDirectoryEntries :: FilePath -> IO ([FilePath], [FilePath])
getDirectoryEntries path = do
entries <- getDirectoryContents path
let filtered = [path </> e | e <- entries, e `notElem` [".", ".."]]
files <- filterM doesFileExist filtered
dirs <- filterM doesDirectoryExist filtered
return (files, dirs)
getAllFiles :: FilePath -> IO [FilePath]
getAllFiles dirPath = do
(files, dirs) <- getDirectoryEntries dirPath
list <- mapM getAllFiles dirs
let allFiles = foldr (++) files list
return allFiles
tokenize :: String -> [String]
tokenize content = words $ map lowerAndReplace $ filter isValid content
where
isValid c = (isAscii c && isAlphaNum c) || c == ' ' || c == '\n'
lowerAndReplace c = if (c == '\n') then ' ' else toLower c
processContent :: String -> Map.Map String [Int]
processContent content = process' (tokenize content) 0 Map.empty
where
process' [] _ map' = map'
process' (w:ws) i map' = process' ws (i + 1) $ Map.insertWith (++) w [i] map'
main = do
(path:_) <- getArgs
files <- getAllFiles path
forM_ files $ \filePath -> do
content <- readFile filePath
let ocList = processContent content
putStrLn $ "File: " ++ filePath ++ " | Words to be indexed: " ++ (show (Map.size ocList))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.