public
Created

  • Download Gist
txt-processing.hs
Haskell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
import System.Directory ( getDirectoryContents, doesFileExist, doesDirectoryExist )
import System.Environment ( getArgs )
import System.FilePath
import Control.Monad ( forM_, filterM )
import Data.Char ( isAscii, isAlphaNum, toLower )
import qualified Data.Map as Map
 
getDirectoryEntries :: FilePath -> IO ([FilePath], [FilePath])
getDirectoryEntries path = do
entries <- getDirectoryContents path
let filtered = [path </> e | e <- entries, e `notElem` [".", ".."]]
files <- filterM doesFileExist filtered
dirs <- filterM doesDirectoryExist filtered
return (files, dirs)
 
getAllFiles :: FilePath -> IO [FilePath]
getAllFiles dirPath = do
(files, dirs) <- getDirectoryEntries dirPath
list <- mapM getAllFiles dirs
let allFiles = foldr (++) files list
return allFiles
 
tokenize :: String -> [String]
tokenize content = words $ map lowerAndReplace $ filter isValid content
where
isValid c = (isAscii c && isAlphaNum c) || c == ' ' || c == '\n'
lowerAndReplace c = if (c == '\n') then ' ' else toLower c
 
processContent :: String -> Map.Map String [Int]
processContent content = process' (tokenize content) 0 Map.empty
where
process' [] _ map' = map'
process' (w:ws) i map' = process' ws (i + 1) $ Map.insertWith (++) w [i] map'
 
main = do
(path:_) <- getArgs
files <- getAllFiles path
 
forM_ files $ \filePath -> do
content <- readFile filePath
let ocList = processContent content
putStrLn $ "File: " ++ filePath ++ " | Words to be indexed: " ++ (show (Map.size ocList))

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.