Skip to content

Instantly share code, notes, and snippets.

@luisgabriel
Created April 27, 2013 20:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save luisgabriel/5474695 to your computer and use it in GitHub Desktop.
Save luisgabriel/5474695 to your computer and use it in GitHub Desktop.
import System.Directory ( getDirectoryContents, doesFileExist, doesDirectoryExist )
import System.Environment ( getArgs )
import System.FilePath
import Control.Monad ( forM_, filterM )
import Data.Char ( isAscii, isAlphaNum, toLower )
import qualified Data.Map as Map
getDirectoryEntries :: FilePath -> IO ([FilePath], [FilePath])
getDirectoryEntries path = do
entries <- getDirectoryContents path
let filtered = [path </> e | e <- entries, e `notElem` [".", ".."]]
files <- filterM doesFileExist filtered
dirs <- filterM doesDirectoryExist filtered
return (files, dirs)
getAllFiles :: FilePath -> IO [FilePath]
getAllFiles dirPath = do
(files, dirs) <- getDirectoryEntries dirPath
list <- mapM getAllFiles dirs
let allFiles = foldr (++) files list
return allFiles
tokenize :: String -> [String]
tokenize content = words $ map lowerAndReplace $ filter isValid content
where
isValid c = (isAscii c && isAlphaNum c) || c == ' ' || c == '\n'
lowerAndReplace c = if (c == '\n') then ' ' else toLower c
processContent :: String -> Map.Map String [Int]
processContent content = process' (tokenize content) 0 Map.empty
where
process' [] _ map' = map'
process' (w:ws) i map' = process' ws (i + 1) $ Map.insertWith (++) w [i] map'
main = do
(path:_) <- getArgs
files <- getAllFiles path
forM_ files $ \filePath -> do
content <- readFile filePath
let ocList = processContent content
putStrLn $ "File: " ++ filePath ++ " | Words to be indexed: " ++ (show (Map.size ocList))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment