jcbozonier/gist:33061

## gistfile1.ml
#light
namespace TextSearch
    module globals =
        let newLine = System.Environment.NewLine

    module fileProcessing =
        open System.IO
        open System.Collections.Generic
        open globals

        let newLine = System.Environment.NewLine

        let file(path:string) =
            let reader = new StreamReader(path) in
                reader.ReadToEnd()

        let filelisting(path:string) =
            let di = new DirectoryInfo(path)
            let files = di.GetFiles("*.txt")
            files |> Seq.map (fun file->file.Name)

        let fullpath path filename = path + filename

        let stringsplit (splitter:string) (str:string) = str.Split([|splitter|], System.StringSplitOptions.RemoveEmptyEntries)

        let loadfile fullpath =
            file(fullpath).ToLower() |> stringsplit (System.Environment.NewLine + System.Environment.NewLine)

        let process_data file_path file_names =
            let rec process_data_tr(file_name, data_list:Map<string, string array>, remaining_data) =
                match file_name, data_list, remaining_data with
                |  null, _, head :: tail ->
                    process_data_tr(head, data_list, tail)
                | a_name, _, [] -> data_list.Add(a_name, loadfile(fullpath file_path file_name))
                | _, _, head :: tail->
                    let data = loadfile(fullpath file_path file_name)
                    let mapped_data = data_list.Add(file_name, data)
                    process_data_tr(head, mapped_data, tail)
            process_data_tr(null, Map.empty, file_names)

        // now we have n lists of lists of strings (a list of books that contain a list of paragraphs)
        let searchable_data folder_path =
            let file_names = filelisting(folder_path)
            // for each file name we want to load the file and place that list of
            // paragraphs in a dictionary with the key being the file_name.
            process_data folder_path  (file_names |> Seq.to_list)

        let extract_paragraphs data = data |> Array.map snd |> Array.concat

        // create a list of every word in all of the books to figure out what my term space is.
        let word_list strings =
            // Remove the following characters: " , ; . \r\n
            let cleaned_strings =
                strings |> List.map(
                    fun(x:string)->
                        let new_text = x.Replace(newLine, "").Replace(".", "").Replace(",", "").Replace("\"", "")
                        new_text)

            let rec words (paragraphs:string list) (wordlist:string list) =
                match paragraphs with
                | (head:string) :: tail ->
                    let words_in_list = head.Split([|' '|]) |> Array.to_list |> List.filter(fun(x)->x <> "")
                    let wordlist = words_in_list @ wordlist
                    words tail wordlist
                | _ -> wordlist

            let remove_dupes the_words =
                let words_found = new Dictionary<string, bool>()
                let (unique_words:string list) = []
                List.map
                    (fun(x:string) ->
                        if not(words_found.ContainsKey(x)) && x<>"" then
                            words_found.Add(x, true)
                            let result = [x] @ unique_words
                            null),
                    the_words
            remove_dupes(words cleaned_strings [])


        let mapped_data = (searchable_data @"C:\Code\SampleContent\")
        let data = Map.to_list mapped_data
        let results =
            word_list(extract_paragraphs <| Seq.to_array data) |> Array.to_list)

        print_any(mapped_data.ContainsKey("JaneEyre.txt"))
	#light
	namespace TextSearch
	module globals =
	let newLine = System.Environment.NewLine

	module fileProcessing =
	open System.IO
	open System.Collections.Generic
	open globals

	let newLine = System.Environment.NewLine

	let file(path:string) =
	let reader = new StreamReader(path) in
	reader.ReadToEnd()

	let filelisting(path:string) =
	let di = new DirectoryInfo(path)
	let files = di.GetFiles("*.txt")
	files \|> Seq.map (fun file->file.Name)

	let fullpath path filename = path + filename

	let stringsplit (splitter:string) (str:string) = str.Split([\|splitter\|], System.StringSplitOptions.RemoveEmptyEntries)

	let loadfile fullpath =
	file(fullpath).ToLower() \|> stringsplit (System.Environment.NewLine + System.Environment.NewLine)

	let process_data file_path file_names =
	let rec process_data_tr(file_name, data_list:Map<string, string array>, remaining_data) =
	match file_name, data_list, remaining_data with
	\| null, _, head :: tail ->
	process_data_tr(head, data_list, tail)
	\| a_name, _, [] -> data_list.Add(a_name, loadfile(fullpath file_path file_name))
	\| _, _, head :: tail->
	let data = loadfile(fullpath file_path file_name)
	let mapped_data = data_list.Add(file_name, data)
	process_data_tr(head, mapped_data, tail)
	process_data_tr(null, Map.empty, file_names)

	// now we have n lists of lists of strings (a list of books that contain a list of paragraphs)
	let searchable_data folder_path =
	let file_names = filelisting(folder_path)
	// for each file name we want to load the file and place that list of
	// paragraphs in a dictionary with the key being the file_name.
	process_data folder_path (file_names \|> Seq.to_list)

	let extract_paragraphs data = data \|> Array.map snd \|> Array.concat

	// create a list of every word in all of the books to figure out what my term space is.
	let word_list strings =
	// Remove the following characters: " , ; . \r\n
	let cleaned_strings =
	strings \|> List.map(
	fun(x:string)->
	let new_text = x.Replace(newLine, "").Replace(".", "").Replace(",", "").Replace("\"", "")
	new_text)

	let rec words (paragraphs:string list) (wordlist:string list) =
	match paragraphs with
	\| (head:string) :: tail ->
	let words_in_list = head.Split([\|' '\|]) \|> Array.to_list \|> List.filter(fun(x)->x <> "")
	let wordlist = words_in_list @ wordlist
	words tail wordlist
	\| _ -> wordlist

	let remove_dupes the_words =
	let words_found = new Dictionary<string, bool>()
	let (unique_words:string list) = []
	List.map
	(fun(x:string) ->
	if not(words_found.ContainsKey(x)) && x<>"" then
	words_found.Add(x, true)
	let result = [x] @ unique_words
	null),
	the_words
	remove_dupes(words cleaned_strings [])



	let mapped_data = (searchable_data @"C:\Code\SampleContent\")
	let data = Map.to_list mapped_data
	let results =
	word_list(extract_paragraphs <\| Seq.to_array data) \|> Array.to_list)

	print_any(mapped_data.ContainsKey("JaneEyre.txt"))