Skip to content

Instantly share code, notes, and snippets.

@jcbozonier
Created December 7, 2008 08:27
Show Gist options
  • Save jcbozonier/33061 to your computer and use it in GitHub Desktop.
Save jcbozonier/33061 to your computer and use it in GitHub Desktop.
#light
namespace TextSearch
module globals =
let newLine = System.Environment.NewLine
module fileProcessing =
open System.IO
open System.Collections.Generic
open globals
let newLine = System.Environment.NewLine
let file(path:string) =
let reader = new StreamReader(path) in
reader.ReadToEnd()
let filelisting(path:string) =
let di = new DirectoryInfo(path)
let files = di.GetFiles("*.txt")
files |> Seq.map (fun file->file.Name)
let fullpath path filename = path + filename
let stringsplit (splitter:string) (str:string) = str.Split([|splitter|], System.StringSplitOptions.RemoveEmptyEntries)
let loadfile fullpath =
file(fullpath).ToLower() |> stringsplit (System.Environment.NewLine + System.Environment.NewLine)
let process_data file_path file_names =
let rec process_data_tr(file_name, data_list:Map<string, string array>, remaining_data) =
match file_name, data_list, remaining_data with
| null, _, head :: tail ->
process_data_tr(head, data_list, tail)
| a_name, _, [] -> data_list.Add(a_name, loadfile(fullpath file_path file_name))
| _, _, head :: tail->
let data = loadfile(fullpath file_path file_name)
let mapped_data = data_list.Add(file_name, data)
process_data_tr(head, mapped_data, tail)
process_data_tr(null, Map.empty, file_names)
// now we have n lists of lists of strings (a list of books that contain a list of paragraphs)
let searchable_data folder_path =
let file_names = filelisting(folder_path)
// for each file name we want to load the file and place that list of
// paragraphs in a dictionary with the key being the file_name.
process_data folder_path (file_names |> Seq.to_list)
let extract_paragraphs data = data |> Array.map snd |> Array.concat
// create a list of every word in all of the books to figure out what my term space is.
let word_list strings =
// Remove the following characters: " , ; . \r\n
let cleaned_strings =
strings |> List.map(
fun(x:string)->
let new_text = x.Replace(newLine, "").Replace(".", "").Replace(",", "").Replace("\"", "")
new_text)
let rec words (paragraphs:string list) (wordlist:string list) =
match paragraphs with
| (head:string) :: tail ->
let words_in_list = head.Split([|' '|]) |> Array.to_list |> List.filter(fun(x)->x <> "")
let wordlist = words_in_list @ wordlist
words tail wordlist
| _ -> wordlist
let remove_dupes the_words =
let words_found = new Dictionary<string, bool>()
let (unique_words:string list) = []
List.map
(fun(x:string) ->
if not(words_found.ContainsKey(x)) && x<>"" then
words_found.Add(x, true)
let result = [x] @ unique_words
null),
the_words
remove_dupes(words cleaned_strings [])
let mapped_data = (searchable_data @"C:\Code\SampleContent\")
let data = Map.to_list mapped_data
let results =
word_list(extract_paragraphs <| Seq.to_array data) |> Array.to_list)
print_any(mapped_data.ContainsKey("JaneEyre.txt"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment