Created
December 7, 2008 08:27
-
-
Save jcbozonier/33061 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#light | |
namespace TextSearch | |
module globals = | |
let newLine = System.Environment.NewLine | |
module fileProcessing = | |
open System.IO | |
open System.Collections.Generic | |
open globals | |
let newLine = System.Environment.NewLine | |
let file(path:string) = | |
let reader = new StreamReader(path) in | |
reader.ReadToEnd() | |
let filelisting(path:string) = | |
let di = new DirectoryInfo(path) | |
let files = di.GetFiles("*.txt") | |
files |> Seq.map (fun file->file.Name) | |
let fullpath path filename = path + filename | |
let stringsplit (splitter:string) (str:string) = str.Split([|splitter|], System.StringSplitOptions.RemoveEmptyEntries) | |
let loadfile fullpath = | |
file(fullpath).ToLower() |> stringsplit (System.Environment.NewLine + System.Environment.NewLine) | |
let process_data file_path file_names = | |
let rec process_data_tr(file_name, data_list:Map<string, string array>, remaining_data) = | |
match file_name, data_list, remaining_data with | |
| null, _, head :: tail -> | |
process_data_tr(head, data_list, tail) | |
| a_name, _, [] -> data_list.Add(a_name, loadfile(fullpath file_path file_name)) | |
| _, _, head :: tail-> | |
let data = loadfile(fullpath file_path file_name) | |
let mapped_data = data_list.Add(file_name, data) | |
process_data_tr(head, mapped_data, tail) | |
process_data_tr(null, Map.empty, file_names) | |
// now we have n lists of lists of strings (a list of books that contain a list of paragraphs) | |
let searchable_data folder_path = | |
let file_names = filelisting(folder_path) | |
// for each file name we want to load the file and place that list of | |
// paragraphs in a dictionary with the key being the file_name. | |
process_data folder_path (file_names |> Seq.to_list) | |
let extract_paragraphs data = data |> Array.map snd |> Array.concat | |
// create a list of every word in all of the books to figure out what my term space is. | |
let word_list strings = | |
// Remove the following characters: " , ; . \r\n | |
let cleaned_strings = | |
strings |> List.map( | |
fun(x:string)-> | |
let new_text = x.Replace(newLine, "").Replace(".", "").Replace(",", "").Replace("\"", "") | |
new_text) | |
let rec words (paragraphs:string list) (wordlist:string list) = | |
match paragraphs with | |
| (head:string) :: tail -> | |
let words_in_list = head.Split([|' '|]) |> Array.to_list |> List.filter(fun(x)->x <> "") | |
let wordlist = words_in_list @ wordlist | |
words tail wordlist | |
| _ -> wordlist | |
let remove_dupes the_words = | |
let words_found = new Dictionary<string, bool>() | |
let (unique_words:string list) = [] | |
List.map | |
(fun(x:string) -> | |
if not(words_found.ContainsKey(x)) && x<>"" then | |
words_found.Add(x, true) | |
let result = [x] @ unique_words | |
null), | |
the_words | |
remove_dupes(words cleaned_strings []) | |
let mapped_data = (searchable_data @"C:\Code\SampleContent\") | |
let data = Map.to_list mapped_data | |
let results = | |
word_list(extract_paragraphs <| Seq.to_array data) |> Array.to_list) | |
print_any(mapped_data.ContainsKey("JaneEyre.txt")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment