Skip to content

Instantly share code, notes, and snippets.

@jzstark
Last active July 31, 2017 22:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jzstark/21b2df84b76f7de02cdc919f23e26b9d to your computer and use it in GitHub Desktop.
Save jzstark/21b2df84b76f7de02cdc919f23e26b9d to your computer and use it in GitHub Desktop.
load IMDB sentiment dataset
(* source: http://ai.stanford.edu/~amaas/data/sentiment/ *)
let vocab_file = "imdb.vocab"
let load_file f =
let ic = open_in f in
let n = in_channel_length ic in
let s = Bytes.create n in
really_input ic s 0 n;
close_in ic;
(* remove non alphabetic character except for whitespace and hyphen *)
(* currently cannot process multiple hyphen input *)
let s = Str.global_replace (Str.regexp "[^a-zA-Z- ]") "" s in
s |> String.lowercase_ascii
let string2array s =
Str.split (Str.regexp " ") s
|> Array.of_list
(* Attrib: https://stackoverflow.com/questions/5774934/how-do-i-read-in-lines-from-a-text-file-in-ocaml*)
let read_lines name : string list =
let ic = open_in name in
let try_read () =
try Some (input_line ic) with End_of_file -> None in
let rec loop acc = match try_read () with
| Some s -> loop (s :: acc)
| None -> close_in ic; List.rev acc in
loop []
let load_vocab vocab_f =
let lines = read_lines vocab_f in
let len = List.length lines in
let v2i = Hashtbl.create len in
let i2v = Hashtbl.create len in
List.iteri (fun i v ->
Hashtbl.add v2i v i;
Hashtbl.add i2v i v;
) lines;
v2i, i2v
let vectorize_file fname =
let s = load_file fname in
let s = string2array s in
(*trim the word such as "think..." and "<br" *)
let num_str = Array.make (Array.length s) 0 in
let v2i, _ = load_vocab vocab_file in
Array.iteri (fun i w ->
Array.set num_str i (Hashtbl.find v2i w) (*exception: cannot find*)
) s;
num_str (* problem: zero index *)
let load_train_file dir =
let rec loop_files xs acc =
match xs with
| [] -> acc
| h::t -> loop_files t (List.append acc [vectorize_file h])
in
let file_list = Sys.readdir dir |> Array.to_list in
loop_files file_list []
let dir = "./train/neg" in
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment