Skip to content

Instantly share code, notes, and snippets.

@mseri
Forked from anuragsoni/count_words.ml
Created March 19, 2021 10:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mseri/0cd0f8ed60a3ce0c24ba4c1e493eb29a to your computer and use it in GitHub Desktop.
Save mseri/0cd0f8ed60a3ce0c24ba4c1e493eb29a to your computer and use it in GitHub Desktop.
module Trie = struct
type t = { mutable count : int; children : t option Array.t }
let[@inline] empty () = { count = 0; children = Array.make 128 None }
let add buf pos len t =
let rec loop idx t =
if idx > pos + len - 1 then t.count <- t.count + 1
else
let c = Char.lowercase_ascii (Bytes.get buf idx) in
let t' =
match t.children.(Char.code c) with
| None ->
let t'' = empty () in
t.children.(Char.code c) <- Some t'';
t''
| Some c -> c
in
loop (idx + 1) t'
in
loop pos t
let to_assoc t =
let rec loop acc curr t =
let acc =
match (curr, t.count) with
| [], _ -> acc
| _, 0 -> acc
| curr, c -> (List.rev curr, c) :: acc
in
let r = ref acc in
Array.iteri
(fun idx t' ->
match t' with
| None -> ()
| Some t' -> r := loop !r (Char.chr idx :: curr) t')
t.children;
!r
in
loop [] [] t
end
let[@inline] is_whitespace c = c = ' ' || c = '\n'
let rec skip_whitespace countwords pad ~lim i =
if i >= lim then (read_more_and_white [@tailcall]) countwords pad i
else if is_whitespace @@ Bytes.unsafe_get pad i then
(skip_whitespace [@tailcall]) countwords pad ~lim (i + 1)
else (read_next_word [@tailcall]) countwords pad ~lim i i
and read_next_word countwords pad ~lim i j =
if j >= lim then (read_more_and_next [@tailcall]) countwords pad i j
else if is_whitespace @@ Bytes.unsafe_get pad j then (
Trie.add pad i (j - i) countwords;
(skip_whitespace [@tailcall]) countwords pad ~lim (j + 1))
else (read_next_word [@tailcall]) countwords pad ~lim i (j + 1)
and read_more_and_next countwords pad i j =
let l = j - i in
Bytes.blit pad i pad 0 l;
let read = input stdin pad l (Bytes.length pad - l) in
let lim = l + read in
if lim = 0 then () else (read_next_word [@tailcall]) countwords pad ~lim 0 l
and read_more_and_white countwords pad i =
let lim = input stdin pad 0 (Bytes.length pad) in
if lim = 0 then () else (skip_whitespace [@tailcall]) countwords pad ~lim 0
let[@inline] read_start countwords pad =
(read_more_and_next [@specialize]) countwords pad 0 0
let () =
let t = Trie.empty () in
let pad = Bytes.make (64 * 1024) ' ' in
read_start t pad;
let assoc = Array.of_list (Trie.to_assoc t) in
Array.sort (fun (_, a) (_, b) -> Int.compare b a) assoc;
Array.iter
(fun (word, count) ->
List.iter print_char word;
print_char ' ';
print_int count;
print_newline ())
assoc
(* Simpler IO doesn't make the timing worse *)
module Trie = struct
type t = { mutable count : int; children : t option Array.t }
let[@inline] empty () = { count = 0; children = Array.make 128 None }
let add s pos len t =
let rec loop idx t =
if idx > pos + len - 1 then t.count <- t.count + 1
else
let c = Char.lowercase_ascii (String.get s idx) in
let t' =
match t.children.(Char.code c) with
| None ->
let t'' = empty () in
t.children.(Char.code c) <- Some t'';
t''
| Some c -> c
in
loop (idx + 1) t'
in
loop pos t
let to_assoc t =
let rec loop acc curr t =
let acc =
match (curr, t.count) with
| [], _ -> acc
| _, 0 -> acc
| curr, c -> (List.rev curr, c) :: acc
in
let r = ref acc in
Array.iteri
(fun idx t' ->
match t' with
| None -> ()
| Some t' -> r := loop !r (Char.chr idx :: curr) t')
t.children;
!r
in
loop [] [] t
end
let read_input t =
try
let rec loop () =
let line = input_line stdin in
let start = ref 0 in
let len = ref 0 in
for i = 0 to String.length line - 1 do
if line.[i] <> ' ' then incr len
else (
Trie.add line !start !len t;
start := i + 1;
len := 0)
done;
Trie.add line !start !len t;
loop ()
in
loop ()
with End_of_file -> ()
let () =
let t = Trie.empty () in
read_input t;
let assoc = Array.of_list (Trie.to_assoc t) in
Array.sort (fun (_, a) (_, b) -> Int.compare b a) assoc;
Array.iter
(fun (word, count) ->
List.iter print_char word;
print_char ' ';
print_int count;
print_newline ())
assoc
module BytesHash = struct
include Bytes
let hash (k : t) = Hashtbl.hash k
end
module BytesHashtbl = Hashtbl.Make (BytesHash)
let lowercase_inplace pad i j =
for k = i to j - 1 do
Bytes.unsafe_set pad k @@ Char.lowercase_ascii @@ Bytes.unsafe_get pad k
done
let sub_and_add_word countwords source i len =
let word = Bytes.sub source i len in
match BytesHashtbl.find countwords word with
| v ->
BytesHashtbl.replace countwords word (v + 1)
| exception Not_found ->
BytesHashtbl.add countwords word 1
let[@inline] is_whitespace c = c = ' ' || c = '\n'
let rec skip_whitespace countwords pad ~lim i =
if i >= lim then
(read_more_and_white[@taillcall]) countwords pad i
else if is_whitespace @@ Bytes.unsafe_get pad i then
(skip_whitespace[@tailcall]) countwords pad ~lim (i + 1)
else
(read_next_word[@tailcall]) countwords pad ~lim i i
and read_next_word countwords pad ~lim i j =
if j >= lim then
(read_more_and_next[@taillcall]) countwords pad i j
else if is_whitespace @@ Bytes.unsafe_get pad j then begin
sub_and_add_word countwords pad i (j-i);
(skip_whitespace[@taillcall]) countwords pad ~lim (j+1)
end else
(read_next_word[@taillcall]) countwords pad ~lim i (j + 1)
and read_more_and_next countwords pad i j =
let l = j - i in
Bytes.blit pad i pad 0 l;
let read = input stdin pad l (Bytes.length pad - l) in
let lim = l+read in
lowercase_inplace pad l lim;
if lim = 0 then ()
else (read_next_word[@taillcall]) countwords pad ~lim 0 l
and read_more_and_white countwords pad i =
let lim = input stdin pad 0 (Bytes.length pad) in
lowercase_inplace pad 0 lim;
if lim = 0 then ()
else (skip_whitespace[@taillcall]) countwords pad ~lim 0
let[@inline] read_start countwords pad =
(read_more_and_next[@specialize]) countwords pad 0 0
let[@inline] print_slot (w, c) =
print_bytes w;
print_char ' ';
print_int c;
print_newline ()
let () =
let countwords = BytesHashtbl.create 33_000 in
let size = 64*1024 in
let pad = Bytes.make size ' ' in
read_start countwords pad ;
let a = Array.make (BytesHashtbl.length countwords) (Bytes.empty,0) in
let f k v i = a.(i) <- (k,v); i+1 in
let _ = BytesHashtbl.fold f countwords 0 in
Array.sort (fun (_,i) (_,j) -> Int.compare i j) a;
Array.iter print_slot a
module BytesHash = struct
include Bytes
let hash (k : bytes) = Hashtbl.hash k
end
module BytesHashtbl = Hashtbl.Make (BytesHash)
module Buf = struct
type t = { buf : Bytes.t; mutable pos : int; mutable len : int; size : int }
let create size = { buf = Bytes.create size; pos = 0; len = 0; size }
let[@inline] compress t =
if t.len <= 0 || t.pos = t.len then (
t.pos <- 0;
t.len <- 0)
else if t.pos > 0 then (
BytesLabels.unsafe_blit ~src:t.buf ~src_pos:t.pos ~dst:t.buf ~dst_pos:0
~len:t.len;
t.pos <- 0)
let read_into t fd =
compress t;
let count =
UnixLabels.read fd ~buf:t.buf ~pos:(t.pos + t.len) ~len:(t.size - t.len)
in
t.len <- t.len + count;
for i = t.pos to t.len - 1 do
Bytes.unsafe_set t.buf i (Char.lowercase_ascii (Bytes.unsafe_get t.buf i))
done;
count
let iter_words t tbl =
let prev = ref t.pos in
for i = t.pos to t.len - 1 do
(* We perform bounds check so its okay to call unsafe_get here *)
let c = Bytes.unsafe_get t.buf i in
if c = '\n' || c = ' ' then (
let pos = !prev in
let len = i - !prev in
(if len > 0 then
let word = Bytes.sub t.buf pos len in
try incr (BytesHashtbl.find tbl word)
with Not_found -> BytesHashtbl.add tbl word (ref 1));
prev := i + 1)
done;
t.len <- t.len - !prev;
t.pos <- !prev
end
let () =
let countwords = BytesHashtbl.create 33_000 in
let buf = Buf.create (64 * 1024) in
while Buf.read_into buf Unix.stdin > 0 do
Buf.iter_words buf countwords
done;
let arr = Array.make (BytesHashtbl.length countwords) (Bytes.empty, 0) in
let idx = ref 0 in
BytesHashtbl.iter
(fun word count ->
arr.(!idx) <- (word, !count);
incr idx)
countwords;
Array.sort (fun (_, x) (_, y) -> Int.compare y x) arr;
Array.iter
(fun (w, c) ->
print_bytes w;
print_char ' ';
print_int c;
print_newline ())
arr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment