Skip to content

Instantly share code, notes, and snippets.

@copy
Created May 20, 2018 18:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save copy/45e413312966288884aee839b022caf5 to your computer and use it in GitHub Desktop.
Save copy/45e413312966288884aee839b022caf5 to your computer and use it in GitHub Desktop.
(*
* The Computer Language Benchmarks Game
* https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
*
* regex-dna program contributed by Paolo Ribeca, August 2011
*
* The regexp machinery comes from a previous OCaml version by
* Christophe TROESTLER and Mauricio Fernandez
*
* converted from regex-dna program
*)
let workers = 16
let variants, variants_to_string =
let variants = [|
"agggtaaa\\|tttaccct";
"[cgt]gggtaaa\\|tttaccc[acg]";
"a[act]ggtaaa\\|tttacc[agt]t";
"ag[act]gtaaa\\|tttac[agt]ct";
"agg[act]taaa\\|ttta[agt]cct";
"aggg[acg]aaa\\|ttt[cgt]ccct";
"agggt[cgt]aa\\|tt[acg]accct";
"agggta[cgt]a\\|t[acg]taccct";
"agggtaa[cgt]\\|[acg]ttaccct"
|]
(* Remove the "\\" which is mandatory in OCaml regex *)
and re_bs = Str.regexp_string "\\" in
Array.map Str.regexp_case_fold variants,
Array.map (Str.global_replace re_bs "") variants
let iupacs =
Array.map (fun (re, s) -> Str.regexp re, s) [|
"tHa[Nt]", "<4>";
"aND\\|caN\\|Ha[DS]\\|WaS", "<3>";
"a[NSt]\\|BY", "<2>";
"<[^>]*>", "\\|";
"\\|[^|][^|]*\\|", "-"
|]
(* Read all of a redirected FASTA format file from stdin *)
let file_data, file_length =
let b = Buffer.create 0xffffff and s = Bytes.create 0xfff and r = ref 1 in
while !r > 0 do
r := input stdin s 0 0xfff; Buffer.add_subbytes b s 0 !r
done;
Buffer.contents b, Buffer.length b
(* Remove FASTA sequence descriptions and all linefeed characters *)
let dna = Bytes.unsafe_of_string (Str.global_replace (Str.regexp "^>.*$\\|\n") "" file_data)
let code_length = Bytes.length dna
(* Count matches of [re] *)
let count re s =
let i = ref 0 and n = ref 0 in
try
while true do
i := 1 + Str.search_forward re (Bytes.unsafe_to_string s) !i;
incr n
done;
assert false
with Not_found -> !n
let () =
let chunk_size = code_length / workers
and rem = code_length mod workers in
assert (chunk_size >= 7);
let w = Array.make workers stdin
and red_workers = workers - 1 in
for i = 0 to red_workers do
let delta = if i > 0 then 7 else 0 in
let lo = i * chunk_size + min i rem - delta in
let len = chunk_size + (if i < rem then 1 else 0) + delta
and input, output = Unix.pipe () in
match Unix.fork () with
| 0 ->
Unix.close input;
let chunk = Bytes.sub dna lo len
and output = Unix.out_channel_of_descr output in
(* First all the regexps... *)
Array.iter
(fun re -> output_binary_int output (count re chunk))
variants;
(* ...and then all the IUPAC replacements *)
if i > 0 then
for j = 0 to 6 do
chunk.[j] <- ' '
done;
let b = ref chunk in
Array.iter
(fun (re, s) -> b := Bytes.unsafe_of_string (Str.global_replace re s (Bytes.unsafe_to_string !b)))
iupacs;
output_binary_int output (Bytes.length !b - delta);
exit 0
| _ ->
Unix.close output;
w.(i) <- Unix.in_channel_of_descr input
done;
let counts_variants = Array.init (Array.length variants) (fun _ -> ref 0)
and count_iupac = ref 0 in
Array.iter
(fun input ->
Array.iter
(fun count -> count := !count + input_binary_int input)
counts_variants;
count_iupac := !count_iupac + input_binary_int input)
w;
Array.iteri
(fun i re -> Printf.printf "%s %i\n" re !(counts_variants.(i)))
variants_to_string;
Printf.printf "\n%i\n%i\n%i\n%!" file_length code_length !count_iupac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment