Skip to content

Instantly share code, notes, and snippets.

@philtomson
Last active December 28, 2015 07:59
Show Gist options
  • Save philtomson/7468849 to your computer and use it in GitHub Desktop.
Save philtomson/7468849 to your computer and use it in GitHub Desktop.
quad tree implementation in OCaml (not yet complete)
type base = A | C | G | T | ROOT ;;
let base_to_s b = match b with
| A -> "A"
| C -> "C"
| G -> "G"
| T -> "T"
| _ -> "ROOT";;
let char_to_base c = match c with
| 'a' | 'A' -> A
| 'c' | 'C' -> C
| 'g' | 'G' -> G
| 't' | 'T' -> T
| _ -> ROOT ;;
type quad_tree = Nd of base * quad_tree * quad_tree * quad_tree * quad_tree
| Empty
| Leaf of base * int ref ;;
let init_quad_tree = Nd(ROOT, Empty,Empty,Empty,Empty);;
let new_node' b = Nd(b,Empty,Empty,Empty,Empty);;
let new_node prev current b = if ((current = ROOT) || (prev = current)) then
new_node' b
else
Empty ;;
let new_leaf prev current b = if prev = current then
Leaf(b, ref 1)
else
Empty ;;
let add_new_node prev current b = new_node b;;
let rec add_node prev_b base k qtree =
let rec aux k' accum qtree' =
if k' = k then
(
match qtree' with
| Nd(bse, Empty, Empty, Empty, Empty) ->
(
match base with
| A -> Nd(bse, (new_leaf prev_b bse base),Empty,Empty,Empty)
| C -> Nd(bse, Empty,(new_leaf prev_b bse base),Empty,Empty)
| G -> Nd(bse, Empty,Empty,(new_leaf prev_b bse base),Empty)
| T -> Nd(bse, Empty,Empty,Empty,(new_leaf prev_b bse base))
| _ -> qtree'
)
| Nd(bse, (Leaf(_,count) as lfa), Empty, Empty, Empty) ->
(
match base with
| A -> count := !count + 1; qtree'
| C -> Nd(bse, lfa,(new_leaf prev_b bse base),Empty,Empty)
| G -> Nd(bse, lfa,Empty,(new_leaf prev_b bse base),Empty)
| T -> Nd(bse, lfa,Empty,Empty,(new_leaf prev_b bse base))
| _ -> qtree'
)
| Nd(bse, Empty,(Leaf(_,count) as lfc), Empty, Empty) ->
(
match base with
| A -> Nd(bse, (new_leaf prev_b bse base),lfc,Empty,Empty)
| C -> count := !count + 1; qtree'
| G -> Nd(bse, Empty,lfc,(new_leaf prev_b bse base),Empty)
| T -> Nd(bse, Empty,lfc,Empty,(new_leaf prev_b bse base))
| _ -> qtree'
)
| Nd(bse, Empty,(Leaf(_,counta) as lfc), Empty, (Leaf(_,countt) as lft)) ->
(
match base with
| A -> Nd(bse, (new_leaf prev_b bse base),lfc,Empty,lft)
| C -> counta := !counta + 1; qtree'
| G -> Nd(bse, Empty,lfc,(new_leaf prev_b bse base),lft)
| T -> countt := !countt + 1; qtree'
| _ -> qtree'
)
| Nd(bse, (Leaf(_,counta) as lfa),(Leaf(_,countc) as lfc), Empty, Empty) ->
(
match base with
| A -> counta := !counta + 1; qtree'
| C -> countc := !countc + 1; qtree'
| G -> Nd(bse, lfa,lfc,(new_leaf prev_b bse base),Empty)
| T -> Nd(bse, lfa,lfc,Empty,(new_leaf prev_b bse base))
| _ -> qtree'
)
| Nd(bse, (Leaf(_,counta) as lfa),(Leaf(_,countc) as lfc),Empty,(Leaf(_,countt) as lft)) ->
(
match base with
| A -> counta := !counta + 1; qtree'
| C -> countc := !countc + 1; qtree'
| G -> Nd(bse, lfa,lfc,(new_leaf prev_b bse base),lft)
| T -> countt := !countt + 1; qtree'
| _ -> qtree'
)
| Nd(bse, (Leaf(_,counta) as lfa),(Leaf(_,countc) as lfc),(Leaf(_,countg) as lfg), Empty) ->
(
match base with
| A -> counta := !counta + 1; qtree'
| C -> countc := !countc + 1; qtree'
| G -> countg := !countg + 1; qtree'
| T -> Nd(bse, lfa,lfc,lfg,(new_leaf prev_b bse base))
| _ -> qtree'
)
| Nd(bse, Empty, Empty, (Leaf(_,count) as lfg), Empty) ->
(
match base with
| A -> Nd(bse, (new_leaf prev_b bse base),Empty,lfg,Empty)
| C -> Nd(bse, Empty,(new_leaf prev_b bse base),lfg,Empty)
| G -> count := !count + 1; qtree'
| T -> Nd(bse, Empty,Empty,lfg,(new_leaf prev_b bse base))
| _ -> qtree'
)
| Nd(bse, (Leaf(_,counta) as lfa),Empty, (Leaf(_,countg) as lfg), Empty) ->
(
match base with
| A -> counta := !counta + 1; qtree'
| C -> Nd(bse, lfa,(new_leaf prev_b bse base),lfg,Empty)
| G -> countg := !countg + 1; qtree'
| T -> Nd(bse, lfa,Empty,lfg,(new_leaf prev_b bse base))
| _ -> qtree'
)
| Nd(bse, Empty,(Leaf(_,countc) as lfc), (Leaf(_,countg) as lfg), Empty) ->
(
match base with
| A -> Nd(bse, (new_leaf prev_b bse base),lfc,lfg,Empty)
| C -> countc := !countc + 1; qtree'
| G -> countg := !countg + 1; qtree'
| T -> Nd(bse, Empty,lfc,lfg,(new_leaf prev_b bse base))
| _ -> qtree'
)
| Nd(bse, Empty,(Leaf(_,countc) as lfc),(Leaf(_,countg) as lfg),(Leaf(_,countt) as lft)) ->
(
match base with
| A -> Nd(bse, (new_leaf prev_b bse base),lfc,lfg,lft)
| C -> countc := !countc + 1; qtree'
| G -> countg := !countg + 1; qtree'
| T -> countt := !countt + 1; qtree'
| _ -> qtree'
)
| Nd(bse, Leaf(_,counta),Leaf(_,countc),Leaf(_,countg),Leaf(_,countt) ) ->
(
match base with
| A -> counta := !counta + 1; qtree'
| C -> countc := !countc + 1; qtree'
| G -> countg := !countg + 1; qtree'
| T -> countt := !countt + 1; qtree'
| _ -> qtree'
)
| Nd(bse, Empty, Empty, Empty, (Leaf(_,count) as lft)) ->
(
match base with
| A -> Nd(bse, (new_leaf prev_b bse base),Empty,Empty,lft)
| C -> Nd(bse, Empty,(new_leaf prev_b bse base),Empty,lft)
| G -> Nd(bse, Empty,Empty,(new_leaf prev_b bse base),lft)
| T -> (*count := !count + 1;*) qtree'
| _ -> qtree'
)
| Nd(bse, (Leaf(_,counta) as lfa), Empty, Empty, (Leaf(_,countt) as lft)) ->
(
match base with
| A -> counta := !counta + 1; qtree'
| C -> Nd(bse, lfa,(new_leaf prev_b bse base),Empty,lft)
| G -> Nd(bse, lfa,Empty,(new_leaf prev_b bse base),lft)
| T -> countt := !countt + 1; qtree'
| _ -> qtree'
)
| Nd(bse, Empty, Empty,(Leaf(_,countg) as lfg),(Leaf(_,countt) as lft)) ->
(
match base with
| A -> Nd(bse, (new_leaf prev_b bse base),Empty,lfg,lft)
| C -> Nd(bse, Empty,(new_leaf prev_b bse base),lfg,lft)
| G -> countg := !countg + 1; qtree'
| T -> countt := !countt + 1; qtree'
| _ -> qtree'
)
| Nd(bse, (Leaf(_,counta) as lfa),Empty,(Leaf(_,countg) as lfg),(Leaf(_,countt) as lft)) ->
(
match base with
| A -> counta := !counta + 1; qtree'
| C -> Nd(bse, lfa,(new_leaf prev_b bse base),lfg,lft)
| G -> countg := !countg + 1; qtree'
| T -> countt := !countt + 1; qtree'
| _ -> qtree'
)
| Nd(bse, aa, Empty, gg, tt) -> Nd(bse, aa,(new_leaf prev_b bse base),gg,tt)
| Nd(bse, aa, cc, Empty, tt) -> Nd(bse, aa,cc,(new_leaf prev_b bse base),tt)
| Nd(bse, aa, cc, gg, Empty) -> Nd(bse, aa,cc,gg,(new_leaf prev_b bse base))
| Leaf(bse,count) -> count := !count + 1; qtree'
| Empty -> Leaf(base, ref accum)
| _ -> qtree'
)
else
match qtree' with
(*do we need this? *)
| Leaf(_,iref) -> iref := !iref + 1; qtree'
| Nd(bse,Empty,Empty,Empty,Empty)->
(
match base with
| A -> Nd(bse,(new_node prev_b bse base),Empty,Empty,Empty )
| C -> Nd(bse,Empty,(new_node prev_b bse base),Empty,Empty )
| G -> Nd(bse,Empty,Empty,(new_node prev_b bse base),Empty )
| T -> Nd(bse,Empty,Empty,Empty,(new_node prev_b bse base) )
| _ -> qtree'
)
| Nd(bse,(Nd(_,_,_,_,_) as a),Empty,Empty,Empty)->
(
match base with
| A -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,Empty,Empty )
| C -> Nd(bse,(aux (k'+1) (accum+1) a),(new_node prev_b bse base),Empty,Empty )
| G -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,(new_node prev_b bse base),Empty )
| T -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,Empty,(new_node prev_b bse base) )
| _ -> qtree'
)
| Nd(bse,Empty,(Nd(_,_,_,_,_) as c),Empty,Empty)->
(
match base with
| A -> Nd(bse,(new_node prev_b bse base),(aux (k'+1) (accum+1) c),Empty,Empty )
| C -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),Empty,Empty )
| G -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),(new_node prev_b bse base),Empty )
| T -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),Empty,(new_node prev_b bse base) )
| _ -> qtree'
)
| Nd(bse,(Nd(_,_,_,_,_) as a),(Nd(_,_,_,_,_) as c),Empty,Empty)->
(
match base with
| A -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),Empty,Empty )
| C -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),Empty,Empty )
| G -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(new_node prev_b bse base),Empty )
| T -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),Empty,(new_node prev_b bse base) )
| _ -> qtree'
)
| Nd(bse,Empty,Empty,(Nd(_,_,_,_,_) as g),Empty)->
(
match base with
| A -> Nd(bse,(new_node prev_b bse base),Empty,(aux (k'+1) (accum+1) g),Empty )
| C -> Nd(bse,Empty,(new_node prev_b bse base),(aux (k'+1) (accum+1) g),Empty )
| G -> Nd(bse,Empty,Empty,(aux (k'+1) (accum+1) g),Empty )
| T -> Nd(bse,Empty,Empty,(aux (k'+1) (accum+1) g),(new_node prev_b bse base) )
| _ -> qtree'
)
| Nd(bse,(Nd(_,_,_,_,_) as a),Empty,(Nd(_,_,_,_,_) as g),Empty)->
(
match base with
| A -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,(aux (k'+1) (accum+1) g),Empty )
| C -> Nd(bse,(aux (k'+1) (accum+1) a),(new_node prev_b bse base),(aux (k'+1) (accum+1) g),Empty )
| G -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,(aux (k'+1) (accum+1) g),Empty )
| T -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,(aux (k'+1) (accum+1) g),(new_node prev_b bse base) )
| _ -> qtree'
)
| Nd(bse,Empty,(Nd(_,_,_,_,_) as c),(Nd(_,_,_,_,_) as g),Empty)->
(
match base with
| A -> Nd(bse,(new_node prev_b bse base),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),Empty )
| C -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),Empty )
| G -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),Empty )
| T -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(new_node prev_b bse base) )
| _ -> qtree'
)
| Nd(bse,(Nd(_,_,_,_,_) as a),(Nd(_,_,_,_,_) as c),(Nd(_,_,_,_,_) as g),Empty)->
(
match base with
| A -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),Empty )
| C -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),Empty )
| G -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),Empty )
| T -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(new_node prev_b bse base) )
| _ -> qtree'
)
| Nd(bse,Empty,Empty,Empty,(Nd(_,_,_,_,_) as t))->
(
match base with
| A -> Nd(bse,(new_node prev_b bse base),Empty,Empty,(aux (k'+1) (accum+1) t) )
| C -> Nd(bse,Empty,(new_node prev_b bse base),Empty,(aux (k'+1) (accum+1) t) )
| G -> Nd(bse,Empty,Empty,(new_node prev_b bse base),(aux (k'+1) (accum+1) t) )
| T -> Nd(bse,Empty,Empty,Empty,(aux (k'+1) (accum+1) t) )
| _ -> qtree'
)
| Nd(bse,(Nd(_,_,_,_,_) as a),Empty,Empty,(Nd(_,_,_,_,_) as t))->
(
match base with
| A -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,Empty,(aux (k'+1) (accum+1) t) )
| C -> Nd(bse,(aux (k'+1) (accum+1) a),(new_node prev_b bse base),Empty,(aux (k'+1) (accum+1) t) )
| G -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,(new_node prev_b bse base),(aux (k'+1) (accum+1) t) )
| T -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,Empty,(aux (k'+1) (accum+1) t) )
| _ -> qtree'
)
| Nd(bse,Empty,(Nd(_,_,_,_,_) as c),Empty,(Nd(_,_,_,_,_) as t))->
(
match base with
| A -> Nd(bse,(new_node prev_b bse base),(aux (k'+1) (accum+1) c),Empty,(aux (k'+1) (accum+1) t) )
| C -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),Empty,(aux (k'+1) (accum+1) t) )
| G -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),(new_node prev_b bse base),(aux (k'+1) (accum+1) t) )
| T -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),Empty,(aux (k'+1) (accum+1) t) )
| _ -> qtree'
)
| Nd(bse,(Nd(_,_,_,_,_) as a),(Nd(_,_,_,_,_) as c),Empty,(Nd(_,_,_,_,_) as t))->
(
match base with
| A -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),Empty,(aux (k'+1) (accum+1) t) )
| C -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),Empty,(aux (k'+1) (accum+1) t) )
| G -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(new_node prev_b bse base),(aux (k'+1) (accum+1) t) )
| T -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),Empty,(aux (k'+1) (accum+1) t) )
| _ -> qtree'
)
| Nd(bse,Empty,Empty,(Nd(_,_,_,_,_) as g),(Nd(_,_,_,_,_) as t))->
(
match base with
| A -> Nd(bse,(new_node prev_b bse base),Empty,(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| C -> Nd(bse,Empty,(new_node prev_b bse base),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| G -> Nd(bse,Empty,Empty,(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| T -> Nd(bse,Empty,Empty,(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| _ -> qtree'
)
| Nd(bse,(Nd(_,_,_,_,_) as a),Empty,(Nd(_,_,_,_,_) as g),(Nd(_,_,_,_,_) as t))->
(
match base with
| A -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| C -> Nd(bse,(aux (k'+1) (accum+1) a),(new_node prev_b bse base),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| G -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| T -> Nd(bse,(aux (k'+1) (accum+1) a),Empty,(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| _ -> qtree'
)
| Nd(bse,Empty,(Nd(_,_,_,_,_) as c),(Nd(_,_,_,_,_) as g),(Nd(_,_,_,_,_) as t))->
(
match base with
| A -> Nd(bse,(new_node prev_b bse base),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| C -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| G -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| T -> Nd(bse,Empty,(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| _ -> qtree'
)
| Nd(bse,(Nd(_,_,_,_,_) as a),(Nd(_,_,_,_,_) as c),(Nd(_,_,_,_,_) as g),(Nd(_,_,_,_,_) as t))->
(
match base with
| A -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| C -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| G -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| T -> Nd(bse,(aux (k'+1) (accum+1) a),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t) )
| _ -> qtree'
)
| Nd(bse, treea, treeb, treec, treed) -> Nd(bse, (aux (k'+1) (accum+1) treea),
(aux (k'+1) (accum+1) treeb),
(aux (k'+1) (accum+1) treec),
(aux (k'+1) (accum+1) treed))
| Empty -> (new_node' base)
in
aux 1 0 qtree ;;
let rec walk_string str =
let len = String.length str in
let rec aux idx = match idx with
| l when l = len -> ()
| _ -> (aux (idx+1))in
aux 0 ;;
let rec build_qtree_from_string str k =
let len = String.length str in
let rec aux prev idx qtree =
match idx with
| n when n >= len -> qtree
| _ -> let current = (char_to_base (String.get str idx)) in
aux (current) (idx+1)
(add_node prev current k qtree) in
aux ROOT 0 init_quad_tree ;;
let get_sub_qtree qtree base = match qtree with
| Nd(bse, (_ as a),(_ as c),(_ as g),(_ as t)) ->
(
match base with
| A -> a
| C -> c
| G -> g
| T -> t
| _ -> qtree
)
| _ -> qtree ;;
let rec find_seq_in_qtree qtree seq =
let rec aux qtree' seq' = match seq' with
[] -> qtree'
| x::xs -> aux (get_sub_qtree qtree' x) xs in
aux qtree seq ;;
(** walk_qtree qtree : returns a list of strings (base sequences) and counts *)
let rec walk_qtree qtree =
let lst = ref [] in
let rec aux qtree' str =
match qtree' with
| Empty | Nd(_, Empty, Empty, Empty, Empty) -> ()
| Leaf(base, nref) -> (
lst := ((str ^ base_to_s base), !nref)::!lst )
| Nd(base, a, c, g, t) -> let str' = (if base <> ROOT then (
(str ^ (base_to_s base))
)
else ( str )) in
aux a str' ;
aux c str' ;
aux g str' ;
aux t str' in
aux qtree "";
!lst ;;
let base_to_int b = match b with
| A -> 0
| C -> 1
| G -> 2
| T -> 3
| ROOT -> 4 ;;
let qtree_to_dot qtree =
let lst = ref [] in
let base_counts = Array.init 5 (fun _ -> 0) in
let base_count b = Array.set base_counts (base_to_int b) (base_counts.(base_to_int b) +1);
base_counts.(base_to_int b) in
let prev_node_to_str b = "{" ^ base_to_s b ^ string_of_int base_counts.(base_to_int b) ^ "[label=\"" ^ base_to_s b ^ "\"]}" in
let cur_node_to_str b = "{" ^ base_to_s b ^ string_of_int (base_count b) ^ "[label=\"" ^ base_to_s b ^ "\"]}" in
let leaf_node_to_str b n = "{" ^ base_to_s b ^ string_of_int (base_count b) ^ "[shape=box,label=\"" ^ base_to_s b ^ " "^ string_of_int n ^ "\"]}" in
let rec aux qtree' prev =
match qtree' with
| Empty -> ()
| Nd(base, Empty, Empty, Empty, Empty) ->
lst := (prev_node_to_str prev ^ "--" ^ cur_node_to_str base) :: !lst
| Leaf(base, nref) ->
lst := (prev_node_to_str prev ^ "--" ^ leaf_node_to_str base !nref)::!lst
| Nd(base, a, c, g, t) -> let _ = (
if base = ROOT && prev = ROOT then
()
else
lst := (prev_node_to_str prev ^ "--" ^ cur_node_to_str base)::!lst
) in
aux t base ;
aux g base ;
aux c base ;
aux a base in
aux qtree ROOT;
"graph actg{\n" ^ (String.concat "\n" !lst ) ^ "\n}" ;;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment