Skip to content

Instantly share code, notes, and snippets.

@jdh30
Created June 13, 2015 15:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdh30/3907cb621c0be82c0d42 to your computer and use it in GitHub Desktop.
Save jdh30/3907cb621c0be82c0d42 to your computer and use it in GitHub Desktop.
This is a drop-in replacement for some of the very poor code given on The Computer Language Benchmarks Game
// F# code for the k-nucleotide task
// This is ~3.6x faster than the solution currently on The Computer Language Benchmarks Game.
// The optimisations are described in detail in the F# Journal article:
// "Optimising k-nucleotide" -- http://fsharpnews.blogspot.com/2013/08/optimizing-k-nucleotide.html
module Sub =
let maxLength = 29
let lengthPos = maxLength*2
let bits length = (1UL <<< (2*length)) - 1UL
let codeMask = bits maxLength
let hs =
Array.init 256 (fun i ->
match byte i with
| 'A'B | 'a'B -> 0uy
| 'C'B | 'c'B -> 1uy
| 'G'B | 'g'B -> 2uy
| 'T'B | 't'B | _ -> 3uy)
let create length code = code ||| (uint64 length <<< lengthPos)
let empty = 0UL
let length code = int(code >>> lengthPos)
let isFull code =
length code = maxLength
let enqueue (b: byte) code =
let length = length code
let code = code &&& codeMask
create (length+1) (code ||| (uint64 hs.[int b] <<< (2*length)))
let toString code =
String.init (length code) (fun i ->
match (code >>> (2*i)) &&& 3UL with
| 0UL -> "A" | 1UL -> "C" | 2UL -> "G" | _ -> "T")
let ofString (s: string) =
Seq.fold (fun c b -> enqueue (byte b) c) empty s
let sub start length (xs: uint64 []) =
let idx = start / maxLength
let off = start % maxLength
let length1 = min length (maxLength-off)
let length2 = max 0 (length-length1)
let code1 = ((xs.[idx] &&& codeMask) >>> (2*off)) &&& bits length1
if length1 = length then
create length code1
else
let code2 = (xs.[idx+1] &&& codeMask) &&& bits(length - length1)
create length (code1 ||| (code2 <<< (2*length1)))
let path = @"C:\Users\Jon\Documents\Visual Studio 2010\Projects\Benchmarks\Shootout\fasta25000000.txt"
let dna =
let prefix = "\n>THREE"B
let dna = ResizeArray()
use stream = System.IO.File.OpenRead path
let buf = Array.zeroCreate 4096
let emit b i n =
let mutable b = b
for i in i..n-1 do
match buf.[i] with
| ' 'B | '\r'B | '\n'B -> ()
| x ->
b <- Sub.enqueue x b
if Sub.isFull b then
dna.Add b
b <- Sub.empty
b
let read() = stream.Read(buf, 0, buf.Length)
let rec search i j n =
if i=n then search 0 j (read()) else
if j=prefix.Length then skip i n else
search (i+1) (if buf.[i] = prefix.[j] then j+1 else 0) n
and skip i n =
if i=n then skip 0 (read()) else
match buf.[i] with
| '\n'B -> copy Sub.empty (i+1) n
| _ -> skip (i+1) n
and copy b i n =
if n=0 then
if b <> Sub.empty then
dna.Add b
dna.ToArray()
else
copy (emit b i n) 0 (read())
search 0 0 (read())
open System.Collections.Generic
let add k dv (d: Dictionary<_, _>) =
let mutable v = Unchecked.defaultof<_>
if d.TryGetValue(k, &v) then v := !v + dv else d.[k] <- ref dv
let totalLength = Seq.sumBy Sub.length dna
let cmp =
{ new System.Collections.Generic.IEqualityComparer<uint64> with
member __.Equals(m, n) = m=n
member __.GetHashCode m = int m }
let maketable (length:int) =
let d = System.Collections.Concurrent.ConcurrentBag()
let init() = Dictionary<_, _>(cmp)
let body start _ d =
add (Sub.sub start length dna) 1 d
d
System.Threading.Tasks.Parallel.For(0, totalLength - length + 1, init, body, d.Add)
|> ignore
seq d
let frequencies (length:int) =
let d = Dictionary<_, _>(cmp)
for kvs in maketable length do
for KeyValue(k, dv) in kvs do
add k !dv d
let total = Seq.sumBy (!) d.Values
let kvs =
[|for pair in d do
yield
(Sub.toString pair.Key).ToUpper(),
float pair.Value.Value * 100.0 / float total|]
Array.sortInPlaceBy (snd >> (~-)) kvs
[|for s, c in kvs do
yield sprintf "%s %.3f" s c
yield ""|]
let countSubstring (substring: string) =
let pattern = Sub.ofString substring
let v =
maketable substring.Length
|> Seq.sumBy (fun d ->
let mutable v = Unchecked.defaultof<_>
if d.TryGetValue(pattern, &v) then !v else 0)
[|sprintf "%d\t%s" v (substring.ToUpper())|]
do
[|for len in [1; 2] do
yield fun () -> frequencies len
for str in ["ggt";"ggta";"ggtatt";"ggtattttaatt";"ggtattttaatttatagt"] do
yield fun () -> countSubstring str|]
|> Array.Parallel.map (fun f -> f())
|> Array.iter (Seq.iter (printfn "%s"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment