Created
June 13, 2015 15:45
-
-
Save jdh30/3907cb621c0be82c0d42 to your computer and use it in GitHub Desktop.
This is a drop-in replacement for some of the very poor code given on The Computer Language Benchmarks Game
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// F# code for the k-nucleotide task | |
// This is ~3.6x faster than the solution currently on The Computer Language Benchmarks Game. | |
// The optimisations are described in detail in the F# Journal article: | |
// "Optimising k-nucleotide" -- http://fsharpnews.blogspot.com/2013/08/optimizing-k-nucleotide.html | |
module Sub = | |
let maxLength = 29 | |
let lengthPos = maxLength*2 | |
let bits length = (1UL <<< (2*length)) - 1UL | |
let codeMask = bits maxLength | |
let hs = | |
Array.init 256 (fun i -> | |
match byte i with | |
| 'A'B | 'a'B -> 0uy | |
| 'C'B | 'c'B -> 1uy | |
| 'G'B | 'g'B -> 2uy | |
| 'T'B | 't'B | _ -> 3uy) | |
let create length code = code ||| (uint64 length <<< lengthPos) | |
let empty = 0UL | |
let length code = int(code >>> lengthPos) | |
let isFull code = | |
length code = maxLength | |
let enqueue (b: byte) code = | |
let length = length code | |
let code = code &&& codeMask | |
create (length+1) (code ||| (uint64 hs.[int b] <<< (2*length))) | |
let toString code = | |
String.init (length code) (fun i -> | |
match (code >>> (2*i)) &&& 3UL with | |
| 0UL -> "A" | 1UL -> "C" | 2UL -> "G" | _ -> "T") | |
let ofString (s: string) = | |
Seq.fold (fun c b -> enqueue (byte b) c) empty s | |
let sub start length (xs: uint64 []) = | |
let idx = start / maxLength | |
let off = start % maxLength | |
let length1 = min length (maxLength-off) | |
let length2 = max 0 (length-length1) | |
let code1 = ((xs.[idx] &&& codeMask) >>> (2*off)) &&& bits length1 | |
if length1 = length then | |
create length code1 | |
else | |
let code2 = (xs.[idx+1] &&& codeMask) &&& bits(length - length1) | |
create length (code1 ||| (code2 <<< (2*length1))) | |
let path = @"C:\Users\Jon\Documents\Visual Studio 2010\Projects\Benchmarks\Shootout\fasta25000000.txt" | |
let dna = | |
let prefix = "\n>THREE"B | |
let dna = ResizeArray() | |
use stream = System.IO.File.OpenRead path | |
let buf = Array.zeroCreate 4096 | |
let emit b i n = | |
let mutable b = b | |
for i in i..n-1 do | |
match buf.[i] with | |
| ' 'B | '\r'B | '\n'B -> () | |
| x -> | |
b <- Sub.enqueue x b | |
if Sub.isFull b then | |
dna.Add b | |
b <- Sub.empty | |
b | |
let read() = stream.Read(buf, 0, buf.Length) | |
let rec search i j n = | |
if i=n then search 0 j (read()) else | |
if j=prefix.Length then skip i n else | |
search (i+1) (if buf.[i] = prefix.[j] then j+1 else 0) n | |
and skip i n = | |
if i=n then skip 0 (read()) else | |
match buf.[i] with | |
| '\n'B -> copy Sub.empty (i+1) n | |
| _ -> skip (i+1) n | |
and copy b i n = | |
if n=0 then | |
if b <> Sub.empty then | |
dna.Add b | |
dna.ToArray() | |
else | |
copy (emit b i n) 0 (read()) | |
search 0 0 (read()) | |
open System.Collections.Generic | |
let add k dv (d: Dictionary<_, _>) = | |
let mutable v = Unchecked.defaultof<_> | |
if d.TryGetValue(k, &v) then v := !v + dv else d.[k] <- ref dv | |
let totalLength = Seq.sumBy Sub.length dna | |
let cmp = | |
{ new System.Collections.Generic.IEqualityComparer<uint64> with | |
member __.Equals(m, n) = m=n | |
member __.GetHashCode m = int m } | |
let maketable (length:int) = | |
let d = System.Collections.Concurrent.ConcurrentBag() | |
let init() = Dictionary<_, _>(cmp) | |
let body start _ d = | |
add (Sub.sub start length dna) 1 d | |
d | |
System.Threading.Tasks.Parallel.For(0, totalLength - length + 1, init, body, d.Add) | |
|> ignore | |
seq d | |
let frequencies (length:int) = | |
let d = Dictionary<_, _>(cmp) | |
for kvs in maketable length do | |
for KeyValue(k, dv) in kvs do | |
add k !dv d | |
let total = Seq.sumBy (!) d.Values | |
let kvs = | |
[|for pair in d do | |
yield | |
(Sub.toString pair.Key).ToUpper(), | |
float pair.Value.Value * 100.0 / float total|] | |
Array.sortInPlaceBy (snd >> (~-)) kvs | |
[|for s, c in kvs do | |
yield sprintf "%s %.3f" s c | |
yield ""|] | |
let countSubstring (substring: string) = | |
let pattern = Sub.ofString substring | |
let v = | |
maketable substring.Length | |
|> Seq.sumBy (fun d -> | |
let mutable v = Unchecked.defaultof<_> | |
if d.TryGetValue(pattern, &v) then !v else 0) | |
[|sprintf "%d\t%s" v (substring.ToUpper())|] | |
do | |
[|for len in [1; 2] do | |
yield fun () -> frequencies len | |
for str in ["ggt";"ggta";"ggtatt";"ggtattttaatt";"ggtattttaatttatagt"] do | |
yield fun () -> countSubstring str|] | |
|> Array.Parallel.map (fun f -> f()) | |
|> Array.iter (Seq.iter (printfn "%s")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment