Skip to content

Instantly share code, notes, and snippets.

/EoSMWFvsSoA Secret

Created January 16, 2015 11:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/d17da4f1a8db2fadf10c to your computer and use it in GitHub Desktop.
Save anonymous/d17da4f1a8db2fadf10c to your computer and use it in GitHub Desktop.
module SoEMWF =
let getName = "\r\nUses the Sieve of Eratosthenes to calculate the primes to argument.\r\n"
let private WRDBTS, FRSTBP = 16, 23UL
let private BPRMS = [| 2UL; 3UL; 5UL; 7UL; 11UL; 13UL; 17UL; 19UL |]
let private BWPS = [| //base wheel positions + 23
23uy;29uy;31uy;37uy;41uy;43uy;47uy;53uy; 59uy;61uy;67uy;71uy;73uy;79uy;83uy;89uy;
97uy;101uy;103uy;107uy;109uy;113uy;121uy;127uy; 131uy;137uy;139uy;143uy;149uy;151uy;157uy;163uy;
167uy;169uy;173uy;179uy;181uy;187uy;191uy;193uy; 197uy;199uy;209uy;211uy;221uy;223uy;227uy;229uy |]
let private GAPS = [| //wheel position deltas for each computation of next incremental cull position
6uy; 2uy; 6uy; 4uy; 2uy; 4uy; 6uy; 6uy; 2uy; 6uy; 4uy; 2uy; 6uy; 4uy; 6uy; 8uy;
4uy; 2uy; 4uy; 2uy; 4uy; 8uy; 6uy; 4uy; 6uy; 2uy; 4uy; 6uy; 2uy; 6uy; 6uy; 4uy;
2uy; 4uy; 6uy; 2uy; 6uy; 4uy; 2uy; 4uy; 2uy; 10uy; 2uy; 10uy; 2uy; 4uy; 2uy; 4uy;
6uy; 2uy; 6uy; 4uy; 2uy; 4uy; 6uy; 6uy; 2uy; 6uy; 4uy; 2uy; 6uy; 4uy; 6uy; 8uy; //2 rounds for overflow
4uy; 2uy; 4uy; 2uy; 4uy; 8uy; 6uy; 4uy; 6uy; 2uy; 4uy; 6uy; 2uy; 6uy; 6uy; 4uy;
2uy; 4uy; 6uy; 2uy; 6uy; 4uy; 2uy; 4uy; 2uy; 10uy; 2uy; 10uy; 2uy; 4uy; 2uy; 4uy |]
let private WHTS, WCRC = BWPS.Length, (GAPS |> Seq.map uint64 |> Seq.sum) / 2UL //check: 48 and 210UL
let private WHLNDXS = //look up table for (position - 23) mod WCRC - 0 to WCRC - 1, to wheel index - 0 to WHTS - 1
(GAPS |> Seq.take WHTS |> Seq.mapi (fun i x -> i, x)
|> Seq.collect (fun (i, n) -> seq { for n = 1uy to n do yield byte i })) |> Seq.toArray
let primes topNum =
if topNum >= ((1UL <<< 30) / 3UL - 1UL) * uint64 WCRC + FRSTBP then failwith "Argument too large for algorithm!!!"
let BFSZ = if topNum < FRSTBP then 6 else ((int ((topNum - FRSTBP) / uint64 WCRC) + 2) * 3) //includes topNum
let cull (cmpsts: _ array) (bps: _ seq) =
let topNdx = uint64 (cmpsts.Length * WRDBTS - 1)
let topRng = topNdx / uint64 WHTS * WCRC + uint64 BWPS.[WHTS - 1] //from size of current array
let kSqrtLim = int (sqrt (float (topRng - WCRC))) / int WCRC //base prime test limit value comp for extra
bps |> Seq.takeWhile (((>=) kSqrtLim) << fst) |> Seq.iter (fun (k, i) -> //for each base prime less than limit
let x = uint64 BWPS.[int i] in let p = (uint64 k * WCRC) + x
let s = p * p - FRSTBP - x * p in let pm = 3 * int p
{ int i .. int i + WHTS - 1 } |> Seq.map (fun i ->
let adj, ni = if i >= WHTS then WCRC, i - WHTS else 0UL, i
let c0 = s + p * (uint64 BWPS.[ni] + adj)
let c0d = c0 / WCRC in let c0n = WHLNDXS.[int (c0 - c0d * WCRC)]
let c0dwtry = 3 * int c0d
if c0n < 16uy then c0dwtry, c0n else if c0n < 32uy then c0dwtry + 1, c0n - 16uy
else c0dwtry + 2, c0n - 32uy )
|> Seq.takeWhile (((>) cmpsts.Length) << fst) //loop maximum of wheel hits times; run tight cull loop for each
|> Seq.iter (fun (ccd, cm) -> //tight culling loops almost all the time, following faster when 32-bit mode
let cmsk = 1us <<< int cm
let inline setCmpst i = cmpsts.[i] <- cmpsts.[i] ||| cmsk
let rec cullp c =
if c < cmpsts.Length then setCmpst c; cullp (c + pm) in cullp ccd) ) //fast for 64 bit mode
let cmpsts = //first generate the small master pre-culled pattern array
let mstr = Array.zeroCreate (11 * 13 * 17 * 19 * WHTS / WRDBTS)
{ WHTS - 4 .. WHTS - 1 } |> Seq.map (fun m -> (-1, byte m)) |> cull mstr //cull 11/13/17/19
let arr = Array.zeroCreate BFSZ
{ 0 .. mstr.Length .. (arr.Length - 1) } //copy repetitions of pre-cull pattern array until full
|> Seq.iter (fun i -> let len = if i + mstr.Length <= arr.Length
then mstr.Length else arr.Length - i
System.Array.Copy(mstr, 0, arr, i, len))
arr //has an slight excess of up to two wheels to come out even in int's/wheels
let inline testCmptst ndx = cmpsts.[ndx >>> 4] &&& (1us <<< (ndx &&& 15)) <> 0us
Seq.initInfinite id |> Seq.collect (fun k -> { 0 .. WHTS - 1 } |> Seq.map (fun i -> k, byte i))
|> Seq.filter (fun (k, i) -> not (testCmptst (WHTS * k + int i))) |> cull cmpsts //cull all composites
//an extremely fast bit counting routine by 16-bit words...
//mask off all buffer composite bits above the top number:
let topd = (topNum - FRSTBP) / WCRC in let topn = int WHLNDXS.[int (topNum - FRSTBP - topd * WCRC)]
let trywrd = int topd * 3
let wrd, wrdn = if topn < 16 then trywrd, topn else if topn < 32 then trywrd + 1, topn - 16
else trywrd + 2, topn - 32
let awrd = if topNum < FRSTBP then -1 else cmpsts.[wrd] <- cmpsts.[wrd] ||| (uint16 -2 <<< wrdn); wrd
{ awrd + 1 .. cmpsts.Length - 1 } |> Seq.iter (fun w -> cmpsts.[w] <- uint16 -1)
let rec count w acc = //tight count loop
if w < cmpsts.Length then //magic int16 bit count algorithm - slightly slower than Look Up Table for int16's
let a = cmpsts.[w] in let b = a - ((a >>> 1) &&& 0x5555us)
let c = (b &&& 0x3333us) + ((b >>> 2) &&& 0x3333us)
count (w + 1) (acc - uint64 ((((c + (c >>> 4)) &&& 0x0F0Fus) * 0x0101us) >>> 8))
else acc in count 0 (uint64 (BPRMS |> Seq.takeWhile ((>=) topNum) |> Seq.length)) + //include base primes
uint64 cmpsts.Length * uint64 WRDBTS //counts bits not set
// seq { yield! BPRMS; |> Seq.takeWhile ((>=) topNum) //very slow but literal way of enumerating resulting primes:
// yield! Seq.initInfinite id
// |> Seq.collect (fun k -> BWPS |> Seq.mapi (fun i x -> WCRC * uint64 k + uint64 x, WHTS * k + i))
// |> Seq.takeWhile (((>=) topNum) << fst)
// |> Seq.filter (not << testCmptst << snd) |> Seq.map (fst) }
module SoA =
let getName = "\r\nUses the Sieve of Atkin to calculate the primes to argument.\r\n"
let private WRDBTS, FRSTBP = 16, 7UL
let private BPRMS = [| 2UL; 3UL; 5UL |]
let private BWPS = [| 7uy;11uy;13uy;17uy;19uy;23uy;29uy;31uy; //base wheel positions + 7
37uy;41uy;43uy;47uy;49uy;53uy;59uy;61uy |]
let private GAPS = [| //wheel position deltas for each computation of next incremental cull position
4uy; 2uy; 4uy; 2uy; 4uy; 6uy; 2uy; 6uy; 4uy; 2uy; 4uy; 2uy; 4uy; 6uy; 2uy; 6uy; //2 times for overflow
4uy; 2uy; 4uy; 2uy; 4uy; 6uy; 2uy; 6uy; 4uy; 2uy; 4uy; 2uy; 4uy; 6uy; 2uy; 6uy |]
let private WHTS, WCRC = BWPS.Length, (GAPS |> Seq.map uint64 |> Seq.sum) / 2UL //check: 16 and 60UL
let private WHLNDXS = //look up table for (position - 23) mod WCRC - 0 to WCRC - 1, to wheel index - 0 to WHTS - 1
(GAPS |> Seq.take WHTS |> Seq.mapi (fun i x -> i, x)
|> Seq.collect (fun (i, n) -> seq { for n = 1uy to n do yield byte i })) |> Seq.toArray
let primes topNum =
if topNum >= ((1UL <<< 30) - 1UL) * uint64 WCRC + FRSTBP then failwith "Argument too large for algorithm!!!"
let BFSZ = if topNum < FRSTBP then 2 else int ((topNum - FRSTBP) / WCRC) + 2 //includes topNum with spare
let is_prms = //initialize the prime sieving array
Array.zeroCreate BFSZ //has an slight excess of the rest of the last wheel to come out even in int's/wheels
let setprms (prms: _ array) (bps: _ seq) =
let topNdx = uint64 (prms.Length * WRDBTS - 1)
let topRng = topNdx / uint64 WHTS * WCRC + uint64 BWPS.[WHTS - 1] //maximum value of current array
let kSqrtLim = int (sqrt (float (topRng - WCRC))) / int WCRC //compensate for extra wheel
//run the 4 * x * x + y * y quadratic prime generating sequences = all x's and odd y's
Seq.initInfinite (uint64 << ((+) 1)) |> Seq.map (fun x -> 4UL * x * x + 1UL)
|> Seq.takeWhile ((>=) topRng)
|> Seq.collect (fun st -> { 1UL .. 2UL .. 29UL } |> Seq.map (fun midy -> st + midy * midy - 1UL, int midy + 15)
|> Seq.takeWhile (((>=) topRng) << fst) )
|> Seq.iter (fun (n, myp15) ->
if n < FRSTBP then () else //no action for n = 5 - not one of the modulos for this quadratic
let cbd = (n - FRSTBP) / WCRC in let cr = n - cbd * WCRC
let nm = int (if cr < WCRC then cr else cr - WCRC)
if 1UL <<< nm &&& 0x22022020022002UL = 0UL then () else //if nm in {1,13,17,29,37,41,49,53}
let cm = 1us <<< (int WHLNDXS.[int (cr - FRSTBP)]) //following is the tight loop that does it
let rec tgl c inc = if c < prms.Length then prms.[c] <- prms.[c] ^^^ cm; tgl (c + inc) (inc + 30)
tgl (int cbd) myp15)
//run the 3 * x * x + y * y quadratic prime generating sequences - odd x's and even y's
Seq.initInfinite (uint64 << ((+) 1) << ((*) 2)) |> Seq.map (fun x -> 3UL * x * x + 4UL)
|> Seq.takeWhile ((>=) topRng)
|> Seq.collect (fun st -> { 2UL .. 2UL .. 30UL } |> Seq.map (fun midy -> st + midy * midy - 4UL, int midy + 15)
|> Seq.takeWhile (((>=) topRng) << fst) )
|> Seq.iter (fun (n, myp15) ->
let cbd = (n - FRSTBP) / WCRC in let cr = n - cbd * WCRC
let nm = int (if cr < WCRC then cr else cr - WCRC)
if 1UL <<< nm &&& 0x80080080080UL = 0UL then () else //if cm in {7,19,31,43}
let cm = 1us <<< (int WHLNDXS.[int (cr - FRSTBP)]) //following is the tight loop that does it
let rec tgl c inc = if c < prms.Length then prms.[c] <- prms.[c] ^^^ cm; tgl (c + inc) (inc + 30)
tgl (int cbd) myp15)
//run the 3 * x * x - y * y quadratic prime generating sequences - both even/odd and odd/even combos in x's/y's
Seq.initInfinite (uint64 << ((+) 2)) |> Seq.map (fun x -> 2UL * x * x + 2UL * x - 1UL, x - 1UL)
|> Seq.takeWhile (((>=) topRng) << fst) //reverse order ok in the following
|> Seq.collect (fun (st, by) -> Seq.unfold (fun x -> if x + 28UL < by || x < 1UL then None
else if x < 2UL then Some(x, 0UL) else Some(x, x - 2UL)) by
|> Seq.map (fun midy -> st - midy * midy + by * by, int midy - 15)
|> Seq.takeWhile (((>=) topRng) << fst))
|> Seq.iter (fun (n, mym15) ->
let cbd = (n - FRSTBP) / WCRC in let cr = n - cbd * WCRC
let nm = int (if cr < WCRC then cr else cr - WCRC)
if 1UL <<< nm &&& 0x800800000800800UL = 0UL then () else //if cm in {11,23,47,59}
let cm = 1us <<< (int WHLNDXS.[int (cr - FRSTBP)]) //following is the tight loop that does it
let rec tgl c inc = if inc > -15 && c < prms.Length then
prms.[c] <- prms.[c] ^^^ cm; tgl (c + inc) (inc - 30) in tgl (int cbd) mym15)
//run the prime square-free culling operations:
bps |> Seq.takeWhile (((>=) kSqrtLim) << fst) |> Seq.iter (fun (k, i) -> //for each base prime less than limit
let x = uint64 BWPS.[int i] in let p = (uint64 k * WCRC) + x in let sndx = BWPS.Length - 1
let sqr = p * p in let s = sqr - FRSTBP - sqr * uint64 BWPS.[sndx]
{ sndx .. sndx + WHTS - 1 } |> Seq.map (fun i ->
let adj, ni = if i >= WHTS then WCRC, i - WHTS else 0UL, i
let c0 = s + sqr * (uint64 BWPS.[ni] + adj)
let c0d = c0 / WCRC in let c0n = WHLNDXS.[int (c0 - c0d * WCRC)]
(c0d, c0n) )
|> Seq.takeWhile (((>) (prms.Length - 1)) << int << fst) //loop maximum of wheel hits times; run tight cull loop for each
|> Seq.iter (fun (ccd, cm) -> //tight culling loops almost all the time, following faster when 32-bit mode
let cmsk = ~~~(1us <<< int cm)
let rec cullp c = //tight prime square culling loop
if c < uint64 prms.Length then prms.[int c] <- prms.[int c] &&& cmsk; cullp (c + sqr) in cullp ccd) )
let inline testPrm ndx = is_prms.[int (ndx >>> 4)] &&& (1us <<< (int ndx &&& 15)) <> 0us
Seq.initInfinite id |> Seq.collect (fun k -> { 0 .. WHTS - 1 } |> Seq.map (fun i -> k, byte i))
|> Seq.filter (fun (k, i) -> testPrm (WHTS * int k + int i)) |> setprms is_prms //find all primes
//an extremely fast bit counting routine by 16-bit words => number of primes...
//mask off all buffer prime bits above the top number:
let wrd = (topNum - FRSTBP) / WCRC in let wrdn = int WHLNDXS.[int (topNum - FRSTBP - wrd * WCRC)]
let awrd = if topNum < FRSTBP then -1 else
is_prms.[int wrd] <- is_prms.[int wrd] &&& ~~~(uint16 -2 <<< wrdn); int wrd
{ awrd + 1 .. is_prms.Length - 1 } |> Seq.iter (fun w -> is_prms.[w] <- 0us)
let rec count w acc = //tight count loop
if w < is_prms.Length then //magic int16 bit count algorithm - slightly slower than Look Up Table for int16's
let a = is_prms.[w] in let b = a - ((a >>> 1) &&& 0x5555us)
let c = (b &&& 0x3333us) + ((b >>> 2) &&& 0x3333us)
count (w + 1) (acc + uint64 ((((c + (c >>> 4)) &&& 0x0F0Fus) * 0x0101us) >>> 8))
else acc in count 0 (uint64 (BPRMS |> Seq.takeWhile ((>=) topNum) |> Seq.length)) //counts set bits incl. base
// seq { yield! BPRMS |> Seq.takeWhile ((>=) topNum); //very slow but literal way of enumerating resulting primes:
// yield! Seq.initInfinite id
// |> Seq.collect (fun k -> BWPS |> Seq.mapi (fun i x -> WCRC * uint64 k + uint64 x, WHTS * k + i))
// |> Seq.takeWhile (((>=) topNum) << fst)
// |> Seq.filter (testPrm << snd) |> Seq.map (fst) }
open SoA
[<EntryPoint>]
let main argv =
printfn "%s" getName
if argv = null || argv.Length = 0 then
failwith "\r\nno command line argument for limit!!!"
let test = primes 100UL // |> Seq.iter (printf "%A ")
let topNum = System.UInt64.Parse argv.[0]
let strt = System.DateTime.Now.Ticks
// let count = { 0 .. 3999 } |> Seq.map (fun i -> SoA.primes topNum) |> Seq.max // |> Seq.length
let count = primes topNum // |> Seq.length
let elpsd = System.DateTime.Now.Ticks - strt
printfn "Found %A primes up to %A in %A milliseconds." count topNum (elpsd / 10000L)
printf "\r\nPress any key to exit:"
System.Console.ReadKey(true) |> ignore
printfn ""
0 // return an integer exit code
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment