Created
July 23, 2018 05:12
-
-
Save mikedn/8dc555a867aba65b6e26f39cf3cb66eb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// The Computer Language Benchmarks Game | |
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ | |
// submitted by Josh Goldfoot | |
// Modified to reduce memory and do more in parallel by Anthony Lloyd | |
// Added dictionary incrementor by Anthony Lloyd | |
// Added custom UInt64UInt32Map hashtable by mikedn | |
using System; | |
using System.Collections.Generic; | |
using System.Diagnostics; | |
using System.IO; | |
using System.Linq; | |
using System.Runtime.CompilerServices; | |
using System.Text; | |
using System.Threading.Tasks; | |
sealed class UInt64UInt32Map : IEnumerable<(ulong Key, uint Value)> | |
{ | |
private uint[] buckets; | |
private Entry[] entries; | |
private uint count; | |
private int hashBits = 18; | |
private uint hashMask; | |
private struct Entry | |
{ | |
public ulong key; | |
public uint value; | |
public uint next; | |
} | |
public UInt64UInt32Map() | |
{ | |
buckets = new uint[1 << hashBits]; | |
entries = new Entry[1 << hashBits]; | |
hashMask = (1u << hashBits) - 1; | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private uint GetHashCode(ulong key) | |
{ | |
uint hash = (uint)((key >> 32) ^ key); | |
hash = (hash >> 18) ^ hash; | |
return hash; | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private uint GetBucketIndex(ulong key) | |
{ | |
return GetHashCode(key) & hashMask; | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
private uint GetEntryIndex(ulong key) | |
{ | |
return buckets[GetBucketIndex(key)] - 1; | |
} | |
public bool TryGetValue(ulong key, out uint value) | |
{ | |
Entry[] entries = this.entries; | |
uint entryIndex = GetEntryIndex(key); | |
while (true) | |
{ | |
if (entryIndex >= (uint)entries.Length) | |
{ | |
break; | |
} | |
if (entries[entryIndex].key == key) | |
{ | |
value = entries[entryIndex].value; | |
return true; | |
} | |
entryIndex = entries[entryIndex].next; | |
} | |
value = 0; | |
return false; | |
} | |
public ref uint GetRef(ulong key) | |
{ | |
Entry[] entries = this.entries; | |
uint entryIndex = GetEntryIndex(key); | |
while (true) | |
{ | |
if (entryIndex >= (uint)entries.Length) | |
{ | |
break; | |
} | |
if (entries[entryIndex].key == key) | |
{ | |
return ref entries[entryIndex].value; | |
} | |
entryIndex = entries[entryIndex].next; | |
} | |
return ref Create(key); | |
} | |
private ref uint Create(ulong key) | |
{ | |
if (count == entries.Length) | |
{ | |
Resize(); | |
} | |
uint entryIndex = count++; | |
entries[entryIndex].key = key; | |
entries[entryIndex].value = 0; | |
uint bucket = GetBucketIndex(key); | |
entries[entryIndex].next = buckets[bucket] - 1; | |
buckets[bucket] = entryIndex + 1; | |
return ref entries[entryIndex].value; | |
} | |
private void Resize() | |
{ | |
Entry[] oldEntries = entries; | |
uint[] oldBuckets = buckets; | |
hashBits++; | |
hashMask = (1u << hashBits) - 1; | |
Entry[] newEntries = new Entry[1 << hashBits]; | |
uint[] newBuckets = new uint[1 << hashBits]; | |
buckets = newBuckets; | |
entries = newEntries; | |
Array.Copy(oldEntries, 0, newEntries, 0, count); | |
for (uint i = 0; i < count; i++) | |
{ | |
uint bucket = GetBucketIndex(newEntries[i].key); | |
newEntries[i].next = newBuckets[bucket] - 1; | |
newBuckets[bucket] = i + 1; | |
} | |
} | |
public Enumerator GetEnumerator() => new Enumerator(this); | |
IEnumerator<(ulong Key, uint Value)> IEnumerable<(ulong Key, uint Value)>.GetEnumerator() => new Enumerator(this); | |
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() => new Enumerator(this); | |
public struct Enumerator : IEnumerator<(ulong Key, uint Value)> | |
{ | |
private readonly UInt64UInt32Map dictionary; | |
private uint index; | |
private (ulong, uint) current; | |
internal Enumerator(UInt64UInt32Map dictionary) | |
{ | |
this.dictionary = dictionary; | |
index = 0; | |
current = new ValueTuple<uint, uint>(); | |
} | |
public bool MoveNext() | |
{ | |
Entry[] entries = dictionary.entries; | |
uint count = dictionary.count; | |
uint index = this.index; | |
while (index < count) | |
{ | |
if (entries[index].next > 0) | |
{ | |
current = (entries[index].key, entries[index].value); | |
this.index = index + 1; | |
return true; | |
} | |
index++; | |
} | |
this.index = count + 1; | |
current = (0, 0); | |
return false; | |
} | |
public (ulong Key, uint Value) Current => current; | |
object System.Collections.IEnumerator.Current => current; | |
void System.Collections.IEnumerator.Reset() => throw new NotImplementedException(); | |
public void Dispose() | |
{ | |
} | |
} | |
} | |
public static class KNucleotide | |
{ | |
const int BLOCK_SIZE = 1024 * 1024 * 8; | |
public static List<byte[]> threeBlocks = new List<byte[]>(); | |
public static int threeStart, threeEnd; | |
static byte[] tonum = new byte[256]; | |
static char[] tochar = new char[] { 'A', 'C', 'G', 'T' }; | |
static int read(Stream stream, byte[] buffer, int offset, int count) | |
{ | |
var bytesRead = stream.Read(buffer, offset, count); | |
return bytesRead == count ? offset + count | |
: bytesRead == 0 ? offset | |
: read(stream, buffer, offset + bytesRead, count - bytesRead); | |
} | |
static int find(byte[] buffer, byte[] toFind, int i, ref int matchIndex) | |
{ | |
if (matchIndex == 0) | |
{ | |
i = Array.IndexOf(buffer, toFind[0], i); | |
if (i == -1) return -1; | |
matchIndex = 1; | |
return find(buffer, toFind, i + 1, ref matchIndex); | |
} | |
else | |
{ | |
int bl = buffer.Length, fl = toFind.Length; | |
while (i < bl && matchIndex < fl) | |
{ | |
if (buffer[i++] != toFind[matchIndex++]) | |
{ | |
matchIndex = 0; | |
return find(buffer, toFind, i, ref matchIndex); | |
} | |
} | |
return matchIndex == fl ? i : -1; | |
} | |
} | |
public static void LoadThreeData() | |
{ | |
var stream = File.OpenRead("D:\\projects\\knuc\\data.txt"); | |
//var stream = Console.OpenStandardInput(); | |
// find three sequence | |
int matchIndex = 0; | |
var toFind = new[] { (byte)'>', (byte)'T', (byte)'H', (byte)'R', (byte)'E', (byte)'E' }; | |
var buffer = new byte[BLOCK_SIZE]; | |
do | |
{ | |
threeEnd = read(stream, buffer, 0, BLOCK_SIZE); | |
threeStart = find(buffer, toFind, 0, ref matchIndex); | |
} while (threeStart == -1); | |
// Skip to end of line | |
matchIndex = 0; | |
toFind = new[] { (byte)'\n' }; | |
threeStart = find(buffer, toFind, threeStart, ref matchIndex); | |
while (threeStart == -1) | |
{ | |
threeEnd = read(stream, buffer, 0, BLOCK_SIZE); | |
threeStart = find(buffer, toFind, 0, ref matchIndex); | |
} | |
threeBlocks.Add(buffer); | |
if (threeEnd != BLOCK_SIZE) // Needs to be at least 2 blocks | |
{ | |
var bytes = threeBlocks[0]; | |
for (int i = threeEnd; i < bytes.Length; i++) | |
bytes[i] = 255; | |
threeEnd = 0; | |
threeBlocks.Add(Array.Empty<byte>()); | |
return; | |
} | |
// find next seq or end of input | |
matchIndex = 0; | |
toFind = new[] { (byte)'>' }; | |
threeEnd = find(buffer, toFind, threeStart, ref matchIndex); | |
while (threeEnd == -1) | |
{ | |
buffer = new byte[BLOCK_SIZE]; | |
var bytesRead = read(stream, buffer, 0, BLOCK_SIZE); | |
threeEnd = bytesRead == BLOCK_SIZE ? find(buffer, toFind, 0, ref matchIndex) | |
: bytesRead; | |
threeBlocks.Add(buffer); | |
} | |
if (threeStart + 18 > BLOCK_SIZE) // Key needs to be in the first block | |
{ | |
byte[] block0 = threeBlocks[0], block1 = threeBlocks[1]; | |
Buffer.BlockCopy(block0, threeStart, block0, threeStart - 18, BLOCK_SIZE - threeStart); | |
Buffer.BlockCopy(block1, 0, block0, BLOCK_SIZE - 18, 18); | |
for (int i = 0; i < 18; i++) block1[i] = 255; | |
} | |
} | |
[MethodImpl(MethodImplOptions.AggressiveInlining)] | |
static void check(UInt64UInt32Map inc, ref ulong rollingKey, byte nb, ulong mask) | |
{ | |
if (nb == 255) return; | |
rollingKey = ((rollingKey & mask) << 2) | nb; | |
inc.GetRef(rollingKey)++; | |
} | |
static Task<string> count(int l, ulong mask, Func<UInt64UInt32Map, string> summary) | |
{ | |
return Task.Run(() => | |
{ | |
ulong rollingKey = 0; | |
var firstBlock = threeBlocks[0]; | |
var start = threeStart; | |
while (--l > 0) rollingKey = (rollingKey << 2) | firstBlock[start++]; | |
var dict = new UInt64UInt32Map(); | |
for (int i = start; i < firstBlock.Length; i++) | |
check(dict, ref rollingKey, firstBlock[i], mask); | |
int lastBlockId = threeBlocks.Count - 1; | |
for (int bl = 1; bl < lastBlockId; bl++) | |
{ | |
var bytes = threeBlocks[bl]; | |
for (int i = 0; i < bytes.Length; i++) | |
check(dict, ref rollingKey, bytes[i], mask); | |
} | |
var lastBlock = threeBlocks[lastBlockId]; | |
for (int i = 0; i < threeEnd; i++) | |
check(dict, ref rollingKey, lastBlock[i], mask); | |
return summary(dict); | |
}); | |
} | |
static string writeFrequencies(UInt64UInt32Map freq, int fragmentLength) | |
{ | |
var sb = new StringBuilder(); | |
double percent = 100.0 / freq.Sum(p => p.Value); | |
foreach (var kv in freq.OrderByDescending(i => i.Value)) | |
{ | |
var keyChars = new char[fragmentLength]; | |
var key = kv.Key; | |
for (int i = keyChars.Length - 1; i >= 0; --i) | |
{ | |
keyChars[i] = tochar[key & 0x3]; | |
key >>= 2; | |
} | |
sb.Append(keyChars); | |
sb.Append(" "); | |
sb.AppendLine((kv.Value * percent).ToString("F3")); | |
} | |
return sb.ToString(); | |
} | |
static string writeCount(UInt64UInt32Map dictionary, string fragment) | |
{ | |
ulong key = 0; | |
for (int i = 0; i < fragment.Length; ++i) | |
key = (key << 2) | tonum[fragment[i]]; | |
var n = dictionary.TryGetValue(key, out var v) ? v : 0; | |
return string.Concat(n.ToString(), "\t", fragment); | |
} | |
public static void Main(string[] args) | |
{ | |
var sw = Stopwatch.StartNew(); | |
tonum['c'] = 1; tonum['C'] = 1; | |
tonum['g'] = 2; tonum['G'] = 2; | |
tonum['t'] = 3; tonum['T'] = 3; | |
tonum['\n'] = 255; tonum['>'] = 255; tonum[255] = 255; | |
LoadThreeData(); | |
Parallel.ForEach(threeBlocks, bytes => | |
{ | |
for (int i = 0; i < bytes.Length; i++) | |
bytes[i] = tonum[bytes[i]]; | |
}); | |
var task18 = count(18, 34359738367, d => writeCount(d, "GGTATTTTAATTTATAGT")); | |
var task12 = count(12, 8388607, d => writeCount(d, "GGTATTTTAATT")); | |
var task6 = count(6, 0b1111111111, d => writeCount(d, "GGTATT")); | |
var task1 = count(1, 0, d => writeFrequencies(d, 1)); | |
var task2 = count(2, 0b11, d => writeFrequencies(d, 2)); | |
var task3 = count(3, 0b1111, d => writeCount(d, "GGT")); | |
var task4 = count(4, 0b111111, d => writeCount(d, "GGTA")); | |
Console.WriteLine(task1.Result); | |
Console.WriteLine(task2.Result); | |
Console.WriteLine(task3.Result); | |
Console.WriteLine(task4.Result); | |
Console.WriteLine(task6.Result); | |
Console.WriteLine(task12.Result); | |
Console.WriteLine(task18.Result); | |
Console.WriteLine($"Took {sw.Elapsed}"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey 😄. What's the license of the
UInt64UInt32Map
hashtable code?