Skip to content

Instantly share code, notes, and snippets.

@mikedn
Created July 23, 2018 05:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mikedn/8dc555a867aba65b6e26f39cf3cb66eb to your computer and use it in GitHub Desktop.
Save mikedn/8dc555a867aba65b6e26f39cf3cb66eb to your computer and use it in GitHub Desktop.
// The Computer Language Benchmarks Game
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
// submitted by Josh Goldfoot
// Modified to reduce memory and do more in parallel by Anthony Lloyd
// Added dictionary incrementor by Anthony Lloyd
// Added custom UInt64UInt32Map hashtable by mikedn
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Threading.Tasks;
sealed class UInt64UInt32Map : IEnumerable<(ulong Key, uint Value)>
{
private uint[] buckets;
private Entry[] entries;
private uint count;
private int hashBits = 18;
private uint hashMask;
private struct Entry
{
public ulong key;
public uint value;
public uint next;
}
public UInt64UInt32Map()
{
buckets = new uint[1 << hashBits];
entries = new Entry[1 << hashBits];
hashMask = (1u << hashBits) - 1;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetHashCode(ulong key)
{
uint hash = (uint)((key >> 32) ^ key);
hash = (hash >> 18) ^ hash;
return hash;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetBucketIndex(ulong key)
{
return GetHashCode(key) & hashMask;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint GetEntryIndex(ulong key)
{
return buckets[GetBucketIndex(key)] - 1;
}
public bool TryGetValue(ulong key, out uint value)
{
Entry[] entries = this.entries;
uint entryIndex = GetEntryIndex(key);
while (true)
{
if (entryIndex >= (uint)entries.Length)
{
break;
}
if (entries[entryIndex].key == key)
{
value = entries[entryIndex].value;
return true;
}
entryIndex = entries[entryIndex].next;
}
value = 0;
return false;
}
public ref uint GetRef(ulong key)
{
Entry[] entries = this.entries;
uint entryIndex = GetEntryIndex(key);
while (true)
{
if (entryIndex >= (uint)entries.Length)
{
break;
}
if (entries[entryIndex].key == key)
{
return ref entries[entryIndex].value;
}
entryIndex = entries[entryIndex].next;
}
return ref Create(key);
}
private ref uint Create(ulong key)
{
if (count == entries.Length)
{
Resize();
}
uint entryIndex = count++;
entries[entryIndex].key = key;
entries[entryIndex].value = 0;
uint bucket = GetBucketIndex(key);
entries[entryIndex].next = buckets[bucket] - 1;
buckets[bucket] = entryIndex + 1;
return ref entries[entryIndex].value;
}
private void Resize()
{
Entry[] oldEntries = entries;
uint[] oldBuckets = buckets;
hashBits++;
hashMask = (1u << hashBits) - 1;
Entry[] newEntries = new Entry[1 << hashBits];
uint[] newBuckets = new uint[1 << hashBits];
buckets = newBuckets;
entries = newEntries;
Array.Copy(oldEntries, 0, newEntries, 0, count);
for (uint i = 0; i < count; i++)
{
uint bucket = GetBucketIndex(newEntries[i].key);
newEntries[i].next = newBuckets[bucket] - 1;
newBuckets[bucket] = i + 1;
}
}
public Enumerator GetEnumerator() => new Enumerator(this);
IEnumerator<(ulong Key, uint Value)> IEnumerable<(ulong Key, uint Value)>.GetEnumerator() => new Enumerator(this);
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() => new Enumerator(this);
public struct Enumerator : IEnumerator<(ulong Key, uint Value)>
{
private readonly UInt64UInt32Map dictionary;
private uint index;
private (ulong, uint) current;
internal Enumerator(UInt64UInt32Map dictionary)
{
this.dictionary = dictionary;
index = 0;
current = new ValueTuple<uint, uint>();
}
public bool MoveNext()
{
Entry[] entries = dictionary.entries;
uint count = dictionary.count;
uint index = this.index;
while (index < count)
{
if (entries[index].next > 0)
{
current = (entries[index].key, entries[index].value);
this.index = index + 1;
return true;
}
index++;
}
this.index = count + 1;
current = (0, 0);
return false;
}
public (ulong Key, uint Value) Current => current;
object System.Collections.IEnumerator.Current => current;
void System.Collections.IEnumerator.Reset() => throw new NotImplementedException();
public void Dispose()
{
}
}
}
public static class KNucleotide
{
const int BLOCK_SIZE = 1024 * 1024 * 8;
public static List<byte[]> threeBlocks = new List<byte[]>();
public static int threeStart, threeEnd;
static byte[] tonum = new byte[256];
static char[] tochar = new char[] { 'A', 'C', 'G', 'T' };
static int read(Stream stream, byte[] buffer, int offset, int count)
{
var bytesRead = stream.Read(buffer, offset, count);
return bytesRead == count ? offset + count
: bytesRead == 0 ? offset
: read(stream, buffer, offset + bytesRead, count - bytesRead);
}
static int find(byte[] buffer, byte[] toFind, int i, ref int matchIndex)
{
if (matchIndex == 0)
{
i = Array.IndexOf(buffer, toFind[0], i);
if (i == -1) return -1;
matchIndex = 1;
return find(buffer, toFind, i + 1, ref matchIndex);
}
else
{
int bl = buffer.Length, fl = toFind.Length;
while (i < bl && matchIndex < fl)
{
if (buffer[i++] != toFind[matchIndex++])
{
matchIndex = 0;
return find(buffer, toFind, i, ref matchIndex);
}
}
return matchIndex == fl ? i : -1;
}
}
public static void LoadThreeData()
{
var stream = File.OpenRead("D:\\projects\\knuc\\data.txt");
//var stream = Console.OpenStandardInput();
// find three sequence
int matchIndex = 0;
var toFind = new[] { (byte)'>', (byte)'T', (byte)'H', (byte)'R', (byte)'E', (byte)'E' };
var buffer = new byte[BLOCK_SIZE];
do
{
threeEnd = read(stream, buffer, 0, BLOCK_SIZE);
threeStart = find(buffer, toFind, 0, ref matchIndex);
} while (threeStart == -1);
// Skip to end of line
matchIndex = 0;
toFind = new[] { (byte)'\n' };
threeStart = find(buffer, toFind, threeStart, ref matchIndex);
while (threeStart == -1)
{
threeEnd = read(stream, buffer, 0, BLOCK_SIZE);
threeStart = find(buffer, toFind, 0, ref matchIndex);
}
threeBlocks.Add(buffer);
if (threeEnd != BLOCK_SIZE) // Needs to be at least 2 blocks
{
var bytes = threeBlocks[0];
for (int i = threeEnd; i < bytes.Length; i++)
bytes[i] = 255;
threeEnd = 0;
threeBlocks.Add(Array.Empty<byte>());
return;
}
// find next seq or end of input
matchIndex = 0;
toFind = new[] { (byte)'>' };
threeEnd = find(buffer, toFind, threeStart, ref matchIndex);
while (threeEnd == -1)
{
buffer = new byte[BLOCK_SIZE];
var bytesRead = read(stream, buffer, 0, BLOCK_SIZE);
threeEnd = bytesRead == BLOCK_SIZE ? find(buffer, toFind, 0, ref matchIndex)
: bytesRead;
threeBlocks.Add(buffer);
}
if (threeStart + 18 > BLOCK_SIZE) // Key needs to be in the first block
{
byte[] block0 = threeBlocks[0], block1 = threeBlocks[1];
Buffer.BlockCopy(block0, threeStart, block0, threeStart - 18, BLOCK_SIZE - threeStart);
Buffer.BlockCopy(block1, 0, block0, BLOCK_SIZE - 18, 18);
for (int i = 0; i < 18; i++) block1[i] = 255;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static void check(UInt64UInt32Map inc, ref ulong rollingKey, byte nb, ulong mask)
{
if (nb == 255) return;
rollingKey = ((rollingKey & mask) << 2) | nb;
inc.GetRef(rollingKey)++;
}
static Task<string> count(int l, ulong mask, Func<UInt64UInt32Map, string> summary)
{
return Task.Run(() =>
{
ulong rollingKey = 0;
var firstBlock = threeBlocks[0];
var start = threeStart;
while (--l > 0) rollingKey = (rollingKey << 2) | firstBlock[start++];
var dict = new UInt64UInt32Map();
for (int i = start; i < firstBlock.Length; i++)
check(dict, ref rollingKey, firstBlock[i], mask);
int lastBlockId = threeBlocks.Count - 1;
for (int bl = 1; bl < lastBlockId; bl++)
{
var bytes = threeBlocks[bl];
for (int i = 0; i < bytes.Length; i++)
check(dict, ref rollingKey, bytes[i], mask);
}
var lastBlock = threeBlocks[lastBlockId];
for (int i = 0; i < threeEnd; i++)
check(dict, ref rollingKey, lastBlock[i], mask);
return summary(dict);
});
}
static string writeFrequencies(UInt64UInt32Map freq, int fragmentLength)
{
var sb = new StringBuilder();
double percent = 100.0 / freq.Sum(p => p.Value);
foreach (var kv in freq.OrderByDescending(i => i.Value))
{
var keyChars = new char[fragmentLength];
var key = kv.Key;
for (int i = keyChars.Length - 1; i >= 0; --i)
{
keyChars[i] = tochar[key & 0x3];
key >>= 2;
}
sb.Append(keyChars);
sb.Append(" ");
sb.AppendLine((kv.Value * percent).ToString("F3"));
}
return sb.ToString();
}
static string writeCount(UInt64UInt32Map dictionary, string fragment)
{
ulong key = 0;
for (int i = 0; i < fragment.Length; ++i)
key = (key << 2) | tonum[fragment[i]];
var n = dictionary.TryGetValue(key, out var v) ? v : 0;
return string.Concat(n.ToString(), "\t", fragment);
}
public static void Main(string[] args)
{
var sw = Stopwatch.StartNew();
tonum['c'] = 1; tonum['C'] = 1;
tonum['g'] = 2; tonum['G'] = 2;
tonum['t'] = 3; tonum['T'] = 3;
tonum['\n'] = 255; tonum['>'] = 255; tonum[255] = 255;
LoadThreeData();
Parallel.ForEach(threeBlocks, bytes =>
{
for (int i = 0; i < bytes.Length; i++)
bytes[i] = tonum[bytes[i]];
});
var task18 = count(18, 34359738367, d => writeCount(d, "GGTATTTTAATTTATAGT"));
var task12 = count(12, 8388607, d => writeCount(d, "GGTATTTTAATT"));
var task6 = count(6, 0b1111111111, d => writeCount(d, "GGTATT"));
var task1 = count(1, 0, d => writeFrequencies(d, 1));
var task2 = count(2, 0b11, d => writeFrequencies(d, 2));
var task3 = count(3, 0b1111, d => writeCount(d, "GGT"));
var task4 = count(4, 0b111111, d => writeCount(d, "GGTA"));
Console.WriteLine(task1.Result);
Console.WriteLine(task2.Result);
Console.WriteLine(task3.Result);
Console.WriteLine(task4.Result);
Console.WriteLine(task6.Result);
Console.WriteLine(task12.Result);
Console.WriteLine(task18.Result);
Console.WriteLine($"Took {sw.Elapsed}");
}
}
@Alan-FGR
Copy link

Alan-FGR commented Aug 3, 2018

Hey 😄. What's the license of the UInt64UInt32Map hashtable code?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment