Skip to content

Instantly share code, notes, and snippets.

@fredyr
Last active August 16, 2023 05:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fredyr/d3309ccd9dcf4f985573feffb33b46ca to your computer and use it in GitHub Desktop.
Save fredyr/d3309ccd9dcf4f985573feffb33b46ca to your computer and use it in GitHub Desktop.
/// K-mer counting
/// Basic Zig port of kc-c1.c from https://github.com/lh3/kmer-cnt/blob/master/kc-c1.c
///
/// zig build -Doptimize=ReleaseFast
/// wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR174/003/ERR1741363/ERR1741363_1.fastq.gz
///
/// time ./kc-c1 ERR1741363_1.fastq.gz => real 0m20,494s
/// time pigz -dc ERR1741363_1.fastq.gz | zig-out/bin/kc-z1 => real 0m20,336s
///
const std = @import("std");
const seq_nt4_table = [_]u8{ // translate ACGT to 0123
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
};
fn count_seq(h: anytype, comptime k: u32, seq: []u8) !void {
var x = [2]u64{ 0, 0 };
var l: u32 = 0;
const mask: u64 = (1 << k * 2) - 1;
const shift: u64 = (k - 1) * 2;
for (seq) |ch| {
const c: u64 = seq_nt4_table[ch];
if (c < 4) { // ATCG
x[0] = (x[0] << 2 | c) & mask; // forward strand
x[1] = x[1] >> 2 | (3 - c) << shift; // reverse strand
l += 1;
if (l >= k) { // we find a k-mer
const y = if (x[0] < x[1]) x[0] else x[1];
var v = try h.getOrPut(y);
if (!v.found_existing) {
v.value_ptr.* = 1;
} else {
v.value_ptr.* += 1;
}
}
} else { // Ambigous nucleotide, so reset
x[0] = 0;
x[1] = 0;
l = 0;
}
}
}
fn print_hist(h: anytype) !void {
var cnt = [_]u64{0} ** 256;
var it = h.iterator();
while (it.next()) |entry| {
var val = if (entry.value_ptr.* < 256) entry.value_ptr.* else 255;
cnt[val] += 1;
}
for (cnt, 0..) |c, i| {
if (i > 0) {
std.debug.print("{}\t{}\n", .{ i, c });
}
}
}
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
var map = std.AutoHashMap(u64, u32).init(allocator);
defer map.deinit();
var buffer: [350]u8 = undefined;
var lineno: u32 = 0;
const stdin = std.io.getStdIn();
var buf = std.io.bufferedReader(stdin.reader());
var r = buf.reader();
while (try r.readUntilDelimiterOrEof(buffer[0..], '\n')) |input| {
const FASTQ = enum { Header, Sequence, Plus, Quality };
const state: FASTQ = @enumFromInt(lineno % 4);
if (state == FASTQ.Sequence) {
try count_seq(&map, 31, input);
}
lineno += 1;
}
try print_hist(map);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment