Skip to content

Instantly share code, notes, and snippets.

@JervenBolleman
Created August 11, 2014 13:08
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save JervenBolleman/b7a572d83f3daa4cfe8a to your computer and use it in GitHub Desktop.
Java GC count trying to be branchless.
package gc;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
public class GC2
{
static final int GC = 0;
static final int AT = 1;
static final int N = 2;
static final int SECOND_RIGHTMOST_BITS_SET = 2;
static final int FOURTH_RIGHMOST_BIT_SET = 8;
public static void main(String[] args)
throws java.io.IOException
{
try (InputStream in = new FileInputStream("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa"))
{
long start = System.currentTimeMillis();
int[] gcatn = readFastaAndCountGCATN(in);
gcatn[AT] = gcatn[AT] - gcatn[N];
System.out.printf("GC count %f\n", gcatn[GC] / (double) (gcatn[GC] + gcatn[AT]));
System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start);
}
}
private static int[] readFastaAndCountGCATN(InputStream in)
throws IOException
{
int[] gcatn = new int[3];
int bufSize = 0;
boolean header = true;
byte[] read = new byte[4096];
while ((bufSize = in.read(read)) != -1)
{
for (int i = 0; i < bufSize; i++)
{
header = readChar(gcatn, header, read, i);
}
}
return gcatn;
}
private static boolean readChar(int[] gcatn, boolean header, byte[] read, int i)
{
char c = (char) read[i];
//While branches are bad these are not to bad as both are very rare so there will be few branch predictions.
if (c == '>')
header = true;
else if (c == '\n')
header = false; //Fasta header are always one line.
else if (!header)
{
//The idea here is to count without branching, so no stall and mispredictions.
//The trick depends on G and C in ascii/utf-8 encoding to share the second rightmost bit being set
//while A and T do not have that bit set. We then need to account for N as well.
//We push it into an array so that we can pass back all the values.
//Putting it all in a loop improves performance because the JIT is beter with on stack replacement like this
int lgc = ((c & SECOND_RIGHTMOST_BITS_SET) >> 1) - ((c & FOURTH_RIGHMOST_BIT_SET) >>> 3);
gcatn[GC] = gcatn[GC] + lgc;
gcatn[AT] = gcatn[AT] + (((~lgc) & 1));
gcatn[N] = gcatn[N] + ((c & FOURTH_RIGHMOST_BIT_SET) >>> 3);
}
return header;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment