Java GC count trying to be branchless.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package gc; | |
import java.io.FileInputStream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
public class GC2 | |
{ | |
static final int GC = 0; | |
static final int AT = 1; | |
static final int N = 2; | |
static final int SECOND_RIGHTMOST_BITS_SET = 2; | |
static final int FOURTH_RIGHMOST_BIT_SET = 8; | |
public static void main(String[] args) | |
throws java.io.IOException | |
{ | |
try (InputStream in = new FileInputStream("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa")) | |
{ | |
long start = System.currentTimeMillis(); | |
int[] gcatn = readFastaAndCountGCATN(in); | |
gcatn[AT] = gcatn[AT] - gcatn[N]; | |
System.out.printf("GC count %f\n", gcatn[GC] / (double) (gcatn[GC] + gcatn[AT])); | |
System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start); | |
} | |
} | |
private static int[] readFastaAndCountGCATN(InputStream in) | |
throws IOException | |
{ | |
int[] gcatn = new int[3]; | |
int bufSize = 0; | |
boolean header = true; | |
byte[] read = new byte[4096]; | |
while ((bufSize = in.read(read)) != -1) | |
{ | |
for (int i = 0; i < bufSize; i++) | |
{ | |
header = readChar(gcatn, header, read, i); | |
} | |
} | |
return gcatn; | |
} | |
private static boolean readChar(int[] gcatn, boolean header, byte[] read, int i) | |
{ | |
char c = (char) read[i]; | |
//While branches are bad these are not to bad as both are very rare so there will be few branch predictions. | |
if (c == '>') | |
header = true; | |
else if (c == '\n') | |
header = false; //Fasta header are always one line. | |
else if (!header) | |
{ | |
//The idea here is to count without branching, so no stall and mispredictions. | |
//The trick depends on G and C in ascii/utf-8 encoding to share the second rightmost bit being set | |
//while A and T do not have that bit set. We then need to account for N as well. | |
//We push it into an array so that we can pass back all the values. | |
//Putting it all in a loop improves performance because the JIT is beter with on stack replacement like this | |
int lgc = ((c & SECOND_RIGHTMOST_BITS_SET) >> 1) - ((c & FOURTH_RIGHMOST_BIT_SET) >>> 3); | |
gcatn[GC] = gcatn[GC] + lgc; | |
gcatn[AT] = gcatn[AT] + (((~lgc) & 1)); | |
gcatn[N] = gcatn[N] + ((c & FOURTH_RIGHMOST_BIT_SET) >>> 3); | |
} | |
return header; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment