Skip to content

Instantly share code, notes, and snippets.

@JervenBolleman
Created August 11, 2014 13:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JervenBolleman/22b85419b1a1e29800c8 to your computer and use it in GitHub Desktop.
Save JervenBolleman/22b85419b1a1e29800c8 to your computer and use it in GitHub Desktop.
Mapped file, same branchless gc counting
package gc;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
public class GC3
{
static final int GC = 0;
static final int AT = 1;
static final int N = 2;
static final int SECOND_RIGHTMOST_BITS_SET = 2;
static final int FOURTH_RIGHMOST_BIT_SET = 8;
public static void main(String[] args)
throws java.io.IOException
{
long start = System.currentTimeMillis();
final RandomAccessFile randomAccessFile = new RandomAccessFile("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa",
"r");
final MappedByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0
, randomAccessFile.length());
int[] gcatn = readFastaAndCountGCATN(channel);
gcatn[AT] = gcatn[AT] - gcatn[N];
System.out.printf("GC count %f\n", gcatn[GC] / (double) (gcatn[GC] + gcatn[AT]));
System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start);
randomAccessFile.close();
}
private static int[] readFastaAndCountGCATN(ByteBuffer channel)
throws IOException
{
int[] gcatn = new int[3];
boolean header = true;
for (int i = 0; i < channel.limit(); i++)
{
char c = (char) channel.get(i);
header = readChar(gcatn, header, c);
}
return gcatn;
}
private static boolean readChar(int[] gcatn, boolean header, char c)
{
//While branches are bad these are not to bad as both are very rare so there will be few branch predictions.
if (c == '>')
header = true;
else if (c == '\n')
header = false; //Fasta header are always one line.
else if (!header)
{
//The idea here is to count without branching, so no stall and mispredictions.
//The trick depends on G and C in ascii/utf-8 encoding to share the second rightmost bit being set
//while A and T do not have that bit set. We then need to account for N as well.
//We push it into an array so that we can pass back all the values.
//Putting it all in a loop improves performance because the JIT is beter with on stack replacement like this
int lgc = ((c & SECOND_RIGHTMOST_BITS_SET) >> 1) - ((c & FOURTH_RIGHMOST_BIT_SET) >>> 3);
gcatn[GC] = gcatn[GC] + lgc;
gcatn[AT] = gcatn[AT] + (((~lgc) & 1));
gcatn[N] = gcatn[N] + ((c & FOURTH_RIGHMOST_BIT_SET) >>> 3);
}
return header;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment