Skip to content

Instantly share code, notes, and snippets.

@JervenBolleman
Created August 12, 2014 10:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JervenBolleman/c8e0861438e72cb7ae15 to your computer and use it in GitHub Desktop.
Save JervenBolleman/c8e0861438e72cb7ae15 to your computer and use it in GitHub Desktop.
GC6 threaded GC count using table (is faster than bit fiddling
package gc;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
public class GC6
{
static final int GC = 0;
static final int AT = 1;
static final int N = 2;
static final int SECOND_RIGHTMOST_BITS_SET = 2;
static final int FOURTH_RIGHMOST_BIT_SET = 8;
static final byte greaterthan = '>';
static final byte newline = '\n';
public static void main(String[] args)
throws java.io.IOException
{
final RandomAccessFile randomAccessFile = new RandomAccessFile("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa",
"r");
final MappedByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0
, randomAccessFile.length());
new GC6(channel);
randomAccessFile.close();
}
private GC6(ByteBuffer channel) throws IOException
{
long start = System.currentTimeMillis();
int gc = 0;
int at = 0;
final int threads = Runtime.getRuntime().availableProcessors();
Counter next = null;
List<Future<int[]>> futures = new ArrayList<>(threads);
ExecutorService exec = Executors.newFixedThreadPool(threads);
for (int thread = threads; thread > 0; thread--)
{
next = new Counter((channel.limit() / threads) * (thread - 1), next, channel);
futures.add(exec.submit(next));
}
while (!futures.isEmpty())
{
Iterator<Future<int[]>> iter = futures.iterator();
while (iter.hasNext())
{
try
{
final int[] is = iter.next().get();
if (is != null)
{
gc += is['G'] + is['C'];
at += is['A'] + is['T'];
iter.remove();
}
} catch (InterruptedException | ExecutionException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
exec.shutdown();
System.out.printf("GC6 count %f\n", gc / (double) (gc + (at)));
System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start);
}
private static class Counter
implements Callable<int[]>
{
private final int start;
private final ByteBuffer channel;
private final int finish;
public Counter(int starts, Counter next, ByteBuffer channel)
{
super();
this.channel = channel;
starts = findCountingStartingPoint(starts, channel);
this.start = starts;
if (next == null)
finish = channel.limit();
else
finish = next.start - 1;
}
private int findCountingStartingPoint(int starts, ByteBuffer channel)
{
for (int i = starts; i < channel.limit(); i++)
{
//If we start on a new line we do not need to worry about missing counting things in a header
//We will either start with a '>' or on a line containing some DNA
if (channel.get(i) == newline)
{
starts = i + 1;//Don't need to retest the newline later
break;
}
}
return starts;
}
@Override
public int[] call()
throws Exception
{
int[] table = new int[85];//T (84) is the highest value if we get something higher we need to know!
boolean header = true;
for (int i = start; i < finish; i++)
{
int b = channel.get(i);
//While branches are bad these are not to bad as both are very rare so there will be few branch predictions.
if (b == greaterthan)
header = true;
else if (header && b == newline)
header = false; //Fasta header are always one line.
else if (!header)
table[b]++;
}
return table;
}
}
}
@JervenBolleman
Copy link
Author

You can replace the int[] table with char[] table, but the answer is incorrect on larger chromosomes.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment