Skip to content

Instantly share code, notes, and snippets.

@JervenBolleman
Created August 25, 2014 12:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JervenBolleman/d1430d0549028861504c to your computer and use it in GitHub Desktop.
Save JervenBolleman/d1430d0549028861504c to your computer and use it in GitHub Desktop.
GC7 GC count using binary encoded nucleotides for speed (requires converting a fasta file first)
package gc;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
public class GC7
{
private static final long GorC = 0b01100110_01100110_01100110_01100110_01100110_01100110_01100110_01100110L;
private static final long AorT = ~GorC;
public static void main(String[] args)
throws java.io.IOException
{
final String orig = "Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa";
final String re = orig+".re";
rewrite(orig, re);
final RandomAccessFile randomAccessFile = new RandomAccessFile(re,
"r");
final ByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0
, randomAccessFile.length());
new GC7(channel);
randomAccessFile.close();
}
/**
* This code rewrites the nucleotide fasta file in a way that allows later faster parsing
* and working with it. Using 4 bytes per nucleotide it is not as space efficient as possible.
* But the nice thing is that with a single binary & and a Long.bitCount() (popcount) we can count
* 16 nucleotides per instruction.
* @param string
* @param reName
* @throws IOException
*/
private static void rewrite(String string, String reName)
throws IOException
{
final File orig = new File(string);
final File re = new File(reName);
if (!re.exists())
{
final FileOutputStream fileWriter = new FileOutputStream(re);
int bufSize = 0;
StringBuilder header = new StringBuilder();
//We want to write bytes not chars. Having the first bit set in a UTF-8 string
//means we get two bytes output due to encoding expansion.
//This byte array means we keep on working on bytes, avoiding encoding crazyness.
ByteArrayOutputStream record = new ByteArrayOutputStream();
char prev = 'N';
boolean even = false;
try (InputStream in = new FileInputStream(orig))
{
byte[] read = new byte[4096];
while ((bufSize = in.read(read)) != -1)
{
for (int i = 0; i < bufSize; i++)
{
char c = (char) read[i];
if (c == '>')
{
if (record != null)
{
fileWriter.write(String.valueOf(record.size()).getBytes());
fileWriter.write(record.toByteArray());
}
header = new StringBuilder();
}
else if (c == '\n')
{
if (header != null)
{
fileWriter.write(String.valueOf(header.length()).getBytes());
fileWriter.write('>');
fileWriter.write(header.toString().getBytes());
}
header = null; //Fasta header are always one line.
}
else if (header != null)
header.append(c);
else if (header == null)
{
if (even)
{
final byte i2 = (byte) ((convert(prev) << 4) | convert(c));
record.write(i2);
even = false;
}
else
{
prev = c;
even = true;
}
}
}
}
}
if (!even)
record.write((convert(prev) << 4));
if (record != null)
{
for (int a=0;a < record.size() % 16;a+=2)
{
record.write((byte) 0);
}
fileWriter.write(String.valueOf(record.size() * 2).getBytes());
//The semi colon is the delimiter between the header and the sequence.
fileWriter.write('!');
fileWriter.write(record.toByteArray());
}
fileWriter.flush();
fileWriter.close();
}
}
private static byte convert(char c)
{
switch (c)
{
case 'A':
return 0b0001;
case 'C':
return 0b0010;
case 'G':
return 0b0100;
case 'T':
return 0b1000;
default://N is encoded as all zero making that easy to count as well if needed.
return 0b0000;
}
}
private GC7(ByteBuffer channel) throws IOException
{
long start = System.currentTimeMillis();
int at = 0, gc = 0;
StringBuilder digit = new StringBuilder();
for (int i = 0; i < channel.limit(); i++)
{
int c = channel.get();
if (Character.isDigit(c))
{
digit.append((char) c);
}
else if (c == '>')
{
final int skip = Integer.parseInt(digit.toString());
for (int a = 0; a < skip; a++)
{
channel.get();
}
i += skip;
digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
}
else if (c == '!')
{
long numberOfChars = Long.parseLong(digit.toString());
digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
for (long j = 0; j < numberOfChars; j+=16)
{
long l = channel.getLong();
gc += Long.bitCount(l & GorC);
at += Long.bitCount(l & AorT);
}
i += numberOfChars;
for (; i < channel.limit(); i++)
{
c = channel.get();
gc += Long.bitCount(c & GorC);
at += Long.bitCount(c & AorT);
}
}
}
System.out.println("GC7 count " + gc + " gc at " + at);
System.out.printf("GC7 count %f\n", gc / (double) (gc + at));
System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment