JervenBolleman/GC7.java

## GC7.java
package gc;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;


public class GC7
{
	private static final long GorC = 0b01100110_01100110_01100110_01100110_01100110_01100110_01100110_01100110L;
	private static final long AorT = ~GorC;

	public static void main(String[] args)
	    throws java.io.IOException
	{

		final String orig = "Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa";
		final String re = orig+".re";
		rewrite(orig, re);
		final RandomAccessFile randomAccessFile = new RandomAccessFile(re,
		    "r");
		final ByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0
		    , randomAccessFile.length());
		new GC7(channel);
		randomAccessFile.close();
	}

	/**
	 * This code rewrites the nucleotide fasta file in a way that allows later faster parsing
	 * and working with it. Using 4 bytes per nucleotide it is not as space efficient as possible.
	 * But the nice thing is that with a single binary & and a Long.bitCount() (popcount) we can count
	 * 16 nucleotides per instruction.
	 * @param string
	 * @param reName
	 * @throws IOException
	 */
	private static void rewrite(String string, String reName)
	    throws IOException
	{
		final File orig = new File(string);
		final File re = new File(reName);
		if (!re.exists())
		{
			final FileOutputStream fileWriter = new FileOutputStream(re);
			int bufSize = 0;
			StringBuilder header = new StringBuilder();
			//We want to write bytes not chars. Having the first bit set in a UTF-8 string
			//means we get two bytes output due to encoding expansion.
			//This byte array means we keep on working on bytes, avoiding encoding crazyness.
			ByteArrayOutputStream record = new ByteArrayOutputStream();
			char prev = 'N';
			boolean even = false;
			try (InputStream in = new FileInputStream(orig))
			{
				byte[] read = new byte[4096];
				while ((bufSize = in.read(read)) != -1)
				{
					for (int i = 0; i < bufSize; i++)
					{
						char c = (char) read[i];
						if (c == '>')
						{
							if (record != null)
							{
								fileWriter.write(String.valueOf(record.size()).getBytes());
								fileWriter.write(record.toByteArray());
							}
							header = new StringBuilder();
						}
						else if (c == '\n')
						{
							if (header != null)
							{
								fileWriter.write(String.valueOf(header.length()).getBytes());
								fileWriter.write('>');
								fileWriter.write(header.toString().getBytes());
							}
							header = null; //Fasta header are always one line.
						}
						else if (header != null)
							header.append(c);
						else if (header == null)
						{
							if (even)
							{
								final byte i2 = (byte) ((convert(prev) << 4) | convert(c));
								record.write(i2);
								even = false;
							}
							else
							{
								prev = c;
								even = true;
							}
						}
					}
				}
			}
			if (!even)
				record.write((convert(prev) << 4));
			if (record != null)
			{
				for (int a=0;a < record.size() % 16;a+=2)
				{
					record.write((byte) 0);
				}
				fileWriter.write(String.valueOf(record.size() * 2).getBytes());
				//The semi colon is the delimiter between the header and the sequence.
				fileWriter.write('!');
				fileWriter.write(record.toByteArray());
			}
			fileWriter.flush();
			fileWriter.close();
		}
	}

	private static byte convert(char c)
	{
		switch (c)
		{
			case 'A':
				return 0b0001;
			case 'C':
				return 0b0010;
			case 'G':
				return 0b0100;
			case 'T':
				return 0b1000;
			default://N is encoded as all zero making that easy to count as well if needed.
				return 0b0000;
		}
	}

	private GC7(ByteBuffer channel) throws IOException
	{
		long start = System.currentTimeMillis();
		int at = 0, gc = 0;
		StringBuilder digit = new StringBuilder();
		for (int i = 0; i < channel.limit(); i++)
		{
			int c = channel.get();
			if (Character.isDigit(c))
			{
				digit.append((char) c);
			}
			else if (c == '>')
			{
				final int skip = Integer.parseInt(digit.toString());
				for (int a = 0; a < skip; a++)
				{
					channel.get();
				}
				i += skip;
				digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
			}
			else if (c == '!')
			{
				long numberOfChars = Long.parseLong(digit.toString());
				digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
				for (long j = 0; j < numberOfChars; j+=16)
				{
						long l = channel.getLong();
						gc += Long.bitCount(l & GorC);
						at += Long.bitCount(l & AorT);
				}
				i += numberOfChars;
				for (; i < channel.limit(); i++)
				{
					c = channel.get();
					gc += Long.bitCount(c & GorC);
					at += Long.bitCount(c & AorT);
				}
			}
		}
		System.out.println("GC7 count " + gc + " gc at " + at);
		System.out.printf("GC7 count %f\n", gc / (double) (gc + at));
		System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start);
	}
}
	package gc;

	import java.io.ByteArrayOutputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.RandomAccessFile;
	import java.nio.ByteBuffer;
	import java.nio.channels.FileChannel;


	public class GC7
	{
	private static final long GorC = 0b01100110_01100110_01100110_01100110_01100110_01100110_01100110_01100110L;
	private static final long AorT = ~GorC;

	public static void main(String[] args)
	throws java.io.IOException
	{

	final String orig = "Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa";
	final String re = orig+".re";
	rewrite(orig, re);
	final RandomAccessFile randomAccessFile = new RandomAccessFile(re,
	"r");
	final ByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0
	, randomAccessFile.length());
	new GC7(channel);
	randomAccessFile.close();
	}

	/**
	* This code rewrites the nucleotide fasta file in a way that allows later faster parsing
	* and working with it. Using 4 bytes per nucleotide it is not as space efficient as possible.
	* But the nice thing is that with a single binary & and a Long.bitCount() (popcount) we can count
	* 16 nucleotides per instruction.
	* @param string
	* @param reName
	* @throws IOException
	*/
	private static void rewrite(String string, String reName)
	throws IOException
	{
	final File orig = new File(string);
	final File re = new File(reName);
	if (!re.exists())
	{
	final FileOutputStream fileWriter = new FileOutputStream(re);
	int bufSize = 0;
	StringBuilder header = new StringBuilder();
	//We want to write bytes not chars. Having the first bit set in a UTF-8 string
	//means we get two bytes output due to encoding expansion.
	//This byte array means we keep on working on bytes, avoiding encoding crazyness.
	ByteArrayOutputStream record = new ByteArrayOutputStream();
	char prev = 'N';
	boolean even = false;
	try (InputStream in = new FileInputStream(orig))
	{
	byte[] read = new byte[4096];
	while ((bufSize = in.read(read)) != -1)
	{
	for (int i = 0; i < bufSize; i++)
	{
	char c = (char) read[i];
	if (c == '>')
	{
	if (record != null)
	{
	fileWriter.write(String.valueOf(record.size()).getBytes());
	fileWriter.write(record.toByteArray());
	}
	header = new StringBuilder();
	}
	else if (c == '\n')
	{
	if (header != null)
	{
	fileWriter.write(String.valueOf(header.length()).getBytes());
	fileWriter.write('>');
	fileWriter.write(header.toString().getBytes());
	}
	header = null; //Fasta header are always one line.
	}
	else if (header != null)
	header.append(c);
	else if (header == null)
	{
	if (even)
	{
	final byte i2 = (byte) ((convert(prev) << 4) \| convert(c));
	record.write(i2);
	even = false;
	}
	else
	{
	prev = c;
	even = true;
	}
	}
	}
	}
	}
	if (!even)
	record.write((convert(prev) << 4));
	if (record != null)
	{
	for (int a=0;a < record.size() % 16;a+=2)
	{
	record.write((byte) 0);
	}
	fileWriter.write(String.valueOf(record.size() * 2).getBytes());
	//The semi colon is the delimiter between the header and the sequence.
	fileWriter.write('!');
	fileWriter.write(record.toByteArray());
	}
	fileWriter.flush();
	fileWriter.close();
	}
	}

	private static byte convert(char c)
	{
	switch (c)
	{
	case 'A':
	return 0b0001;
	case 'C':
	return 0b0010;
	case 'G':
	return 0b0100;
	case 'T':
	return 0b1000;
	default://N is encoded as all zero making that easy to count as well if needed.
	return 0b0000;
	}
	}

	private GC7(ByteBuffer channel) throws IOException
	{
	long start = System.currentTimeMillis();
	int at = 0, gc = 0;
	StringBuilder digit = new StringBuilder();
	for (int i = 0; i < channel.limit(); i++)
	{
	int c = channel.get();
	if (Character.isDigit(c))
	{
	digit.append((char) c);
	}
	else if (c == '>')
	{
	final int skip = Integer.parseInt(digit.toString());
	for (int a = 0; a < skip; a++)
	{
	channel.get();
	}
	i += skip;
	digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
	}
	else if (c == '!')
	{
	long numberOfChars = Long.parseLong(digit.toString());
	digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
	for (long j = 0; j < numberOfChars; j+=16)
	{
	long l = channel.getLong();
	gc += Long.bitCount(l & GorC);
	at += Long.bitCount(l & AorT);
	}
	i += numberOfChars;
	for (; i < channel.limit(); i++)
	{
	c = channel.get();
	gc += Long.bitCount(c & GorC);
	at += Long.bitCount(c & AorT);
	}
	}
	}
	System.out.println("GC7 count " + gc + " gc at " + at);
	System.out.printf("GC7 count %f\n", gc / (double) (gc + at));
	System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start);
	}
	}