tgs/ncbi2na.py

## ncbi2na.py
#!/usr/bin/env python
"""
Translate compact hex sequence representation to standard bases.

If you have NCBI ASN.1 data, the easiest way to get a FASTA version is to
use NCBI's asn2fasta tool, which is available here:
https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/asn2fasta/

NCBI2na is a compact representation of sequence data, where each base just
takes 2 bits.  It's commonly found in text ASN.1 files as hexadecimal strings.
In binary ASN.1, it is found as binary data - this script does not handle that.
In a string of hexadecimal digits, each digit represents two nucleotide bases.
And the hex string represents actual characters, so only even numbers of hex
digits are valid.  So for any given hex string, there are four different
numbers of nucleotides it could represent.  This means you need to provide the
length of the nucleotide sequence you expect.

    >>> import ncbi2na
    >>> ncbi2na.decode_ncbi2na(20, u'0123456789')
    'AAACAGATCACCCGCTGAGC'
    >>> ncbi2na.decode_ncbi2na(17, u'0123456789')
    'AAACAGATCACCCGCTG'

Since we have the expected length, we can sanity-check it against the input
string:

    >>> ncbi2na.decode_ncbi2na(16, u'0123456789')
    Traceback (most recent call last):
        ...
    ValueError: Found more nuc bases than expected
    >>> ncbi2na.decode_ncbi2na(21, u'0123456789')
    Traceback (most recent call last):
        ...
    ValueError: Found fewer nuc bases than expected

This should work from Python 2 or 3, but the hexadecimal string
must be unicode because the `translate` method works very differently
in byte strings.
"""
from itertools import product

__all__ = ['decode_ncbi2na']

_nucs = u'ACGT'
_replacements = list(map(u''.join, product(_nucs, _nucs)))
_hex2nuc = dict(zip(map(ord, u'0123456789ABCDEF'), _replacements))

if bytes is str:  # python2
    _text_type = unicode
else:
    _text_type = str


def decode_ncbi2na(length, hex_str):
    if not isinstance(hex_str, _text_type):
        raise TypeError("hex_str must be unicode/text type")
    seq_untrimmed = hex_str.translate(_hex2nuc)
    raw_len = len(seq_untrimmed)
    if length > raw_len:
        raise ValueError("Found fewer nuc bases than expected")
    if length < raw_len - 3:
        raise ValueError("Found more nuc bases than expected")
    return seq_untrimmed[:length]
	#!/usr/bin/env python
	"""
	Translate compact hex sequence representation to standard bases.

	If you have NCBI ASN.1 data, the easiest way to get a FASTA version is to
	use NCBI's asn2fasta tool, which is available here:
	https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/asn2fasta/

	NCBI2na is a compact representation of sequence data, where each base just
	takes 2 bits. It's commonly found in text ASN.1 files as hexadecimal strings.
	In binary ASN.1, it is found as binary data - this script does not handle that.
	In a string of hexadecimal digits, each digit represents two nucleotide bases.
	And the hex string represents actual characters, so only even numbers of hex
	digits are valid. So for any given hex string, there are four different
	numbers of nucleotides it could represent. This means you need to provide the
	length of the nucleotide sequence you expect.

	>>> import ncbi2na
	>>> ncbi2na.decode_ncbi2na(20, u'0123456789')
	'AAACAGATCACCCGCTGAGC'
	>>> ncbi2na.decode_ncbi2na(17, u'0123456789')
	'AAACAGATCACCCGCTG'

	Since we have the expected length, we can sanity-check it against the input
	string:

	>>> ncbi2na.decode_ncbi2na(16, u'0123456789')
	Traceback (most recent call last):
	...
	ValueError: Found more nuc bases than expected
	>>> ncbi2na.decode_ncbi2na(21, u'0123456789')
	Traceback (most recent call last):
	...
	ValueError: Found fewer nuc bases than expected

	This should work from Python 2 or 3, but the hexadecimal string
	must be unicode because the `translate` method works very differently
	in byte strings.
	"""
	from itertools import product

	__all__ = ['decode_ncbi2na']

	_nucs = u'ACGT'
	_replacements = list(map(u''.join, product(_nucs, _nucs)))
	_hex2nuc = dict(zip(map(ord, u'0123456789ABCDEF'), _replacements))

	if bytes is str: # python2
	_text_type = unicode
	else:
	_text_type = str


	def decode_ncbi2na(length, hex_str):
	if not isinstance(hex_str, _text_type):
	raise TypeError("hex_str must be unicode/text type")
	seq_untrimmed = hex_str.translate(_hex2nuc)
	raw_len = len(seq_untrimmed)
	if length > raw_len:
	raise ValueError("Found fewer nuc bases than expected")
	if length < raw_len - 3:
	raise ValueError("Found more nuc bases than expected")
	return seq_untrimmed[:length]