pansapiens/aaindex.py

## aaindex.py
import urllib.request as request
from collections import defaultdict

def parse_aaindex2(lines, default=None):
    """
    Parse the lines of an AAIndex2 substitution matrix, return a dict of the entire database keyed by
    AAIndex identifier.

    The aaindex[id]['matrix'] dictionary is the same structure as Biopython's `Bio.SubsMat.MatrixInfo`
    substitution matricies.

    Example:

    import urllib.request as request

    aaindex2_text = request.urlopen('ftp://ftp.genome.jp/pub/db/community/aaindex/aaindex2').read().decode().splitlines()
    aaindex2 = parse_aaindex2(aaindex_text)
    grantham_matrix = aaindex2['GRAR740104']['matrix']
    """

    aaindex = {}
    current_id = None
    record_type = None
    for ll in lines:
        l = ll.strip()
        if l == '':
            continue

        if ll[0] in ['H', 'D', 'R', 'A', 'T', '*', 'J', 'M']:
            record_type = l[0]

        if l == '//':
            record_type = '//'

        if record_type == 'H':
            current_id = l[2:]
            aaindex[current_id] = defaultdict(str)
            matrix = {}
            i_row = 0
        if record_type in ['D', 'R', 'A', 'T', '*', 'J']:
            aaindex[current_id][record_type] += l[2:]

        if record_type == 'M':
            if ll[0] == 'M':
                s = l[2:].split(',')
                rows = list(s[0].split('=')[1].strip())
                cols = list(s[1].split('=')[1].strip())
                aaindex[current_id]['row_index'] = rows
                aaindex[current_id]['col_index'] = cols
            else:
                values = []
                for v in l.split():
                    if v != '-':
                        values.append(float(v))
                    else:
                        values.append(None)

                for i_col, v in enumerate(values):
                    matrix[(cols[i_col], rows[i_row])] = v
                i_row += 1

        if record_type == '//':
            aaindex[current_id]['matrix'] = matrix
            record_type = None

    return aaindex


def get_aaindex2(url='ftp://ftp.genome.jp/pub/db/community/aaindex/aaindex2')
    aaindex2_text = request.urlopen(url).read().decode().splitlines()
    return parse_aaindex2(aaindex2_text)


if __name__ == '__main__':
    from pprint import pprint

    aaindex2 = get_aaindex2()
    pprint(aaindex2)
	import urllib.request as request
	from collections import defaultdict

	def parse_aaindex2(lines, default=None):
	"""
	Parse the lines of an AAIndex2 substitution matrix, return a dict of the entire database keyed by
	AAIndex identifier.

	The aaindex[id]['matrix'] dictionary is the same structure as Biopython's `Bio.SubsMat.MatrixInfo`
	substitution matricies.

	Example:

	import urllib.request as request

	aaindex2_text = request.urlopen('ftp://ftp.genome.jp/pub/db/community/aaindex/aaindex2').read().decode().splitlines()
	aaindex2 = parse_aaindex2(aaindex_text)
	grantham_matrix = aaindex2['GRAR740104']['matrix']
	"""

	aaindex = {}
	current_id = None
	record_type = None
	for ll in lines:
	l = ll.strip()
	if l == '':
	continue

	if ll[0] in ['H', 'D', 'R', 'A', 'T', '*', 'J', 'M']:
	record_type = l[0]

	if l == '//':
	record_type = '//'

	if record_type == 'H':
	current_id = l[2:]
	aaindex[current_id] = defaultdict(str)
	matrix = {}
	i_row = 0
	if record_type in ['D', 'R', 'A', 'T', '*', 'J']:
	aaindex[current_id][record_type] += l[2:]

	if record_type == 'M':
	if ll[0] == 'M':
	s = l[2:].split(',')
	rows = list(s[0].split('=')[1].strip())
	cols = list(s[1].split('=')[1].strip())
	aaindex[current_id]['row_index'] = rows
	aaindex[current_id]['col_index'] = cols
	else:
	values = []
	for v in l.split():
	if v != '-':
	values.append(float(v))
	else:
	values.append(None)

	for i_col, v in enumerate(values):
	matrix[(cols[i_col], rows[i_row])] = v
	i_row += 1

	if record_type == '//':
	aaindex[current_id]['matrix'] = matrix
	record_type = None

	return aaindex


	def get_aaindex2(url='ftp://ftp.genome.jp/pub/db/community/aaindex/aaindex2')
	aaindex2_text = request.urlopen(url).read().decode().splitlines()
	return parse_aaindex2(aaindex2_text)


	if __name__ == '__main__':
	from pprint import pprint

	aaindex2 = get_aaindex2()
	pprint(aaindex2)