Skip to content

Instantly share code, notes, and snippets.

@pansapiens
Created March 13, 2020 01:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pansapiens/870908abc7d0292ace0654a036487303 to your computer and use it in GitHub Desktop.
Save pansapiens/870908abc7d0292ace0654a036487303 to your computer and use it in GitHub Desktop.
AAIndex parsing
import urllib.request as request
from collections import defaultdict
def parse_aaindex2(lines, default=None):
"""
Parse the lines of an AAIndex2 substitution matrix, return a dict of the entire database keyed by
AAIndex identifier.
The aaindex[id]['matrix'] dictionary is the same structure as Biopython's `Bio.SubsMat.MatrixInfo`
substitution matricies.
Example:
import urllib.request as request
aaindex2_text = request.urlopen('ftp://ftp.genome.jp/pub/db/community/aaindex/aaindex2').read().decode().splitlines()
aaindex2 = parse_aaindex2(aaindex_text)
grantham_matrix = aaindex2['GRAR740104']['matrix']
"""
aaindex = {}
current_id = None
record_type = None
for ll in lines:
l = ll.strip()
if l == '':
continue
if ll[0] in ['H', 'D', 'R', 'A', 'T', '*', 'J', 'M']:
record_type = l[0]
if l == '//':
record_type = '//'
if record_type == 'H':
current_id = l[2:]
aaindex[current_id] = defaultdict(str)
matrix = {}
i_row = 0
if record_type in ['D', 'R', 'A', 'T', '*', 'J']:
aaindex[current_id][record_type] += l[2:]
if record_type == 'M':
if ll[0] == 'M':
s = l[2:].split(',')
rows = list(s[0].split('=')[1].strip())
cols = list(s[1].split('=')[1].strip())
aaindex[current_id]['row_index'] = rows
aaindex[current_id]['col_index'] = cols
else:
values = []
for v in l.split():
if v != '-':
values.append(float(v))
else:
values.append(None)
for i_col, v in enumerate(values):
matrix[(cols[i_col], rows[i_row])] = v
i_row += 1
if record_type == '//':
aaindex[current_id]['matrix'] = matrix
record_type = None
return aaindex
def get_aaindex2(url='ftp://ftp.genome.jp/pub/db/community/aaindex/aaindex2')
aaindex2_text = request.urlopen(url).read().decode().splitlines()
return parse_aaindex2(aaindex2_text)
if __name__ == '__main__':
from pprint import pprint
aaindex2 = get_aaindex2()
pprint(aaindex2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment