latticetower/aa_convertor.py

## aa_convertor.py
"""Alternative aminoacid to SMILES convertor

If you want to check how it works, simply run
```
python aa_convertor.py
```
When called this way, module expects that rdkit is installed, however simple
module import and function call shouldn't fail when it is not.

This module should be compatible with python 2 & 3.
"""

aminoacids = "GASRDVTELCHKIYNMPWFQ"

def parseSmilesFragments(filename):
    """expects file to contain SMILES fragments for aminoacids, separated by space.
    <1letter aminoacid representation> <COOH><alpha-carbon>(<sidechain starting from beta-carbon>)<N>
    """
    result = dict()
    with open(filename) as f:
        for line in f:
            aa, smiles = line.strip().split()
            result[aa] = smiles
    return result

aaToSmiles=parseSmilesFragments("smiles2.txt")


def proteinToSmiles(protein):
    """
    converts 1letter aminoacid representation to SMILES
    """
    for aa in protein:
        smiles = aaToSmiles.get(aa, None)
        # if data is none, raise error (got unknown aminoacid type)
        #if aa == "P":
        #    # this is an exception, process differently
        yield smiles
        #else:


if __name__ == "__main__":
    from rdkit import Chem
    from itertools import combinations
    for i in range(1, 5):
        for x in combinations(aminoacids, 2):
            seq = "".join(x)
            p = "".join(proteinToSmiles(seq))
            #print(seq, p)
            molecule = Chem.MolFromSmiles(p)
            assert molecule is not None, "SMILES string for '%s' couldn't be parsed" % seq
    print("If you see this line, everything went fine")

## smiles2.txt
G OC(=O)CN
A OC(=O)[C@H](C)N
S OC(=O)[C@@H](CO)N
D OC(=O)[C@@H](CC(=O)O)N
R OC(=O)C(CCCNC(=N)N)N
V OC(=O)[C@@H](C(C)(C))N
T OC(=O)[C@@H]([C@H](O)C)N
E OC(=O)[C@@H](CCC(=O)O)N
L OC(=O)[C@@H](CC(C)C)N
C OC(=O)[C@@H](CS)N
H OC(=O)[C@@H](CC1[N]=CNC=1)N
K OC(=O)[C@@H](CCCC(N))N
I OC(=O)[C@@H]([C@H](C)CC)N
Y OC(=O)[C@@H](CC1=CC=C(O)C=C1)N
N OC(=O)[C@@H](CC(=O)(N))N
M OC(=O)C(CCSC)N
P OC(=O)C1CCCN1
W OC(=O)[C@@H](Cc1c2c(cccc2)[nH]c1)N
F OC(=O)[C@@H](CC1=CC=CC=C1)N
Q OC(=O)[C@@H](CCC(=O)(N))N
	"""Alternative aminoacid to SMILES convertor

	If you want to check how it works, simply run
	```
	python aa_convertor.py
	```
	When called this way, module expects that rdkit is installed, however simple
	module import and function call shouldn't fail when it is not.

	This module should be compatible with python 2 & 3.
	"""

	aminoacids = "GASRDVTELCHKIYNMPWFQ"

	def parseSmilesFragments(filename):
	"""expects file to contain SMILES fragments for aminoacids, separated by space.
	<1letter aminoacid representation> <COOH><alpha-carbon>(<sidechain starting from beta-carbon>)<N>
	"""
	result = dict()
	with open(filename) as f:
	for line in f:
	aa, smiles = line.strip().split()
	result[aa] = smiles
	return result

	aaToSmiles=parseSmilesFragments("smiles2.txt")


	def proteinToSmiles(protein):
	"""
	converts 1letter aminoacid representation to SMILES
	"""
	for aa in protein:
	smiles = aaToSmiles.get(aa, None)
	# if data is none, raise error (got unknown aminoacid type)
	#if aa == "P":
	# # this is an exception, process differently
	yield smiles
	#else:


	if __name__ == "__main__":
	from rdkit import Chem
	from itertools import combinations
	for i in range(1, 5):
	for x in combinations(aminoacids, 2):
	seq = "".join(x)
	p = "".join(proteinToSmiles(seq))
	#print(seq, p)
	molecule = Chem.MolFromSmiles(p)
	assert molecule is not None, "SMILES string for '%s' couldn't be parsed" % seq
	print("If you see this line, everything went fine")
	G OC(=O)CN
	A OC(=O)[C@H](C)N
	S OC(=O)[C@@H](CO)N
	D OC(=O)[C@@H](CC(=O)O)N
	R OC(=O)C(CCCNC(=N)N)N
	V OC(=O)[C@@H](C(C)(C))N
	T OC(=O)[C@@H]([C@H](O)C)N
	E OC(=O)[C@@H](CCC(=O)O)N
	L OC(=O)[C@@H](CC(C)C)N
	C OC(=O)[C@@H](CS)N
	H OC(=O)[C@@H](CC1[N]=CNC=1)N
	K OC(=O)[C@@H](CCCC(N))N
	I OC(=O)[C@@H]([C@H](C)CC)N
	Y OC(=O)[C@@H](CC1=CC=C(O)C=C1)N
	N OC(=O)[C@@H](CC(=O)(N))N
	M OC(=O)C(CCSC)N
	P OC(=O)C1CCCN1
	W OC(=O)[C@@H](Cc1c2c(cccc2)[nH]c1)N
	F OC(=O)[C@@H](CC1=CC=CC=C1)N
	Q OC(=O)[C@@H](CCC(=O)(N))N