Created
May 19, 2017 13:12
-
-
Save latticetower/8f26514ca355254fbe2de3060645c692 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Alternative aminoacid to SMILES convertor | |
If you want to check how it works, simply run | |
``` | |
python aa_convertor.py | |
``` | |
When called this way, module expects that rdkit is installed, however simple | |
module import and function call shouldn't fail when it is not. | |
This module should be compatible with python 2 & 3. | |
""" | |
aminoacids = "GASRDVTELCHKIYNMPWFQ" | |
def parseSmilesFragments(filename): | |
"""expects file to contain SMILES fragments for aminoacids, separated by space. | |
<1letter aminoacid representation> <COOH><alpha-carbon>(<sidechain starting from beta-carbon>)<N> | |
""" | |
result = dict() | |
with open(filename) as f: | |
for line in f: | |
aa, smiles = line.strip().split() | |
result[aa] = smiles | |
return result | |
aaToSmiles=parseSmilesFragments("smiles2.txt") | |
def proteinToSmiles(protein): | |
""" | |
converts 1letter aminoacid representation to SMILES | |
""" | |
for aa in protein: | |
smiles = aaToSmiles.get(aa, None) | |
# if data is none, raise error (got unknown aminoacid type) | |
#if aa == "P": | |
# # this is an exception, process differently | |
yield smiles | |
#else: | |
if __name__ == "__main__": | |
from rdkit import Chem | |
from itertools import combinations | |
for i in range(1, 5): | |
for x in combinations(aminoacids, 2): | |
seq = "".join(x) | |
p = "".join(proteinToSmiles(seq)) | |
#print(seq, p) | |
molecule = Chem.MolFromSmiles(p) | |
assert molecule is not None, "SMILES string for '%s' couldn't be parsed" % seq | |
print("If you see this line, everything went fine") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
G OC(=O)CN | |
A OC(=O)[C@H](C)N | |
S OC(=O)[C@@H](CO)N | |
D OC(=O)[C@@H](CC(=O)O)N | |
R OC(=O)C(CCCNC(=N)N)N | |
V OC(=O)[C@@H](C(C)(C))N | |
T OC(=O)[C@@H]([C@H](O)C)N | |
E OC(=O)[C@@H](CCC(=O)O)N | |
L OC(=O)[C@@H](CC(C)C)N | |
C OC(=O)[C@@H](CS)N | |
H OC(=O)[C@@H](CC1[N]=CNC=1)N | |
K OC(=O)[C@@H](CCCC(N))N | |
I OC(=O)[C@@H]([C@H](C)CC)N | |
Y OC(=O)[C@@H](CC1=CC=C(O)C=C1)N | |
N OC(=O)[C@@H](CC(=O)(N))N | |
M OC(=O)C(CCSC)N | |
P OC(=O)C1CCCN1 | |
W OC(=O)[C@@H](Cc1c2c(cccc2)[nH]c1)N | |
F OC(=O)[C@@H](CC1=CC=CC=C1)N | |
Q OC(=O)[C@@H](CCC(=O)(N))N |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment