Last active
September 26, 2019 14:43
-
-
Save atc3/f0260a99c5328a12f4b6fbbe65beec83 to your computer and use it in GitHub Desktop.
Valdar Sequence Conservation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sequence conservation score function, from Valdar 2001 - https://doi.org/10.1002/1097-0134\(20010101\)42:1<108::AID-PROT110>3.0.CO;2-O | |
import pandas as pd | |
import numpy as np | |
# Load substitution matrix (PET91) | |
# https://doi.org/10.1093/bioinformatics/8.3.275 | |
pet91 = pd.read_csv('pet91.csv', comment='!', index_col=0) | |
m = pet91 | |
# Get minimum and maximum values | |
m_min = np.min(m.stack().values) | |
m_max = np.max(m.stack().values) | |
# Define amino acid similarity function | |
def Mut(a, b): | |
if a == '-' or b == '-': | |
return 0 | |
return (m[a][b] - m_min) / (m_max - m_min) | |
# Define function for evolutionary distance between sequences | |
def Dist(s_j, s_k): | |
assert len(s_j) == len(s_k) | |
# Convert strings to lists, if not already | |
s_j = list(s_j) | |
s_k = list(s_k) | |
# Flag aligned positions (no gaps) | |
non_gap_positions = [] | |
for i in range(0, len(s_j)): | |
if s_j[i] == '-' or s_k[i] == '-': | |
continue | |
non_gap_positions.append(i) | |
# Count number of non-gap positions | |
n_aligned_jk = len(non_gap_positions) | |
# Calculate and sum, for each non-gap position, | |
# the similarity between the two amino acids | |
sum_similarity = 0. | |
for i in non_gap_positions: | |
sum_similarity = sum_similarity + Mut(s_j[i], s_k[i]) | |
return 1 - (sum_similarity / n_aligned_jk) | |
# Define sequence weighting function | |
def Weight(s_j, all_sequences): | |
assert s_j in all_sequences | |
sum_distances = 0. | |
for s_k in all_sequences: | |
# Need to cast to a string in order to compare equality | |
if ''.join(s_j) == ''.join(s_k): | |
continue | |
sum_distances = sum_distances + Dist(s_j, s_k) | |
return sum_distances / (len(all_sequences) - 1) | |
#Weight(sequence_mat[10], sequence_mat) | |
# Define conservation function | |
# Return a vector with the conservation score for each position in the sequence | |
def Cons(all_sequences): | |
sequence_length = len(all_sequences[0]) | |
sum_numerator = [0.,] * sequence_length | |
sum_denominator = 0. | |
for j in range(0, len(all_sequences)): | |
s_j = all_sequences[j] | |
W_j = Weight(s_j, all_sequences) | |
for k in range(j + 1, len(all_sequences)): | |
s_k = all_sequences[k] | |
W_k = Weight(s_k, all_sequences) | |
# For each AA position | |
for i in range(0, sequence_length): | |
sum_numerator[i] = sum_numerator[i] + (W_j * W_k * Mut(s_j[i], s_k[i])) | |
sum_denominator = sum_denominator + (W_j * W_k) | |
result = sum_numerator | |
result = [x / sum_denominator for x in result] | |
return result | |
# Example: | |
sequence_mat = [ | |
list('GADGVGNASGNWHCDSTWLGDRVITTSTRTWALPTYNNHLYKQISSAST-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTTNDGVTTIANNLTSTVQVFSDSEYQLPYVLGSAHQGCLPPFPADVFMIPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEEVPFHSSYAHSQSLDRLMNPLIDQYLYYLNRTQN-QSGSAQNKDLLFSRGSPAGMSVQPKNWLPGPCYRQQRVSKTKTDNNNSN-----FTWTGASKYNLNGRESIINPGTAMASHKDDEDKFFPMSGVMIFGKESAGA---SNTALDNVMITDEEEIKATNPVATERFGTVAVNFQSSSTDPATGDVHAMGALPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKNPPPQILIKNTPVPANPPAEFSATKFASFITQYSTGQVSVEIEWELQKENSKRWNPEVQYTSNYAKSANVDFTVDNNGLYTEPRPIGTRYLTRPL'), | |
list('--DGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQ-S-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTN-TPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSE-----YSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEK---TNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL'), | |
list('GADGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQ-S-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTN-TPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSE-----YSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEK---TNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL'), | |
list('GADGVGNSSGNWHCDSQWLGDRVITTSTRTWALPTYNNHLYKQISSQ-S-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKKLSFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFQFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLNRTQGTTSGTTNQSRLLFSQAGPQSMSLQARNWLPGPCYRQQRLSKTANDNNNSN-----FPWTAASKYHLNGRDSLVNPGPAMASHKDDEEKFFPMHGNLIFGKEGTTA---SNAELDNVMITDEEEIRTTNPVATEQYGTVANNLQSSNTAPTTRTVNDQGALPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQIMIKNTPVPANPPTTFSPAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL'), | |
list('GADGVGNASGDWHCDSTWSEGHVTTTSTRTWVLPTYNNHLYKRLG-----ESLQSNTYNGFSTPWGYFDFNRFHCHFSPRDWQRLINNNWGMRPKAMRVKIFNIQVKEVTTSNGETTVANNLTSTVQIFADSSYELPYVMDAGQEGSLPPFPNDVFMVPQYGYCGLVTGNTSQQQTDRNAFYCLEYFPSQMLRTGNNFEITYSFEKVPFHSMYAHSQSLDRLMNPLIDQYLWGLQSTTTGTTLNAGTATTNFTKLRPTNFSNFKKNWLPGPSIKQQGFSKTANQNYKIPATGSDSLIKYETHSTLDGRWSALTPGPPMATAGPADSKFS-NSQLIFAGPKQNGN---TATVPGTLIFTSEEELAATNATDTDMWGNLPGGDQSNSNLPTVDRLTALGAVPGMVWQNRDIYYQGPIWAKIPHTDGHFHPSPLIGGFGLKHPPPQIFIKNTPVPANPATTFSSTPVNSFITQYSTGQVSVQIDWEIQKERSKRWNPEVQFTSNYGQQNSLLWAPDAAGKYTEPRAIGTRYLTHHL'), | |
list('--DGVGNASGDWHCDSTWMGDRVVTKSTRTWVLPSYNNHQYREIKSGSV-DGSNANAYFGYSTPWGYFDFNRFHSHWSPRDWQRLINNYWGFRPRSLRVKIFNIQVKEVTVQDSTTTIANNLTSTVQVFTDDDYQLPYVVGNGTEGCLPAFPPQVFTLPQYGYATLNRDNT-ENPTERSSFFCLEYFPSKMLRTGNNFEFTYNFEEVPFHSSFAPSQNLFKLANPLVDQYLYRFVSTNNT-------GGVQFNKNLAGRYANTYKNWFPGPMGRTQGWNLGSGVNRASV-----SAFATTNRMELEGASYQVPPQPNGMTNNLQGSNTYALENTMIFNSQPANPGTTATYLEGNMLITSESETQPVNRVAYNVGGQMATNNQSSTTAPATGTYNLQEIVPGSVWMERDVYLQGPIWAKIPETGAHFHPSPAMGGFGLKHPPPMMLIKNTPVPGNI-TSFSDVPVSSFITQYSTGQVTVEMEWELKKENSKRWNPEIQYTNNYNDPQFVDFAPDSTGEYRTTRPIGTRYLTRPL'), | |
list('-ADGVGNASGNWHCDSTWLGDRVITTSTRTWALPTYNNHLYKQISSAST-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTTNDGVTTIANNLTSTVQVFSDSEYQLPYVLGSAHQGCLPPFPADVFMIPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLNRTQN-QSGSAQNKDLLFSRGSPAGMSVQPKNWLPGPCYRQQRVSKTKTDNNNSN-----FTWTGASKYNLNGRESIINPGTAMASHKDDKDKFFPMSGVMIFGKESAGA---SNTALDNVMITDEEEIKATNPVATERFGTVAVNLQSSSTDPATGDVHVMGALPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPPAEFSATKFASFITQYSTGQVSVEIEWELQKENSKRWNPEVQYTSNYAKSANVDFTVDNNGLYTEPRPIGTRYLTRPL'), | |
list('GADGVGNASGNWHCDSTWLGDRVITTSTRTWALPTYNNHLYKQISSAST-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTTNDGVTTIANNLTSTVQVFSDSEYQLPYVLGSAHQGCLPPFPADVFMIPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLNRTQN-QSGSAQNKDLLFSRGSPAGMSVQPKNWLPGPCYRQQRVSKTKTDNNNSN-----FTWTGASKYNLNGRESIINPGTAMASHKDDKDKFFPMSGVMIFGKESAGA---SNTALDNVMITDEEEIKATNPVATERFGTVAVNLQSSSTDPATGDVHVMGALPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPPAEFSATKFASFITQYSTGQVSVEIEWELQKENSKRWNPEVQYTSNYAKSANVDFTVDNNGLYTEPRPIGTRYLTRPL'), | |
list('--DGVGSSSGNWHCDSQWLGDRVITTSTRTWALPTYNNHLYKQISNSTSGGSSNDNAYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTDNNGVKTIANNLTSTVQVFTDSDYQLPYVLGSAHEGCLPPFPADVFMIPQYGYLTLNDG---SQAVGRSSFYCLEYFPSQMLRTGNNFQFSYEFENVPFHSSYAHSQSLDRLMNPLIDQYLYYLSKTING--SGQNQQTLKFSVAGPSNMAVQGRNYIPGPSYRQQRVSTTVTQNNNSE-----FAWPGASSWALNGRNSLMNPGPAMASHKEGEDRFFPLSGSLIFGKQGTGR---DNVDADKVMITNEEEIKTTNPVATESYGQVATNHQSAQAQAQTGWVQNQGILPGMVWQDRDVYLQGPIWAKIPHTDGNFHPSPLMGGFGMKHPPPQILIKNTPVPADPPTAFNKDKLNSFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYYKSNNVEFAVNTEGVYSEPRPIGTRYLTRNL'), | |
list('GADGVGNSSGNWHCDSTWLGDRVITTSTRTWALPTYNNHLYKQISNGTSGGSTNDNTYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTTNEGTKTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQALGRSSFYCLEYFPSQMLRTGNNFQFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLVRTQTT--GTGGTQTLAFSQAGPSSMANQARNWVPGPCYRQQRVSTTTNQNNNSN-----FAWTGAAKFKLNGRDSLMNPGVAMASHKDDDDRFFPSSGVLIFGKQGAGN---DGVDYSQVLITDEEEIKATNPVATEEYGAVAINNQAANTQAQTGLVHNQGVIPGMVWQNRDVYLQGPIWAKIPHTDGNFHPSPLMGGFGLKHPPPQILIKNTPVPADPPLTFNQAKLNSFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYYKSTNVDFAVNTEGVYSEPRPIGTRYLTRNL'), | |
list('---GVGNASGDWHCDSTWSEGKVTTTSTRTWVLPTYNNHLYLRLG-----TTSNSNTYNGFSTPWGYFDFNRFHCHFSPRDWQRLINNNWGLRPKAMRVKIFNIQVKEVTTSNGETTVANNLTSTVQIFADSSYELPYVMDAGQEGSLPPFPNDVFMVPQYGYCGIVTGEN-QNQTDRNAFYCLEYFPSQMLRTGNNFEMAYNFEKVPFHSMYAHSQSLDRLMNPLLDQYLWHLQSTTSGETLNQGNAATTFGKIRSGDFAFYRKNWLPGPCVKQQRFSKTASQNYKIPASGGNALLKYDTHYTLNNRWSNIAPGPPMATAGPSDGDFS-NAQLIFPGPSVTGN---TTTSANNLLFTSEEEIAATNPRDTDMFGQIADNNQNATTAPITGNVTAMGVLPGMVWQNRDIYYQGPIWAKIPHADGHFHPSPLIGGFGLKHPPPQIFIKNTPVPANPATTFTAARVDSFITQYSTGQVAVQIEWEIEKERSKRWNPEVQFTSNYGNQSSMLWAPDTTGKYTEPRVIGSRYLTNHL'), | |
list('--DGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQ-S-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTN-TPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSE-----YSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEK---TNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL') | |
] | |
Cons(sequence_mat) |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 1 column, instead of 2 in line 1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
! Taken from: https://github.com/ACRMGroup/bioplib/blob/master/data/pet91.mat | |
! PET91 Matrix - Jones, Taylor and Thornton 1991 | |
! This is an update of the MDM78 Dayhoff matrix normalised such that | |
! all maxima are on the diagonal with a score of 10 | |
,A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V,B,Z,X,- | |
A,10,-1,0,-1,-1,-1,-1,1,-2,0,-1,-1,-1,-3,1,1,2,-4,-3,1,0,-1,0,0 | |
R,-1,10,0,-1,-1,2,0,0,2,-3,-3,4,-2,-4,-1,-1,-1,0,-2,-3,0,1,0,0 | |
N,0,0,10,2,-1,0,1,0,1,-2,-3,1,-2,-3,-1,1,1,-4,-1,-2,2,0,0,0 | |
D,-1,-1,2,10,-3,0,4,1,0,-3,-4,0,-3,-5,-2,0,-1,-5,-2,-3,3,2,0,0 | |
C,-1,-1,-1,-3,10,-3,-4,-1,0,-2,-3,-3,-2,0,-2,1,-1,1,2,-2,-2,-3,0,0 | |
Q,-1,2,0,0,-3,10,2,-1,3,-3,-2,2,-2,-4,0,-1,-1,-3,-1,-3,0,3,0,0 | |
E,-1,0,1,4,-4,2,10,1,0,-3,-4,1,-3,-5,-2,-1,-1,-5,-4,-2,2,3,0,0 | |
G,1,0,0,1,-1,-1,1,10,-2,-3,-4,-1,-3,-5,-1,1,0,-2,-4,-2,0,0,0,0 | |
H,-2,2,1,0,0,3,0,-2,10,-3,-2,1,-2,0,0,-1,-1,-3,4,-3,0,1,0,0 | |
I,0,-3,-2,-3,-2,-3,-3,-3,-3,10,2,-3,3,0,-2,-1,1,-4,-2,4,-2,-3,0,0 | |
L,-1,-3,-3,-4,-3,-2,-4,-4,-2,2,10,-3,3,2,0,-2,-1,-2,-1,2,-3,-3,0,0 | |
K,-1,4,1,0,-3,2,1,-1,1,-3,-3,10,-2,-5,-2,-1,-1,-3,-3,-3,0,1,0,0 | |
M,-1,-2,-2,-3,-2,-2,-3,-3,-2,3,3,-2,10,0,-2,-1,0,-3,-3,2,-2,-2,0,0 | |
F,-3,-4,-3,-5,0,-4,-5,-5,0,0,2,-5,0,10,-2,-2,-2,-1,5,0,-4,-4,0,0 | |
P,1,-1,-1,-2,-2,0,-2,-1,0,-2,0,-2,-2,-2,10,1,1,-5,-3,-1,-1,-1,0,0 | |
S,1,-1,1,0,1,-1,-1,1,-1,-1,-2,-1,-1,-2,1,10,1,-3,-1,-1,0,-1,0,0 | |
T,2,-1,1,-1,-1,-1,-1,0,-1,1,-1,-1,0,-2,1,1,10,-4,-3,0,0,-1,0,0 | |
W,-4,0,-4,-5,1,-3,-5,-2,-3,-4,-2,-3,-3,-1,-5,-3,-4,10,0,-4,-4,-4,0,0 | |
Y,-3,-2,-1,-2,2,-1,-4,-4,4,-2,-1,-3,-3,5,-3,-1,-3,0,10,-3,-1,-2,0,0 | |
V,1,-3,-2,-3,-2,-3,-2,-2,-3,4,2,-3,2,0,-1,-1,0,-4,-3,10,-2,-2,0,0 | |
B,0,0,2,3,-2,0,2,0,0,-2,-3,0,-2,-4,-1,0,0,-4,-1,-2,10,1,0,0 | |
Z,-1,1,0,2,-3,3,3,0,1,-3,-3,1,-2,-4,-1,-1,-1,-4,-2,-2,1,10,0,0 | |
X,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0 | |
-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment