Skip to content

Instantly share code, notes, and snippets.

@atc3
Last active September 26, 2019 14:43
Show Gist options
  • Save atc3/f0260a99c5328a12f4b6fbbe65beec83 to your computer and use it in GitHub Desktop.
Save atc3/f0260a99c5328a12f4b6fbbe65beec83 to your computer and use it in GitHub Desktop.
Valdar Sequence Conservation
# Sequence conservation score function, from Valdar 2001 - https://doi.org/10.1002/1097-0134\(20010101\)42:1<108::AID-PROT110>3.0.CO;2-O
import pandas as pd
import numpy as np
# Load substitution matrix (PET91)
# https://doi.org/10.1093/bioinformatics/8.3.275
pet91 = pd.read_csv('pet91.csv', comment='!', index_col=0)
m = pet91
# Get minimum and maximum values
m_min = np.min(m.stack().values)
m_max = np.max(m.stack().values)
# Define amino acid similarity function
def Mut(a, b):
if a == '-' or b == '-':
return 0
return (m[a][b] - m_min) / (m_max - m_min)
# Define function for evolutionary distance between sequences
def Dist(s_j, s_k):
assert len(s_j) == len(s_k)
# Convert strings to lists, if not already
s_j = list(s_j)
s_k = list(s_k)
# Flag aligned positions (no gaps)
non_gap_positions = []
for i in range(0, len(s_j)):
if s_j[i] == '-' or s_k[i] == '-':
continue
non_gap_positions.append(i)
# Count number of non-gap positions
n_aligned_jk = len(non_gap_positions)
# Calculate and sum, for each non-gap position,
# the similarity between the two amino acids
sum_similarity = 0.
for i in non_gap_positions:
sum_similarity = sum_similarity + Mut(s_j[i], s_k[i])
return 1 - (sum_similarity / n_aligned_jk)
# Define sequence weighting function
def Weight(s_j, all_sequences):
assert s_j in all_sequences
sum_distances = 0.
for s_k in all_sequences:
# Need to cast to a string in order to compare equality
if ''.join(s_j) == ''.join(s_k):
continue
sum_distances = sum_distances + Dist(s_j, s_k)
return sum_distances / (len(all_sequences) - 1)
#Weight(sequence_mat[10], sequence_mat)
# Define conservation function
# Return a vector with the conservation score for each position in the sequence
def Cons(all_sequences):
sequence_length = len(all_sequences[0])
sum_numerator = [0.,] * sequence_length
sum_denominator = 0.
for j in range(0, len(all_sequences)):
s_j = all_sequences[j]
W_j = Weight(s_j, all_sequences)
for k in range(j + 1, len(all_sequences)):
s_k = all_sequences[k]
W_k = Weight(s_k, all_sequences)
# For each AA position
for i in range(0, sequence_length):
sum_numerator[i] = sum_numerator[i] + (W_j * W_k * Mut(s_j[i], s_k[i]))
sum_denominator = sum_denominator + (W_j * W_k)
result = sum_numerator
result = [x / sum_denominator for x in result]
return result
# Example:
sequence_mat = [
list('GADGVGNASGNWHCDSTWLGDRVITTSTRTWALPTYNNHLYKQISSAST-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTTNDGVTTIANNLTSTVQVFSDSEYQLPYVLGSAHQGCLPPFPADVFMIPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEEVPFHSSYAHSQSLDRLMNPLIDQYLYYLNRTQN-QSGSAQNKDLLFSRGSPAGMSVQPKNWLPGPCYRQQRVSKTKTDNNNSN-----FTWTGASKYNLNGRESIINPGTAMASHKDDEDKFFPMSGVMIFGKESAGA---SNTALDNVMITDEEEIKATNPVATERFGTVAVNFQSSSTDPATGDVHAMGALPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKNPPPQILIKNTPVPANPPAEFSATKFASFITQYSTGQVSVEIEWELQKENSKRWNPEVQYTSNYAKSANVDFTVDNNGLYTEPRPIGTRYLTRPL'),
list('--DGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQ-S-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTN-TPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSE-----YSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEK---TNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL'),
list('GADGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQ-S-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTN-TPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSE-----YSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEK---TNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL'),
list('GADGVGNSSGNWHCDSQWLGDRVITTSTRTWALPTYNNHLYKQISSQ-S-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKKLSFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFQFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLNRTQGTTSGTTNQSRLLFSQAGPQSMSLQARNWLPGPCYRQQRLSKTANDNNNSN-----FPWTAASKYHLNGRDSLVNPGPAMASHKDDEEKFFPMHGNLIFGKEGTTA---SNAELDNVMITDEEEIRTTNPVATEQYGTVANNLQSSNTAPTTRTVNDQGALPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQIMIKNTPVPANPPTTFSPAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL'),
list('GADGVGNASGDWHCDSTWSEGHVTTTSTRTWVLPTYNNHLYKRLG-----ESLQSNTYNGFSTPWGYFDFNRFHCHFSPRDWQRLINNNWGMRPKAMRVKIFNIQVKEVTTSNGETTVANNLTSTVQIFADSSYELPYVMDAGQEGSLPPFPNDVFMVPQYGYCGLVTGNTSQQQTDRNAFYCLEYFPSQMLRTGNNFEITYSFEKVPFHSMYAHSQSLDRLMNPLIDQYLWGLQSTTTGTTLNAGTATTNFTKLRPTNFSNFKKNWLPGPSIKQQGFSKTANQNYKIPATGSDSLIKYETHSTLDGRWSALTPGPPMATAGPADSKFS-NSQLIFAGPKQNGN---TATVPGTLIFTSEEELAATNATDTDMWGNLPGGDQSNSNLPTVDRLTALGAVPGMVWQNRDIYYQGPIWAKIPHTDGHFHPSPLIGGFGLKHPPPQIFIKNTPVPANPATTFSSTPVNSFITQYSTGQVSVQIDWEIQKERSKRWNPEVQFTSNYGQQNSLLWAPDAAGKYTEPRAIGTRYLTHHL'),
list('--DGVGNASGDWHCDSTWMGDRVVTKSTRTWVLPSYNNHQYREIKSGSV-DGSNANAYFGYSTPWGYFDFNRFHSHWSPRDWQRLINNYWGFRPRSLRVKIFNIQVKEVTVQDSTTTIANNLTSTVQVFTDDDYQLPYVVGNGTEGCLPAFPPQVFTLPQYGYATLNRDNT-ENPTERSSFFCLEYFPSKMLRTGNNFEFTYNFEEVPFHSSFAPSQNLFKLANPLVDQYLYRFVSTNNT-------GGVQFNKNLAGRYANTYKNWFPGPMGRTQGWNLGSGVNRASV-----SAFATTNRMELEGASYQVPPQPNGMTNNLQGSNTYALENTMIFNSQPANPGTTATYLEGNMLITSESETQPVNRVAYNVGGQMATNNQSSTTAPATGTYNLQEIVPGSVWMERDVYLQGPIWAKIPETGAHFHPSPAMGGFGLKHPPPMMLIKNTPVPGNI-TSFSDVPVSSFITQYSTGQVTVEMEWELKKENSKRWNPEIQYTNNYNDPQFVDFAPDSTGEYRTTRPIGTRYLTRPL'),
list('-ADGVGNASGNWHCDSTWLGDRVITTSTRTWALPTYNNHLYKQISSAST-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTTNDGVTTIANNLTSTVQVFSDSEYQLPYVLGSAHQGCLPPFPADVFMIPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLNRTQN-QSGSAQNKDLLFSRGSPAGMSVQPKNWLPGPCYRQQRVSKTKTDNNNSN-----FTWTGASKYNLNGRESIINPGTAMASHKDDKDKFFPMSGVMIFGKESAGA---SNTALDNVMITDEEEIKATNPVATERFGTVAVNLQSSSTDPATGDVHVMGALPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPPAEFSATKFASFITQYSTGQVSVEIEWELQKENSKRWNPEVQYTSNYAKSANVDFTVDNNGLYTEPRPIGTRYLTRPL'),
list('GADGVGNASGNWHCDSTWLGDRVITTSTRTWALPTYNNHLYKQISSAST-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTTNDGVTTIANNLTSTVQVFSDSEYQLPYVLGSAHQGCLPPFPADVFMIPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLNRTQN-QSGSAQNKDLLFSRGSPAGMSVQPKNWLPGPCYRQQRVSKTKTDNNNSN-----FTWTGASKYNLNGRESIINPGTAMASHKDDKDKFFPMSGVMIFGKESAGA---SNTALDNVMITDEEEIKATNPVATERFGTVAVNLQSSSTDPATGDVHVMGALPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPPAEFSATKFASFITQYSTGQVSVEIEWELQKENSKRWNPEVQYTSNYAKSANVDFTVDNNGLYTEPRPIGTRYLTRPL'),
list('--DGVGSSSGNWHCDSQWLGDRVITTSTRTWALPTYNNHLYKQISNSTSGGSSNDNAYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTDNNGVKTIANNLTSTVQVFTDSDYQLPYVLGSAHEGCLPPFPADVFMIPQYGYLTLNDG---SQAVGRSSFYCLEYFPSQMLRTGNNFQFSYEFENVPFHSSYAHSQSLDRLMNPLIDQYLYYLSKTING--SGQNQQTLKFSVAGPSNMAVQGRNYIPGPSYRQQRVSTTVTQNNNSE-----FAWPGASSWALNGRNSLMNPGPAMASHKEGEDRFFPLSGSLIFGKQGTGR---DNVDADKVMITNEEEIKTTNPVATESYGQVATNHQSAQAQAQTGWVQNQGILPGMVWQDRDVYLQGPIWAKIPHTDGNFHPSPLMGGFGMKHPPPQILIKNTPVPADPPTAFNKDKLNSFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYYKSNNVEFAVNTEGVYSEPRPIGTRYLTRNL'),
list('GADGVGNSSGNWHCDSTWLGDRVITTSTRTWALPTYNNHLYKQISNGTSGGSTNDNTYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTTNEGTKTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQALGRSSFYCLEYFPSQMLRTGNNFQFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLVRTQTT--GTGGTQTLAFSQAGPSSMANQARNWVPGPCYRQQRVSTTTNQNNNSN-----FAWTGAAKFKLNGRDSLMNPGVAMASHKDDDDRFFPSSGVLIFGKQGAGN---DGVDYSQVLITDEEEIKATNPVATEEYGAVAINNQAANTQAQTGLVHNQGVIPGMVWQNRDVYLQGPIWAKIPHTDGNFHPSPLMGGFGLKHPPPQILIKNTPVPADPPLTFNQAKLNSFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYYKSTNVDFAVNTEGVYSEPRPIGTRYLTRNL'),
list('---GVGNASGDWHCDSTWSEGKVTTTSTRTWVLPTYNNHLYLRLG-----TTSNSNTYNGFSTPWGYFDFNRFHCHFSPRDWQRLINNNWGLRPKAMRVKIFNIQVKEVTTSNGETTVANNLTSTVQIFADSSYELPYVMDAGQEGSLPPFPNDVFMVPQYGYCGIVTGEN-QNQTDRNAFYCLEYFPSQMLRTGNNFEMAYNFEKVPFHSMYAHSQSLDRLMNPLLDQYLWHLQSTTSGETLNQGNAATTFGKIRSGDFAFYRKNWLPGPCVKQQRFSKTASQNYKIPASGGNALLKYDTHYTLNNRWSNIAPGPPMATAGPSDGDFS-NAQLIFPGPSVTGN---TTTSANNLLFTSEEEIAATNPRDTDMFGQIADNNQNATTAPITGNVTAMGVLPGMVWQNRDIYYQGPIWAKIPHADGHFHPSPLIGGFGLKHPPPQIFIKNTPVPANPATTFTAARVDSFITQYSTGQVAVQIEWEIEKERSKRWNPEVQFTSNYGNQSSMLWAPDTTGKYTEPRVIGSRYLTNHL'),
list('--DGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQ-S-GASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNG---SQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTN-TPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSE-----YSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEK---TNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL')
]
Cons(sequence_mat)
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 1 column, instead of 2 in line 1.
! Taken from: https://github.com/ACRMGroup/bioplib/blob/master/data/pet91.mat
! PET91 Matrix - Jones, Taylor and Thornton 1991
! This is an update of the MDM78 Dayhoff matrix normalised such that
! all maxima are on the diagonal with a score of 10
,A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V,B,Z,X,-
A,10,-1,0,-1,-1,-1,-1,1,-2,0,-1,-1,-1,-3,1,1,2,-4,-3,1,0,-1,0,0
R,-1,10,0,-1,-1,2,0,0,2,-3,-3,4,-2,-4,-1,-1,-1,0,-2,-3,0,1,0,0
N,0,0,10,2,-1,0,1,0,1,-2,-3,1,-2,-3,-1,1,1,-4,-1,-2,2,0,0,0
D,-1,-1,2,10,-3,0,4,1,0,-3,-4,0,-3,-5,-2,0,-1,-5,-2,-3,3,2,0,0
C,-1,-1,-1,-3,10,-3,-4,-1,0,-2,-3,-3,-2,0,-2,1,-1,1,2,-2,-2,-3,0,0
Q,-1,2,0,0,-3,10,2,-1,3,-3,-2,2,-2,-4,0,-1,-1,-3,-1,-3,0,3,0,0
E,-1,0,1,4,-4,2,10,1,0,-3,-4,1,-3,-5,-2,-1,-1,-5,-4,-2,2,3,0,0
G,1,0,0,1,-1,-1,1,10,-2,-3,-4,-1,-3,-5,-1,1,0,-2,-4,-2,0,0,0,0
H,-2,2,1,0,0,3,0,-2,10,-3,-2,1,-2,0,0,-1,-1,-3,4,-3,0,1,0,0
I,0,-3,-2,-3,-2,-3,-3,-3,-3,10,2,-3,3,0,-2,-1,1,-4,-2,4,-2,-3,0,0
L,-1,-3,-3,-4,-3,-2,-4,-4,-2,2,10,-3,3,2,0,-2,-1,-2,-1,2,-3,-3,0,0
K,-1,4,1,0,-3,2,1,-1,1,-3,-3,10,-2,-5,-2,-1,-1,-3,-3,-3,0,1,0,0
M,-1,-2,-2,-3,-2,-2,-3,-3,-2,3,3,-2,10,0,-2,-1,0,-3,-3,2,-2,-2,0,0
F,-3,-4,-3,-5,0,-4,-5,-5,0,0,2,-5,0,10,-2,-2,-2,-1,5,0,-4,-4,0,0
P,1,-1,-1,-2,-2,0,-2,-1,0,-2,0,-2,-2,-2,10,1,1,-5,-3,-1,-1,-1,0,0
S,1,-1,1,0,1,-1,-1,1,-1,-1,-2,-1,-1,-2,1,10,1,-3,-1,-1,0,-1,0,0
T,2,-1,1,-1,-1,-1,-1,0,-1,1,-1,-1,0,-2,1,1,10,-4,-3,0,0,-1,0,0
W,-4,0,-4,-5,1,-3,-5,-2,-3,-4,-2,-3,-3,-1,-5,-3,-4,10,0,-4,-4,-4,0,0
Y,-3,-2,-1,-2,2,-1,-4,-4,4,-2,-1,-3,-3,5,-3,-1,-3,0,10,-3,-1,-2,0,0
V,1,-3,-2,-3,-2,-3,-2,-2,-3,4,2,-3,2,0,-1,-1,0,-4,-3,10,-2,-2,0,0
B,0,0,2,3,-2,0,2,0,0,-2,-3,0,-2,-4,-1,0,0,-4,-1,-2,10,1,0,0
Z,-1,1,0,2,-3,3,3,0,1,-3,-3,1,-2,-4,-1,-1,-1,-4,-2,-2,1,10,0,0
X,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment