tinvernizzi/penalite_affine.py

## penalite_affine.py
import numpy
import sys

numpy.set_printoptions(threshold=numpy.nan)

seq1 ='MGEIGFTEKQEALVKESWEILKQDIPKYSLHFFSQILEIAPAAKGLFSFLRDSDEVPHNNPKLKAHAVKVFKMTCETAIQLREEGKVVVADTTLQYLGSIHLKSGVIDPHFEVVKEALLRTLKEGGEKYNEEVEGAWSQAYDHLALAIKTEMKQEES'

seq2 ='MEKVPGEMEIERRERSEELSEAERKAVQATWARLYANCEDVGVAILVRFFVNFPSAKQYFSQFKHMEEPLEMERSPQLRKHACRVMGALNTVVENLHDPEKVSSVLSLVGKAHALKHKVEPVYFKLSGVILEVIAEEFANDFPPETQRAWAKLRGLIYSHVTAAYKEVGWVQQVPNATTPPATLPSSGP'

seq3 ='MVLSAADKNNVKGIFTKIAGHAEEYGAETLERMFTTYPPTKTYFPHFDLSHGSAQIKGHGKKVVAALIEAANHIDDIAGTLSKLSDLHAHKLRVDPVNFKLLGQCFLVVVAIHHPAALTPEVHASLDKFLCAVGTVLTAKYR'

seq4 ='MGLSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASEDLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLQSKHPGDFGADAQGAMNKALELFRKDMASNYKELGFQG'

seq5 ='MERLESELIRQSWRAVSRSPLEHGTVLFSRLFALEPSLLPLFQYNGRQFSSPEDCLSSPEFLDHIRKVMLVIDAAVTNVEDLSSLEEYLATLGRKHRAVGVRLSSFSTVGESLLYMLEKCLGPDFTPATRTAWSQLYGAVVQAMSRGWDGE'

def read_blosum():
    with open('BLOSUM62.txt') as f:
        fileContent = f.readlines()
    global alignmentScore
    alignmentScore = []
    for i in range (1, len(fileContent)):
        alignmentScore = alignmentScore + [ fileContent[i].split() ]
    alignmentScore = [line[1:] for line in alignmentScore]
    global alignmentIndices
    alignmentIndices = fileContent[0].split()

def get_score(a, b):
    indice_a = -1
    indice_b = -1
    for i in range(0, len(alignmentIndices)):
        if (a == alignmentIndices[i]):
            indice_a = i
        if (b == alignmentIndices[i]):
            indice_b = i
    if (a == b):
        return 4
    else:
        return 1
    #return int(alignmentScore[indice_a][indice_b])

def penalite_affine(mot1, mot2, a=10, b=1):
    v = numpy.zeros((len(mot1) + 1, len(mot2) + 1)).astype(int) # Matrice des couts des meilleurs alignements qui depend des 3 autres matrices suivantes :
    e = numpy.zeros((len(mot1) + 1, len(mot2) + 1)).astype(int) # Matrice des couts des meilleurs alignements entre xi et un gap
    f = numpy.zeros((len(mot1) + 1, len(mot2) + 1)).astype(int) # Matrice des couts des meilleurs alignements entre yj et un gap
    g = numpy.zeros((len(mot1) + 1, len(mot2) + 1)).astype(int) # Matrice des couts des meilleurs alignements entre xi et yj
    directions = numpy.zeros((len(mot1) + 1, len(mot2) + 1, 3)).astype(int) # Stockage des fleches de directions
    #Initialisation des matrices
    for i in range(1, (len(mot1) + 1)):
        e[i][0] = - 99
        f[i][0] = - a - i * b
        g[i][0] = - 99
        v[i][0] = - a - i * b
        directions[i][0] = [0, 0, 1]
    for j in range(1, (len(mot2) + 1)):
        e[0][j] = - a - j * b
        f[0][j] = - 99
        g[0][j] = - 99
        v[0][j] = - a - j * b
        directions[0][j] = [1, 0, 0]
    for i in range(1, (len(mot1) + 1)):
        for j in range(1, (len(mot2) + 1)):
            g[i][j] = v[i - 1][j - 1] + get_score(mot1[i - 1], mot2[j - 1])
            f[i][j] = max(f[i - 1][j] - (i * b), v[i - 1][j] - (a + (i * b)) )
            e[i][j] = max(e[i][j - 1] - (j * b), v[i][j - 1] - (a + (j * b)) )
            maxi = max(g[i][j], f[i][j], e[i][j])
            if (maxi == g[i][j]):
                directions[i][j] = [0, 1, 0]
            elif (maxi == f[i][j]):
                directions[i][j] = [1, 0, 0]
            elif (maxi == e[i][j]):
                directions[i][j] = [0, 0, 1]
            v[i][j] = maxi
    str1 = ''
    str2 = ''
    x = len(mot1) - 1
    y = len(mot2) - 1
    while (x != -1 and y != -1):
        if numpy.array_equal(directions[x][y], [0, 0, 1]):
            str1 = str1 + mot1[x]
            str2 = str2 + "_"
            x = x - 1
        elif numpy.array_equal(directions[x][y], [0, 1, 0]): #diagonale
            str1 = str1 + mot1[x]
            str2 = str2 + mot2[y]
            x = x - 1
            y = y - 1
        elif numpy.array_equal(directions[x][y], [1, 0, 0]): #haut
            str1 = str1 + "_"
            str2 = str2 + mot2[y]
            y = y - 1
        if numpy.array_equal(directions[x][y], [0, 0, 0]):
            x = x - 1
            y = y - 1
    while (x >= 0):
        str1 = str1 + mot1[x]
        str2 = str2 + "_"
        x = x - 1
    while (y >= 0):
        str1 = str1 + "_"
        str2 = str2 + mot2[y]
        y = y - 1
    print v
    print directions
    print g
    print f
    print e
    print 'str1 : ' + str1[::-1]
    print 'str2 : ' + str2[::-1]
    print 'Score : ' + str(v[len(mot1)][len(mot2)])
    print

read_blosum()
if (len(sys.argv) == 3):
    penalite_affine(sys.argv[1], sys.argv[2])
elif (len(sys.argv) == 1):
    try:
        with open('sequences.fasta') as f:
            sequenceFileContent = f.readlines()
    except:
        print 'Problem with file. Is the file sequences.fasta in the same folder as this python file ?'
    sequences = []
    sequenceNumber = 0
    seq = ''
    for i in range(len(sequenceFileContent)):
        if (sequenceFileContent[i][0] == '>'):
            if (sequenceNumber != 0):
                sequences.append(seq)
            sequenceNumber = sequenceNumber + 1
            seq = ''
        else:
            seq = seq + sequenceFileContent[i].replace('\n','')
    if (seq != ''):
        sequences.append(seq)
    for i in range (len(sequences)):
        for j in range (len(sequences)):
            penalite_affine(sequences[i], sequences[j])
else:
    print 'Usage :\n' + '       python _.py <seq1> <seq2>\n' + '       python _.py <filename>'
	import numpy
	import sys

	numpy.set_printoptions(threshold=numpy.nan)

	seq1 ='MGEIGFTEKQEALVKESWEILKQDIPKYSLHFFSQILEIAPAAKGLFSFLRDSDEVPHNNPKLKAHAVKVFKMTCETAIQLREEGKVVVADTTLQYLGSIHLKSGVIDPHFEVVKEALLRTLKEGGEKYNEEVEGAWSQAYDHLALAIKTEMKQEES'

	seq2 ='MEKVPGEMEIERRERSEELSEAERKAVQATWARLYANCEDVGVAILVRFFVNFPSAKQYFSQFKHMEEPLEMERSPQLRKHACRVMGALNTVVENLHDPEKVSSVLSLVGKAHALKHKVEPVYFKLSGVILEVIAEEFANDFPPETQRAWAKLRGLIYSHVTAAYKEVGWVQQVPNATTPPATLPSSGP'

	seq3 ='MVLSAADKNNVKGIFTKIAGHAEEYGAETLERMFTTYPPTKTYFPHFDLSHGSAQIKGHGKKVVAALIEAANHIDDIAGTLSKLSDLHAHKLRVDPVNFKLLGQCFLVVVAIHHPAALTPEVHASLDKFLCAVGTVLTAKYR'

	seq4 ='MGLSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASEDLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLQSKHPGDFGADAQGAMNKALELFRKDMASNYKELGFQG'

	seq5 ='MERLESELIRQSWRAVSRSPLEHGTVLFSRLFALEPSLLPLFQYNGRQFSSPEDCLSSPEFLDHIRKVMLVIDAAVTNVEDLSSLEEYLATLGRKHRAVGVRLSSFSTVGESLLYMLEKCLGPDFTPATRTAWSQLYGAVVQAMSRGWDGE'

	def read_blosum():
	with open('BLOSUM62.txt') as f:
	fileContent = f.readlines()
	global alignmentScore
	alignmentScore = []
	for i in range (1, len(fileContent)):
	alignmentScore = alignmentScore + [ fileContent[i].split() ]
	alignmentScore = [line[1:] for line in alignmentScore]
	global alignmentIndices
	alignmentIndices = fileContent[0].split()

	def get_score(a, b):
	indice_a = -1
	indice_b = -1
	for i in range(0, len(alignmentIndices)):
	if (a == alignmentIndices[i]):
	indice_a = i
	if (b == alignmentIndices[i]):
	indice_b = i
	if (a == b):
	return 4
	else:
	return 1
	#return int(alignmentScore[indice_a][indice_b])

	def penalite_affine(mot1, mot2, a=10, b=1):
	v = numpy.zeros((len(mot1) + 1, len(mot2) + 1)).astype(int) # Matrice des couts des meilleurs alignements qui depend des 3 autres matrices suivantes :
	e = numpy.zeros((len(mot1) + 1, len(mot2) + 1)).astype(int) # Matrice des couts des meilleurs alignements entre xi et un gap
	f = numpy.zeros((len(mot1) + 1, len(mot2) + 1)).astype(int) # Matrice des couts des meilleurs alignements entre yj et un gap
	g = numpy.zeros((len(mot1) + 1, len(mot2) + 1)).astype(int) # Matrice des couts des meilleurs alignements entre xi et yj
	directions = numpy.zeros((len(mot1) + 1, len(mot2) + 1, 3)).astype(int) # Stockage des fleches de directions
	#Initialisation des matrices
	for i in range(1, (len(mot1) + 1)):
	e[i][0] = - 99
	f[i][0] = - a - i * b
	g[i][0] = - 99
	v[i][0] = - a - i * b
	directions[i][0] = [0, 0, 1]
	for j in range(1, (len(mot2) + 1)):
	e[0][j] = - a - j * b
	f[0][j] = - 99
	g[0][j] = - 99
	v[0][j] = - a - j * b
	directions[0][j] = [1, 0, 0]
	for i in range(1, (len(mot1) + 1)):
	for j in range(1, (len(mot2) + 1)):
	g[i][j] = v[i - 1][j - 1] + get_score(mot1[i - 1], mot2[j - 1])
	f[i][j] = max(f[i - 1][j] - (i * b), v[i - 1][j] - (a + (i * b)) )
	e[i][j] = max(e[i][j - 1] - (j * b), v[i][j - 1] - (a + (j * b)) )
	maxi = max(g[i][j], f[i][j], e[i][j])
	if (maxi == g[i][j]):
	directions[i][j] = [0, 1, 0]
	elif (maxi == f[i][j]):
	directions[i][j] = [1, 0, 0]
	elif (maxi == e[i][j]):
	directions[i][j] = [0, 0, 1]
	v[i][j] = maxi
	str1 = ''
	str2 = ''
	x = len(mot1) - 1
	y = len(mot2) - 1
	while (x != -1 and y != -1):
	if numpy.array_equal(directions[x][y], [0, 0, 1]):
	str1 = str1 + mot1[x]
	str2 = str2 + "_"
	x = x - 1
	elif numpy.array_equal(directions[x][y], [0, 1, 0]): #diagonale
	str1 = str1 + mot1[x]
	str2 = str2 + mot2[y]
	x = x - 1
	y = y - 1
	elif numpy.array_equal(directions[x][y], [1, 0, 0]): #haut
	str1 = str1 + "_"
	str2 = str2 + mot2[y]
	y = y - 1
	if numpy.array_equal(directions[x][y], [0, 0, 0]):
	x = x - 1
	y = y - 1
	while (x >= 0):
	str1 = str1 + mot1[x]
	str2 = str2 + "_"
	x = x - 1
	while (y >= 0):
	str1 = str1 + "_"
	str2 = str2 + mot2[y]
	y = y - 1
	print v
	print directions
	print g
	print f
	print e
	print 'str1 : ' + str1[::-1]
	print 'str2 : ' + str2[::-1]
	print 'Score : ' + str(v[len(mot1)][len(mot2)])
	print

	read_blosum()
	if (len(sys.argv) == 3):
	penalite_affine(sys.argv[1], sys.argv[2])
	elif (len(sys.argv) == 1):
	try:
	with open('sequences.fasta') as f:
	sequenceFileContent = f.readlines()
	except:
	print 'Problem with file. Is the file sequences.fasta in the same folder as this python file ?'
	sequences = []
	sequenceNumber = 0
	seq = ''
	for i in range(len(sequenceFileContent)):
	if (sequenceFileContent[i][0] == '>'):
	if (sequenceNumber != 0):
	sequences.append(seq)
	sequenceNumber = sequenceNumber + 1
	seq = ''
	else:
	seq = seq + sequenceFileContent[i].replace('\n','')
	if (seq != ''):
	sequences.append(seq)
	for i in range (len(sequences)):
	for j in range (len(sequences)):
	penalite_affine(sequences[i], sequences[j])
	else:
	print 'Usage :\n' + ' python _.py <seq1> <seq2>\n' + ' python _.py <filename>'