Skip to content

Instantly share code, notes, and snippets.

@coleoguy
Last active January 3, 2016 22:09
Show Gist options
  • Save coleoguy/8526218 to your computer and use it in GitHub Desktop.
Save coleoguy/8526218 to your computer and use it in GitHub Desktop.
This python script takes a list of exons from multiple exon genes as well as fast files for each chromosome in a genome and it constructs a fasta file where each sequence is 60bp in length (last 30bp of one exon and the first 30bp of the next). These can then be used to search the genome for retroduplication events of genes. By limiting our selv…
from Bio import SeqIO # to deal with the fast files
# lets pull in our exon table
datafile = open('exons.csv', 'r')
data = []
for row in datafile:
data.append(row.strip().split(','))
# now lets start the process of creating all of our exon-exon sequences
# by first creating a list that has the name we want to assign and the LG and
# coordinates for it we will store this data in variable called targets
targets = []
for i in range(0, len(data)-1):
currentTC = data[i][3]
nextTC = data[i+1][3]
if currentTC == nextTC:
fooA = int(data[i][1])
fooB = int(data[i][2])
fooC = int(data[i+1][1])
fooD = int(data[i+1][2])
if fooB-fooA>29 and fooD-fooC>29:
line = []
B=int(data[i][2])
A=B-30
C=int(data[i+1][1])-1
D=C+30
names = data[i][0] + "|" + data[i+1][3] + "|" + str(A) + ":" + str(B) + "|" + str(C) + ":" + str(D)
line.append(names)
line.append(A)
line.append(B)
line.append(C)
line.append(D)
targets.append(line)
# at this point we have a list of lists in the variable "targets"
# each element is an exon-exon junction
# lets check it
#print targets[0]
# no need to create this file each time as i work so I will comment out for now
import csv
with open('targets.csv', 'wb') as f:
wr = csv.writer(f)
wr.writerow(targets)
# testing the interaction of this file with the chromosome files that I have is a bit odd
# I am seeing lots of rows that are solid Ns not something that I believe is correct
# I am going to try and download everything from ensembl and see if this fixes the problem
# I believe that I currently may be using ensembl's gff3 with beetlebases chromosome files
# I would like to make sure that I am using all ensembl data as well because beetle base
# keeps being unaccessible for long periods of time (about a week currently)
# now we can start building a final list: we want to have the first element
# be the name ie "targets[i][0]" and the second element the DNA sequence it specifies
def format_fasta(name, sequence):
fasta_string = '>' + name + "\n" + sequence + "\n"
return fasta_string
fastaLines = []
LGHandle = 0
for i in range(0, len(targets)): #len(targets)
currentLG = targets[i][0][0:5]
if LGHandle != currentLG: # the next 8 lines just deal with the fact that file
fullname = currentLG # names are a bit wonky the important line is 81 where
if currentLG == 'ChLG1': # where we open the appropriate fasta file
fullname = 'ChLG10'
if currentLG == 'Unkno':
fullname = 'Unknown'
LGHandle = currentLG
fileName = './chromosomes/' + fullname + '-2.fa'
record = SeqIO.read(open(fileName), "fasta")
A=targets[i][1]
B=targets[i][2]
C=targets[i][3]
D=targets[i][4]
fastaLines.append(format_fasta(targets[i][0], str(record.seq[A:B]) + str(record.seq[C:D])))
FASTA = ''.join(fastaLines)
text_file = open("EEJunctions.fa", "w")
text_file.write(FASTA)
text_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment