coleoguy/retro-EEfinder.py

## retro-EEfinder.py
from Bio import SeqIO  # to deal with the fast files

# lets pull in our exon table
datafile = open('exons.csv', 'r')
data = []
for row in datafile:
    data.append(row.strip().split(','))

# now lets start the process of creating all of our exon-exon sequences
# by first creating a list that has the name we want to assign and the LG and
# coordinates for it we will store this data in variable called targets
targets = []
for i in range(0, len(data)-1):
	currentTC = data[i][3]
	nextTC = data[i+1][3]
	if currentTC == nextTC:
		fooA = int(data[i][1])
		fooB = int(data[i][2])
		fooC = int(data[i+1][1])
		fooD = int(data[i+1][2])
		if fooB-fooA>29 and fooD-fooC>29:
			line = []
			B=int(data[i][2])
			A=B-30
			C=int(data[i+1][1])-1
			D=C+30
			names = data[i][0] + "|" + data[i+1][3] + "|" + str(A) + ":" + str(B) + "|" + str(C) + ":" + str(D)
			line.append(names)
			line.append(A)
			line.append(B)
			line.append(C)
			line.append(D)
			targets.append(line)
# at this point we have a list of lists in the variable "targets"
# each element is an exon-exon junction

# lets check it
#print targets[0]

# no need to create this file each time as i work so I will comment out for now
import csv
with open('targets.csv', 'wb') as f:
	wr = csv.writer(f)
	wr.writerow(targets)

# testing the interaction of this file with the chromosome files that I have is a bit odd
# I am seeing lots of rows that are solid Ns not something that I believe is correct
# I am going to try and download everything from ensembl and see if this fixes the problem
# I believe that I currently may be using ensembl's gff3 with beetlebases chromosome files
# I would like to make sure that I am using all ensembl data as well because beetle base
# keeps being unaccessible for long periods of time (about a week currently)

# now we can start building a final list: we want to have the first element
# be the name ie "targets[i][0]" and the second element the DNA sequence it specifies

def format_fasta(name, sequence):
    fasta_string = '>' + name + "\n" + sequence + "\n"
    return fasta_string

fastaLines = []
LGHandle = 0

for i in range(0, len(targets)):     #len(targets)
	currentLG = targets[i][0][0:5]
	if LGHandle != currentLG:   # the next 8 lines just deal with the fact that file
		fullname = currentLG    # names are a bit wonky the important line is 81 where
		if currentLG == 'ChLG1': # where we open the appropriate fasta file
			fullname = 'ChLG10'
		if currentLG == 'Unkno':
			fullname = 'Unknown'
		LGHandle = currentLG
		fileName = './chromosomes/' + fullname + '-2.fa'
		record = SeqIO.read(open(fileName), "fasta")
	A=targets[i][1]
	B=targets[i][2]
	C=targets[i][3]
	D=targets[i][4]
	fastaLines.append(format_fasta(targets[i][0], str(record.seq[A:B]) + str(record.seq[C:D])))
	FASTA = ''.join(fastaLines)

text_file = open("EEJunctions.fa", "w")
text_file.write(FASTA)
text_file.close()
	from Bio import SeqIO # to deal with the fast files

	# lets pull in our exon table
	datafile = open('exons.csv', 'r')
	data = []
	for row in datafile:
	data.append(row.strip().split(','))

	# now lets start the process of creating all of our exon-exon sequences
	# by first creating a list that has the name we want to assign and the LG and
	# coordinates for it we will store this data in variable called targets
	targets = []
	for i in range(0, len(data)-1):
	currentTC = data[i][3]
	nextTC = data[i+1][3]
	if currentTC == nextTC:
	fooA = int(data[i][1])
	fooB = int(data[i][2])
	fooC = int(data[i+1][1])
	fooD = int(data[i+1][2])
	if fooB-fooA>29 and fooD-fooC>29:
	line = []
	B=int(data[i][2])
	A=B-30
	C=int(data[i+1][1])-1
	D=C+30
	names = data[i][0] + "\|" + data[i+1][3] + "\|" + str(A) + ":" + str(B) + "\|" + str(C) + ":" + str(D)
	line.append(names)
	line.append(A)
	line.append(B)
	line.append(C)
	line.append(D)
	targets.append(line)
	# at this point we have a list of lists in the variable "targets"
	# each element is an exon-exon junction

	# lets check it
	#print targets[0]

	# no need to create this file each time as i work so I will comment out for now
	import csv
	with open('targets.csv', 'wb') as f:
	wr = csv.writer(f)
	wr.writerow(targets)

	# testing the interaction of this file with the chromosome files that I have is a bit odd
	# I am seeing lots of rows that are solid Ns not something that I believe is correct
	# I am going to try and download everything from ensembl and see if this fixes the problem
	# I believe that I currently may be using ensembl's gff3 with beetlebases chromosome files
	# I would like to make sure that I am using all ensembl data as well because beetle base
	# keeps being unaccessible for long periods of time (about a week currently)

	# now we can start building a final list: we want to have the first element
	# be the name ie "targets[i][0]" and the second element the DNA sequence it specifies

	def format_fasta(name, sequence):
	fasta_string = '>' + name + "\n" + sequence + "\n"
	return fasta_string

	fastaLines = []
	LGHandle = 0

	for i in range(0, len(targets)): #len(targets)
	currentLG = targets[i][0][0:5]
	if LGHandle != currentLG: # the next 8 lines just deal with the fact that file
	fullname = currentLG # names are a bit wonky the important line is 81 where
	if currentLG == 'ChLG1': # where we open the appropriate fasta file
	fullname = 'ChLG10'
	if currentLG == 'Unkno':
	fullname = 'Unknown'
	LGHandle = currentLG
	fileName = './chromosomes/' + fullname + '-2.fa'
	record = SeqIO.read(open(fileName), "fasta")
	A=targets[i][1]
	B=targets[i][2]
	C=targets[i][3]
	D=targets[i][4]
	fastaLines.append(format_fasta(targets[i][0], str(record.seq[A:B]) + str(record.seq[C:D])))
	FASTA = ''.join(fastaLines)

	text_file = open("EEJunctions.fa", "w")
	text_file.write(FASTA)
	text_file.close()