Skip to content

Instantly share code, notes, and snippets.

@ethanagb
Created May 30, 2014 18:01
Show Gist options
  • Save ethanagb/2dcd2f38a614ada09e85 to your computer and use it in GitHub Desktop.
Save ethanagb/2dcd2f38a614ada09e85 to your computer and use it in GitHub Desktop.
Pull longest sequence per identifier out of Trinity assembly
from __future__ import print_function
import sys
from Bio import SeqIO
records = dict()
for record in SeqIO.parse('Trinity.fasta',"fasta"):
seq_id = record.id.split('_')[0]
if seq_id not in records: records[seq_id] = list()
records[seq_id].append(record)
for seq_id in records:
records[seq_id].sort(key=lambda a: len(a.seq), reverse=True)
output = open("Trinity.fasta.uniques", "w")
###Prints sequences/headers of all ID numbers that have only 1 associated sequence
for seq_id in records:
if len(records[seq_id]) == 1:
for record in records[seq_id]:
print('>' + record.description, file = output)
print(record.seq, file=output)
###Print only longest of IDs with duplicates
for seq_id in records:
if len(records[seq_id]) > 1 :
for record in records[seq_id]:
if len(records[seq_id]) < len(record):
records[seq_id] = record
print('>' + record.description, file = output)
print(record.seq, file = output)
output.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment