Created
May 30, 2014 18:01
-
-
Save ethanagb/2dcd2f38a614ada09e85 to your computer and use it in GitHub Desktop.
Pull longest sequence per identifier out of Trinity assembly
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import sys | |
from Bio import SeqIO | |
records = dict() | |
for record in SeqIO.parse('Trinity.fasta',"fasta"): | |
seq_id = record.id.split('_')[0] | |
if seq_id not in records: records[seq_id] = list() | |
records[seq_id].append(record) | |
for seq_id in records: | |
records[seq_id].sort(key=lambda a: len(a.seq), reverse=True) | |
output = open("Trinity.fasta.uniques", "w") | |
###Prints sequences/headers of all ID numbers that have only 1 associated sequence | |
for seq_id in records: | |
if len(records[seq_id]) == 1: | |
for record in records[seq_id]: | |
print('>' + record.description, file = output) | |
print(record.seq, file=output) | |
###Print only longest of IDs with duplicates | |
for seq_id in records: | |
if len(records[seq_id]) > 1 : | |
for record in records[seq_id]: | |
if len(records[seq_id]) < len(record): | |
records[seq_id] = record | |
print('>' + record.description, file = output) | |
print(record.seq, file = output) | |
output.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment