Created
June 29, 2011 17:17
-
-
Save bow/1054348 to your computer and use it in GitHub Desktop.
Script for counting AA triplet occurence in a fasta file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import random | |
from Bio import SeqIO | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from Bio.Alphabet import IUPAC | |
# function to generate random protein sequence | |
def random_prot(length): | |
seq = '' | |
for i in xrange(length): | |
seq += random.choice(IUPAC.protein.letters) | |
return seq | |
# function to generate list of SeqRecord objects | |
def seqrecord_gen(num, length): | |
seqs = [] | |
for i in xrange(num): | |
seqs.append(SeqRecord(Seq(random_prot(length)), | |
id='fasta'+str(i+1), | |
name='', | |
description='')) | |
SeqIO.write(seqs, 'random_proteins.fa', 'fasta') | |
# function to read fasta file and count triplets | |
def count_triplet(source): | |
triplets = {} | |
seqs = SeqIO.parse(source, 'fasta') | |
for rec in seqs: | |
step = 0 | |
while step + 3 <= len(rec.seq.tostring()): | |
tri = rec.seq.tostring()[0+step:3+step] | |
if tri not in triplets: | |
triplets[tri] = 1 | |
else: | |
triplets[tri] += 1 | |
step += 1 | |
with open('results', 'w') as output: | |
for key in sorted(triplets, key=triplets.get, reverse=True): | |
output.writelines("{0}: {1}\n".format(key, triplets[key])) | |
# generate mock file | |
seqrecord_gen(100, 30) | |
# count the triplets | |
count_triplet('random_proteins.fa') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment