Skip to content

Instantly share code, notes, and snippets.

@bow
Created June 29, 2011 17:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bow/1054348 to your computer and use it in GitHub Desktop.
Save bow/1054348 to your computer and use it in GitHub Desktop.
Script for counting AA triplet occurence in a fasta file.
#!/usr/bin/env python
import random
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
# function to generate random protein sequence
def random_prot(length):
seq = ''
for i in xrange(length):
seq += random.choice(IUPAC.protein.letters)
return seq
# function to generate list of SeqRecord objects
def seqrecord_gen(num, length):
seqs = []
for i in xrange(num):
seqs.append(SeqRecord(Seq(random_prot(length)),
id='fasta'+str(i+1),
name='',
description=''))
SeqIO.write(seqs, 'random_proteins.fa', 'fasta')
# function to read fasta file and count triplets
def count_triplet(source):
triplets = {}
seqs = SeqIO.parse(source, 'fasta')
for rec in seqs:
step = 0
while step + 3 <= len(rec.seq.tostring()):
tri = rec.seq.tostring()[0+step:3+step]
if tri not in triplets:
triplets[tri] = 1
else:
triplets[tri] += 1
step += 1
with open('results', 'w') as output:
for key in sorted(triplets, key=triplets.get, reverse=True):
output.writelines("{0}: {1}\n".format(key, triplets[key]))
# generate mock file
seqrecord_gen(100, 30)
# count the triplets
count_triplet('random_proteins.fa')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment