Skip to content

Instantly share code, notes, and snippets.

@d53dave
Last active December 20, 2015 12:19
Show Gist options
  • Save d53dave/6130254 to your computer and use it in GitHub Desktop.
Save d53dave/6130254 to your computer and use it in GitHub Desktop.
This tool finds stretches of equal characters in a given fasta file. It can be used to compute homopolymer Stretches for nucleotide data
#!/usr/bin/env python
import optparse, re
from Bio import SeqIO
from itertools import groupby
def findHomopolymers(seqFile, more=False, all=False, length=6):
cumlen = 0
num = 0
results = []
resulthist = {}
max = 0
maxresult = ""
with open(seqFile, "rU") as infile:
for record in SeqIO.parse(infile, "fasta"):
if more:
print "Processing " + str(record.id)
lists = [list(g) for k, g in groupby(str(record.seq))]
for x in lists:
hplen = len(x)
if(hplen>=int(length)):
num += 1
results.append(str("Hp"+str(num)+": "+x[0]+" stretch on "+record.id+" at start="+str(cumlen)\
+" and end="+str(cumlen+len(x))+" with length="+str(len(x))))
if resulthist.has_key(hplen):
resulthist[hplen] += 1
else:
resulthist[hplen] = 1
if(hplen > max): #new max found
max = hplen
maxresult = results[num-1]
cumlen += len(x)
if more:
print "\nDone!"
print "Longest stretch: "+maxresult
print "\nDistribution of homopolymer lengths:"
print "Length\tCount"
print "-------------"
keys = resulthist.keys()
keys.sort()
for key in keys:
print str(key)+"\t"+str(resulthist[key])
if all:
for result in results:
print result
if not all or more:
print "\nFound "+str(num)+" homopolymer streches longer or equal "+str(length)
def printHelp():
print "This script finds homopolymer stretches in a given fasta file."
print "This script expects the filename (or path) to the file to analyze. Currently, only fasta format is supported."
print "As an optional parameter, you can specify the desired homopolymer length\n" \
"after the file name, \"python findHpStretches.py mySeq.fa 8\" would find homopolymers longer or equal to 8bp."
print "If no value is specified, the a 6bp cutoff will be used for computations."
print "To print additional information, specify --more or -m"
print "To print all found streches to stdout, specify --all or -a"
def main():
usage = "Usage: %prog [options] arg1 (arg2)\narg1 is expected to be the input file in fasta format"\
+"\narg2 is optional, specifies the desires homopolymer length (default = 6)"
parser = optparse.OptionParser(usage=usage)
parser.set_defaults(more=False)
parser.set_defaults(all=False)
parser.add_option('--more', '-m', action='store_true', dest='more', help="show additional information")
parser.add_option('--all', '-a', action='store_true', dest='all', help="print all found stretches")
#parser.add_option('-m', action='store_true', dest='more')
(options, args) = parser.parse_args()
if len(args) == 1:
#run with n=50
findHomopolymers(args[0], options.more, options.all)
elif len(args) == 2:
#run with provided number
findHomopolymers(args[0], options.more, options.all, int(args[1]))
else:
print ("Problem with command line arguments!\n")
printHelp()
exit(2)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment