Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active October 22, 2020 01:43
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save avrilcoghlan/6308594 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/6308594 to your computer and use it in GitHub Desktop.
Python module to carry out simple bioinformatics analyses
import os
import sys
from Bio import Entrez
from Bio import SeqIO
import matplotlib.pyplot as plt
#====================================================================#
# Checked and seems to work fine.
# Takes the GenBank accession number (GI number) as input.
# First checks to see if the sequence has been stored locally as a file.
# If it hasn't, the sequence is downloaded and stored locally as a file.
def getgenbank(accession):
"""Gets a sequence for a particular accession (GI number) from GenBank, and
saves it in a file gi_<accession>
>>> getgenbank("226880621")
Parsing filename gi_226880621...
SeqRecord(seq=Seq('CGGAAAGATGATGGAACGTT', SingleLetterAlphabet()), id='gi|226880621|emb|GN082338.1|', name='gi|226880621|emb|GN082338.1|', description='gi|226880621|emb|GN082338.1| Sequence 6 from Patent WO2009030679', dbxrefs=[])
"""
Entrez.email = "alc@sanger.ac.uk"
filename = "gi_%s"% accession
if not os.path.isfile(filename):
print("Downloading %s..." % filename)
net_handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta")
out_handle = open(filename, "w")
out_handle.write(net_handle.read())
out_handle.close()
net_handle.close()
print("Saved %s" % filename)
print("Parsing filename %s..." % filename)
record = SeqIO.read(open(filename),"fasta")
return record
#====================================================================#
# Agrees with output from Matlab at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
# Checked and seems to work fine.
def basecount(seq,useall=False,calcfreqs=False,verbose=False):
"""Prints out the number or frequency of each symbol in a DNA sequence
# Followed ideas at http://bugs.python.org/issue3332 and
# http://stackoverflow.com/questions/364519/in-python-how-to-i-iterate-over-a-dictionary-in-sorted-order
# for testing the hash contains what it should:
>>> basecount("CT") == {'A': 0, 'C': 1, 'G': 0, 'T': 1}
True
>>> basecount("") == {'A': 0, 'C': 0, 'G': 0, 'T': 0}
True
>>> basecount("CGGAAAGATGATGGAACGTT") == {'A': 7, 'C': 2, 'G': 7, 'T': 4}
True
>>> basecount("CGGAAAGATGATGGAACGTT",calcfreqs=True) == { 'A': 0.35, 'C': 0.1, 'G': 0.35, 'T': 0.2}
True
"""
length = len(seq)
if verbose:
print("The sequence is %d base-pairs long" % length)
if calcfreqs:
# Make a dictionary "myfreq" to contain the frequency of each base.
myfreq = {}
else:
# Make a dictionary "mynum" to contain the number of occurrences of each base.
mynum = {}
# If we want to look at every letter that appears in the sequence:
if useall:
myset = set(seq)
# If we just want to look at the four bases A,C,G,T:
else:
myset = ("A", "C", "T", "G")
for letter in myset:
num = seq.count(letter)
if calcfreqs:
# Note that the frequency is calculated out of the total
# sequence length, even though some bases are not A/G/C/T
freq = num/length
myfreq[letter] = freq
if verbose:
print("The frequency of %s is %.2f" % (letter, freq))
else:
mynum[letter] = num
if verbose:
print("There are %d %s" % (num, letter))
if calcfreqs:
return myfreq
else:
return mynum
#====================================================================#
# Checked and seems to work fine.
# Agrees with result at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
def dimercount(myseq,verbose=False,calcfreqs=False):
"""Prints out the frequency of dimers in a long DNA sequence
>>> dimercount("CGGAAAGATGATGGAACGTT") == {'AA': 3, 'AC': 1, 'AG': 1, 'AT': 2, 'GA': 4, 'GG': 2, 'GT': 1, 'CG': 2, 'TG': 2, 'TT': 1}
True
>>> dimercount("CGGAAAGATGATGGAACGTT",calcfreqs=True) == {'GG': 0.10526315789473684, 'CG': 0.10526315789473684, 'AG': 0.05263157894736842, 'TT': 0.05263157894736842, 'AA': 0.15789473684210525, 'AC': 0.05263157894736842, 'GT': 0.05263157894736842, 'AT': 0.10526315789473684, 'GA': 0.21052631578947367, 'TG': 0.10526315789473684}
True
"""
# Normally we pass in a sequence object 'myseq', but in the doc-test
# above, I just pass in a string:
try:
seq = myseq.tostring()
except AttributeError:
seq = myseq
length = len(seq)
# Define the set of dimers we are interested in
myset = ("AA", "AC", "AG", "AT", "GA", "GC", "GG", "GT",
"CA", "CC", "CG", "CT", "TA", "TC", "TG", "TT")
# Make a dictionary "mynum" to contain the number of occurrences of each dimer.
mynum = {}
if calcfreqs:
# Make a dictionary "myfreq" to contain the frequency of each dimer.
myfreq = {}
# Move the window by jumpsize bp at a time
windowsize = 2
jumpsize = 1
for i in range (0, length-windowsize+1, jumpsize):
subseq = seq[i:i+windowsize]
# if subseq is a member of set 'myset':
if subseq in myset:
if subseq in mynum: # if subseq is already in dictionary 'mynum':
mynum[subseq] += 1
else:
mynum[subseq] = 1
for dimer in myset:
# The total number of dimers in a sequence is length-1.
# Note that the dimer frequency is given out of length-1, even
# though some of the dimers in the sequence are not those in
# "myset" above, because of non-A/G/C/T bases.
if dimer in mynum: # if dimer is already in dictionary 'mynum':
pc = mynum[dimer]/(length-1)
if calcfreqs:
myfreq[dimer] = pc
if verbose:
print("The frequency of %s is %d (fraction %f)" % (dimer, mynum[dimer], pc))
if calcfreqs:
return myfreq
else:
return mynum
#====================================================================#
# Checked this works on a very short sequence.
# Agrees with result at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
def ntdensity1(seq,windowsize,verbose=False,jumpsize=1000,makePlot=False):
"""Plots the G+C content along a sequence using a sliding window
>>> ntdensity1("CGGAAAGATGATGGAACGTT",4,verbose=False,jumpsize=1) == ([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0], {'G+C': [75.0, 50.0, 25.0, 25.0, 25.0, 25.0, 50.0, 25.0, 25.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0], 'A+T': [25.0, 50.0, 75.0, 75.0, 75.0, 75.0, 50.0, 75.0, 75.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0]})
True
"""
length = len(seq)
# Make a dictionary to contain two empty lists
freqs = { "G+C": [], "A+T": [] }
myset = ("A+T", "G+C")
# Note: instead of storing G+C, A+T in a hash, could just have coded G+C=0, A+T=1, and stored these values in arrays.
midpoints = []
# Move the window by jumpsize bp at a time.
# The first window we look at is from i=0 to i=windowsize.
# For example, if the sequence is 30000 bases long, windowsize=10000.
# In the first loop, we look at i=0 to i=10000.
# In the second loop, we look at i=1000 to i=11000. ...
# In the last loop, we look at i=29000 to i=30000.
# Note: for i = range(0,10) goes from i=0...9.
for i in range(0,length-windowsize+1,jumpsize):
subseq = seq[i:i+windowsize]
if verbose:
start = i
end = i+windowsize
print("start %d end %d subseq is %s length %d windowsize %d" % (start,end,subseq,length,windowsize))
assert len(subseq)==windowsize, "ERROR: ntdensity1: length of subseq is not windowsize"
for dimer in myset:
letter1 = dimer[0:1]
letter2 = dimer[2:3]
num1 = subseq.count(letter1)
num2 = subseq.count(letter2)
num = num1 + num2
pc = (100 * num)/windowsize
freqs[dimer].append(pc)
# Find the mid-point of the window:
# For example, if the window is from i=1000 to i=11000,
# midpoint = 12000/2 = 6000
midpoint = (i + i + windowsize)/2
midpoints.append(midpoint)
if makePlot:
# Call the plotting function
midpoints2 = [x/1000 for x in midpoints] # list comprehension
myscatterplot(midpoints2,freqs,'Base-pair position (kb)','%') # Convert to kb for plotting
# Return the results
return(midpoints,freqs)
#====================================================================#
def myscatterplot(xvar,ydict,xlab,ylab):
"""Makes a scatterplot of y-variable(s) against an x-variable"""
# See http://matplotlib.sourceforge.net/examples/api/date_demo.html
# for where I got the plotting code from
fig = plt.figure()
ax = fig.add_subplot(111)
yvarnames = []
for yvar in ydict:
yvarnames.append(yvar)
ax.plot(xvar,ydict[yvar])
ax.grid(True)
plt.xlabel(xlab)
plt.ylabel(ylab)
ax.legend(yvarnames)
plt.grid(True)
plt.show()
#====================================================================#
# Agrees with result at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
# Checked this works on a very short sequence.
def ntdensity2(seq,windowsize,verbose=False,jumpsize=1000,makePlot=False):
"""Plots the base frequency along a sequence using a sliding window
>>> ntdensity2("CGGAAAGATGATGGAACGTT",4,jumpsize=1) == ([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0], {'A': [25.0, 50.0, 75.0, 75.0, 75.0, 50.0, 25.0, 50.0, 25.0, 25.0, 25.0, 25.0, 50.0, 50.0, 50.0, 25.0, 0.0], 'C': [25.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 25.0, 25.0, 25.0, 25.0], 'T': [0.0, 0.0, 0.0, 0.0, 0.0, 25.0, 25.0, 25.0, 50.0, 25.0, 25.0, 25.0, 0.0, 0.0, 0.0, 25.0, 50.0], 'G': [50.0, 50.0, 25.0, 25.0, 25.0, 25.0, 50.0, 25.0, 25.0, 50.0, 50.0, 50.0, 50.0, 25.0, 25.0, 25.0, 25.0]})
True
"""
length = len(seq)
# make a dictionary to contain four empty lists
freqs = { "A": [], "T": [], "G": [], "C": [] }
myset = ("A", "C", "T", "G")
midpoints = []
# Move the window by jumpsize bp at a time.
for i in range(0,length-windowsize+1,jumpsize):
subseq = seq[i:i+windowsize]
if verbose:
start = i
end = i+windowsize
print("start %d end %d subseq is %s length %d windowsize %d" % (start,end,subseq,length,windowsize))
assert len(subseq)==windowsize, "ERROR: ntdensity2: length of subseq is not windowsize"
for letter in myset:
num = subseq.count(letter)
pc = 100 * num/windowsize
freqs[letter].append(pc)
# Find the mid-point of the window:
# For example, if the window is from i=1000 to i=11000,
# midpoint = 12000/2 = 6000
midpoint = (i + i + windowsize)/2
midpoints.append(midpoint)
if makePlot:
# Call the plotting function
midpoints2 = [x/1000 for x in midpoints] # list comprehension
myscatterplot(midpoints2,freqs,'Base-pair position (kb)','%') # Convert to kb for plotting
# Return the results:
return(midpoints,freqs)
#====================================================================#
def main():
Hflu = getgenbank("16271976")
dimersignature(Hflu.seq)
#gag = getgenbank("223601064")
#dimergensign(gag.seq)
#====================================================================#
def runtests():
import doctest
doctest.testmod()
#====================================================================#
if __name__=="__main__":
if len(sys.argv) == 2 and sys.argv[1] == "test":
runtests()
sys.exit(0)
main()
#====================================================================#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment