avrilcoghlan/haemophilus1.py

## haemophilus1.py
import os
import sys
from Bio import Entrez
from Bio import SeqIO
import matplotlib.pyplot as plt

#====================================================================#
# Checked and seems to work fine.
# Takes the GenBank accession number (GI number) as input.
# First checks to see if the sequence has been stored locally as a file.
# If it hasn't, the sequence is downloaded and stored locally as a file.
def getgenbank(accession):
    """Gets a sequence for a particular accession (GI number) from GenBank, and
    saves it in a file gi_<accession>

    >>> getgenbank("226880621")
    Parsing filename gi_226880621...
    SeqRecord(seq=Seq('CGGAAAGATGATGGAACGTT', SingleLetterAlphabet()), id='gi|226880621|emb|GN082338.1|', name='gi|226880621|emb|GN082338.1|', description='gi|226880621|emb|GN082338.1| Sequence 6 from Patent WO2009030679', dbxrefs=[])

    """
    Entrez.email = "alc@sanger.ac.uk"
    filename = "gi_%s"% accession
    if not os.path.isfile(filename):
        print("Downloading %s..." % filename)
        net_handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta")
        out_handle = open(filename, "w")
        out_handle.write(net_handle.read())
        out_handle.close()
        net_handle.close()
        print("Saved %s" % filename)
    print("Parsing filename %s..." % filename)
    record = SeqIO.read(open(filename),"fasta")
    return record

#====================================================================#
# Agrees with output from Matlab at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
# Checked and seems to work fine.
def basecount(seq,useall=False,calcfreqs=False,verbose=False):
    """Prints out the number or frequency of each symbol in a DNA sequence

    # Followed ideas at http://bugs.python.org/issue3332 and
    # http://stackoverflow.com/questions/364519/in-python-how-to-i-iterate-over-a-dictionary-in-sorted-order
    # for testing the hash contains what it should:
    >>> basecount("CT") == {'A': 0, 'C': 1, 'G': 0, 'T': 1}
    True
    >>> basecount("") == {'A': 0, 'C': 0, 'G': 0, 'T': 0}
    True
    >>> basecount("CGGAAAGATGATGGAACGTT") == {'A': 7, 'C': 2, 'G': 7, 'T': 4}
    True
    >>> basecount("CGGAAAGATGATGGAACGTT",calcfreqs=True) == { 'A': 0.35, 'C': 0.1, 'G': 0.35, 'T': 0.2}
    True

    """
    length = len(seq)
    if verbose:
        print("The sequence is %d base-pairs long" % length)
    if calcfreqs:
        # Make a dictionary "myfreq" to contain the frequency of each base.
        myfreq = {}
    else:
        # Make a dictionary "mynum" to contain the number of occurrences of each base.
        mynum = {}
    # If we want to look at every letter that appears in the sequence:
    if useall:
        myset = set(seq)
    # If we just want to look at the four bases A,C,G,T:
    else:
        myset = ("A", "C", "T", "G")
    for letter in myset:
        num = seq.count(letter)
        if calcfreqs:
            # Note that the frequency is calculated out of the total
            # sequence length, even though some bases are not A/G/C/T
            freq = num/length
            myfreq[letter] = freq
            if verbose:
                print("The frequency of %s is %.2f" % (letter, freq))
        else:
            mynum[letter] = num
            if verbose:
                print("There are %d %s" % (num, letter))
    if calcfreqs:
        return myfreq
    else:
        return mynum

#====================================================================#
# Checked and seems to work fine.
# Agrees with result at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
def dimercount(myseq,verbose=False,calcfreqs=False):
    """Prints out the frequency of dimers in a long DNA sequence

    >>> dimercount("CGGAAAGATGATGGAACGTT") == {'AA': 3, 'AC': 1, 'AG': 1, 'AT': 2, 'GA': 4, 'GG': 2, 'GT': 1, 'CG': 2, 'TG': 2, 'TT': 1}
    True
    >>> dimercount("CGGAAAGATGATGGAACGTT",calcfreqs=True) == {'GG': 0.10526315789473684, 'CG': 0.10526315789473684, 'AG': 0.05263157894736842, 'TT': 0.05263157894736842, 'AA': 0.15789473684210525, 'AC': 0.05263157894736842, 'GT': 0.05263157894736842, 'AT': 0.10526315789473684, 'GA': 0.21052631578947367, 'TG': 0.10526315789473684}
    True

    """
    # Normally we pass in a sequence object 'myseq', but in the doc-test
    # above, I just pass in a string:
    try:
        seq = myseq.tostring()
    except AttributeError:
        seq = myseq
    length = len(seq)
    # Define the set of dimers we are interested in
    myset = ("AA", "AC", "AG", "AT", "GA", "GC", "GG", "GT",
             "CA", "CC", "CG", "CT", "TA", "TC", "TG", "TT")
    # Make a dictionary "mynum" to contain the number of occurrences of each dimer.
    mynum = {}
    if calcfreqs:
        # Make a dictionary "myfreq" to contain the frequency of each dimer.
        myfreq = {}
    # Move the window by jumpsize bp at a time
    windowsize = 2
    jumpsize = 1
    for i in range (0, length-windowsize+1, jumpsize):
        subseq = seq[i:i+windowsize]
        # if subseq is a member of set 'myset':
        if subseq in myset:
            if subseq in mynum: # if subseq is already in dictionary 'mynum':
                mynum[subseq] += 1
            else:
                mynum[subseq] = 1
    for dimer in myset:
    # The total number of dimers in a sequence is length-1.
    # Note that the dimer frequency is given out of length-1, even
    # though some of the dimers in the sequence are not those in
    # "myset" above, because of non-A/G/C/T bases.
        if dimer in mynum: # if dimer is already in dictionary 'mynum':
            pc = mynum[dimer]/(length-1)
            if calcfreqs:
                myfreq[dimer] = pc
            if verbose:
                print("The frequency of %s is %d (fraction %f)" % (dimer, mynum[dimer], pc))
    if calcfreqs:
        return myfreq
    else:
        return mynum

#====================================================================#
# Checked this works on a very short sequence.
# Agrees with result at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
def ntdensity1(seq,windowsize,verbose=False,jumpsize=1000,makePlot=False):
    """Plots the G+C content along a sequence using a sliding window

    >>> ntdensity1("CGGAAAGATGATGGAACGTT",4,verbose=False,jumpsize=1) == ([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0], {'G+C': [75.0, 50.0, 25.0, 25.0, 25.0, 25.0, 50.0, 25.0, 25.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0], 'A+T': [25.0, 50.0, 75.0, 75.0, 75.0, 75.0, 50.0, 75.0, 75.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0]})
    True

    """
    length = len(seq)
    # Make a dictionary to contain two empty lists
    freqs = { "G+C": [], "A+T": [] }
    myset = ("A+T", "G+C")
    # Note: instead of storing G+C, A+T in a hash, could just have coded G+C=0, A+T=1, and stored these values in arrays.
    midpoints = []
    # Move the window by jumpsize bp at a time.
    # The first window we look at is from i=0 to i=windowsize.
    # For example, if the sequence is 30000 bases long, windowsize=10000.
    # In the first loop, we look at i=0 to i=10000.
    # In the second loop, we look at i=1000 to i=11000. ...
    # In the last loop, we look at i=29000 to i=30000.
    # Note: for i = range(0,10) goes from i=0...9.
    for i in range(0,length-windowsize+1,jumpsize):
        subseq = seq[i:i+windowsize]
        if verbose:
            start = i
            end = i+windowsize
            print("start %d end %d subseq is %s length %d windowsize %d" % (start,end,subseq,length,windowsize))
        assert len(subseq)==windowsize, "ERROR: ntdensity1: length of subseq is not windowsize"
        for dimer in myset:
            letter1 = dimer[0:1]
            letter2 = dimer[2:3]
            num1 = subseq.count(letter1)
            num2 = subseq.count(letter2)
            num = num1 + num2
            pc = (100 * num)/windowsize
            freqs[dimer].append(pc)
        # Find the mid-point of the window:
        # For example, if the window is from i=1000 to i=11000,
        # midpoint = 12000/2 = 6000
        midpoint = (i + i + windowsize)/2
        midpoints.append(midpoint)
    if makePlot:
        # Call the plotting function
    	midpoints2 = [x/1000 for x in midpoints] # list comprehension
    	myscatterplot(midpoints2,freqs,'Base-pair position (kb)','%') # Convert to kb for plotting
    # Return the results
    return(midpoints,freqs)

#====================================================================#
def myscatterplot(xvar,ydict,xlab,ylab):
    """Makes a scatterplot of y-variable(s) against an x-variable"""
    # See http://matplotlib.sourceforge.net/examples/api/date_demo.html
    # for where I got the plotting code from
    fig = plt.figure()
    ax = fig.add_subplot(111)
    yvarnames = []
    for yvar in ydict:
        yvarnames.append(yvar)
        ax.plot(xvar,ydict[yvar])
    ax.grid(True)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    ax.legend(yvarnames)
    plt.grid(True)
    plt.show()

#====================================================================#
# Agrees with result at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
# Checked this works on a very short sequence.
def ntdensity2(seq,windowsize,verbose=False,jumpsize=1000,makePlot=False):
    """Plots the base frequency along a sequence using a sliding window

    >>> ntdensity2("CGGAAAGATGATGGAACGTT",4,jumpsize=1) == ([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0], {'A': [25.0, 50.0, 75.0, 75.0, 75.0, 50.0, 25.0, 50.0, 25.0, 25.0, 25.0, 25.0, 50.0, 50.0, 50.0, 25.0, 0.0], 'C': [25.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 25.0, 25.0, 25.0, 25.0], 'T': [0.0, 0.0, 0.0, 0.0, 0.0, 25.0, 25.0, 25.0, 50.0, 25.0, 25.0, 25.0, 0.0, 0.0, 0.0, 25.0, 50.0], 'G': [50.0, 50.0, 25.0, 25.0, 25.0, 25.0, 50.0, 25.0, 25.0, 50.0, 50.0, 50.0, 50.0, 25.0, 25.0, 25.0, 25.0]})
    True

    """
    length = len(seq)
    # make a dictionary to contain four empty lists
    freqs = { "A": [], "T": [], "G": [], "C": [] }
    myset = ("A", "C", "T", "G")
    midpoints = []
    # Move the window by jumpsize bp at a time.
    for i in range(0,length-windowsize+1,jumpsize):
        subseq = seq[i:i+windowsize]
        if verbose:
            start = i
            end = i+windowsize
            print("start %d end %d subseq is %s length %d windowsize %d" % (start,end,subseq,length,windowsize))
        assert len(subseq)==windowsize, "ERROR: ntdensity2: length of subseq is not windowsize"
        for letter in myset:
            num = subseq.count(letter)
            pc = 100 * num/windowsize
            freqs[letter].append(pc)
        # Find the mid-point of the window:
        # For example, if the window is from i=1000 to i=11000,
        # midpoint = 12000/2 = 6000
        midpoint = (i + i + windowsize)/2
        midpoints.append(midpoint)
    if makePlot:
        # Call the plotting function
        midpoints2 = [x/1000 for x in midpoints] # list comprehension
        myscatterplot(midpoints2,freqs,'Base-pair position (kb)','%') # Convert to kb for plotting
    # Return the results:
    return(midpoints,freqs)

#====================================================================#

def main():
    Hflu = getgenbank("16271976")
    dimersignature(Hflu.seq)
    #gag = getgenbank("223601064")
    #dimergensign(gag.seq)

#====================================================================#

def runtests():
    import doctest
    doctest.testmod()

#====================================================================#

if __name__=="__main__":
    if len(sys.argv) == 2 and sys.argv[1] == "test":
        runtests()
        sys.exit(0)
    main()

#====================================================================#
	import os
	import sys
	from Bio import Entrez
	from Bio import SeqIO
	import matplotlib.pyplot as plt

	#====================================================================#
	# Checked and seems to work fine.
	# Takes the GenBank accession number (GI number) as input.
	# First checks to see if the sequence has been stored locally as a file.
	# If it hasn't, the sequence is downloaded and stored locally as a file.
	def getgenbank(accession):
	"""Gets a sequence for a particular accession (GI number) from GenBank, and
	saves it in a file gi_<accession>

	>>> getgenbank("226880621")
	Parsing filename gi_226880621...
	SeqRecord(seq=Seq('CGGAAAGATGATGGAACGTT', SingleLetterAlphabet()), id='gi\|226880621\|emb\|GN082338.1\|', name='gi\|226880621\|emb\|GN082338.1\|', description='gi\|226880621\|emb\|GN082338.1\| Sequence 6 from Patent WO2009030679', dbxrefs=[])

	"""
	Entrez.email = "alc@sanger.ac.uk"
	filename = "gi_%s"% accession
	if not os.path.isfile(filename):
	print("Downloading %s..." % filename)
	net_handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta")
	out_handle = open(filename, "w")
	out_handle.write(net_handle.read())
	out_handle.close()
	net_handle.close()
	print("Saved %s" % filename)
	print("Parsing filename %s..." % filename)
	record = SeqIO.read(open(filename),"fasta")
	return record

	#====================================================================#
	# Agrees with output from Matlab at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
	# Checked and seems to work fine.
	def basecount(seq,useall=False,calcfreqs=False,verbose=False):
	"""Prints out the number or frequency of each symbol in a DNA sequence

	# Followed ideas at http://bugs.python.org/issue3332 and
	# http://stackoverflow.com/questions/364519/in-python-how-to-i-iterate-over-a-dictionary-in-sorted-order
	# for testing the hash contains what it should:
	>>> basecount("CT") == {'A': 0, 'C': 1, 'G': 0, 'T': 1}
	True
	>>> basecount("") == {'A': 0, 'C': 0, 'G': 0, 'T': 0}
	True
	>>> basecount("CGGAAAGATGATGGAACGTT") == {'A': 7, 'C': 2, 'G': 7, 'T': 4}
	True
	>>> basecount("CGGAAAGATGATGGAACGTT",calcfreqs=True) == { 'A': 0.35, 'C': 0.1, 'G': 0.35, 'T': 0.2}
	True

	"""
	length = len(seq)
	if verbose:
	print("The sequence is %d base-pairs long" % length)
	if calcfreqs:
	# Make a dictionary "myfreq" to contain the frequency of each base.
	myfreq = {}
	else:
	# Make a dictionary "mynum" to contain the number of occurrences of each base.
	mynum = {}
	# If we want to look at every letter that appears in the sequence:
	if useall:
	myset = set(seq)
	# If we just want to look at the four bases A,C,G,T:
	else:
	myset = ("A", "C", "T", "G")
	for letter in myset:
	num = seq.count(letter)
	if calcfreqs:
	# Note that the frequency is calculated out of the total
	# sequence length, even though some bases are not A/G/C/T
	freq = num/length
	myfreq[letter] = freq
	if verbose:
	print("The frequency of %s is %.2f" % (letter, freq))
	else:
	mynum[letter] = num
	if verbose:
	print("There are %d %s" % (num, letter))
	if calcfreqs:
	return myfreq
	else:
	return mynum

	#====================================================================#
	# Checked and seems to work fine.
	# Agrees with result at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
	def dimercount(myseq,verbose=False,calcfreqs=False):
	"""Prints out the frequency of dimers in a long DNA sequence

	>>> dimercount("CGGAAAGATGATGGAACGTT") == {'AA': 3, 'AC': 1, 'AG': 1, 'AT': 2, 'GA': 4, 'GG': 2, 'GT': 1, 'CG': 2, 'TG': 2, 'TT': 1}
	True
	>>> dimercount("CGGAAAGATGATGGAACGTT",calcfreqs=True) == {'GG': 0.10526315789473684, 'CG': 0.10526315789473684, 'AG': 0.05263157894736842, 'TT': 0.05263157894736842, 'AA': 0.15789473684210525, 'AC': 0.05263157894736842, 'GT': 0.05263157894736842, 'AT': 0.10526315789473684, 'GA': 0.21052631578947367, 'TG': 0.10526315789473684}
	True

	"""
	# Normally we pass in a sequence object 'myseq', but in the doc-test
	# above, I just pass in a string:
	try:
	seq = myseq.tostring()
	except AttributeError:
	seq = myseq
	length = len(seq)
	# Define the set of dimers we are interested in
	myset = ("AA", "AC", "AG", "AT", "GA", "GC", "GG", "GT",
	"CA", "CC", "CG", "CT", "TA", "TC", "TG", "TT")
	# Make a dictionary "mynum" to contain the number of occurrences of each dimer.
	mynum = {}
	if calcfreqs:
	# Make a dictionary "myfreq" to contain the frequency of each dimer.
	myfreq = {}
	# Move the window by jumpsize bp at a time
	windowsize = 2
	jumpsize = 1
	for i in range (0, length-windowsize+1, jumpsize):
	subseq = seq[i:i+windowsize]
	# if subseq is a member of set 'myset':
	if subseq in myset:
	if subseq in mynum: # if subseq is already in dictionary 'mynum':
	mynum[subseq] += 1
	else:
	mynum[subseq] = 1
	for dimer in myset:
	# The total number of dimers in a sequence is length-1.
	# Note that the dimer frequency is given out of length-1, even
	# though some of the dimers in the sequence are not those in
	# "myset" above, because of non-A/G/C/T bases.
	if dimer in mynum: # if dimer is already in dictionary 'mynum':
	pc = mynum[dimer]/(length-1)
	if calcfreqs:
	myfreq[dimer] = pc
	if verbose:
	print("The frequency of %s is %d (fraction %f)" % (dimer, mynum[dimer], pc))
	if calcfreqs:
	return myfreq
	else:
	return mynum

	#====================================================================#
	# Checked this works on a very short sequence.
	# Agrees with result at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
	def ntdensity1(seq,windowsize,verbose=False,jumpsize=1000,makePlot=False):
	"""Plots the G+C content along a sequence using a sliding window

	>>> ntdensity1("CGGAAAGATGATGGAACGTT",4,verbose=False,jumpsize=1) == ([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0], {'G+C': [75.0, 50.0, 25.0, 25.0, 25.0, 25.0, 50.0, 25.0, 25.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0], 'A+T': [25.0, 50.0, 75.0, 75.0, 75.0, 75.0, 50.0, 75.0, 75.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0]})
	True

	"""
	length = len(seq)
	# Make a dictionary to contain two empty lists
	freqs = { "G+C": [], "A+T": [] }
	myset = ("A+T", "G+C")
	# Note: instead of storing G+C, A+T in a hash, could just have coded G+C=0, A+T=1, and stored these values in arrays.
	midpoints = []
	# Move the window by jumpsize bp at a time.
	# The first window we look at is from i=0 to i=windowsize.
	# For example, if the sequence is 30000 bases long, windowsize=10000.
	# In the first loop, we look at i=0 to i=10000.
	# In the second loop, we look at i=1000 to i=11000. ...
	# In the last loop, we look at i=29000 to i=30000.
	# Note: for i = range(0,10) goes from i=0...9.
	for i in range(0,length-windowsize+1,jumpsize):
	subseq = seq[i:i+windowsize]
	if verbose:
	start = i
	end = i+windowsize
	print("start %d end %d subseq is %s length %d windowsize %d" % (start,end,subseq,length,windowsize))
	assert len(subseq)==windowsize, "ERROR: ntdensity1: length of subseq is not windowsize"
	for dimer in myset:
	letter1 = dimer[0:1]
	letter2 = dimer[2:3]
	num1 = subseq.count(letter1)
	num2 = subseq.count(letter2)
	num = num1 + num2
	pc = (100 * num)/windowsize
	freqs[dimer].append(pc)
	# Find the mid-point of the window:
	# For example, if the window is from i=1000 to i=11000,
	# midpoint = 12000/2 = 6000
	midpoint = (i + i + windowsize)/2
	midpoints.append(midpoint)
	if makePlot:
	# Call the plotting function
	midpoints2 = [x/1000 for x in midpoints] # list comprehension
	myscatterplot(midpoints2,freqs,'Base-pair position (kb)','%') # Convert to kb for plotting
	# Return the results
	return(midpoints,freqs)

	#====================================================================#
	def myscatterplot(xvar,ydict,xlab,ylab):
	"""Makes a scatterplot of y-variable(s) against an x-variable"""
	# See http://matplotlib.sourceforge.net/examples/api/date_demo.html
	# for where I got the plotting code from
	fig = plt.figure()
	ax = fig.add_subplot(111)
	yvarnames = []
	for yvar in ydict:
	yvarnames.append(yvar)
	ax.plot(xvar,ydict[yvar])
	ax.grid(True)
	plt.xlabel(xlab)
	plt.ylabel(ylab)
	ax.legend(yvarnames)
	plt.grid(True)
	plt.show()

	#====================================================================#
	# Agrees with result at http://www.computational-genomics.net/case_studies/haemophilus_demo.html
	# Checked this works on a very short sequence.
	def ntdensity2(seq,windowsize,verbose=False,jumpsize=1000,makePlot=False):
	"""Plots the base frequency along a sequence using a sliding window

	>>> ntdensity2("CGGAAAGATGATGGAACGTT",4,jumpsize=1) == ([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0], {'A': [25.0, 50.0, 75.0, 75.0, 75.0, 50.0, 25.0, 50.0, 25.0, 25.0, 25.0, 25.0, 50.0, 50.0, 50.0, 25.0, 0.0], 'C': [25.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 25.0, 25.0, 25.0, 25.0], 'T': [0.0, 0.0, 0.0, 0.0, 0.0, 25.0, 25.0, 25.0, 50.0, 25.0, 25.0, 25.0, 0.0, 0.0, 0.0, 25.0, 50.0], 'G': [50.0, 50.0, 25.0, 25.0, 25.0, 25.0, 50.0, 25.0, 25.0, 50.0, 50.0, 50.0, 50.0, 25.0, 25.0, 25.0, 25.0]})
	True

	"""
	length = len(seq)
	# make a dictionary to contain four empty lists
	freqs = { "A": [], "T": [], "G": [], "C": [] }
	myset = ("A", "C", "T", "G")
	midpoints = []
	# Move the window by jumpsize bp at a time.
	for i in range(0,length-windowsize+1,jumpsize):
	subseq = seq[i:i+windowsize]
	if verbose:
	start = i
	end = i+windowsize
	print("start %d end %d subseq is %s length %d windowsize %d" % (start,end,subseq,length,windowsize))
	assert len(subseq)==windowsize, "ERROR: ntdensity2: length of subseq is not windowsize"
	for letter in myset:
	num = subseq.count(letter)
	pc = 100 * num/windowsize
	freqs[letter].append(pc)
	# Find the mid-point of the window:
	# For example, if the window is from i=1000 to i=11000,
	# midpoint = 12000/2 = 6000
	midpoint = (i + i + windowsize)/2
	midpoints.append(midpoint)
	if makePlot:
	# Call the plotting function
	midpoints2 = [x/1000 for x in midpoints] # list comprehension
	myscatterplot(midpoints2,freqs,'Base-pair position (kb)','%') # Convert to kb for plotting
	# Return the results:
	return(midpoints,freqs)

	#====================================================================#

	def main():
	Hflu = getgenbank("16271976")
	dimersignature(Hflu.seq)
	#gag = getgenbank("223601064")
	#dimergensign(gag.seq)

	#====================================================================#

	def runtests():
	import doctest
	doctest.testmod()

	#====================================================================#

	if __name__=="__main__":
	if len(sys.argv) == 2 and sys.argv[1] == "test":
	runtests()
	sys.exit(0)
	main()

	#====================================================================#