Last active
July 6, 2017 13:45
-
-
Save fransua/d6b8720e1dd74ed4c06c6d8134a2ab78 to your computer and use it in GitHub Desktop.
Sequence logo drawer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from matplotlib import pyplot as plt | |
import numpy as np | |
from itertools import izip_longest | |
def seq_logo(seqs, title='', axe=None, quality=1, plot=True, ylim=(0, 2), lwmodif=1.25, | |
errorbar=True, savefig=None): | |
""" | |
Draw a sequence motif according to an input list of sequences, | |
https://en.wikipedia.org/wiki/Sequence_logo | |
:params seqs: list of sequences | |
:params '' title: title of the plot | |
:params None axe: matplotlib axe object | |
:params 1 quality: quality modifier | |
:params None savefig: if set save the plot to the file at the specified path | |
(the format of the file "pdf", "png"... will be infered from the file extension) | |
:params 1.25 lwmodif: modifier for linewith of the drown letters. | |
:params True plot: draw plot, otherwise only returns proportions. | |
:params True errorbar: draw error bar, that are equal to the correction for | |
small samples | |
:returns: a table (list of list) with the proportions of each nucleotide at | |
each position | |
""" | |
xoffset = 0.003 * quality | |
yoffset = 0.010 * quality | |
lwmodif = 1.25 | |
letterw = 0.75 | |
qlmodif = lwmodif * quality / 5. | |
halfw = letterw / 2. | |
letters = ['A', 'T', 'G', 'C', 'N'] | |
nseq = float(len(seqs)) | |
seq_count = [dict([(let, col.count(let) / nseq) for let in letters]) | |
for col in izip_longest(*seqs, fillvalue='N')] | |
if not plot: | |
return seq_count | |
def lw(val): | |
return ((val + np.log(val) + 7) * qlmodif) if val else 0. | |
def A(x, y, h): | |
if yoffset < h: | |
y += yoffset | |
h -= yoffset | |
axe.plot([x, x + halfw, x + letterw ], [y, y + h, y], color='green', lw=lw(h)) | |
axe.plot([x + letterw * 1 / 4, x + letterw * 3 / 4], [y + h * 1 / 3] * 2, | |
color='green', lw=lw(h)) | |
def T(x, y, h): | |
if yoffset < h: | |
y += yoffset | |
h -= yoffset | |
axe.plot([x + halfw ] * 2, [y, y + h] , color='red', lw=lw(h)) | |
axe.plot([x, x + letterw] , [y + h] * 2, color='red', lw=lw(h)) | |
Gcircle = np.linspace(np.pi * 0.25, np.pi * 1.9, 20 * quality) | |
xG = np.cos(Gcircle) | |
yG = np.sin(Gcircle) | |
def G(x, y, h): | |
if yoffset < h: | |
y += yoffset | |
h -= yoffset | |
halfh = h / 2. | |
xGw = x + xG * halfw + halfw | |
yGh = y + yG * halfh + halfh | |
axe.plot(xGw, yGh, color='orange', lw=lw(h)) | |
axe.plot([xGw[-1] , xGw[-1] , xGw[-1] - letterw * 0.4 ], | |
[yGh[-1] - h * 0.3, yGh[-1] + h * 0.05, yGh[-1] + h * 0.05], | |
color='orange', lw=lw(h)) | |
Ccircle = np.linspace(np.pi * 0.2, np.pi * 1.8, 20 * quality) | |
xC = np.cos(Ccircle) | |
yC = np.sin(Ccircle) | |
def C(x, y, h): | |
if yoffset < h: | |
y += yoffset | |
h -= yoffset | |
axe.plot(x + xC * halfw + halfw, | |
y + yC * h / 2. + h / 2., color='blue', lw=lw(h)) | |
def N(x, y, h): | |
if yoffset < h: | |
y += yoffset | |
h -= yoffset | |
axe.plot([x, x, x + letterw, x + letterw], [y, y + h, y, y + h], color='grey', lw=lw(h)) | |
if not axe: | |
fig = plt.figure(figsize=(len(seq_count) * 0.15 * quality, 1.75 * quality)) | |
axe = fig.add_subplot(111) | |
draw_letter_functions = {'A':A, 'T': T, 'G': G, 'C': C, 'N': N} | |
en = 1. / np.log(2) * (4 - 1.) / (2. * nseq) | |
for npos, position in enumerate(seq_count): | |
offset = 0 | |
ri = sum(fi * np.log2(fi) for fi in position.values() if fi) - en | |
for letter, f in sorted(position.iteritems(), key=lambda x: x[1]): | |
h = f * (2. + ri) | |
x = npos + xoffset | |
y = offset | |
draw_letter_functions[letter](x, y, h) | |
if errorbar: | |
plt.plot([x + halfw, x + halfw], [y + h + en, max(y + h - en, 0)], | |
color='grey') | |
plt.plot([x + halfw - 0.1, x + halfw + 0.1], [y + h + en, y + h + en], | |
color='grey') | |
if y + h - en > 0: | |
plt.plot([x + halfw - 0.1, x + halfw + 0.1], [y + h - en] * 2, | |
color='grey') | |
offset += h | |
# axes | |
axe.axis('off') | |
axe.plot([-0.4, -0.1, -0.1, -0.4], [ylim[0], ylim[0], ylim[1], ylim[1]], color='k') | |
for i in range(3): | |
axe.text(-0.5, ylim[0] + (ylim[1] - ylim[0]) / 2. * i, | |
str(ylim[0] + (ylim[1] - ylim[0]) / 2. * i), | |
va='center', ha='right', size=4.5 * quality) | |
axe.text(ylim[0] - 1.5, 1, 'bits', va='center', ha='right', rotation=90, size=7 * quality) | |
axe.plot([-0.4, -0.1], [(ylim[1] - ylim[0]) / 2.] * 2, color='k') | |
for i in xrange(1, npos + 2): | |
axe.text(i - 0.5, ylim[0] - 0.05, str(i), va='top', ha='center', | |
size=4.5 * quality, rotation=90) | |
_ = axe.set_xlim(-1, npos + 1.) | |
_ = axe.set_ylim(ylim[0] - .5, ylim[1] + 3 * yoffset + 0.5) | |
# title | |
axe.set_title(title) | |
if savefig: | |
plt.savefig(savefig, format=savefig.split('.')[-1]) | |
return seq_count | |
def test_it(): | |
from random import random | |
seqs = [] | |
for i in xrange(15): | |
rnd_nts = ['ATGCATGCATGCN'[int(random()*13)] for _ in xrange(3)] | |
seqs.append([rnd_nts[int(random()*3)] for _ in xrange(40)] + ['ATGC'[int(random()*4)] for _ in xrange(10)]) | |
seqs = [''.join(s) for s in zip(*seqs)] | |
prop_matrix = seq_logo(seqs, quality=4, savefig='random_logo.png', errorbar=True) |
Author
fransua
commented
Sep 5, 2016
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment