Skip to content

Instantly share code, notes, and snippets.

@matsen
Last active February 9, 2016 04:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matsen/a28a737269552a2a022a to your computer and use it in GitHub Desktop.
Save matsen/a28a737269552a2a022a to your computer and use it in GitHub Desktop.
A simple script to render a molecular sequence file as a PNG, with a specified number of pixels per symbol.
#!/usr/bin/env python
"""
Turn a sequence file into a PNG, with a specified number of pixels per symbol.
Gaps are shown in light gray.
Unknown symbols, including N and X, are shown in pink.
Requires matplotlib, seqmagick, and their dependencies.
"""
import argparse
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import numpy as np
import os
import seqmagick
from Bio import SeqIO
from matplotlib import colors
from seqmagick.fileformat import from_filename
aa_symbols = list('-ACDEFGHIKLMNPQRSTVWY')
old_paper = '#F8F4DA'
# Default colors from
# http://www.bioinformatics.org/sms/multi_align.html
aa_colors = [
old_paper,
'#999999',
'#FFFF00',
'#33CCFF',
'#33CCFF',
'#FF9900',
'#999999',
'#CC0000',
'#FF6666',
'#CC0000',
'#FF6666',
'#3366FF',
'#3366FF',
'#CC33CC',
'#3366FF',
'#CC0000',
'#999999',
'#3366FF',
'#FF6666',
'#FF9900',
'#FF9900',
'pink',
'white'
]
nt_symbols = list('-AGCT')
nt_colors = [old_paper, 'red', 'yellow', 'blue', 'green', 'pink', 'white']
hipster_colors = \
[old_paper, '#e7298a', '#d95f02','#7570b3', '#1b9e77', 'pink', 'white']
def array_of_sequence_file(path, symbol_d):
unknown_symbol_index = max(symbol_d.values())+1 # Unknown is pink.
aln_ll = [[symbol_d.get(b, unknown_symbol_index) for b in list(str(s.seq))]
for s in SeqIO.parse(path, from_filename(path))]
m = np.zeros((len(aln_ll), max(len(r) for r in aln_ll)), dtype=np.int)
m[:,:] = unknown_symbol_index+1 # Default is white.
for i, r in enumerate(aln_ll):
m[i, 0:len(r)] = r
return m
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Make a PNG of a sequence file.")
parser.add_argument('in_path', type=str,
help="Sequence file name.")
parser.add_argument(
'-o', type=str, default=None,
help="Output file name. Default is the same name, with .png")
parser.add_argument(
'-p', type=int, default=3, help="Pixels on a side per symbol.")
parser.add_argument(
'--aa', action='store_true', default=False, help="Amino acid sequences.")
parser.add_argument(
'--hipster', action='store_true', default=False,
help="Use hipster colors for nucleotides.")
args = parser.parse_args()
pixels_per_symbol = args.p
nominal_dpi = 100
factor = float(pixels_per_symbol) / nominal_dpi
if args.o is None:
base, _ = os.path.splitext(args.in_path)
args.o = base + '.png'
if args.aa:
symbols, symbol_colors = aa_symbols, aa_colors
elif args.hipster:
symbols, symbol_colors = nt_symbols, hipster_colors
else:
symbols, symbol_colors = nt_symbols, nt_colors
symbol_d = {c: np.int(n) for n, c in enumerate(symbols)}
symbol_d['.'] = symbol_d['-']
if not args.aa:
symbol_d['U'] = symbol_d['T']
for s in symbols:
symbol_d[s.lower()] = symbol_d[s]
# make a color map of fixed colors
# http://stackoverflow.com/questions/9707676/defining-a-discrete-colormap-for-imshow-in-matplotlib
cmap = colors.ListedColormap(symbol_colors)
bounds = np.array(range(len(symbol_colors)+1))-0.5
norm = colors.BoundaryNorm(bounds, cmap.N)
m = array_of_sequence_file(args.in_path, symbol_d)
fig, ax = plt.subplots(
figsize=(m.shape[1] * factor, m.shape[0] * factor), dpi=nominal_dpi)
ax.imshow(m, aspect='equal', interpolation='nearest', origin='upper',
cmap=cmap, norm=norm)
for x in 'top bottom left right'.split():
ax.spines[x].set_visible(False)
fig.savefig(args.o, dpi=nominal_dpi, bbox_inches='tight')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment