Skip to content

Instantly share code, notes, and snippets.

@joefutrelle
Last active September 30, 2017 14:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joefutrelle/c4c84c1b5cdda585bc1b611642b26142 to your computer and use it in GitHub Desktop.
Save joefutrelle/c4c84c1b5cdda585bc1b611642b26142 to your computer and use it in GitHub Desktop.
bokeh plotting of proteomics data (checkpointing)
# coding: utf-8
import pandas as pd
import numpy as np
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.io import curdoc
from bokeh.layouts import row, widgetbox
from bokeh.models.widgets import Select, Slider, TextInput
from bokeh.models import HoverTool, CustomJS
import os
# at the top of your file
import logging
logger = logging.getLogger(__name__)
MAX_PROFILES = 200
# proteomz_v0-1 needs to be transposed for this
df = pd.read_csv('proteomz_V0-3_stnmetadata.csv',low_memory=False)
# extract metadata section of spreadsheet containing station and depth information
cruise_metadata = df[df.columns[:11]][:103]
# extract metadata section of spreadsheet containing prot id information
data = df[103:]
data.index=data.pop('ID')
for col in data.columns[:10]:
data.pop(col)
prot_metadata = data.transpose()
###For taxonomy information
taxa_df = pd.read_csv('Taxa_keys.csv')
prot_metadata.best_hit_taxon_id = pd.to_numeric(prot_metadata['best_hit_taxon_id'], errors='coerce')
taxa_df.best_hit_taxon_id = pd.to_numeric(taxa_df['best_hit_taxon_id'], errors='coerce')
prot_metadata_taxa = pd.merge(prot_metadata, taxa_df, how='left')
prot_metadata_taxa.index = prot_metadata.index
prot_metadata = prot_metadata_taxa
# get ec group from ec column using substring
ec_group = prot_metadata['EC'].dropna().str.slice(0,1).astype(int)
unique_ec_groups = sorted(ec_group.unique())
# extract counts section of spreadsheet
all_counts = df[df.columns[11:]][:103].transpose()
all_counts = all_counts.dropna().astype(int)
# stations are in that column of cruise_metadata
stations = cruise_metadata.Station.unique().astype(int)
init_station = stations[0]
init_pctile = 99.25
init_ec_group = 'ALL'
# the following depends on the fact that columns in counts are numbered the same as rows in cruise_metadata
def select_data(station=init_station, percentile=init_pctile, selected_ec_group=init_ec_group):
# depth are the real depths of this station
z = cruise_metadata[cruise_metadata.Station==station]['Real Depth']
# counts for this station are the slice of columns corresponding
# to the associated columns in the cruise_metadata df
station_counts = all_counts[list(z.index)]
if selected_ec_group != 'ALL':
station_ec_group = ec_group[station_counts.index].dropna().astype(int)
selected_prots = station_ec_group[station_ec_group == int(selected_ec_group)]
station_counts = station_counts.loc[selected_prots.index]
# filter all proteins above certain count sum threshold
s = station_counts.sum(axis=1)
args = s.sort_values(ascending=False).index
station_counts = station_counts.loc[args]
station_counts = station_counts[s > s.quantile(percentile/100.)]
return z, station_counts
def bin_taxa(station_counts):
station_sums = station_counts.sum(axis=1)
select_group = prot_metadata_taxa.SelectGroup
sum_group = pd.DataFrame({
'spec_sum': station_sums,
'select_group': select_group.fillna('Unknown'),
})
return sum_group.groupby(sum_group.select_group).sum()
def get_profile_multiline(z, station_counts):
# construct the multiline
zs, cs = [], []
# add annotation and taxon for hover
ann, tax = [], []
sc_trunc = station_counts[:MAX_PROFILES]
for prot in sc_trunc.index:
cs.append(list(sc_trunc.loc[prot]))
zs.append(list(z))
ann.append(prot_metadata.best_hit_annotation[prot])
tax.append(prot_metadata.best_hit_species[prot])
return dict(zs=zs, cs=cs, ann=ann, tax=tax)
# select initial data
z, station_counts = select_data(init_station, init_pctile, init_ec_group)
# turn that into a cds
cds = ColumnDataSource(get_profile_multiline(z, station_counts))
def compute_axis_ranges(z, station_counts):
# compute plot axis ranges
max_z, min_z = z.max(), z.min()
min_c, max_c = 0, station_counts.max().max()
return (max_z, min_z), (min_c, max_c)
(max_z, min_z), (min_c, max_c) = compute_axis_ranges(z, station_counts)
TITLE = 'Put title here'
X_AXIS_LABEL = 'count'
Y_AXIS_LABEL = 'depth (m)'
# figure with multiline
p = figure(plot_width=400, plot_height=400, x_range=(min_c,max_c), y_range=(max_z,min_z),
x_axis_label=X_AXIS_LABEL, y_axis_label=Y_AXIS_LABEL, title=TITLE)
p.multi_line(source=cds, xs='cs', ys='zs', line_width=2, line_alpha=0.5,
hover_line_alpha=1, hover_line_color='red')
# station selector
options = list(stations.astype(str))
st_select = Select(title="Station", options=options, value=options[0])
options = ['ALL'] + [str(e) for e in unique_ec_groups]
ec_select = Select(title='EC group', options=options, value=init_ec_group)
hover = HoverTool(tooltips=[
('depth', '$y m'),
('count', '$x'),
('species', '@tax[$index]'),
('annotation', '@ann[$index]'),
])
p.add_tools(hover)
# percentile slider
percentile = Slider(title='Percentile', value=init_pctile, start=90, end=99.9, step=0.05)
# select data based on values of selected station and
# percentile slider
def update_selected(attr, old, new):
station = int(st_select.value)
pct = percentile.value
selected_ec_group = ec_select.value
z, station_counts = select_data(station, pct, selected_ec_group)
cds_dict = get_profile_multiline(z, station_counts)
(max_z, min_z), (min_c, max_c) = compute_axis_ranges(z, station_counts)
# adjust plot axis ranges
p.y_range.start = max_z
p.y_range.end = min_z
p.x_range.start = min_c
p.x_range.end = max_c
# change the data
cds.data = cds_dict
# attach callbacks to widgets
st_select.on_change('value', update_selected)
percentile.on_change('value', update_selected)
ec_select.on_change('value', update_selected)
# layout and output
layout = row(widgetbox(st_select,ec_select,percentile),p)
curdoc().add_root(layout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment