Last active
September 30, 2017 14:43
-
-
Save joefutrelle/c4c84c1b5cdda585bc1b611642b26142 to your computer and use it in GitHub Desktop.
bokeh plotting of proteomics data (checkpointing)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import pandas as pd | |
import numpy as np | |
from bokeh.plotting import figure | |
from bokeh.models import ColumnDataSource | |
from bokeh.io import curdoc | |
from bokeh.layouts import row, widgetbox | |
from bokeh.models.widgets import Select, Slider, TextInput | |
from bokeh.models import HoverTool, CustomJS | |
import os | |
# at the top of your file | |
import logging | |
logger = logging.getLogger(__name__) | |
MAX_PROFILES = 200 | |
# proteomz_v0-1 needs to be transposed for this | |
df = pd.read_csv('proteomz_V0-3_stnmetadata.csv',low_memory=False) | |
# extract metadata section of spreadsheet containing station and depth information | |
cruise_metadata = df[df.columns[:11]][:103] | |
# extract metadata section of spreadsheet containing prot id information | |
data = df[103:] | |
data.index=data.pop('ID') | |
for col in data.columns[:10]: | |
data.pop(col) | |
prot_metadata = data.transpose() | |
###For taxonomy information | |
taxa_df = pd.read_csv('Taxa_keys.csv') | |
prot_metadata.best_hit_taxon_id = pd.to_numeric(prot_metadata['best_hit_taxon_id'], errors='coerce') | |
taxa_df.best_hit_taxon_id = pd.to_numeric(taxa_df['best_hit_taxon_id'], errors='coerce') | |
prot_metadata_taxa = pd.merge(prot_metadata, taxa_df, how='left') | |
prot_metadata_taxa.index = prot_metadata.index | |
prot_metadata = prot_metadata_taxa | |
# get ec group from ec column using substring | |
ec_group = prot_metadata['EC'].dropna().str.slice(0,1).astype(int) | |
unique_ec_groups = sorted(ec_group.unique()) | |
# extract counts section of spreadsheet | |
all_counts = df[df.columns[11:]][:103].transpose() | |
all_counts = all_counts.dropna().astype(int) | |
# stations are in that column of cruise_metadata | |
stations = cruise_metadata.Station.unique().astype(int) | |
init_station = stations[0] | |
init_pctile = 99.25 | |
init_ec_group = 'ALL' | |
# the following depends on the fact that columns in counts are numbered the same as rows in cruise_metadata | |
def select_data(station=init_station, percentile=init_pctile, selected_ec_group=init_ec_group): | |
# depth are the real depths of this station | |
z = cruise_metadata[cruise_metadata.Station==station]['Real Depth'] | |
# counts for this station are the slice of columns corresponding | |
# to the associated columns in the cruise_metadata df | |
station_counts = all_counts[list(z.index)] | |
if selected_ec_group != 'ALL': | |
station_ec_group = ec_group[station_counts.index].dropna().astype(int) | |
selected_prots = station_ec_group[station_ec_group == int(selected_ec_group)] | |
station_counts = station_counts.loc[selected_prots.index] | |
# filter all proteins above certain count sum threshold | |
s = station_counts.sum(axis=1) | |
args = s.sort_values(ascending=False).index | |
station_counts = station_counts.loc[args] | |
station_counts = station_counts[s > s.quantile(percentile/100.)] | |
return z, station_counts | |
def bin_taxa(station_counts): | |
station_sums = station_counts.sum(axis=1) | |
select_group = prot_metadata_taxa.SelectGroup | |
sum_group = pd.DataFrame({ | |
'spec_sum': station_sums, | |
'select_group': select_group.fillna('Unknown'), | |
}) | |
return sum_group.groupby(sum_group.select_group).sum() | |
def get_profile_multiline(z, station_counts): | |
# construct the multiline | |
zs, cs = [], [] | |
# add annotation and taxon for hover | |
ann, tax = [], [] | |
sc_trunc = station_counts[:MAX_PROFILES] | |
for prot in sc_trunc.index: | |
cs.append(list(sc_trunc.loc[prot])) | |
zs.append(list(z)) | |
ann.append(prot_metadata.best_hit_annotation[prot]) | |
tax.append(prot_metadata.best_hit_species[prot]) | |
return dict(zs=zs, cs=cs, ann=ann, tax=tax) | |
# select initial data | |
z, station_counts = select_data(init_station, init_pctile, init_ec_group) | |
# turn that into a cds | |
cds = ColumnDataSource(get_profile_multiline(z, station_counts)) | |
def compute_axis_ranges(z, station_counts): | |
# compute plot axis ranges | |
max_z, min_z = z.max(), z.min() | |
min_c, max_c = 0, station_counts.max().max() | |
return (max_z, min_z), (min_c, max_c) | |
(max_z, min_z), (min_c, max_c) = compute_axis_ranges(z, station_counts) | |
TITLE = 'Put title here' | |
X_AXIS_LABEL = 'count' | |
Y_AXIS_LABEL = 'depth (m)' | |
# figure with multiline | |
p = figure(plot_width=400, plot_height=400, x_range=(min_c,max_c), y_range=(max_z,min_z), | |
x_axis_label=X_AXIS_LABEL, y_axis_label=Y_AXIS_LABEL, title=TITLE) | |
p.multi_line(source=cds, xs='cs', ys='zs', line_width=2, line_alpha=0.5, | |
hover_line_alpha=1, hover_line_color='red') | |
# station selector | |
options = list(stations.astype(str)) | |
st_select = Select(title="Station", options=options, value=options[0]) | |
options = ['ALL'] + [str(e) for e in unique_ec_groups] | |
ec_select = Select(title='EC group', options=options, value=init_ec_group) | |
hover = HoverTool(tooltips=[ | |
('depth', '$y m'), | |
('count', '$x'), | |
('species', '@tax[$index]'), | |
('annotation', '@ann[$index]'), | |
]) | |
p.add_tools(hover) | |
# percentile slider | |
percentile = Slider(title='Percentile', value=init_pctile, start=90, end=99.9, step=0.05) | |
# select data based on values of selected station and | |
# percentile slider | |
def update_selected(attr, old, new): | |
station = int(st_select.value) | |
pct = percentile.value | |
selected_ec_group = ec_select.value | |
z, station_counts = select_data(station, pct, selected_ec_group) | |
cds_dict = get_profile_multiline(z, station_counts) | |
(max_z, min_z), (min_c, max_c) = compute_axis_ranges(z, station_counts) | |
# adjust plot axis ranges | |
p.y_range.start = max_z | |
p.y_range.end = min_z | |
p.x_range.start = min_c | |
p.x_range.end = max_c | |
# change the data | |
cds.data = cds_dict | |
# attach callbacks to widgets | |
st_select.on_change('value', update_selected) | |
percentile.on_change('value', update_selected) | |
ec_select.on_change('value', update_selected) | |
# layout and output | |
layout = row(widgetbox(st_select,ec_select,percentile),p) | |
curdoc().add_root(layout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment