joefutrelle/proteomz.py

## proteomz.py
# coding: utf-8

import pandas as pd
import numpy as np
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.io import curdoc
from bokeh.layouts import row, widgetbox
from bokeh.models.widgets import Select, Slider, TextInput
from bokeh.models import HoverTool, CustomJS
import os

# at the top of your file
import logging
logger = logging.getLogger(__name__)

MAX_PROFILES = 200

# proteomz_v0-1 needs to be transposed for this
df = pd.read_csv('proteomz_V0-3_stnmetadata.csv',low_memory=False)

# extract metadata section of spreadsheet containing station and depth information
cruise_metadata = df[df.columns[:11]][:103]

# extract metadata section of spreadsheet containing prot id information
data = df[103:]
data.index=data.pop('ID')
for col in data.columns[:10]:
    data.pop(col)
prot_metadata = data.transpose()

###For taxonomy information
taxa_df = pd.read_csv('Taxa_keys.csv')
prot_metadata.best_hit_taxon_id = pd.to_numeric(prot_metadata['best_hit_taxon_id'], errors='coerce')
taxa_df.best_hit_taxon_id = pd.to_numeric(taxa_df['best_hit_taxon_id'], errors='coerce')
prot_metadata_taxa = pd.merge(prot_metadata, taxa_df, how='left')
prot_metadata_taxa.index = prot_metadata.index
prot_metadata = prot_metadata_taxa

# get ec group from ec column using substring
ec_group = prot_metadata['EC'].dropna().str.slice(0,1).astype(int)
unique_ec_groups = sorted(ec_group.unique())

# extract counts section of spreadsheet
all_counts = df[df.columns[11:]][:103].transpose()
all_counts = all_counts.dropna().astype(int)

# stations are in that column of cruise_metadata
stations = cruise_metadata.Station.unique().astype(int)

init_station = stations[0]
init_pctile = 99.25
init_ec_group = 'ALL'

# the following depends on the fact that columns in counts are numbered the same as rows in cruise_metadata

def select_data(station=init_station, percentile=init_pctile, selected_ec_group=init_ec_group):
    # depth are the real depths of this station
    z = cruise_metadata[cruise_metadata.Station==station]['Real Depth']

    # counts for this station are the slice of columns corresponding
    # to the associated columns in the cruise_metadata df
    station_counts = all_counts[list(z.index)]

    if selected_ec_group != 'ALL':
        station_ec_group = ec_group[station_counts.index].dropna().astype(int)
        selected_prots = station_ec_group[station_ec_group == int(selected_ec_group)]
        station_counts = station_counts.loc[selected_prots.index]

    # filter all proteins above certain count sum threshold
    s = station_counts.sum(axis=1)
    args = s.sort_values(ascending=False).index
    station_counts = station_counts.loc[args]
    station_counts = station_counts[s > s.quantile(percentile/100.)]

    return z, station_counts

def bin_taxa(station_counts):
    station_sums = station_counts.sum(axis=1)
    select_group = prot_metadata_taxa.SelectGroup
    sum_group = pd.DataFrame({
        'spec_sum': station_sums,
        'select_group': select_group.fillna('Unknown'),
    })
    return sum_group.groupby(sum_group.select_group).sum()

def get_profile_multiline(z, station_counts):
    # construct the multiline
    zs, cs = [], []

    # add annotation and taxon for hover
    ann, tax = [], []

    sc_trunc = station_counts[:MAX_PROFILES]

    for prot in sc_trunc.index:
        cs.append(list(sc_trunc.loc[prot]))
        zs.append(list(z))
        ann.append(prot_metadata.best_hit_annotation[prot])
        tax.append(prot_metadata.best_hit_species[prot])

    return dict(zs=zs, cs=cs, ann=ann, tax=tax)

# select initial data
z, station_counts = select_data(init_station, init_pctile, init_ec_group)
# turn that into a cds
cds = ColumnDataSource(get_profile_multiline(z, station_counts))

def compute_axis_ranges(z, station_counts):
    # compute plot axis ranges
    max_z, min_z = z.max(), z.min()
    min_c, max_c = 0, station_counts.max().max()
    return (max_z, min_z), (min_c, max_c)

(max_z, min_z), (min_c, max_c) = compute_axis_ranges(z, station_counts)

TITLE = 'Put title here'
X_AXIS_LABEL = 'count'
Y_AXIS_LABEL = 'depth (m)'

# figure with multiline
p = figure(plot_width=400, plot_height=400, x_range=(min_c,max_c), y_range=(max_z,min_z),
    x_axis_label=X_AXIS_LABEL, y_axis_label=Y_AXIS_LABEL, title=TITLE)

p.multi_line(source=cds, xs='cs', ys='zs', line_width=2, line_alpha=0.5,
    hover_line_alpha=1, hover_line_color='red')

# station selector
options = list(stations.astype(str))
st_select = Select(title="Station", options=options, value=options[0])

options = ['ALL'] + [str(e) for e in unique_ec_groups]
ec_select = Select(title='EC group', options=options, value=init_ec_group)

hover = HoverTool(tooltips=[
        ('depth', '$y m'),
        ('count', '$x'),
        ('species', '@tax[$index]'),
        ('annotation', '@ann[$index]'),
    ])

p.add_tools(hover)

# percentile slider
percentile = Slider(title='Percentile', value=init_pctile, start=90, end=99.9, step=0.05)

# select data based on values of selected station and
# percentile slider
def update_selected(attr, old, new):
    station = int(st_select.value)
    pct = percentile.value
    selected_ec_group = ec_select.value

    z, station_counts = select_data(station, pct, selected_ec_group)
    cds_dict = get_profile_multiline(z, station_counts)

    (max_z, min_z), (min_c, max_c) = compute_axis_ranges(z, station_counts)
    # adjust plot axis ranges
    p.y_range.start = max_z
    p.y_range.end = min_z
    p.x_range.start = min_c
    p.x_range.end = max_c
    # change the data
    cds.data = cds_dict

# attach callbacks to widgets
st_select.on_change('value', update_selected)
percentile.on_change('value', update_selected)
ec_select.on_change('value', update_selected)

# layout and output
layout = row(widgetbox(st_select,ec_select,percentile),p)

curdoc().add_root(layout)
	# coding: utf-8

	import pandas as pd
	import numpy as np
	from bokeh.plotting import figure
	from bokeh.models import ColumnDataSource
	from bokeh.io import curdoc
	from bokeh.layouts import row, widgetbox
	from bokeh.models.widgets import Select, Slider, TextInput
	from bokeh.models import HoverTool, CustomJS
	import os

	# at the top of your file
	import logging
	logger = logging.getLogger(__name__)

	MAX_PROFILES = 200

	# proteomz_v0-1 needs to be transposed for this
	df = pd.read_csv('proteomz_V0-3_stnmetadata.csv',low_memory=False)

	# extract metadata section of spreadsheet containing station and depth information
	cruise_metadata = df[df.columns[:11]][:103]

	# extract metadata section of spreadsheet containing prot id information
	data = df[103:]
	data.index=data.pop('ID')
	for col in data.columns[:10]:
	data.pop(col)
	prot_metadata = data.transpose()

	###For taxonomy information
	taxa_df = pd.read_csv('Taxa_keys.csv')
	prot_metadata.best_hit_taxon_id = pd.to_numeric(prot_metadata['best_hit_taxon_id'], errors='coerce')
	taxa_df.best_hit_taxon_id = pd.to_numeric(taxa_df['best_hit_taxon_id'], errors='coerce')
	prot_metadata_taxa = pd.merge(prot_metadata, taxa_df, how='left')
	prot_metadata_taxa.index = prot_metadata.index
	prot_metadata = prot_metadata_taxa

	# get ec group from ec column using substring
	ec_group = prot_metadata['EC'].dropna().str.slice(0,1).astype(int)
	unique_ec_groups = sorted(ec_group.unique())

	# extract counts section of spreadsheet
	all_counts = df[df.columns[11:]][:103].transpose()
	all_counts = all_counts.dropna().astype(int)

	# stations are in that column of cruise_metadata
	stations = cruise_metadata.Station.unique().astype(int)

	init_station = stations[0]
	init_pctile = 99.25
	init_ec_group = 'ALL'

	# the following depends on the fact that columns in counts are numbered the same as rows in cruise_metadata

	def select_data(station=init_station, percentile=init_pctile, selected_ec_group=init_ec_group):
	# depth are the real depths of this station
	z = cruise_metadata[cruise_metadata.Station==station]['Real Depth']

	# counts for this station are the slice of columns corresponding
	# to the associated columns in the cruise_metadata df
	station_counts = all_counts[list(z.index)]

	if selected_ec_group != 'ALL':
	station_ec_group = ec_group[station_counts.index].dropna().astype(int)
	selected_prots = station_ec_group[station_ec_group == int(selected_ec_group)]
	station_counts = station_counts.loc[selected_prots.index]

	# filter all proteins above certain count sum threshold
	s = station_counts.sum(axis=1)
	args = s.sort_values(ascending=False).index
	station_counts = station_counts.loc[args]
	station_counts = station_counts[s > s.quantile(percentile/100.)]

	return z, station_counts

	def bin_taxa(station_counts):
	station_sums = station_counts.sum(axis=1)
	select_group = prot_metadata_taxa.SelectGroup
	sum_group = pd.DataFrame({
	'spec_sum': station_sums,
	'select_group': select_group.fillna('Unknown'),
	})
	return sum_group.groupby(sum_group.select_group).sum()

	def get_profile_multiline(z, station_counts):
	# construct the multiline
	zs, cs = [], []

	# add annotation and taxon for hover
	ann, tax = [], []

	sc_trunc = station_counts[:MAX_PROFILES]

	for prot in sc_trunc.index:
	cs.append(list(sc_trunc.loc[prot]))
	zs.append(list(z))
	ann.append(prot_metadata.best_hit_annotation[prot])
	tax.append(prot_metadata.best_hit_species[prot])

	return dict(zs=zs, cs=cs, ann=ann, tax=tax)

	# select initial data
	z, station_counts = select_data(init_station, init_pctile, init_ec_group)
	# turn that into a cds
	cds = ColumnDataSource(get_profile_multiline(z, station_counts))

	def compute_axis_ranges(z, station_counts):
	# compute plot axis ranges
	max_z, min_z = z.max(), z.min()
	min_c, max_c = 0, station_counts.max().max()
	return (max_z, min_z), (min_c, max_c)

	(max_z, min_z), (min_c, max_c) = compute_axis_ranges(z, station_counts)

	TITLE = 'Put title here'
	X_AXIS_LABEL = 'count'
	Y_AXIS_LABEL = 'depth (m)'

	# figure with multiline
	p = figure(plot_width=400, plot_height=400, x_range=(min_c,max_c), y_range=(max_z,min_z),
	x_axis_label=X_AXIS_LABEL, y_axis_label=Y_AXIS_LABEL, title=TITLE)

	p.multi_line(source=cds, xs='cs', ys='zs', line_width=2, line_alpha=0.5,
	hover_line_alpha=1, hover_line_color='red')

	# station selector
	options = list(stations.astype(str))
	st_select = Select(title="Station", options=options, value=options[0])

	options = ['ALL'] + [str(e) for e in unique_ec_groups]
	ec_select = Select(title='EC group', options=options, value=init_ec_group)

	hover = HoverTool(tooltips=[
	('depth', '$y m'),
	('count', '$x'),
	('species', '@tax[$index]'),
	('annotation', '@ann[$index]'),
	])

	p.add_tools(hover)

	# percentile slider
	percentile = Slider(title='Percentile', value=init_pctile, start=90, end=99.9, step=0.05)

	# select data based on values of selected station and
	# percentile slider
	def update_selected(attr, old, new):
	station = int(st_select.value)
	pct = percentile.value
	selected_ec_group = ec_select.value

	z, station_counts = select_data(station, pct, selected_ec_group)
	cds_dict = get_profile_multiline(z, station_counts)

	(max_z, min_z), (min_c, max_c) = compute_axis_ranges(z, station_counts)
	# adjust plot axis ranges
	p.y_range.start = max_z
	p.y_range.end = min_z
	p.x_range.start = min_c
	p.x_range.end = max_c
	# change the data
	cds.data = cds_dict

	# attach callbacks to widgets
	st_select.on_change('value', update_selected)
	percentile.on_change('value', update_selected)
	ec_select.on_change('value', update_selected)

	# layout and output
	layout = row(widgetbox(st_select,ec_select,percentile),p)

	curdoc().add_root(layout)