ezralanglois/plot_q30.py

## plot_q30.py
# Tested with
#  - Anaconda Python 3.7
#  - Illumina InterOp 1.1.12

import interop.py_interop_plot as interop_plot
import interop.py_interop_run as interop_run
import interop.py_interop_run_metrics as interop_run_metrics
import pandas as pd


run_folder = ""
metric = "Q30Percent"
# List of metrics can be found with
#   metrics = interop_run.string_vector()
#   interop_run.list_metric_type(metrics)
#   [metrics[m] for m in range(metrics.size())]
#
# However only a subset can be plotted by cycle; this can determined with the following
#
#   feature = interop_run.parse_metric_feature_type('CycleFeature')
#   metrics = interop_run.string_vector()
#   interop_run.list_metric_type(metrics)
#   [metrics[m] for m in range(metrics.size()) if (interop_run_metrics.to_feature(interop_run.parse_metric_type(metrics[m])) & feature) == feature ]
#
# List can be found here:
#   https://github.com/Illumina/interop/blob/master/interop/constants/enums.h#L41
#
# Caveat 1: not all metrics are available on every platform. You will get NaN if the metric is not available.
# For example, Signal To Noise is only available on MiSeq
#
# Caveat 2: Certain plots are box plots/candle stick plots. Any plot that returns multiple curves is an average, so only
# x/y are defined; e.g. Intensity returns a plot per channel. The interface for these plots is the same.

# Load only the metrics necessary to plot % >= Q30
metric = interop_run.parse_metric_type(metric)
valid_to_load = interop_run.uchar_vector(interop_run.MetricCount, 0)
interop_run_metrics.list_metrics_to_load_by_type(metric, valid_to_load)
run_metrics = interop_run_metrics.run_metrics()
run_metrics.read(run_folder, valid_to_load)

# Plot everything
options = interop_plot.filter_options(run_metrics.run_info().flowcell().naming_method())
plot_data = interop_plot.candle_stick_plot_data()
interop_plot.plot_by_cycle(run_metrics, metric, options, plot_data)


if plot_data.size() == 1:
    # Convert candlestick (kind of box plot) to a pandas DataFrame
    # https://en.wikipedia.org/wiki/Box_plot
    # P25 = 25th percentile
    # P50 = median
    # P75 = 75th percentile
    # Lower Whisker = lowest data point excluding any outliers or p25 - 1.5 * (p75-p25)
    # Upper Whisker = highest data point excluding any outliers or p75 + 1.5 * (p75-p25)
    columns = ["x", "lower", "p25", "p50", "p75", "upper"]
    data = []
    for j in range(plot_data.at(0).size()):
        data_point = plot_data.at(0).at(j)
        data.append([data_point.x()
                        , data_point.lower()
                        , data_point.p25()
                        , data_point.p50()
                        , data_point.p75()
                        , data_point.upper()])
elif plot_data.size() > 1:
    # Convert several x/y curves where the tiles are averaged to a pandas DataFrame
    columns = ["x", "y", "label"]
    data = []
    for i in range(plot_data.size()):
        title = plot_data.at(i).title()
        for j in range(plot_data.at(i).size()):
            data_point = plot_data.at(i).at(j)
            data.append([data_point.x(), data_point.y(), title])
df = pd.DataFrame(data=data, columns=columns)
	# Tested with
	# - Anaconda Python 3.7
	# - Illumina InterOp 1.1.12

	import interop.py_interop_plot as interop_plot
	import interop.py_interop_run as interop_run
	import interop.py_interop_run_metrics as interop_run_metrics
	import pandas as pd


	run_folder = ""
	metric = "Q30Percent"
	# List of metrics can be found with
	# metrics = interop_run.string_vector()
	# interop_run.list_metric_type(metrics)
	# [metrics[m] for m in range(metrics.size())]
	#
	# However only a subset can be plotted by cycle; this can determined with the following
	#
	# feature = interop_run.parse_metric_feature_type('CycleFeature')
	# metrics = interop_run.string_vector()
	# interop_run.list_metric_type(metrics)
	# [metrics[m] for m in range(metrics.size()) if (interop_run_metrics.to_feature(interop_run.parse_metric_type(metrics[m])) & feature) == feature ]
	#
	# List can be found here:
	# https://github.com/Illumina/interop/blob/master/interop/constants/enums.h#L41
	#
	# Caveat 1: not all metrics are available on every platform. You will get NaN if the metric is not available.
	# For example, Signal To Noise is only available on MiSeq
	#
	# Caveat 2: Certain plots are box plots/candle stick plots. Any plot that returns multiple curves is an average, so only
	# x/y are defined; e.g. Intensity returns a plot per channel. The interface for these plots is the same.

	# Load only the metrics necessary to plot % >= Q30
	metric = interop_run.parse_metric_type(metric)
	valid_to_load = interop_run.uchar_vector(interop_run.MetricCount, 0)
	interop_run_metrics.list_metrics_to_load_by_type(metric, valid_to_load)
	run_metrics = interop_run_metrics.run_metrics()
	run_metrics.read(run_folder, valid_to_load)

	# Plot everything
	options = interop_plot.filter_options(run_metrics.run_info().flowcell().naming_method())
	plot_data = interop_plot.candle_stick_plot_data()
	interop_plot.plot_by_cycle(run_metrics, metric, options, plot_data)


	if plot_data.size() == 1:
	# Convert candlestick (kind of box plot) to a pandas DataFrame
	# https://en.wikipedia.org/wiki/Box_plot
	# P25 = 25th percentile
	# P50 = median
	# P75 = 75th percentile
	# Lower Whisker = lowest data point excluding any outliers or p25 - 1.5 * (p75-p25)
	# Upper Whisker = highest data point excluding any outliers or p75 + 1.5 * (p75-p25)
	columns = ["x", "lower", "p25", "p50", "p75", "upper"]
	data = []
	for j in range(plot_data.at(0).size()):
	data_point = plot_data.at(0).at(j)
	data.append([data_point.x()
	, data_point.lower()
	, data_point.p25()
	, data_point.p50()
	, data_point.p75()
	, data_point.upper()])
	elif plot_data.size() > 1:
	# Convert several x/y curves where the tiles are averaged to a pandas DataFrame
	columns = ["x", "y", "label"]
	data = []
	for i in range(plot_data.size()):
	title = plot_data.at(i).title()
	for j in range(plot_data.at(i).size()):
	data_point = plot_data.at(i).at(j)
	data.append([data_point.x(), data_point.y(), title])
	df = pd.DataFrame(data=data, columns=columns)