Skip to content

Instantly share code, notes, and snippets.

@fedarko
Last active October 27, 2019 23:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fedarko/06d432ca873355046738822692937095 to your computer and use it in GitHub Desktop.
Save fedarko/06d432ca873355046738822692937095 to your computer and use it in GitHub Desktop.
Search taxonomies of negative controls
#! /usr/bin/env python3
"""
This is a small script that looks through the annotated taxonomies of all
features present in a dataset's negative control samples. It's handy for
checking that certain features are (for the most part) absent from these
samples.
This obviously isn't a very formal way of accounting for contamination,
but it is useful for quickly verifying that certain taxa are probably not
the product of contamination. (Better approaches include e.g. the decontam
package, the KatharoSeq protocol, ...)
I originally made this for an analysis located at
https://github.com/knightlab-analyses/qurro-mackerel-analysis, so a lot of
formatting things here assume formatting in the metadata,
directory structure, etc. that matches that analysis' data.
"""
import sys
import os
import biom
import pandas as pd
from qiime2 import Artifact, Metadata
from qurro._df_utils import biom_table_to_sparse_df
OUT_DIR = "AnalysisOutput"
TABLE_QZA_FILEPATH = os.path.join(OUT_DIR, "table-unfiltered.qza")
TAXONOMY_QZA_FILEPATH = os.path.join(OUT_DIR, "taxonomy.qza")
METADATA_FILEPATH = os.path.join(OUT_DIR, "metadata.tsv")
try:
tax_query = sys.argv[1]
except IndexError:
raise ValueError(
"You need to specify some text to use for searching taxonomy!"
)
# Load the table
tbl_qza = Artifact.load(TABLE_QZA_FILEPATH)
tbl = tbl_qza.view(biom.Table)
# Load the taxonomy
tax_qza = Artifact.load(TAXONOMY_QZA_FILEPATH)
tax = tax_qza.view(pd.DataFrame)
# Load the sample metadata
md = Metadata.load(METADATA_FILEPATH).to_dataframe()
# Identify negative control samples: that is, all samples where
# empo_1 == "control" and empo_2 == "negative".
c_sample_md = md[md["empo_1"] == "control"]
n_c_samples = c_sample_md[c_sample_md["empo_2"] == "negative"].index
# Identify all features whose taxonomy annotations contain tax_query
query_features = tax[
tax["Taxon"].str.lower().str.find(tax_query.lower()) != -1
].index
matching_feature_ct = len(query_features)
# Yeah, I know this will say "1 features'" if only 1 feature matched, but I
# don't think it's worth making the code a whole lot uglier to address that
print(
'Identified "{}" in {} features\' taxonomy annotations.'.format(
tax_query, matching_feature_ct
)
)
# If no features matched the query, no reason to go any further down
if matching_feature_ct == 0:
quit()
# Make a copy of the table filtered to just the features matching tax_query
tbl_query = tbl.filter(query_features, axis="observation", inplace=False)
# Load this filtered table as a pandas SparseDataFrame.
df_tbl_query = biom_table_to_sparse_df(tbl_query, min_row_ct=1, min_col_ct=1)
# Find all samples containing at least one count of a matching feature
samples_with_query = df_tbl_query.columns[df_tbl_query.sum(axis="index") > 0]
print(
"Identified {} samples containing at least one of those {} "
"features.".format(len(samples_with_query), matching_feature_ct)
)
# ...Filter this to just *negative control* samples
n_c_samples_with_query = set(samples_with_query) & set(n_c_samples)
print(
"Of the {} negative control samples, {} contain(s) at least one "
'"{}"-matching feature.'.format(
len(n_c_samples), len(n_c_samples_with_query), tax_query
)
)
if len(n_c_samples_with_query) > 0:
print(
'Negative control samples containing at least one "{}"-matching feature:'.format(
tax_query
)
)
ncswq_list = list(n_c_samples_with_query)
for i in range(len(ncswq_list)):
print("{}) {}".format(i + 1, ncswq_list[i]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment