Last active
October 27, 2019 23:55
-
-
Save fedarko/06d432ca873355046738822692937095 to your computer and use it in GitHub Desktop.
Search taxonomies of negative controls
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
""" | |
This is a small script that looks through the annotated taxonomies of all | |
features present in a dataset's negative control samples. It's handy for | |
checking that certain features are (for the most part) absent from these | |
samples. | |
This obviously isn't a very formal way of accounting for contamination, | |
but it is useful for quickly verifying that certain taxa are probably not | |
the product of contamination. (Better approaches include e.g. the decontam | |
package, the KatharoSeq protocol, ...) | |
I originally made this for an analysis located at | |
https://github.com/knightlab-analyses/qurro-mackerel-analysis, so a lot of | |
formatting things here assume formatting in the metadata, | |
directory structure, etc. that matches that analysis' data. | |
""" | |
import sys | |
import os | |
import biom | |
import pandas as pd | |
from qiime2 import Artifact, Metadata | |
from qurro._df_utils import biom_table_to_sparse_df | |
OUT_DIR = "AnalysisOutput" | |
TABLE_QZA_FILEPATH = os.path.join(OUT_DIR, "table-unfiltered.qza") | |
TAXONOMY_QZA_FILEPATH = os.path.join(OUT_DIR, "taxonomy.qza") | |
METADATA_FILEPATH = os.path.join(OUT_DIR, "metadata.tsv") | |
try: | |
tax_query = sys.argv[1] | |
except IndexError: | |
raise ValueError( | |
"You need to specify some text to use for searching taxonomy!" | |
) | |
# Load the table | |
tbl_qza = Artifact.load(TABLE_QZA_FILEPATH) | |
tbl = tbl_qza.view(biom.Table) | |
# Load the taxonomy | |
tax_qza = Artifact.load(TAXONOMY_QZA_FILEPATH) | |
tax = tax_qza.view(pd.DataFrame) | |
# Load the sample metadata | |
md = Metadata.load(METADATA_FILEPATH).to_dataframe() | |
# Identify negative control samples: that is, all samples where | |
# empo_1 == "control" and empo_2 == "negative". | |
c_sample_md = md[md["empo_1"] == "control"] | |
n_c_samples = c_sample_md[c_sample_md["empo_2"] == "negative"].index | |
# Identify all features whose taxonomy annotations contain tax_query | |
query_features = tax[ | |
tax["Taxon"].str.lower().str.find(tax_query.lower()) != -1 | |
].index | |
matching_feature_ct = len(query_features) | |
# Yeah, I know this will say "1 features'" if only 1 feature matched, but I | |
# don't think it's worth making the code a whole lot uglier to address that | |
print( | |
'Identified "{}" in {} features\' taxonomy annotations.'.format( | |
tax_query, matching_feature_ct | |
) | |
) | |
# If no features matched the query, no reason to go any further down | |
if matching_feature_ct == 0: | |
quit() | |
# Make a copy of the table filtered to just the features matching tax_query | |
tbl_query = tbl.filter(query_features, axis="observation", inplace=False) | |
# Load this filtered table as a pandas SparseDataFrame. | |
df_tbl_query = biom_table_to_sparse_df(tbl_query, min_row_ct=1, min_col_ct=1) | |
# Find all samples containing at least one count of a matching feature | |
samples_with_query = df_tbl_query.columns[df_tbl_query.sum(axis="index") > 0] | |
print( | |
"Identified {} samples containing at least one of those {} " | |
"features.".format(len(samples_with_query), matching_feature_ct) | |
) | |
# ...Filter this to just *negative control* samples | |
n_c_samples_with_query = set(samples_with_query) & set(n_c_samples) | |
print( | |
"Of the {} negative control samples, {} contain(s) at least one " | |
'"{}"-matching feature.'.format( | |
len(n_c_samples), len(n_c_samples_with_query), tax_query | |
) | |
) | |
if len(n_c_samples_with_query) > 0: | |
print( | |
'Negative control samples containing at least one "{}"-matching feature:'.format( | |
tax_query | |
) | |
) | |
ncswq_list = list(n_c_samples_with_query) | |
for i in range(len(ncswq_list)): | |
print("{}) {}".format(i + 1, ncswq_list[i])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment