Skip to content

Instantly share code, notes, and snippets.

@fedarko
Last active December 16, 2019 22:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fedarko/fe2e18a5850e02f7d38a002a2497178a to your computer and use it in GitHub Desktop.
Save fedarko/fe2e18a5850e02f7d38a002a2497178a to your computer and use it in GitHub Desktop.
Script to report on duplicate IDs in a plate map spreadsheet (and modify certain duplicate IDs, in a very specific case); also attempts to update Qiita prep files accordingly. As a warning, code is untested / pretty gross.
#! /usr/bin/env python3
import os
from collections import Counter
from math import ceil
import re
from numpy import argmax
import pandas as pd
from qiime2 import Metadata
# "Parameters" of this script
EXPECTED_PLATE_COUNT = 174
PLATE_COL_COUNT = 12
PLATE_ROW_NAMES = ["a", "b", "c", "d", "e", "f", "g", "h"]
# Qiita study ID goes here: used when updating sample IDs
QIITA_STUDY_ID = ""
def write_list(namelist, filename):
with open(filename, "w") as fobj:
fobj.write("\n".join(sorted(namelist)))
def modify_prep(prep_id, sample_id, new_sample_id):
"""Modifies a single index in a prep."""
matching_preps = list(
filter(
lambda fn: str(prep_id) in fn,
os.listdir("preps_automodified"),
)
)
if len(matching_preps) == 0:
matching_preps = list(
filter(
lambda fn: "_" + str(prep_id) + "_" in fn,
os.listdir("preps"),
)
)
if len(matching_preps) > 1:
raise ValueError(
"multiple possible preps for '{}': {}".format(
prep_id, matching_preps
)
)
prep_md = Metadata.load("preps/" + matching_preps[0]).to_dataframe()
matching_indices = list(filter(lambda i: sample_id in i, prep_md.index))
if len(matching_indices) > 1:
# Multiple instances of this sample ID in this prep.
# We could resolve it automatically, but simplest for now to just
# require manual resolution.
print("MULTIPLE INSTANCES OF SAMPLE ID ON PREP???", sample_id)
else:
idx_list = list(prep_md.index)
idx_list[idx_list.index(sample_id)] = new_sample_id
prep_md.index = idx_list
prep_md.to_csv(
"preps_automodified/" + matching_preps[0], sep="\t"
)
print("prep {}: {} -> {}".format(prep_id, sample_id, new_sample_id))
assoc_list = pd.read_csv(
"list-no-4055-4057-4065-4087.tsv", sep="\t", index_col=0
)
# This is a spreadsheet of plate mappings. Each plate should have 8 rows, all
# labeled A through H.
sheet_df = pd.read_csv(
"abtx-big.tsv", sep="\t", index_col=0, keep_default_na=False, dtype=str
)
# Remove extraneous columns
if len(sheet_df.columns) < PLATE_COL_COUNT:
raise ValueError("There are less than {} columns.".format(PLATE_COL_COUNT))
# Drop all columns after the PLATE_COL_COUNT-th (remember that the index isn't
# treated as a column by pandas since we used index_col=0 above)
sheet_df_colsubset = sheet_df.drop(
sheet_df.columns[PLATE_COL_COUNT:], axis="columns"
)
# We have the columns filtered, but we need to filter the rows (since there are
# empty rows in the sheet DataFrame between plate maps, etc.)
# The rows that actually 'belong' to plates will be stored in this DF:
plate_df = pd.DataFrame(columns=sheet_df_colsubset.columns)
weird_stuff = []
# Also, to make life easier, we'll store all cell text in an 'unrolled' list:
well_list = []
well_list_notimes_ifposs = []
# Filter to rows that are 'within' plates (I know this solution is slow, but
# this way we can maintain order between plates instead of using
# .loc[["A", "B", ...]] or something where order is lost because the indices
# are sorted).
# Use of iterrows() based on https://stackoverflow.com/a/16476974/10730311.
for index, row in sheet_df_colsubset.iterrows():
# Only pay attention to rows that are putatively in a plate (i.e. their
# indices are in the A-H range for normal 96-well plates)
if index.strip().lower() in PLATE_ROW_NAMES:
plate_df = plate_df.append(row, sort=False)
for w in row:
well_list.append(w)
# ignore timestamp; just extract mm.dd.yy.hs.bs from the ID
match = re.match(r"(\d\d?\.\d\d?\.\d\d\.\w\w\.\w\w?).*", w)
if match:
well_list_notimes_ifposs.append(match.group(1))
else:
# if that pattern didn't match (e.g. this well ID is just BLANK
# or whatever), just add the entire thing
if not w.startswith("BLANK") and len(w) > 0:
weird_stuff.append(w)
well_list_notimes_ifposs.append(w)
# Change the columns in plate_df to be normal 1-indexed numbers
plate_df.columns = range(1, PLATE_COL_COUNT + 1)
# Validate that the number of rows detected in plates is what we expected
expected_row_ct = EXPECTED_PLATE_COUNT * len(PLATE_ROW_NAMES)
if len(plate_df.index) != expected_row_ct:
raise ValueError(
(
"Found {} rows 'within' plates, but expected to find {} rows. "
"This is because we expected the spreadsheet to contain {} plates."
).format(len(plate_df.index), expected_row_ct, EXPECTED_PLATE_COUNT)
)
# OK, now we have a sane DataFrame (plate_df) where each cell is a
# sample ID, BLANK, etc.
#
# We *also* now have well_list, which is a list that contains the same
# information but "unrolled" (so it goes row1 col1 row1 col2 ... etc, all the
# way through all of the plates on the input spreadsheet).
#
# Now we can find duplicates, etc. by just searching through these data
# structures (well_list will be useful for searching and counting, and plate_df
# will be useful for determining which samples occur on "earlier"/later plates)
duplicate_text_to_frequency = {}
def is_duplicate_within_or_across_plates(duplicate_text):
"""Since each plate has exactly 96* wells, we can just iterate over
well_list and update a "curr_plate_num" with some frequency.
This will let us determine if a duplicate ID (i.e. a well/sample ID that
occurs exactly twice) is duplicated on a plate or across two plates.
Returns ("within", None) if the duplication is on a single plate.
Returns ("across", i) if the duplication is across two plates.
In this case, i is the integer position (1-indexed!!!) of the second
occurrence duplicate_text in well_list.
* or PLATE_COL_COUNT * len(PLATE_ROW_NAMES), if you've modified the
stuff at the top of this file.
"""
wells_per_plate = PLATE_COL_COUNT * len(PLATE_ROW_NAMES)
first_occurrence_plate_num = None
curr_plate_num = 1
i = 1
for w in well_list:
if w == duplicate_text:
if first_occurrence_plate_num is None:
first_occurrence_plate_num = curr_plate_num
else:
# Ok, now we know the plate numbers of both occurrences of
# duplicate_text. Are these the same?
if first_occurrence_plate_num == curr_plate_num:
return "within", None
else:
return "across", i
i += 1
if i % wells_per_plate == 0:
curr_plate_num += 1
counts_by_text = Counter(well_list)
for t in counts_by_text.keys():
c = counts_by_text[t]
if t != "BLANK" and c > 1:
duplicate_text_to_frequency[t] = c
duplicate_text_to_frequency_notimes = {}
counts_by_text_notimes = Counter(well_list_notimes_ifposs)
for t in counts_by_text_notimes.keys():
c = counts_by_text_notimes[t]
if t != "BLANK" and c > 1:
duplicate_text_to_frequency_notimes[t] = c
with open("extra_duplicates_if_you_try_to_ignore_times.txt", "w") as tfile:
s1 = set(duplicate_text_to_frequency.keys())
s2 = set(duplicate_text_to_frequency_notimes.keys())
# Basically, there's probably going to be stuff in s2 but not in s1. This
# is because these should correspond to IDs with just date/etc. info,
# whereas the IDs in s1 for these dates also have time info (we chopped
# that off when we applied the regex above). TLDR, this is as expected.
# (i know this comment is kind of badly explained but i'm tired)
intersection = s1 & s2
if len(intersection) != len(s1):
missing_ids = s1 - intersection
print(
"Duplicates you get if you ignore times that are not exactly "
"observed in normal duplicates:",
missing_ids,
)
# sort to preserve determinism
unique_dups = sorted(s2 - s1)
tfile.write("\n".join(unique_dups))
with open(
"extra_duplicates_if_you_try_to_ignore_times_no_stool.txt", "w"
) as tfile:
unique_dups_no_stool = [
i for i in unique_dups if not i.lower().endswith(".st")
]
tfile.write("\n".join(unique_dups_no_stool))
with open("weird_stuff.txt", "w") as wfile:
wfile.write("\n".join(weird_stuff))
with open("all_duplicates.txt", "w") as dup_file:
dup_file.write("\n".join(duplicate_text_to_frequency.keys()))
impossible_to_auto_resolve_2014_crossplate_dups = []
in2014_duplicates_across_plates = []
in2014_duplicates_within_plate = []
non2014_duplicates = []
duplicates_with_3_or_more_frequency = []
for w in duplicate_text_to_frequency.keys():
# Find duplicates that occur twice...
if duplicate_text_to_frequency[w] == 2:
# ...and have a "year" of 2014.
# (this regex looks for stuff of the form "12.10.14.", on the
# assumption that these dates represent years -- if your wells have a
# different date format this will need to be adjusted)
#
# (also, note that we 'capture' some stuff from these IDs. This is
# useful when reconstructing IDs later but using 2015 instead of 2014.
match = re.match(r"(\d\d?\.\d\d?)\.14\.(.*)", w)
if match:
# Ok, now we need to determine if the duplicates occur on separate
# plates.
# If yes, change the later occurrence in plate_df to 2015 instead
# If no, then report this as a within-plate duplicate
results = is_duplicate_within_or_across_plates(w)
if results[0] == "across":
w_but_with_2015 = match.group(1) + ".15." + match.group(2)
if w_but_with_2015 in well_list:
impossible_to_auto_resolve_2014_crossplate_dups.append(
w_but_with_2015
)
print(
"WARNING: {} can't be auto-resolved, since {} is "
"already in the dataset.".format(w, w_but_with_2015)
)
# raise ValueError(
# "{} is already in the sheet?".format(w_but_with_2015)
# )
else:
# If we're here, we can auto-resolve this sample ID!
# Hopefully.
# Determine plate IDs of these samples.
matching_ids = assoc_list.index.str.contains(
"{}\\.".format(QIITA_STUDY_ID) + w + r"[A-Za-z\.]*"
)
if sum(matching_ids) > 2:
print("> 2 matching IDs for {}???".format(w))
continue
elif sum(matching_ids) < 2:
print(
"Couldn't find {} twice in association list.".format(
w
)
)
continue
else:
matching_samples = assoc_list.loc[matching_ids]
# Ok, so we know there are two matching IDs in
# matching_ids. What we can do is determine which is on
# the later plate, then get the prep of that, then
# change that prep's instance of this ID to 2015 in the
# ID instead of 2014. That is doable with modify_prep()
if len(set(matching_samples["sample_plate"])) == 1:
print(
"WARNING: Same plate in assoc_list for {}.".format(
w
)
)
continue
else:
platelist = list(matching_samples["sample_plate"])
platenums = [
int(x.split("_")[len(x.split("_")) - 1])
for x in platelist
]
# Get index of later plate
later_plate_idx = argmax(platenums)
# Later sample is the one with that later plate
sample_to_change = matching_samples[
matching_samples["sample_plate"]
== platelist[later_plate_idx]
]
modify_prep(
sample_to_change["prep_id"][
sample_to_change.index[0]
],
sample_to_change.index[0],
sample_to_change.index[0].replace(
w, w_but_with_2015
),
)
# Use the 1-indexed position of the second occurrence of the
# duplicate to modify the plate (the - 1's switch this back to
# 0 indexing)
row = ceil(results[1] / PLATE_COL_COUNT) - 1
col = (results[1] % PLATE_COL_COUNT) - 1
plate_df.iat[row, col] = w_but_with_2015
# print("Changed second {} to {}".format(w, w_but_with_2015))
in2014_duplicates_across_plates.append(w)
else:
in2014_duplicates_within_plate.append(w)
else:
non2014_duplicates.append(w)
else:
duplicates_with_3_or_more_frequency.append(w)
write_list(
impossible_to_auto_resolve_2014_crossplate_dups,
"impossible_to_autoresolve_duplicates.txt",
)
write_list(
in2014_duplicates_across_plates, "2_freq_2014_duplicates_across_plates.txt"
)
write_list(
in2014_duplicates_within_plate, "2_freq_2014_duplicates_within_plate.txt"
)
write_list(non2014_duplicates, "2_freq_non2014_duplicates.txt")
write_list(duplicates_with_3_or_more_frequency, "3ormore_freq_duplicates.txt")
plate_df.to_csv("abtx_with_simple_2014_duplicates_fixed.tsv", sep="\t")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment