Last active
December 16, 2019 22:12
-
-
Save fedarko/fe2e18a5850e02f7d38a002a2497178a to your computer and use it in GitHub Desktop.
Script to report on duplicate IDs in a plate map spreadsheet (and modify certain duplicate IDs, in a very specific case); also attempts to update Qiita prep files accordingly. As a warning, code is untested / pretty gross.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
import os | |
from collections import Counter | |
from math import ceil | |
import re | |
from numpy import argmax | |
import pandas as pd | |
from qiime2 import Metadata | |
# "Parameters" of this script | |
EXPECTED_PLATE_COUNT = 174 | |
PLATE_COL_COUNT = 12 | |
PLATE_ROW_NAMES = ["a", "b", "c", "d", "e", "f", "g", "h"] | |
# Qiita study ID goes here: used when updating sample IDs | |
QIITA_STUDY_ID = "" | |
def write_list(namelist, filename): | |
with open(filename, "w") as fobj: | |
fobj.write("\n".join(sorted(namelist))) | |
def modify_prep(prep_id, sample_id, new_sample_id): | |
"""Modifies a single index in a prep.""" | |
matching_preps = list( | |
filter( | |
lambda fn: str(prep_id) in fn, | |
os.listdir("preps_automodified"), | |
) | |
) | |
if len(matching_preps) == 0: | |
matching_preps = list( | |
filter( | |
lambda fn: "_" + str(prep_id) + "_" in fn, | |
os.listdir("preps"), | |
) | |
) | |
if len(matching_preps) > 1: | |
raise ValueError( | |
"multiple possible preps for '{}': {}".format( | |
prep_id, matching_preps | |
) | |
) | |
prep_md = Metadata.load("preps/" + matching_preps[0]).to_dataframe() | |
matching_indices = list(filter(lambda i: sample_id in i, prep_md.index)) | |
if len(matching_indices) > 1: | |
# Multiple instances of this sample ID in this prep. | |
# We could resolve it automatically, but simplest for now to just | |
# require manual resolution. | |
print("MULTIPLE INSTANCES OF SAMPLE ID ON PREP???", sample_id) | |
else: | |
idx_list = list(prep_md.index) | |
idx_list[idx_list.index(sample_id)] = new_sample_id | |
prep_md.index = idx_list | |
prep_md.to_csv( | |
"preps_automodified/" + matching_preps[0], sep="\t" | |
) | |
print("prep {}: {} -> {}".format(prep_id, sample_id, new_sample_id)) | |
assoc_list = pd.read_csv( | |
"list-no-4055-4057-4065-4087.tsv", sep="\t", index_col=0 | |
) | |
# This is a spreadsheet of plate mappings. Each plate should have 8 rows, all | |
# labeled A through H. | |
sheet_df = pd.read_csv( | |
"abtx-big.tsv", sep="\t", index_col=0, keep_default_na=False, dtype=str | |
) | |
# Remove extraneous columns | |
if len(sheet_df.columns) < PLATE_COL_COUNT: | |
raise ValueError("There are less than {} columns.".format(PLATE_COL_COUNT)) | |
# Drop all columns after the PLATE_COL_COUNT-th (remember that the index isn't | |
# treated as a column by pandas since we used index_col=0 above) | |
sheet_df_colsubset = sheet_df.drop( | |
sheet_df.columns[PLATE_COL_COUNT:], axis="columns" | |
) | |
# We have the columns filtered, but we need to filter the rows (since there are | |
# empty rows in the sheet DataFrame between plate maps, etc.) | |
# The rows that actually 'belong' to plates will be stored in this DF: | |
plate_df = pd.DataFrame(columns=sheet_df_colsubset.columns) | |
weird_stuff = [] | |
# Also, to make life easier, we'll store all cell text in an 'unrolled' list: | |
well_list = [] | |
well_list_notimes_ifposs = [] | |
# Filter to rows that are 'within' plates (I know this solution is slow, but | |
# this way we can maintain order between plates instead of using | |
# .loc[["A", "B", ...]] or something where order is lost because the indices | |
# are sorted). | |
# Use of iterrows() based on https://stackoverflow.com/a/16476974/10730311. | |
for index, row in sheet_df_colsubset.iterrows(): | |
# Only pay attention to rows that are putatively in a plate (i.e. their | |
# indices are in the A-H range for normal 96-well plates) | |
if index.strip().lower() in PLATE_ROW_NAMES: | |
plate_df = plate_df.append(row, sort=False) | |
for w in row: | |
well_list.append(w) | |
# ignore timestamp; just extract mm.dd.yy.hs.bs from the ID | |
match = re.match(r"(\d\d?\.\d\d?\.\d\d\.\w\w\.\w\w?).*", w) | |
if match: | |
well_list_notimes_ifposs.append(match.group(1)) | |
else: | |
# if that pattern didn't match (e.g. this well ID is just BLANK | |
# or whatever), just add the entire thing | |
if not w.startswith("BLANK") and len(w) > 0: | |
weird_stuff.append(w) | |
well_list_notimes_ifposs.append(w) | |
# Change the columns in plate_df to be normal 1-indexed numbers | |
plate_df.columns = range(1, PLATE_COL_COUNT + 1) | |
# Validate that the number of rows detected in plates is what we expected | |
expected_row_ct = EXPECTED_PLATE_COUNT * len(PLATE_ROW_NAMES) | |
if len(plate_df.index) != expected_row_ct: | |
raise ValueError( | |
( | |
"Found {} rows 'within' plates, but expected to find {} rows. " | |
"This is because we expected the spreadsheet to contain {} plates." | |
).format(len(plate_df.index), expected_row_ct, EXPECTED_PLATE_COUNT) | |
) | |
# OK, now we have a sane DataFrame (plate_df) where each cell is a | |
# sample ID, BLANK, etc. | |
# | |
# We *also* now have well_list, which is a list that contains the same | |
# information but "unrolled" (so it goes row1 col1 row1 col2 ... etc, all the | |
# way through all of the plates on the input spreadsheet). | |
# | |
# Now we can find duplicates, etc. by just searching through these data | |
# structures (well_list will be useful for searching and counting, and plate_df | |
# will be useful for determining which samples occur on "earlier"/later plates) | |
duplicate_text_to_frequency = {} | |
def is_duplicate_within_or_across_plates(duplicate_text): | |
"""Since each plate has exactly 96* wells, we can just iterate over | |
well_list and update a "curr_plate_num" with some frequency. | |
This will let us determine if a duplicate ID (i.e. a well/sample ID that | |
occurs exactly twice) is duplicated on a plate or across two plates. | |
Returns ("within", None) if the duplication is on a single plate. | |
Returns ("across", i) if the duplication is across two plates. | |
In this case, i is the integer position (1-indexed!!!) of the second | |
occurrence duplicate_text in well_list. | |
* or PLATE_COL_COUNT * len(PLATE_ROW_NAMES), if you've modified the | |
stuff at the top of this file. | |
""" | |
wells_per_plate = PLATE_COL_COUNT * len(PLATE_ROW_NAMES) | |
first_occurrence_plate_num = None | |
curr_plate_num = 1 | |
i = 1 | |
for w in well_list: | |
if w == duplicate_text: | |
if first_occurrence_plate_num is None: | |
first_occurrence_plate_num = curr_plate_num | |
else: | |
# Ok, now we know the plate numbers of both occurrences of | |
# duplicate_text. Are these the same? | |
if first_occurrence_plate_num == curr_plate_num: | |
return "within", None | |
else: | |
return "across", i | |
i += 1 | |
if i % wells_per_plate == 0: | |
curr_plate_num += 1 | |
counts_by_text = Counter(well_list) | |
for t in counts_by_text.keys(): | |
c = counts_by_text[t] | |
if t != "BLANK" and c > 1: | |
duplicate_text_to_frequency[t] = c | |
duplicate_text_to_frequency_notimes = {} | |
counts_by_text_notimes = Counter(well_list_notimes_ifposs) | |
for t in counts_by_text_notimes.keys(): | |
c = counts_by_text_notimes[t] | |
if t != "BLANK" and c > 1: | |
duplicate_text_to_frequency_notimes[t] = c | |
with open("extra_duplicates_if_you_try_to_ignore_times.txt", "w") as tfile: | |
s1 = set(duplicate_text_to_frequency.keys()) | |
s2 = set(duplicate_text_to_frequency_notimes.keys()) | |
# Basically, there's probably going to be stuff in s2 but not in s1. This | |
# is because these should correspond to IDs with just date/etc. info, | |
# whereas the IDs in s1 for these dates also have time info (we chopped | |
# that off when we applied the regex above). TLDR, this is as expected. | |
# (i know this comment is kind of badly explained but i'm tired) | |
intersection = s1 & s2 | |
if len(intersection) != len(s1): | |
missing_ids = s1 - intersection | |
print( | |
"Duplicates you get if you ignore times that are not exactly " | |
"observed in normal duplicates:", | |
missing_ids, | |
) | |
# sort to preserve determinism | |
unique_dups = sorted(s2 - s1) | |
tfile.write("\n".join(unique_dups)) | |
with open( | |
"extra_duplicates_if_you_try_to_ignore_times_no_stool.txt", "w" | |
) as tfile: | |
unique_dups_no_stool = [ | |
i for i in unique_dups if not i.lower().endswith(".st") | |
] | |
tfile.write("\n".join(unique_dups_no_stool)) | |
with open("weird_stuff.txt", "w") as wfile: | |
wfile.write("\n".join(weird_stuff)) | |
with open("all_duplicates.txt", "w") as dup_file: | |
dup_file.write("\n".join(duplicate_text_to_frequency.keys())) | |
impossible_to_auto_resolve_2014_crossplate_dups = [] | |
in2014_duplicates_across_plates = [] | |
in2014_duplicates_within_plate = [] | |
non2014_duplicates = [] | |
duplicates_with_3_or_more_frequency = [] | |
for w in duplicate_text_to_frequency.keys(): | |
# Find duplicates that occur twice... | |
if duplicate_text_to_frequency[w] == 2: | |
# ...and have a "year" of 2014. | |
# (this regex looks for stuff of the form "12.10.14.", on the | |
# assumption that these dates represent years -- if your wells have a | |
# different date format this will need to be adjusted) | |
# | |
# (also, note that we 'capture' some stuff from these IDs. This is | |
# useful when reconstructing IDs later but using 2015 instead of 2014. | |
match = re.match(r"(\d\d?\.\d\d?)\.14\.(.*)", w) | |
if match: | |
# Ok, now we need to determine if the duplicates occur on separate | |
# plates. | |
# If yes, change the later occurrence in plate_df to 2015 instead | |
# If no, then report this as a within-plate duplicate | |
results = is_duplicate_within_or_across_plates(w) | |
if results[0] == "across": | |
w_but_with_2015 = match.group(1) + ".15." + match.group(2) | |
if w_but_with_2015 in well_list: | |
impossible_to_auto_resolve_2014_crossplate_dups.append( | |
w_but_with_2015 | |
) | |
print( | |
"WARNING: {} can't be auto-resolved, since {} is " | |
"already in the dataset.".format(w, w_but_with_2015) | |
) | |
# raise ValueError( | |
# "{} is already in the sheet?".format(w_but_with_2015) | |
# ) | |
else: | |
# If we're here, we can auto-resolve this sample ID! | |
# Hopefully. | |
# Determine plate IDs of these samples. | |
matching_ids = assoc_list.index.str.contains( | |
"{}\\.".format(QIITA_STUDY_ID) + w + r"[A-Za-z\.]*" | |
) | |
if sum(matching_ids) > 2: | |
print("> 2 matching IDs for {}???".format(w)) | |
continue | |
elif sum(matching_ids) < 2: | |
print( | |
"Couldn't find {} twice in association list.".format( | |
w | |
) | |
) | |
continue | |
else: | |
matching_samples = assoc_list.loc[matching_ids] | |
# Ok, so we know there are two matching IDs in | |
# matching_ids. What we can do is determine which is on | |
# the later plate, then get the prep of that, then | |
# change that prep's instance of this ID to 2015 in the | |
# ID instead of 2014. That is doable with modify_prep() | |
if len(set(matching_samples["sample_plate"])) == 1: | |
print( | |
"WARNING: Same plate in assoc_list for {}.".format( | |
w | |
) | |
) | |
continue | |
else: | |
platelist = list(matching_samples["sample_plate"]) | |
platenums = [ | |
int(x.split("_")[len(x.split("_")) - 1]) | |
for x in platelist | |
] | |
# Get index of later plate | |
later_plate_idx = argmax(platenums) | |
# Later sample is the one with that later plate | |
sample_to_change = matching_samples[ | |
matching_samples["sample_plate"] | |
== platelist[later_plate_idx] | |
] | |
modify_prep( | |
sample_to_change["prep_id"][ | |
sample_to_change.index[0] | |
], | |
sample_to_change.index[0], | |
sample_to_change.index[0].replace( | |
w, w_but_with_2015 | |
), | |
) | |
# Use the 1-indexed position of the second occurrence of the | |
# duplicate to modify the plate (the - 1's switch this back to | |
# 0 indexing) | |
row = ceil(results[1] / PLATE_COL_COUNT) - 1 | |
col = (results[1] % PLATE_COL_COUNT) - 1 | |
plate_df.iat[row, col] = w_but_with_2015 | |
# print("Changed second {} to {}".format(w, w_but_with_2015)) | |
in2014_duplicates_across_plates.append(w) | |
else: | |
in2014_duplicates_within_plate.append(w) | |
else: | |
non2014_duplicates.append(w) | |
else: | |
duplicates_with_3_or_more_frequency.append(w) | |
write_list( | |
impossible_to_auto_resolve_2014_crossplate_dups, | |
"impossible_to_autoresolve_duplicates.txt", | |
) | |
write_list( | |
in2014_duplicates_across_plates, "2_freq_2014_duplicates_across_plates.txt" | |
) | |
write_list( | |
in2014_duplicates_within_plate, "2_freq_2014_duplicates_within_plate.txt" | |
) | |
write_list(non2014_duplicates, "2_freq_non2014_duplicates.txt") | |
write_list(duplicates_with_3_or_more_frequency, "3ormore_freq_duplicates.txt") | |
plate_df.to_csv("abtx_with_simple_2014_duplicates_fixed.tsv", sep="\t") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment