fedarko/handle_duplicate_sample_ids.py

## handle_duplicate_sample_ids.py
#! /usr/bin/env python3
import os
from collections import Counter
from math import ceil
import re
from numpy import argmax
import pandas as pd
from qiime2 import Metadata

# "Parameters" of this script
EXPECTED_PLATE_COUNT = 174
PLATE_COL_COUNT = 12
PLATE_ROW_NAMES = ["a", "b", "c", "d", "e", "f", "g", "h"]

# Qiita study ID goes here: used when updating sample IDs
QIITA_STUDY_ID = ""

def write_list(namelist, filename):
    with open(filename, "w") as fobj:
        fobj.write("\n".join(sorted(namelist)))


def modify_prep(prep_id, sample_id, new_sample_id):
    """Modifies a single index in a prep."""

    matching_preps = list(
        filter(
            lambda fn: str(prep_id) in fn,
            os.listdir("preps_automodified"),
        )
    )
    if len(matching_preps) == 0:
        matching_preps = list(
            filter(
                lambda fn: "_" + str(prep_id) + "_" in fn,
                os.listdir("preps"),
            )
        )

    if len(matching_preps) > 1:
        raise ValueError(
            "multiple possible preps for '{}': {}".format(
                prep_id, matching_preps
            )
        )

    prep_md = Metadata.load("preps/" + matching_preps[0]).to_dataframe()
    matching_indices = list(filter(lambda i: sample_id in i, prep_md.index))
    if len(matching_indices) > 1:
        # Multiple instances of this sample ID in this prep.
        # We could resolve it automatically, but simplest for now to just
        # require manual resolution.
        print("MULTIPLE INSTANCES OF SAMPLE ID ON PREP???", sample_id)
    else:
        idx_list = list(prep_md.index)
        idx_list[idx_list.index(sample_id)] = new_sample_id
        prep_md.index = idx_list
        prep_md.to_csv(
            "preps_automodified/" + matching_preps[0], sep="\t"
        )
        print("prep {}: {} -> {}".format(prep_id, sample_id, new_sample_id))


assoc_list = pd.read_csv(
    "list-no-4055-4057-4065-4087.tsv", sep="\t", index_col=0
)

# This is a spreadsheet of plate mappings. Each plate should have 8 rows, all
# labeled A through H.
sheet_df = pd.read_csv(
    "abtx-big.tsv", sep="\t", index_col=0, keep_default_na=False, dtype=str
)

# Remove extraneous columns
if len(sheet_df.columns) < PLATE_COL_COUNT:
    raise ValueError("There are less than {} columns.".format(PLATE_COL_COUNT))

# Drop all columns after the PLATE_COL_COUNT-th (remember that the index isn't
# treated as a column by pandas since we used index_col=0 above)
sheet_df_colsubset = sheet_df.drop(
    sheet_df.columns[PLATE_COL_COUNT:], axis="columns"
)

# We have the columns filtered, but we need to filter the rows (since there are
# empty rows in the sheet DataFrame between plate maps, etc.)
# The rows that actually 'belong' to plates will be stored in this DF:
plate_df = pd.DataFrame(columns=sheet_df_colsubset.columns)

weird_stuff = []

# Also, to make life easier, we'll store all cell text in an 'unrolled' list:
well_list = []
well_list_notimes_ifposs = []

# Filter to rows that are 'within' plates (I know this solution is slow, but
# this way we can maintain order between plates instead of using
# .loc[["A", "B", ...]] or something where order is lost because the indices
# are sorted).
# Use of iterrows() based on https://stackoverflow.com/a/16476974/10730311.
for index, row in sheet_df_colsubset.iterrows():
    # Only pay attention to rows that are putatively in a plate (i.e. their
    # indices are in the A-H range for normal 96-well plates)
    if index.strip().lower() in PLATE_ROW_NAMES:
        plate_df = plate_df.append(row, sort=False)
        for w in row:
            well_list.append(w)

            # ignore timestamp; just extract mm.dd.yy.hs.bs from the ID
            match = re.match(r"(\d\d?\.\d\d?\.\d\d\.\w\w\.\w\w?).*", w)
            if match:
                well_list_notimes_ifposs.append(match.group(1))
            else:
                # if that pattern didn't match (e.g. this well ID is just BLANK
                # or whatever), just add the entire thing
                if not w.startswith("BLANK") and len(w) > 0:
                    weird_stuff.append(w)
                well_list_notimes_ifposs.append(w)

# Change the columns in plate_df to be normal 1-indexed numbers
plate_df.columns = range(1, PLATE_COL_COUNT + 1)

# Validate that the number of rows detected in plates is what we expected
expected_row_ct = EXPECTED_PLATE_COUNT * len(PLATE_ROW_NAMES)
if len(plate_df.index) != expected_row_ct:
    raise ValueError(
        (
            "Found {} rows 'within' plates, but expected to find {} rows. "
            "This is because we expected the spreadsheet to contain {} plates."
        ).format(len(plate_df.index), expected_row_ct, EXPECTED_PLATE_COUNT)
    )

# OK, now we have a sane DataFrame (plate_df) where each cell is a
# sample ID, BLANK, etc.
#
# We *also* now have well_list, which is a list that contains the same
# information but "unrolled" (so it goes row1 col1 row1 col2 ... etc, all the
# way through all of the plates on the input spreadsheet).
#
# Now we can find duplicates, etc. by just searching through these data
# structures (well_list will be useful for searching and counting, and plate_df
# will be useful for determining which samples occur on "earlier"/later plates)

duplicate_text_to_frequency = {}


def is_duplicate_within_or_across_plates(duplicate_text):
    """Since each plate has exactly 96* wells, we can just iterate over
       well_list and update a "curr_plate_num" with some frequency.

       This will let us determine if a duplicate ID (i.e. a well/sample ID that
       occurs exactly twice) is duplicated on a plate or across two plates.

       Returns ("within", None) if the duplication is on a single plate.
       Returns ("across", i) if the duplication is across two plates.
        In this case, i is the integer position (1-indexed!!!) of the second
        occurrence duplicate_text in well_list.

       * or PLATE_COL_COUNT * len(PLATE_ROW_NAMES), if you've modified the
         stuff at the top of this file.
    """
    wells_per_plate = PLATE_COL_COUNT * len(PLATE_ROW_NAMES)
    first_occurrence_plate_num = None
    curr_plate_num = 1
    i = 1
    for w in well_list:
        if w == duplicate_text:
            if first_occurrence_plate_num is None:
                first_occurrence_plate_num = curr_plate_num
            else:
                # Ok, now we know the plate numbers of both occurrences of
                # duplicate_text. Are these the same?
                if first_occurrence_plate_num == curr_plate_num:
                    return "within", None
                else:
                    return "across", i
        i += 1
        if i % wells_per_plate == 0:
            curr_plate_num += 1


counts_by_text = Counter(well_list)
for t in counts_by_text.keys():
    c = counts_by_text[t]
    if t != "BLANK" and c > 1:
        duplicate_text_to_frequency[t] = c

duplicate_text_to_frequency_notimes = {}
counts_by_text_notimes = Counter(well_list_notimes_ifposs)
for t in counts_by_text_notimes.keys():
    c = counts_by_text_notimes[t]
    if t != "BLANK" and c > 1:
        duplicate_text_to_frequency_notimes[t] = c

with open("extra_duplicates_if_you_try_to_ignore_times.txt", "w") as tfile:
    s1 = set(duplicate_text_to_frequency.keys())
    s2 = set(duplicate_text_to_frequency_notimes.keys())
    # Basically, there's probably going to be stuff in s2 but not in s1. This
    # is because these should correspond to IDs with just date/etc. info,
    # whereas the IDs in s1 for these dates also have time info (we chopped
    # that off when we applied the regex above). TLDR, this is as expected.
    # (i know this comment is kind of badly explained but i'm tired)
    intersection = s1 & s2
    if len(intersection) != len(s1):
        missing_ids = s1 - intersection
        print(
            "Duplicates you get if you ignore times that are not exactly "
            "observed in normal duplicates:",
            missing_ids,
        )
    # sort to preserve determinism
    unique_dups = sorted(s2 - s1)
    tfile.write("\n".join(unique_dups))

with open(
    "extra_duplicates_if_you_try_to_ignore_times_no_stool.txt", "w"
) as tfile:
    unique_dups_no_stool = [
        i for i in unique_dups if not i.lower().endswith(".st")
    ]
    tfile.write("\n".join(unique_dups_no_stool))

with open("weird_stuff.txt", "w") as wfile:
    wfile.write("\n".join(weird_stuff))

with open("all_duplicates.txt", "w") as dup_file:
    dup_file.write("\n".join(duplicate_text_to_frequency.keys()))

impossible_to_auto_resolve_2014_crossplate_dups = []
in2014_duplicates_across_plates = []
in2014_duplicates_within_plate = []
non2014_duplicates = []
duplicates_with_3_or_more_frequency = []

for w in duplicate_text_to_frequency.keys():

    # Find duplicates that occur twice...
    if duplicate_text_to_frequency[w] == 2:

        # ...and have a "year" of 2014.
        # (this regex looks for stuff of the form "12.10.14.", on the
        # assumption that these dates represent years -- if your wells have a
        # different date format this will need to be adjusted)
        #
        # (also, note that we 'capture' some stuff from these IDs. This is
        # useful when reconstructing IDs later but using 2015 instead of 2014.
        match = re.match(r"(\d\d?\.\d\d?)\.14\.(.*)", w)
        if match:

            # Ok, now we need to determine if the duplicates occur on separate
            # plates.
            #   If yes, change the later occurrence in plate_df to 2015 instead
            #   If no, then report this as a within-plate duplicate

            results = is_duplicate_within_or_across_plates(w)
            if results[0] == "across":
                w_but_with_2015 = match.group(1) + ".15." + match.group(2)
                if w_but_with_2015 in well_list:
                    impossible_to_auto_resolve_2014_crossplate_dups.append(
                        w_but_with_2015
                    )
                    print(
                        "WARNING: {} can't be auto-resolved, since {} is "
                        "already in the dataset.".format(w, w_but_with_2015)
                    )
                    # raise ValueError(
                    #     "{} is already in the sheet?".format(w_but_with_2015)
                    # )
                else:
                    # If we're here, we can auto-resolve this sample ID!
                    # Hopefully.
                    # Determine plate IDs of these samples.
                    matching_ids = assoc_list.index.str.contains(
                        "{}\\.".format(QIITA_STUDY_ID) + w + r"[A-Za-z\.]*"
                    )
                    if sum(matching_ids) > 2:
                        print("> 2 matching IDs for {}???".format(w))
                        continue
                    elif sum(matching_ids) < 2:
                        print(
                            "Couldn't find {} twice in association list.".format(
                                w
                            )
                        )
                        continue
                    else:
                        matching_samples = assoc_list.loc[matching_ids]
                        # Ok, so we know there are two matching IDs in
                        # matching_ids. What we can do is determine which is on
                        # the later plate, then get the prep of that, then
                        # change that prep's instance of this ID to 2015 in the
                        # ID instead of 2014. That is doable with modify_prep()
                        if len(set(matching_samples["sample_plate"])) == 1:
                            print(
                                "WARNING: Same plate in assoc_list for {}.".format(
                                    w
                                )
                            )
                            continue
                        else:
                            platelist = list(matching_samples["sample_plate"])
                            platenums = [
                                int(x.split("_")[len(x.split("_")) - 1])
                                for x in platelist
                            ]
                            # Get index of later plate
                            later_plate_idx = argmax(platenums)
                            # Later sample is the one with that later plate
                            sample_to_change = matching_samples[
                                matching_samples["sample_plate"]
                                == platelist[later_plate_idx]
                            ]
                            modify_prep(
                                sample_to_change["prep_id"][
                                    sample_to_change.index[0]
                                ],
                                sample_to_change.index[0],
                                sample_to_change.index[0].replace(
                                    w, w_but_with_2015
                                ),
                            )
                    # Use the 1-indexed position of the second occurrence of the
                    # duplicate to modify the plate (the - 1's switch this back to
                    # 0 indexing)
                    row = ceil(results[1] / PLATE_COL_COUNT) - 1
                    col = (results[1] % PLATE_COL_COUNT) - 1
                    plate_df.iat[row, col] = w_but_with_2015
                    # print("Changed second {} to {}".format(w, w_but_with_2015))
                    in2014_duplicates_across_plates.append(w)
            else:
                in2014_duplicates_within_plate.append(w)
        else:
            non2014_duplicates.append(w)
    else:
        duplicates_with_3_or_more_frequency.append(w)

write_list(
    impossible_to_auto_resolve_2014_crossplate_dups,
    "impossible_to_autoresolve_duplicates.txt",
)
write_list(
    in2014_duplicates_across_plates, "2_freq_2014_duplicates_across_plates.txt"
)
write_list(
    in2014_duplicates_within_plate, "2_freq_2014_duplicates_within_plate.txt"
)
write_list(non2014_duplicates, "2_freq_non2014_duplicates.txt")
write_list(duplicates_with_3_or_more_frequency, "3ormore_freq_duplicates.txt")

plate_df.to_csv("abtx_with_simple_2014_duplicates_fixed.tsv", sep="\t")
	#! /usr/bin/env python3
	import os
	from collections import Counter
	from math import ceil
	import re
	from numpy import argmax
	import pandas as pd
	from qiime2 import Metadata

	# "Parameters" of this script
	EXPECTED_PLATE_COUNT = 174
	PLATE_COL_COUNT = 12
	PLATE_ROW_NAMES = ["a", "b", "c", "d", "e", "f", "g", "h"]

	# Qiita study ID goes here: used when updating sample IDs
	QIITA_STUDY_ID = ""

	def write_list(namelist, filename):
	with open(filename, "w") as fobj:
	fobj.write("\n".join(sorted(namelist)))


	def modify_prep(prep_id, sample_id, new_sample_id):
	"""Modifies a single index in a prep."""

	matching_preps = list(
	filter(
	lambda fn: str(prep_id) in fn,
	os.listdir("preps_automodified"),
	)
	)
	if len(matching_preps) == 0:
	matching_preps = list(
	filter(
	lambda fn: "_" + str(prep_id) + "_" in fn,
	os.listdir("preps"),
	)
	)

	if len(matching_preps) > 1:
	raise ValueError(
	"multiple possible preps for '{}': {}".format(
	prep_id, matching_preps
	)
	)

	prep_md = Metadata.load("preps/" + matching_preps[0]).to_dataframe()
	matching_indices = list(filter(lambda i: sample_id in i, prep_md.index))
	if len(matching_indices) > 1:
	# Multiple instances of this sample ID in this prep.
	# We could resolve it automatically, but simplest for now to just
	# require manual resolution.
	print("MULTIPLE INSTANCES OF SAMPLE ID ON PREP???", sample_id)
	else:
	idx_list = list(prep_md.index)
	idx_list[idx_list.index(sample_id)] = new_sample_id
	prep_md.index = idx_list
	prep_md.to_csv(
	"preps_automodified/" + matching_preps[0], sep="\t"
	)
	print("prep {}: {} -> {}".format(prep_id, sample_id, new_sample_id))


	assoc_list = pd.read_csv(
	"list-no-4055-4057-4065-4087.tsv", sep="\t", index_col=0
	)

	# This is a spreadsheet of plate mappings. Each plate should have 8 rows, all
	# labeled A through H.
	sheet_df = pd.read_csv(
	"abtx-big.tsv", sep="\t", index_col=0, keep_default_na=False, dtype=str
	)

	# Remove extraneous columns
	if len(sheet_df.columns) < PLATE_COL_COUNT:
	raise ValueError("There are less than {} columns.".format(PLATE_COL_COUNT))

	# Drop all columns after the PLATE_COL_COUNT-th (remember that the index isn't
	# treated as a column by pandas since we used index_col=0 above)
	sheet_df_colsubset = sheet_df.drop(
	sheet_df.columns[PLATE_COL_COUNT:], axis="columns"
	)

	# We have the columns filtered, but we need to filter the rows (since there are
	# empty rows in the sheet DataFrame between plate maps, etc.)
	# The rows that actually 'belong' to plates will be stored in this DF:
	plate_df = pd.DataFrame(columns=sheet_df_colsubset.columns)

	weird_stuff = []

	# Also, to make life easier, we'll store all cell text in an 'unrolled' list:
	well_list = []
	well_list_notimes_ifposs = []

	# Filter to rows that are 'within' plates (I know this solution is slow, but
	# this way we can maintain order between plates instead of using
	# .loc[["A", "B", ...]] or something where order is lost because the indices
	# are sorted).
	# Use of iterrows() based on https://stackoverflow.com/a/16476974/10730311.
	for index, row in sheet_df_colsubset.iterrows():
	# Only pay attention to rows that are putatively in a plate (i.e. their
	# indices are in the A-H range for normal 96-well plates)
	if index.strip().lower() in PLATE_ROW_NAMES:
	plate_df = plate_df.append(row, sort=False)
	for w in row:
	well_list.append(w)

	# ignore timestamp; just extract mm.dd.yy.hs.bs from the ID
	match = re.match(r"(\d\d?\.\d\d?\.\d\d\.\w\w\.\w\w?).*", w)
	if match:
	well_list_notimes_ifposs.append(match.group(1))
	else:
	# if that pattern didn't match (e.g. this well ID is just BLANK
	# or whatever), just add the entire thing
	if not w.startswith("BLANK") and len(w) > 0:
	weird_stuff.append(w)
	well_list_notimes_ifposs.append(w)

	# Change the columns in plate_df to be normal 1-indexed numbers
	plate_df.columns = range(1, PLATE_COL_COUNT + 1)

	# Validate that the number of rows detected in plates is what we expected
	expected_row_ct = EXPECTED_PLATE_COUNT * len(PLATE_ROW_NAMES)
	if len(plate_df.index) != expected_row_ct:
	raise ValueError(
	(
	"Found {} rows 'within' plates, but expected to find {} rows. "
	"This is because we expected the spreadsheet to contain {} plates."
	).format(len(plate_df.index), expected_row_ct, EXPECTED_PLATE_COUNT)
	)

	# OK, now we have a sane DataFrame (plate_df) where each cell is a
	# sample ID, BLANK, etc.
	#
	# We also now have well_list, which is a list that contains the same
	# information but "unrolled" (so it goes row1 col1 row1 col2 ... etc, all the
	# way through all of the plates on the input spreadsheet).
	#
	# Now we can find duplicates, etc. by just searching through these data
	# structures (well_list will be useful for searching and counting, and plate_df
	# will be useful for determining which samples occur on "earlier"/later plates)

	duplicate_text_to_frequency = {}


	def is_duplicate_within_or_across_plates(duplicate_text):
	"""Since each plate has exactly 96* wells, we can just iterate over
	well_list and update a "curr_plate_num" with some frequency.

	This will let us determine if a duplicate ID (i.e. a well/sample ID that
	occurs exactly twice) is duplicated on a plate or across two plates.

	Returns ("within", None) if the duplication is on a single plate.
	Returns ("across", i) if the duplication is across two plates.
	In this case, i is the integer position (1-indexed!!!) of the second
	occurrence duplicate_text in well_list.

	* or PLATE_COL_COUNT * len(PLATE_ROW_NAMES), if you've modified the
	stuff at the top of this file.
	"""
	wells_per_plate = PLATE_COL_COUNT * len(PLATE_ROW_NAMES)
	first_occurrence_plate_num = None
	curr_plate_num = 1
	i = 1
	for w in well_list:
	if w == duplicate_text:
	if first_occurrence_plate_num is None:
	first_occurrence_plate_num = curr_plate_num
	else:
	# Ok, now we know the plate numbers of both occurrences of
	# duplicate_text. Are these the same?
	if first_occurrence_plate_num == curr_plate_num:
	return "within", None
	else:
	return "across", i
	i += 1
	if i % wells_per_plate == 0:
	curr_plate_num += 1


	counts_by_text = Counter(well_list)
	for t in counts_by_text.keys():
	c = counts_by_text[t]
	if t != "BLANK" and c > 1:
	duplicate_text_to_frequency[t] = c

	duplicate_text_to_frequency_notimes = {}
	counts_by_text_notimes = Counter(well_list_notimes_ifposs)
	for t in counts_by_text_notimes.keys():
	c = counts_by_text_notimes[t]
	if t != "BLANK" and c > 1:
	duplicate_text_to_frequency_notimes[t] = c

	with open("extra_duplicates_if_you_try_to_ignore_times.txt", "w") as tfile:
	s1 = set(duplicate_text_to_frequency.keys())
	s2 = set(duplicate_text_to_frequency_notimes.keys())
	# Basically, there's probably going to be stuff in s2 but not in s1. This
	# is because these should correspond to IDs with just date/etc. info,
	# whereas the IDs in s1 for these dates also have time info (we chopped
	# that off when we applied the regex above). TLDR, this is as expected.
	# (i know this comment is kind of badly explained but i'm tired)
	intersection = s1 & s2
	if len(intersection) != len(s1):
	missing_ids = s1 - intersection
	print(
	"Duplicates you get if you ignore times that are not exactly "
	"observed in normal duplicates:",
	missing_ids,
	)
	# sort to preserve determinism
	unique_dups = sorted(s2 - s1)
	tfile.write("\n".join(unique_dups))

	with open(
	"extra_duplicates_if_you_try_to_ignore_times_no_stool.txt", "w"
	) as tfile:
	unique_dups_no_stool = [
	i for i in unique_dups if not i.lower().endswith(".st")
	]
	tfile.write("\n".join(unique_dups_no_stool))

	with open("weird_stuff.txt", "w") as wfile:
	wfile.write("\n".join(weird_stuff))

	with open("all_duplicates.txt", "w") as dup_file:
	dup_file.write("\n".join(duplicate_text_to_frequency.keys()))

	impossible_to_auto_resolve_2014_crossplate_dups = []
	in2014_duplicates_across_plates = []
	in2014_duplicates_within_plate = []
	non2014_duplicates = []
	duplicates_with_3_or_more_frequency = []

	for w in duplicate_text_to_frequency.keys():

	# Find duplicates that occur twice...
	if duplicate_text_to_frequency[w] == 2:

	# ...and have a "year" of 2014.
	# (this regex looks for stuff of the form "12.10.14.", on the
	# assumption that these dates represent years -- if your wells have a
	# different date format this will need to be adjusted)
	#
	# (also, note that we 'capture' some stuff from these IDs. This is
	# useful when reconstructing IDs later but using 2015 instead of 2014.
	match = re.match(r"(\d\d?\.\d\d?)\.14\.(.*)", w)
	if match:

	# Ok, now we need to determine if the duplicates occur on separate
	# plates.
	# If yes, change the later occurrence in plate_df to 2015 instead
	# If no, then report this as a within-plate duplicate

	results = is_duplicate_within_or_across_plates(w)
	if results[0] == "across":
	w_but_with_2015 = match.group(1) + ".15." + match.group(2)
	if w_but_with_2015 in well_list:
	impossible_to_auto_resolve_2014_crossplate_dups.append(
	w_but_with_2015
	)
	print(
	"WARNING: {} can't be auto-resolved, since {} is "
	"already in the dataset.".format(w, w_but_with_2015)
	)
	# raise ValueError(
	# "{} is already in the sheet?".format(w_but_with_2015)
	# )
	else:
	# If we're here, we can auto-resolve this sample ID!
	# Hopefully.
	# Determine plate IDs of these samples.
	matching_ids = assoc_list.index.str.contains(
	"{}\\.".format(QIITA_STUDY_ID) + w + r"[A-Za-z\.]*"
	)
	if sum(matching_ids) > 2:
	print("> 2 matching IDs for {}???".format(w))
	continue
	elif sum(matching_ids) < 2:
	print(
	"Couldn't find {} twice in association list.".format(
	w
	)
	)
	continue
	else:
	matching_samples = assoc_list.loc[matching_ids]
	# Ok, so we know there are two matching IDs in
	# matching_ids. What we can do is determine which is on
	# the later plate, then get the prep of that, then
	# change that prep's instance of this ID to 2015 in the
	# ID instead of 2014. That is doable with modify_prep()
	if len(set(matching_samples["sample_plate"])) == 1:
	print(
	"WARNING: Same plate in assoc_list for {}.".format(
	w
	)
	)
	continue
	else:
	platelist = list(matching_samples["sample_plate"])
	platenums = [
	int(x.split("_")[len(x.split("_")) - 1])
	for x in platelist
	]
	# Get index of later plate
	later_plate_idx = argmax(platenums)
	# Later sample is the one with that later plate
	sample_to_change = matching_samples[
	matching_samples["sample_plate"]
	== platelist[later_plate_idx]
	]
	modify_prep(
	sample_to_change["prep_id"][
	sample_to_change.index[0]
	],
	sample_to_change.index[0],
	sample_to_change.index[0].replace(
	w, w_but_with_2015
	),
	)
	# Use the 1-indexed position of the second occurrence of the
	# duplicate to modify the plate (the - 1's switch this back to
	# 0 indexing)
	row = ceil(results[1] / PLATE_COL_COUNT) - 1
	col = (results[1] % PLATE_COL_COUNT) - 1
	plate_df.iat[row, col] = w_but_with_2015
	# print("Changed second {} to {}".format(w, w_but_with_2015))
	in2014_duplicates_across_plates.append(w)
	else:
	in2014_duplicates_within_plate.append(w)
	else:
	non2014_duplicates.append(w)
	else:
	duplicates_with_3_or_more_frequency.append(w)

	write_list(
	impossible_to_auto_resolve_2014_crossplate_dups,
	"impossible_to_autoresolve_duplicates.txt",
	)
	write_list(
	in2014_duplicates_across_plates, "2_freq_2014_duplicates_across_plates.txt"
	)
	write_list(
	in2014_duplicates_within_plate, "2_freq_2014_duplicates_within_plate.txt"
	)
	write_list(non2014_duplicates, "2_freq_non2014_duplicates.txt")
	write_list(duplicates_with_3_or_more_frequency, "3ormore_freq_duplicates.txt")

	plate_df.to_csv("abtx_with_simple_2014_duplicates_fixed.tsv", sep="\t")