fedarko/validate_collection_date_and_timestamp.py

## validate_collection_date_and_timestamp.py
#! /usr/bin/env python3
# note that this is very preliminary + untested code

# We use pd.read_csv() because, unlike QIIME 2's Metadata object, it allows
# duplicate sample IDs.
import pandas as pd
import sys
if len(sys.argv) < 2:
    raise ValueError("You need to specify a metadata file to check.")
df = pd.read_csv(sys.argv[1], sep='\t', index_col=0)

# We need to add 1 because df.columns doesn't include the index column (aka the
# sample ID), whereas the output of df.itertuples() does
cd1 = list(df.columns).index("collection_date") + 1
ct1 = list(df.columns).index("collection_timestamp") + 1

# Now, go through each sample and do this check. df.itertuples() is relatively
# slow, but I'm not aware of a way to vectorize this (and this should be fast
# enough for most uses)
for row in df.itertuples():
    if not row[ct1].startswith(row[cd1]):
        # If the date and timestamp don't match up, print the Sample ID and
        # offending date + timestamp.
        print("Sample ID:", row[0], "| collection_date:", row[cd1], "| collection_timestamp:", row[ct1])
	#! /usr/bin/env python3
	# note that this is very preliminary + untested code

	# We use pd.read_csv() because, unlike QIIME 2's Metadata object, it allows
	# duplicate sample IDs.
	import pandas as pd
	import sys
	if len(sys.argv) < 2:
	raise ValueError("You need to specify a metadata file to check.")
	df = pd.read_csv(sys.argv[1], sep='\t', index_col=0)

	# We need to add 1 because df.columns doesn't include the index column (aka the
	# sample ID), whereas the output of df.itertuples() does
	cd1 = list(df.columns).index("collection_date") + 1
	ct1 = list(df.columns).index("collection_timestamp") + 1

	# Now, go through each sample and do this check. df.itertuples() is relatively
	# slow, but I'm not aware of a way to vectorize this (and this should be fast
	# enough for most uses)
	for row in df.itertuples():
	if not row[ct1].startswith(row[cd1]):
	# If the date and timestamp don't match up, print the Sample ID and
	# offending date + timestamp.
	print("Sample ID:", row[0], "\| collection_date:", row[cd1], "\| collection_timestamp:", row[ct1])