fedarko/validate_sample_ids_and_timestamps.py

## validate_sample_ids_and_timestamps.py
#! /usr/bin/env python3
import re
import pandas as pd
from dateutil.parser import parse

m = pd.read_csv("metadata.tsv", sep='\t', index_col=0)
# Find all sample IDs in the metadata that include a "date"
# We assume that sample IDs that start with a 5-digit Qiita ID, then a period,
# then a two-character host ID string, then another period, will follow this
# convention.
#
# NOTE that this will result in some samples being excluded (e.g. those that
# follow the convention of "12345.T.MF.8.20.2010"). Also note that this code
# will break if the embedded date is invalid (e.g. "8.20.2010A").
#
# An example sample ID matching our expectations is "12345.MF.8.20.2010" -- we
# detect the match by looking at the first 9 characters ("12345.MF."), and
# later on we'll slice off the remaining characters as the date ("8.20.2010").
pattern = re.compile(r"\d{5}\.[A-Z]{2}\.")
date_sample_ids = []
for sid in m.index:
    if pattern.match(sid[:9]):
        date_sample_ids.append(sid)

# For all "date" sample IDs in the metadata file, try to parse their "date"
# component along with their collection_timestamp field value. See if there's a
# difference. If so, print information about these discordant IDs.
m_subset = m.loc[date_sample_ids]
for dsid in m_subset.index:
    id_datetime = parse(dsid[9:])
    explicit_datetime = parse(m["collection_timestamp"][dsid])
    if id_datetime != explicit_datetime:
        print(dsid, id_datetime, explicit_datetime)
	#! /usr/bin/env python3
	import re
	import pandas as pd
	from dateutil.parser import parse

	m = pd.read_csv("metadata.tsv", sep='\t', index_col=0)
	# Find all sample IDs in the metadata that include a "date"
	# We assume that sample IDs that start with a 5-digit Qiita ID, then a period,
	# then a two-character host ID string, then another period, will follow this
	# convention.
	#
	# NOTE that this will result in some samples being excluded (e.g. those that
	# follow the convention of "12345.T.MF.8.20.2010"). Also note that this code
	# will break if the embedded date is invalid (e.g. "8.20.2010A").
	#
	# An example sample ID matching our expectations is "12345.MF.8.20.2010" -- we
	# detect the match by looking at the first 9 characters ("12345.MF."), and
	# later on we'll slice off the remaining characters as the date ("8.20.2010").
	pattern = re.compile(r"\d{5}\.[A-Z]{2}\.")
	date_sample_ids = []
	for sid in m.index:
	if pattern.match(sid[:9]):
	date_sample_ids.append(sid)

	# For all "date" sample IDs in the metadata file, try to parse their "date"
	# component along with their collection_timestamp field value. See if there's a
	# difference. If so, print information about these discordant IDs.
	m_subset = m.loc[date_sample_ids]
	for dsid in m_subset.index:
	id_datetime = parse(dsid[9:])
	explicit_datetime = parse(m["collection_timestamp"][dsid])
	if id_datetime != explicit_datetime:
	print(dsid, id_datetime, explicit_datetime)