fedarko/find_missing_dates.py

## find_missing_dates.py
#! /usr/bin/env python3
from dateutil.parser import parse
import pandas as pd


df = pd.read_csv("20191209_metadata.txt", sep="\t", index_col=0)

# Subset to a certain host subject ID, if desired
df = df[df["host_subject_id"] == "M03"]

# identify earliest and latest dates (this approach based on my prior code at
# https://github.com/fedarko/qeeseburger/blob/506bcad5c5eb696eba78700e1c4807996e1831dd/add_age_and_extra_times.py#L105)
min_date = None
max_date = None
for sample_id in df.index:
    try:
        parsed_date = parse(df.loc[sample_id, "collection_timestamp"])
    except ValueError:
        # ignore malformed dates
        continue
    if min_date is None or parsed_date < min_date:
        min_date = parsed_date
    if max_date is None or parsed_date > max_date:
        max_date = parsed_date

# Get (inclusive) range of all dates in [min_date, max_date]
date_range = pd.date_range(min_date, max_date, freq='D')
date_range_set = set([d.date() for d in date_range])

# Print information about data
print("Earliest date is {}. Latest date is {}.".format(min_date, max_date))
print("In total, this range spans {} days.".format(len(date_range)))

# Get list of all dates in the metadata (just down to the date level)
sample_dates = set()
for sample_id in df.index:
    try:
        parsed_date = parse(df.loc[sample_id, "collection_timestamp"])
    except ValueError:
        continue
    sample_dates.add(parsed_date.date())

print(
    "There are {} unique dates (down to the day level) in the "
    "metadata.".format(len(sample_dates))
)

# See which dates in the date range are not represented in the sample metadata
missing_dates = date_range_set - sample_dates
print(
    "There are {} dates in the date range that are not represented in the "
    "sample metadata.".format(len(missing_dates))
)

# Finish up by outputting missing dates to a file
with open("missing_dates.txt", "w") as md_fobj:
    missing_dates_as_strs = sorted([str(d) for d in missing_dates])
    md_fobj.write("\n".join(missing_dates_as_strs))

print("Wrote out a list of missing dates to missing_dates.txt.")

# Get stats about missing dates by year
year_to_num_missing_dates = {}
for d in missing_dates:
    if d.year in year_to_num_missing_dates:
        year_to_num_missing_dates[d.year] += 1
    else:
        year_to_num_missing_dates[d.year] = 1
print("Number of missing dates stratified by year:")
print(year_to_num_missing_dates)
	#! /usr/bin/env python3
	from dateutil.parser import parse
	import pandas as pd


	df = pd.read_csv("20191209_metadata.txt", sep="\t", index_col=0)

	# Subset to a certain host subject ID, if desired
	df = df[df["host_subject_id"] == "M03"]

	# identify earliest and latest dates (this approach based on my prior code at
	# https://github.com/fedarko/qeeseburger/blob/506bcad5c5eb696eba78700e1c4807996e1831dd/add_age_and_extra_times.py#L105)
	min_date = None
	max_date = None
	for sample_id in df.index:
	try:
	parsed_date = parse(df.loc[sample_id, "collection_timestamp"])
	except ValueError:
	# ignore malformed dates
	continue
	if min_date is None or parsed_date < min_date:
	min_date = parsed_date
	if max_date is None or parsed_date > max_date:
	max_date = parsed_date

	# Get (inclusive) range of all dates in [min_date, max_date]
	date_range = pd.date_range(min_date, max_date, freq='D')
	date_range_set = set([d.date() for d in date_range])

	# Print information about data
	print("Earliest date is {}. Latest date is {}.".format(min_date, max_date))
	print("In total, this range spans {} days.".format(len(date_range)))

	# Get list of all dates in the metadata (just down to the date level)
	sample_dates = set()
	for sample_id in df.index:
	try:
	parsed_date = parse(df.loc[sample_id, "collection_timestamp"])
	except ValueError:
	continue
	sample_dates.add(parsed_date.date())

	print(
	"There are {} unique dates (down to the day level) in the "
	"metadata.".format(len(sample_dates))
	)

	# See which dates in the date range are not represented in the sample metadata
	missing_dates = date_range_set - sample_dates
	print(
	"There are {} dates in the date range that are not represented in the "
	"sample metadata.".format(len(missing_dates))
	)

	# Finish up by outputting missing dates to a file
	with open("missing_dates.txt", "w") as md_fobj:
	missing_dates_as_strs = sorted([str(d) for d in missing_dates])
	md_fobj.write("\n".join(missing_dates_as_strs))

	print("Wrote out a list of missing dates to missing_dates.txt.")

	# Get stats about missing dates by year
	year_to_num_missing_dates = {}
	for d in missing_dates:
	if d.year in year_to_num_missing_dates:
	year_to_num_missing_dates[d.year] += 1
	else:
	year_to_num_missing_dates[d.year] = 1
	print("Number of missing dates stratified by year:")
	print(year_to_num_missing_dates)