Last active
December 10, 2019 00:54
-
-
Save fedarko/877fe48a42f948704532378b3b889c0c to your computer and use it in GitHub Desktop.
In a timeseries metadata file, finds all days that are not "represented" by at least one sample in the metadata
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
from dateutil.parser import parse | |
import pandas as pd | |
df = pd.read_csv("20191209_metadata.txt", sep="\t", index_col=0) | |
# Subset to a certain host subject ID, if desired | |
df = df[df["host_subject_id"] == "M03"] | |
# identify earliest and latest dates (this approach based on my prior code at | |
# https://github.com/fedarko/qeeseburger/blob/506bcad5c5eb696eba78700e1c4807996e1831dd/add_age_and_extra_times.py#L105) | |
min_date = None | |
max_date = None | |
for sample_id in df.index: | |
try: | |
parsed_date = parse(df.loc[sample_id, "collection_timestamp"]) | |
except ValueError: | |
# ignore malformed dates | |
continue | |
if min_date is None or parsed_date < min_date: | |
min_date = parsed_date | |
if max_date is None or parsed_date > max_date: | |
max_date = parsed_date | |
# Get (inclusive) range of all dates in [min_date, max_date] | |
date_range = pd.date_range(min_date, max_date, freq='D') | |
date_range_set = set([d.date() for d in date_range]) | |
# Print information about data | |
print("Earliest date is {}. Latest date is {}.".format(min_date, max_date)) | |
print("In total, this range spans {} days.".format(len(date_range))) | |
# Get list of all dates in the metadata (just down to the date level) | |
sample_dates = set() | |
for sample_id in df.index: | |
try: | |
parsed_date = parse(df.loc[sample_id, "collection_timestamp"]) | |
except ValueError: | |
continue | |
sample_dates.add(parsed_date.date()) | |
print( | |
"There are {} unique dates (down to the day level) in the " | |
"metadata.".format(len(sample_dates)) | |
) | |
# See which dates in the date range are not represented in the sample metadata | |
missing_dates = date_range_set - sample_dates | |
print( | |
"There are {} dates in the date range that are not represented in the " | |
"sample metadata.".format(len(missing_dates)) | |
) | |
# Finish up by outputting missing dates to a file | |
with open("missing_dates.txt", "w") as md_fobj: | |
missing_dates_as_strs = sorted([str(d) for d in missing_dates]) | |
md_fobj.write("\n".join(missing_dates_as_strs)) | |
print("Wrote out a list of missing dates to missing_dates.txt.") | |
# Get stats about missing dates by year | |
year_to_num_missing_dates = {} | |
for d in missing_dates: | |
if d.year in year_to_num_missing_dates: | |
year_to_num_missing_dates[d.year] += 1 | |
else: | |
year_to_num_missing_dates[d.year] = 1 | |
print("Number of missing dates stratified by year:") | |
print(year_to_num_missing_dates) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment