Skip to content

Instantly share code, notes, and snippets.

@walterst
Last active March 15, 2019 10:11
Show Gist options
  • Save walterst/ca4a41d32cceba809c77b55fc2c068cc to your computer and use it in GitHub Desktop.
Save walterst/ca4a41d32cceba809c77b55fc2c068cc to your computer and use it in GitHub Desktop.
Custom script used to parse tab delimited Ipod data, match up dates from tab-delimited QIIME mapping data, and write averages of data from multiple days on and prior to qiime metadata samples as metadata columns. This script uses a QIIME 1.9X environment for the parse_mapping_file function.
#!/usr/bin/env python
from __future__ import division
# USAGE: python parse_ipod_to_metadata.py mapping_file days_to_consider ipod_tab_delim_file raw_output_file qiime_compatible_output_file
# where days_to_consider counts the same-day as one of the days, and comma-seperated columns needs to be
# an exact match to the field label in the ipod data file, e.g. Gastrointestinal_issues
# All dates must be in the format of DD/MM/YY in the ipod source tab delimited data.
from sys import argv
from operator import itemgetter
from datetime import datetime, date
from numpy import array, mean
from qiime.parse import parse_mapping_file
# For now, a subset of the data, until all data can be quantified for parsing
target_fields = ["What time did you wake up today? (please use military time)",
"How much of your breakfast did you eat?",
"How much of your lunch did you eat?",
"How much of it did you eat?",
"Bottled water",
"Base purified water",
"Tap water",
"Soda",
"Sports energy drink (e.g. Gatorade)",
"Coffee or tea",
"Milk",
"Fruit juice",
"Yogurt",
"Beer, wine, or spirits",
"How are you feeling today?",
"Fever",
"Gastrointestinal issues",
"Headache",
"Tiredness",
"Runny or blocked nose",
"Rash",
"Muscle strain",
"Cramp",
"How much exercise did you do?",
"How many times did you urinate today?",
"How many times did you have a bowel movement today?",
"Please describe the consistency of your stool.",
"What time did you get into bed before going to sleep today? (please use military time)"
]
portions_numeric = {
# all empty fields become NA
"":"NA",
# what fraction was eaten at breakfast, lunch, or dinner
"All of it":"4",
"3/4 of it":"3",
"1/2 of it":"2",
"1/4 of it":"1",
# reported wellness, from "How are you feeling today?"
"Good":"3",
"Ill":"2",
"Very Ill":"1",
# Exercise duration
"Less than 30 mins":"1",
"30 mins to 1 hour":"2",
"1 to 2 hours":"3",
"2+ hours":"4",
# Urination frequency
"1-2":"1.5",
"3-5":"4",
"6-9":"7.5",
"9+":"9",
# Bowel movement frequency
"0":"0",
"1":"1",
"2":"2",
"3":"3",
"4":"4",
"5":"5",
"6+":"6",
# Stool consistency
"Hard and formed (like a cigar)":"1",
"Soft and formed (like peanut butter)":"2",
"Loose and unformed (like a thick milkshake)":"3",
"Liquid (like water)":"4"
}
# for categories with only one item, just set to 1 or 0
presence_absence = {
"":"0",
"Bottled water":"1",
"Base purified water":"1",
"Tap water":"1",
"Soda":"1",
"Sports energy drink (e.g. Gatorade)":"1",
"Coffee or tea":"1",
"Milk":"1",
"Fruit juice":"1",
"Yogurt":"1",
"Beer, wine, or spirits":"1",
"Fever":"1",
"Gastrointestinal issues":"1",
"Headache":"1",
"Tiredness":"1",
"Runny or blocked nose":"1",
"Rash":"1",
"Muscle strain":"1",
"Cramp":"1",
}
target_fields_lookup = [None,
portions_numeric,
portions_numeric,
portions_numeric,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
portions_numeric,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
presence_absence,
portions_numeric,
portions_numeric,
portions_numeric,
portions_numeric,
None
]
mapping_f = argv[1]
mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_f, 'U'))
days_considered = int(argv[2])
ipod_f = open(argv[3], "U")
raw_data_outf = open(argv[4], "w")
qiime_data_outf = open(argv[5], "w")
qiime_data_outf.write("#")
target_field_ixs = []
ipod_data = {}
""" Attempting this approach to organizing:
Data will be loaded from ipod touch form as a dictionary with tuples as keys of:
(4 digit ID code, ordinal date int value):[list of strip()ed data from tsv ipod specified headers]
Will have to query the dictionary for the presence of each id,date from the mapping file,
have KeyError exceptions to indicate missing data.
"""
# 2 samples from Turkey in ipod survey, also extra PHR sample, ignoring these for now
ignore_ids = ["2002","2004","2005","PHR"]
counter = 0
for line in ipod_f:
curr_line = line.replace("\n","").split("\t")
if len(curr_line) == 0:
continue
if line.startswith("#"):
User_ix = curr_line.index("User")
date_ipod_ix = curr_line.index("CorrectedDateDDMMYY")
for curr_field in target_fields:
target_field_ixs.append(curr_line.index(curr_field))
continue
curr_id = curr_line[User_ix].strip()
# IF date is NA, or if in IDs to skip listed above, skip appending data
if(curr_line[date_ipod_ix].strip() == "NA" or curr_id in ignore_ids):
continue
curr_date = datetime.strptime(curr_line[date_ipod_ix].strip(), '%d/%m/%y').date().toordinal()
curr_added_data = []
for n in target_field_ixs:
curr_lookup = target_fields_lookup[target_field_ixs.index(n)]
if curr_lookup:
curr_val = curr_lookup[curr_line[n].strip()]
else:
curr_val = curr_line[n].strip()
# Set empty fields to NA
if len(curr_val) == 0:
curr_val = "NA"
curr_added_data.append(curr_val)
ipod_data[(curr_id,curr_date)] = curr_added_data
date_mapping_ix = mapping_headers.index("SampleDate")
numeric_id_ix = mapping_headers.index("Numeric_ID")
# Build up dict of tuples of (id,ordinal date), skip inclusion if NA in either position.
id_dates_in_mapping = {}
# Built up similar dict of tuples as above, but for handling hours slept calculation
id_dates_in_mapping_sleep = {}
# Build up this data, so can be output later along with ipod data
metadata_line_in_mapping = {}
for line in mapping_data:
if line[date_mapping_ix] == "NA" or line[numeric_id_ix] == "NA":
continue
curr_date = datetime.strptime(line[date_mapping_ix].strip(), '%d/%m/%y').date().toordinal()
id_dates_in_mapping[line[numeric_id_ix].strip(), curr_date] = []
id_dates_in_mapping_sleep[line[numeric_id_ix].strip(), curr_date] = []
metadata_line_in_mapping[line[numeric_id_ix].strip(), curr_date] = line
# For each id:date key in dict, make list of target ids:dates to query from ipod data
# takes values from curr_date - (0 to days_considered) to find appropriate ordinal values
for curr_key in id_dates_in_mapping:
for n in range(0, days_considered):
id_dates_in_mapping[curr_key].append((curr_key[0], curr_key[1] - n))
# Copying this, but increasing by range by 1 to handle the extra day need for hours
# slept calculation
for curr_key in id_dates_in_mapping_sleep:
for n in range(0, days_considered + 1):
id_dates_in_mapping_sleep[curr_key].append((curr_key[0], curr_key[1] - n))
# Insert metadata headers considered into the end of the headers, before the last Description column
target_fields.append("Time_Slept")
corrected_headers = mapping_headers
for curr_header in target_fields:
corrected_headers.insert(-1, curr_header)
raw_data = [corrected_headers]
qiime_data = [corrected_headers]
# Try to calculate the hours slept for each day (based upon prior day's time to sleep),
# add to data as additional field
awake_ix = 0
asleep_ix = -1
ipod_sleeping_hours = {}
count_missing_firstix = 0
count_missing_secondix = 0
for curr_key in id_dates_in_mapping_sleep:
""" Explanation here for this-the data are sorted backwards, with the newest day
first in the list being indexed. To do the time-time comparison for sleeping and
awakening, each day is going to be queried, along with the next one in the list,
and if both data exist, then convert the datetime object with year/month/date/hour,
using the caveat that if the value of the hour is after midnight but before 11:30,
increment the day by 1"""
for curr_id_date_ix in range(len(id_dates_in_mapping_sleep[curr_key]) - 1):
# Will often be empty, so have to do try/except commands
try:
curr_awake = ipod_data[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]][awake_ix]
except KeyError:
ipod_sleeping_hours[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]] = "NA"
continue
try:
curr_asleep = ipod_data[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix+1]][asleep_ix]
except KeyError:
ipod_sleeping_hours[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]] = "NA"
continue
# If either field has NA value, continue, should not be many of these
if curr_awake == "NA" or curr_asleep == "NA":
ipod_sleeping_hours[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]] = "NA"
continue
if int(curr_asleep) >= 0 and int(curr_asleep) <= 1130:
inc_date = 1
else:
inc_date = 0
# Get the ordinal values back into year, month, day, correct the date if went to sleep in the AM
datetime_awake = date.fromordinal(id_dates_in_mapping_sleep[curr_key][curr_id_date_ix][1])
datetime_asleep = date.fromordinal(id_dates_in_mapping_sleep[curr_key][curr_id_date_ix+1][1] + inc_date)
# Might be a more elegant way to handle this, but going to slice the time
# based upon the number of digits present to get hours and minutes.
if len(curr_awake) == 4:
awake_hours = int(curr_awake[0:2])
awake_mins = int(curr_awake[2:])
elif len(curr_awake) == 3:
awake_hours = int(curr_awake[0:1])
awake_mins = int(curr_awake[1:])
else:
awake_hours = 0
awake_mins = int(curr_awake)
if len(curr_asleep) == 4:
asleep_hours = int(curr_asleep[0:2])
asleep_mins = int(curr_asleep[2:])
elif len(curr_asleep) == 3:
asleep_hours = int(curr_asleep[0:1])
asleep_mins = int(curr_asleep[1:])
else:
asleep_hours = 0
asleep_mins = int(curr_asleep)
converted_awake = datetime(datetime_awake.year, datetime_awake.month,
datetime_awake.day, awake_hours, awake_mins)
converted_asleep = datetime(datetime_asleep.year, datetime_asleep.month,
datetime_asleep.day, asleep_hours, asleep_mins)
time_diff = converted_awake-converted_asleep
time_slept_hours = time_diff.seconds/3600
# add the slept hours to the ipod data as another field, but, need to build up
# this data and add it outside of this loop, since we're still reading in the
# ipod touch date during this loop
ipod_sleeping_hours[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]] = "%2.2f" % time_slept_hours
#hours_added = True
#if not hours_added:
#print curr_key
#ipod_sleeping_hours.append((id_dates_in_mapping_sleep[curr_key][curr_id_date_ix],
# "NA"))
ipod_data_keys = set(ipod_data.keys())
for curr_indexdata in ipod_sleeping_hours:
# All empty fields were given "NA" above, this just fills in subset to match the
# actual data in ipod_data, needed for later parsing of the data
if curr_indexdata in ipod_data_keys:
ipod_data[curr_indexdata].append(ipod_sleeping_hours[curr_indexdata])
#for x in ipod_data:
# print ipod_data[x]
# Now to query ipod data for each target id:date combo
for curr_key in id_dates_in_mapping:
target_vals = []
queried_dates = []
average_vals = []
fill_empty_fields = True # If no data found in date range, use this to fill empty data in final mapping
for curr_id_date in id_dates_in_mapping[curr_key]:
# Will often be empty, so have to do try/except commands
try:
target_vals.append(ipod_data[curr_id_date])
except KeyError:
continue
queried_dates.append("%s" % date.fromordinal(curr_id_date[1]))
fill_empty_fields = False
# Transpose data, needed to step through values, ignore NA or empty fields
transposed_vals = map(list, zip(*target_vals))
for n in transposed_vals:
curr_vals = []
for x in n:
if x == "NA" or x == '':
continue
curr_vals.append(float(x))
# If empty, put "NA" in the field, else put average of values
if len(curr_vals) == 0:
average_vals.append("NA")
else:
average_vals.append("%4.2f" % (mean(array(curr_vals))))
if fill_empty_fields:
average_vals = ["NA"] * len(target_fields)
# Write out raw data to log file
raw_data_outf.write("****************************\n")
raw_data_outf.write("4 digit ID and date for current date: %s,%s\n" % (curr_key[0], date.fromordinal(curr_key[1])))
raw_data_outf.write("Dates from IPOD data queried: %s \n" % queried_dates)
raw_data_outf.write("Headers queried: %s \n" % ",".join(target_fields))
raw_data_outf.write("Raw values for each category: %s \n" % transposed_vals)
raw_data_outf.write("Averaged values for each category: %s \n" % average_vals)
raw_data_outf.write("Mapping metadata line associated with the above values: %s \n" % "\t".join(metadata_line_in_mapping[curr_key]))
# Add data to metadata lines, write out to qiime-formatted file
curr_metadata_line = metadata_line_in_mapping[curr_key]
for curr_average in average_vals:
curr_metadata_line.insert(-1, curr_average)
qiime_data.append(curr_metadata_line)
for line in qiime_data:
qiime_data_outf.write("\t".join(line))
qiime_data_outf.write('\n')
"""
list.insert(location or -1, value) for inserting data before the end.
If no data are available, put NA in the field.
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment