Skip to content

Instantly share code, notes, and snippets.

@czue
Last active December 12, 2015 00:38
Show Gist options
  • Save czue/4684953 to your computer and use it in GitHub Desktop.
Save czue/4684953 to your computer and use it in GitHub Desktop.
Script to remove potentially duplicate counts from wisepill data.
"""
Script usage:
The script currently assumes the existence of two files and of specific formats.
dataforwisepillcalculation.csv - a list of patient ids and visit times
wisepill.csv - the full log of wisepill openings
It outputs a single file:
adherence.csv - the patients' adherence scores over each of the visit periods
These are assumed to be in the same directory as the script. The filenames can
be changed as variables below.
The script will count adherence per day with the following rules:
- A patient can only have adherence up to their dosage.
- The first pill of the day always counts.
- After the first pill, any pill within the THRESHOLD (in hours) from the first
pill does not count.
- The next pill outside of the threshold counts again.
- After this pill the theshold resets, and the rules are the same.
The THRESHOLDS can be configured, via the variable DEFAULT_THRESHOLD_MAP, which
maps frequency to thresholds. The defaults are 12 hours for once-a-day patients
and 8 hours for twice-a-day patients.
Original Problem statement:
The wisepill devices for ARemind were sending in a lot of repeat messages
within a 24 hour window - sometimes a duplicate with the exact same timestamp,
which is easy to rule out, but sometimes within 5, 10, 30 minutes or sometimes
in 6-12 hours. This is a bit confusing for patients that are supposed to be
opening the device 1x or 2x per day. So the data team has some concerns about
the confidence in the data.
The data team has decided that multiple readings for patients that take 2 doses
per day should be considered a single event if they occur within 8 hours, and
if they take a single dose then within 12 hours should be considered a single
dose.
Output should be:
Columns: patient ID, visit (e.g. T1, T2...), wisepill adherence % (single
adherence score for this time window from the previous appointment to this
appointment , which is # of actual pills / expected pills), dosage (1 or 2)
They also will likely want to tweak the 8 hours for 2x day dose and 12 hours to
1x per day dose to see how the data looks: I hope it's not too hard to build
those in as variables for the script.
"""
import csv
from datetime import datetime
from collections import defaultdict
# CONFIGURATION PARAMETERS
# change these filenames as appropriate, assumed to be in the same directory
# as the script
USER_FILE = 'dataforwisepillcalculation.csv'
WISEPILL_FILE = 'wisepill.csv'
OUTPUT_FILE = 'adherence.csv'
NO_DATA = '.' # what shows up in the export when there is no data to calculate adherence
DEFAULT_THRESHOLD_MAP = {1: 12, 2: 8} # frequency: hours apart for doses to be unique
PAT_COLUMN_ID = "Study ID"
PAT_COLUMN_ENROLL_DATE = "Enrollment Date"
PAT_COLUMN_COMPLETION_DATE = "Complete T%s"
PAT_COLUMN_LAST_VISIT = "Last Visit T8"
PAT_COLUMN_ANCHOR_MED = "Anchor Medication"
PAT_COLUMN_FREQUENCY = "Once a day or twice a day"
PAT_VISIT_COLUMNS = [PAT_COLUMN_ENROLL_DATE] + \
[PAT_COLUMN_COMPLETION_DATE % i for i in range(1, 7)] + \
[PAT_COLUMN_LAST_VISIT]
WISE_COLUMN_PATIENT = 'patient'
WISE_COLUMN_TIMESTAMP = 'timestamp'
class Patient(object):
wisepill_logs = []
def __init__(self, data):
def _to_date(datestring, required=False):
if required and not datestring:
raise ValueError("Field is required!")
if not required and not datestring:
return None
return datetime.strptime(datestring, "%m/%d/%Y")
self.id = data[PAT_COLUMN_ID]
self.enrollment_date = _to_date(data[PAT_COLUMN_ENROLL_DATE], True)
self.last_visit_date = _to_date(data[PAT_COLUMN_LAST_VISIT])
self.visit_dates = [_to_date(data[c]) for c in PAT_VISIT_COLUMNS]
self.anchor_medication = data[PAT_COLUMN_ANCHOR_MED]
self.frequency = int(data[PAT_COLUMN_FREQUENCY])
def __repr__(self):
return "{id}: {start}-{end} ({med} {freq}/day)".format(
id=self.id, start=self.enrollment_date, end=self.last_visit_date,
med=self.anchor_medication, freq=self.frequency
)
def get_adherence_data(self, threshold_map=None):
threshold_map = threshold_map or DEFAULT_THRESHOLD_MAP
adherence_data = []
last_visit = this_visit = None
for i in range(len(self.visit_dates)):
this_visit = self.visit_dates[i]
expected_pills = actual_pills = None
if last_visit and this_visit:
assert this_visit > last_visit, \
"patient %s visits aren't in order please fix in the source file! %s -> %s" % \
(self.id, last_visit, this_visit)
logs = [l for l in self.wisepill_logs if \
l.timestamp >= last_visit and \
l.timestamp < this_visit]
expected_pills = (this_visit.date() - last_visit.date()).days * self.frequency
actual_pills = _get_pillcount(logs, self.frequency,
threshold_map[self.frequency])
adherence_data.append(Adherence(last_visit, this_visit,
expected_pills, actual_pills))
last_visit = this_visit
return adherence_data
def as_export_row(self, threshold_map=None):
# Columns: patient ID, visit (e.g. T1, T2...), wisepill adherence % (single
# adherence score for this time window from the previous appointment to this
# appointment , which is # of actual pills / expected pills), dosage (1 or 2)
return [self.id] + [adh.adherence_display for adh in \
self.get_adherence_data(threshold_map)]
class WisePillLog(object):
def __init__(self, data):
def _to_datetime(datestring):
return datetime.strptime(datestring, "%Y-%m-%dT%H:%M:%S")
# there are other columns but for now we don't need them.
self.patient_id = data[WISE_COLUMN_PATIENT]
self.timestamp = _to_datetime(data[WISE_COLUMN_TIMESTAMP])
def __str__(self):
return "%s: %s" % (self.patient_id, self.timestamp)
class Adherence(object):
def __init__(self, start, end, expected, actual):
self.start = start
self.end = end
self.expected = expected
self.actual = actual
def has_data(self):
return self.actual is not None
@property
def adherence_display(self):
if self.has_data():
return "%.4f" % (100 * float(self.actual) / float(self.expected))
else:
return NO_DATA
def __repr__(self):
return "%s - %s: %s" % (self.start, self.end, self.adherence_display)
def _get_pillcount(pills, frequency, threshold):
"""
Given a list of wisepill logs and a threshold, returns the number
of unique entries that should be counted (records on the same day
that fall within the threshold are ignored)
"""
# assumes pills are sorted by timestamp
pills_that_count = []
for wp in pills:
if not pills_that_count:
# no prior record, it counts
pills_that_count.append(wp)
else:
last_record = pills_that_count[-1]
if (last_record.timestamp.date() != wp.timestamp.date() or \
(wp.timestamp - last_record.timestamp).seconds > threshold * 60 * 60) and \
len(filter(lambda wpd: wpd.timestamp.date() == wp.timestamp.date(), pills_that_count)) < frequency:
# was either a new day or outside the threshold
# as a final check make sure we haven't exceeded our count for the day
pills_that_count.append(wp)
return len(pills_that_count)
def process_data(user_file=USER_FILE,
wisepill_file=WISEPILL_FILE,
output_file=OUTPUT_FILE):
users = build_user_db(user_file)
wisepill_logs = get_wisepill_logs(wisepill_file)
for u in users:
u.wisepill_logs = sorted(wisepill_logs[u.id], key=lambda wp: wp.timestamp)
with open(output_file, 'wb') as f:
writer = csv.writer(f)
writer.writerow([PAT_COLUMN_ID] + PAT_VISIT_COLUMNS) # headers
for u in users:
writer.writerow(u.as_export_row())
def build_user_db(filename):
with open(filename, 'r') as f:
reader = csv.DictReader(f)
return [Patient(line) for line in reader]
def get_wisepill_logs(filename):
logs_by_patient_id = defaultdict(lambda: [])
with open(filename, 'r') as f:
reader = csv.DictReader(f)
for line in reader:
wp = WisePillLog(line)
logs_by_patient_id[wp.patient_id].append(wp)
return logs_by_patient_id
if __name__ == '__main__':
process_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment