Last active
December 12, 2015 00:38
-
-
Save czue/4684953 to your computer and use it in GitHub Desktop.
Script to remove potentially duplicate counts from wisepill data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script usage: | |
The script currently assumes the existence of two files and of specific formats. | |
dataforwisepillcalculation.csv - a list of patient ids and visit times | |
wisepill.csv - the full log of wisepill openings | |
It outputs a single file: | |
adherence.csv - the patients' adherence scores over each of the visit periods | |
These are assumed to be in the same directory as the script. The filenames can | |
be changed as variables below. | |
The script will count adherence per day with the following rules: | |
- A patient can only have adherence up to their dosage. | |
- The first pill of the day always counts. | |
- After the first pill, any pill within the THRESHOLD (in hours) from the first | |
pill does not count. | |
- The next pill outside of the threshold counts again. | |
- After this pill the theshold resets, and the rules are the same. | |
The THRESHOLDS can be configured, via the variable DEFAULT_THRESHOLD_MAP, which | |
maps frequency to thresholds. The defaults are 12 hours for once-a-day patients | |
and 8 hours for twice-a-day patients. | |
Original Problem statement: | |
The wisepill devices for ARemind were sending in a lot of repeat messages | |
within a 24 hour window - sometimes a duplicate with the exact same timestamp, | |
which is easy to rule out, but sometimes within 5, 10, 30 minutes or sometimes | |
in 6-12 hours. This is a bit confusing for patients that are supposed to be | |
opening the device 1x or 2x per day. So the data team has some concerns about | |
the confidence in the data. | |
The data team has decided that multiple readings for patients that take 2 doses | |
per day should be considered a single event if they occur within 8 hours, and | |
if they take a single dose then within 12 hours should be considered a single | |
dose. | |
Output should be: | |
Columns: patient ID, visit (e.g. T1, T2...), wisepill adherence % (single | |
adherence score for this time window from the previous appointment to this | |
appointment , which is # of actual pills / expected pills), dosage (1 or 2) | |
They also will likely want to tweak the 8 hours for 2x day dose and 12 hours to | |
1x per day dose to see how the data looks: I hope it's not too hard to build | |
those in as variables for the script. | |
""" | |
import csv | |
from datetime import datetime | |
from collections import defaultdict | |
# CONFIGURATION PARAMETERS | |
# change these filenames as appropriate, assumed to be in the same directory | |
# as the script | |
USER_FILE = 'dataforwisepillcalculation.csv' | |
WISEPILL_FILE = 'wisepill.csv' | |
OUTPUT_FILE = 'adherence.csv' | |
NO_DATA = '.' # what shows up in the export when there is no data to calculate adherence | |
DEFAULT_THRESHOLD_MAP = {1: 12, 2: 8} # frequency: hours apart for doses to be unique | |
PAT_COLUMN_ID = "Study ID" | |
PAT_COLUMN_ENROLL_DATE = "Enrollment Date" | |
PAT_COLUMN_COMPLETION_DATE = "Complete T%s" | |
PAT_COLUMN_LAST_VISIT = "Last Visit T8" | |
PAT_COLUMN_ANCHOR_MED = "Anchor Medication" | |
PAT_COLUMN_FREQUENCY = "Once a day or twice a day" | |
PAT_VISIT_COLUMNS = [PAT_COLUMN_ENROLL_DATE] + \ | |
[PAT_COLUMN_COMPLETION_DATE % i for i in range(1, 7)] + \ | |
[PAT_COLUMN_LAST_VISIT] | |
WISE_COLUMN_PATIENT = 'patient' | |
WISE_COLUMN_TIMESTAMP = 'timestamp' | |
class Patient(object): | |
wisepill_logs = [] | |
def __init__(self, data): | |
def _to_date(datestring, required=False): | |
if required and not datestring: | |
raise ValueError("Field is required!") | |
if not required and not datestring: | |
return None | |
return datetime.strptime(datestring, "%m/%d/%Y") | |
self.id = data[PAT_COLUMN_ID] | |
self.enrollment_date = _to_date(data[PAT_COLUMN_ENROLL_DATE], True) | |
self.last_visit_date = _to_date(data[PAT_COLUMN_LAST_VISIT]) | |
self.visit_dates = [_to_date(data[c]) for c in PAT_VISIT_COLUMNS] | |
self.anchor_medication = data[PAT_COLUMN_ANCHOR_MED] | |
self.frequency = int(data[PAT_COLUMN_FREQUENCY]) | |
def __repr__(self): | |
return "{id}: {start}-{end} ({med} {freq}/day)".format( | |
id=self.id, start=self.enrollment_date, end=self.last_visit_date, | |
med=self.anchor_medication, freq=self.frequency | |
) | |
def get_adherence_data(self, threshold_map=None): | |
threshold_map = threshold_map or DEFAULT_THRESHOLD_MAP | |
adherence_data = [] | |
last_visit = this_visit = None | |
for i in range(len(self.visit_dates)): | |
this_visit = self.visit_dates[i] | |
expected_pills = actual_pills = None | |
if last_visit and this_visit: | |
assert this_visit > last_visit, \ | |
"patient %s visits aren't in order please fix in the source file! %s -> %s" % \ | |
(self.id, last_visit, this_visit) | |
logs = [l for l in self.wisepill_logs if \ | |
l.timestamp >= last_visit and \ | |
l.timestamp < this_visit] | |
expected_pills = (this_visit.date() - last_visit.date()).days * self.frequency | |
actual_pills = _get_pillcount(logs, self.frequency, | |
threshold_map[self.frequency]) | |
adherence_data.append(Adherence(last_visit, this_visit, | |
expected_pills, actual_pills)) | |
last_visit = this_visit | |
return adherence_data | |
def as_export_row(self, threshold_map=None): | |
# Columns: patient ID, visit (e.g. T1, T2...), wisepill adherence % (single | |
# adherence score for this time window from the previous appointment to this | |
# appointment , which is # of actual pills / expected pills), dosage (1 or 2) | |
return [self.id] + [adh.adherence_display for adh in \ | |
self.get_adherence_data(threshold_map)] | |
class WisePillLog(object): | |
def __init__(self, data): | |
def _to_datetime(datestring): | |
return datetime.strptime(datestring, "%Y-%m-%dT%H:%M:%S") | |
# there are other columns but for now we don't need them. | |
self.patient_id = data[WISE_COLUMN_PATIENT] | |
self.timestamp = _to_datetime(data[WISE_COLUMN_TIMESTAMP]) | |
def __str__(self): | |
return "%s: %s" % (self.patient_id, self.timestamp) | |
class Adherence(object): | |
def __init__(self, start, end, expected, actual): | |
self.start = start | |
self.end = end | |
self.expected = expected | |
self.actual = actual | |
def has_data(self): | |
return self.actual is not None | |
@property | |
def adherence_display(self): | |
if self.has_data(): | |
return "%.4f" % (100 * float(self.actual) / float(self.expected)) | |
else: | |
return NO_DATA | |
def __repr__(self): | |
return "%s - %s: %s" % (self.start, self.end, self.adherence_display) | |
def _get_pillcount(pills, frequency, threshold): | |
""" | |
Given a list of wisepill logs and a threshold, returns the number | |
of unique entries that should be counted (records on the same day | |
that fall within the threshold are ignored) | |
""" | |
# assumes pills are sorted by timestamp | |
pills_that_count = [] | |
for wp in pills: | |
if not pills_that_count: | |
# no prior record, it counts | |
pills_that_count.append(wp) | |
else: | |
last_record = pills_that_count[-1] | |
if (last_record.timestamp.date() != wp.timestamp.date() or \ | |
(wp.timestamp - last_record.timestamp).seconds > threshold * 60 * 60) and \ | |
len(filter(lambda wpd: wpd.timestamp.date() == wp.timestamp.date(), pills_that_count)) < frequency: | |
# was either a new day or outside the threshold | |
# as a final check make sure we haven't exceeded our count for the day | |
pills_that_count.append(wp) | |
return len(pills_that_count) | |
def process_data(user_file=USER_FILE, | |
wisepill_file=WISEPILL_FILE, | |
output_file=OUTPUT_FILE): | |
users = build_user_db(user_file) | |
wisepill_logs = get_wisepill_logs(wisepill_file) | |
for u in users: | |
u.wisepill_logs = sorted(wisepill_logs[u.id], key=lambda wp: wp.timestamp) | |
with open(output_file, 'wb') as f: | |
writer = csv.writer(f) | |
writer.writerow([PAT_COLUMN_ID] + PAT_VISIT_COLUMNS) # headers | |
for u in users: | |
writer.writerow(u.as_export_row()) | |
def build_user_db(filename): | |
with open(filename, 'r') as f: | |
reader = csv.DictReader(f) | |
return [Patient(line) for line in reader] | |
def get_wisepill_logs(filename): | |
logs_by_patient_id = defaultdict(lambda: []) | |
with open(filename, 'r') as f: | |
reader = csv.DictReader(f) | |
for line in reader: | |
wp = WisePillLog(line) | |
logs_by_patient_id[wp.patient_id].append(wp) | |
return logs_by_patient_id | |
if __name__ == '__main__': | |
process_data() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment