czue/aremind_wisepill_calc.py

## aremind_wisepill_calc.py
"""
Script usage:

The script currently assumes the existence of two files and of specific formats.
 dataforwisepillcalculation.csv - a list of patient ids and visit times
 wisepill.csv - the full log of wisepill openings

It outputs a single file:
 adherence.csv - the patients' adherence scores over each of the visit periods

These are assumed to be in the same directory as the script. The filenames can
be changed as variables below.

The script will count adherence per day with the following rules:
 - A patient can only have adherence up to their dosage.
 - The first pill of the day always counts.
 - After the first pill, any pill within the THRESHOLD (in hours) from the first
   pill does not count.
 - The next pill outside of the threshold counts again.
 - After this pill the theshold resets, and the rules are the same.

The THRESHOLDS can be configured, via the variable DEFAULT_THRESHOLD_MAP, which
maps frequency to thresholds. The defaults are 12 hours for once-a-day patients
and 8 hours for twice-a-day patients.

Original Problem statement:

The wisepill devices for ARemind were sending in a lot of repeat messages
within a 24 hour window - sometimes a duplicate with the exact same timestamp,
which is easy to rule out, but sometimes within 5, 10, 30 minutes or sometimes
in 6-12 hours.  This is a bit confusing for patients that are supposed to be
opening the device 1x or 2x per day.  So the data team has some concerns about
the confidence in the data.

The data team has decided that multiple readings for patients that take 2 doses
per day should be considered a single event if they occur within 8 hours, and
if they take a single dose then within 12 hours should be considered a single
dose.

Output should be:
Columns: patient ID, visit (e.g. T1, T2...), wisepill adherence % (single
adherence score for this time window from the previous appointment to this
appointment , which is # of actual pills / expected pills), dosage (1 or 2)

They also will likely want to tweak the 8 hours for 2x day dose and 12 hours to
1x per day dose to see how the data looks: I hope it's not too hard to build
those in as variables for the script.

"""

import csv
from datetime import datetime
from collections import defaultdict

# CONFIGURATION PARAMETERS
# change these filenames as appropriate, assumed to be in the same directory
# as the script
USER_FILE = 'dataforwisepillcalculation.csv'
WISEPILL_FILE = 'wisepill.csv'
OUTPUT_FILE = 'adherence.csv'

NO_DATA = '.' # what shows up in the export when there is no data to calculate adherence
DEFAULT_THRESHOLD_MAP = {1: 12, 2: 8} # frequency: hours apart for doses to be unique


PAT_COLUMN_ID = "Study ID"
PAT_COLUMN_ENROLL_DATE = "Enrollment Date"
PAT_COLUMN_COMPLETION_DATE = "Complete T%s"
PAT_COLUMN_LAST_VISIT = "Last Visit T8"
PAT_COLUMN_ANCHOR_MED = "Anchor Medication"
PAT_COLUMN_FREQUENCY = "Once a day or twice a day"
PAT_VISIT_COLUMNS = [PAT_COLUMN_ENROLL_DATE] + \
    [PAT_COLUMN_COMPLETION_DATE % i for i in range(1, 7)] + \
    [PAT_COLUMN_LAST_VISIT]


WISE_COLUMN_PATIENT = 'patient'
WISE_COLUMN_TIMESTAMP = 'timestamp'

class Patient(object):
    wisepill_logs = []

    def __init__(self, data):

        def _to_date(datestring, required=False):
            if required and not datestring:
                raise ValueError("Field is required!")
            if not required and not datestring:
                return None
            return datetime.strptime(datestring, "%m/%d/%Y")

        self.id = data[PAT_COLUMN_ID]
        self.enrollment_date = _to_date(data[PAT_COLUMN_ENROLL_DATE], True)
        self.last_visit_date = _to_date(data[PAT_COLUMN_LAST_VISIT])
        self.visit_dates = [_to_date(data[c]) for c in PAT_VISIT_COLUMNS]
        self.anchor_medication = data[PAT_COLUMN_ANCHOR_MED]
        self.frequency = int(data[PAT_COLUMN_FREQUENCY])

    def __repr__(self):
        return "{id}: {start}-{end} ({med} {freq}/day)".format(
            id=self.id, start=self.enrollment_date, end=self.last_visit_date,
            med=self.anchor_medication, freq=self.frequency
        )

    def get_adherence_data(self, threshold_map=None):
        threshold_map = threshold_map or DEFAULT_THRESHOLD_MAP
        adherence_data = []
        last_visit = this_visit = None
        for i in range(len(self.visit_dates)):
            this_visit = self.visit_dates[i]
            expected_pills = actual_pills = None
            if last_visit and this_visit:
                assert this_visit > last_visit, \
                    "patient %s visits aren't in order please fix in the source file! %s -> %s" % \
                    (self.id, last_visit, this_visit)
                logs = [l for l in self.wisepill_logs if \
                        l.timestamp >= last_visit and \
                        l.timestamp < this_visit]
                expected_pills = (this_visit.date() - last_visit.date()).days * self.frequency
                actual_pills = _get_pillcount(logs, self.frequency,
                                              threshold_map[self.frequency])
            adherence_data.append(Adherence(last_visit, this_visit,
                                            expected_pills, actual_pills))
            last_visit = this_visit
        return adherence_data

    def as_export_row(self, threshold_map=None):
        # Columns: patient ID, visit (e.g. T1, T2...), wisepill adherence % (single
        # adherence score for this time window from the previous appointment to this
        # appointment , which is # of actual pills / expected pills), dosage (1 or 2)
        return [self.id] + [adh.adherence_display for adh in \
                            self.get_adherence_data(threshold_map)]

class WisePillLog(object):
    def __init__(self, data):
        def _to_datetime(datestring):
            return datetime.strptime(datestring, "%Y-%m-%dT%H:%M:%S")
        # there are other columns but for now we don't need them.
        self.patient_id = data[WISE_COLUMN_PATIENT]
        self.timestamp = _to_datetime(data[WISE_COLUMN_TIMESTAMP])

    def __str__(self):
        return "%s: %s" % (self.patient_id, self.timestamp)

class Adherence(object):
    def __init__(self, start, end, expected, actual):
        self.start = start
        self.end = end
        self.expected = expected
        self.actual = actual

    def has_data(self):
        return self.actual is not None

    @property
    def adherence_display(self):
        if self.has_data():
            return "%.4f" % (100 * float(self.actual) / float(self.expected))
        else:
            return NO_DATA

    def __repr__(self):
        return "%s - %s: %s" % (self.start, self.end, self.adherence_display)

def _get_pillcount(pills, frequency, threshold):
    """
    Given a list of wisepill logs and a threshold, returns the number
    of unique entries that should be counted (records on the same day
    that fall within the threshold are ignored)
    """
    # assumes pills are sorted by timestamp
    pills_that_count = []
    for wp in pills:
        if not pills_that_count:
            # no prior record, it counts
            pills_that_count.append(wp)
        else:
            last_record = pills_that_count[-1]
            if (last_record.timestamp.date() != wp.timestamp.date() or \
                (wp.timestamp - last_record.timestamp).seconds > threshold * 60 * 60) and \
                len(filter(lambda wpd: wpd.timestamp.date() == wp.timestamp.date(), pills_that_count)) < frequency:
                # was either a new day or outside the threshold
                # as a final check make sure we haven't exceeded our count for the day
                pills_that_count.append(wp)
    return len(pills_that_count)

def process_data(user_file=USER_FILE,
                 wisepill_file=WISEPILL_FILE,
                 output_file=OUTPUT_FILE):
    users = build_user_db(user_file)
    wisepill_logs = get_wisepill_logs(wisepill_file)
    for u in users:
        u.wisepill_logs = sorted(wisepill_logs[u.id], key=lambda wp: wp.timestamp)

    with open(output_file, 'wb') as f:
        writer = csv.writer(f)
        writer.writerow([PAT_COLUMN_ID] + PAT_VISIT_COLUMNS) # headers
        for u in users:
            writer.writerow(u.as_export_row())

def build_user_db(filename):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        return [Patient(line) for line in reader]

def get_wisepill_logs(filename):
    logs_by_patient_id = defaultdict(lambda: [])
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        for line in reader:
            wp = WisePillLog(line)
            logs_by_patient_id[wp.patient_id].append(wp)

    return logs_by_patient_id


if __name__ == '__main__':
    process_data()
	"""
	Script usage:

	The script currently assumes the existence of two files and of specific formats.
	dataforwisepillcalculation.csv - a list of patient ids and visit times
	wisepill.csv - the full log of wisepill openings

	It outputs a single file:
	adherence.csv - the patients' adherence scores over each of the visit periods

	These are assumed to be in the same directory as the script. The filenames can
	be changed as variables below.

	The script will count adherence per day with the following rules:
	- A patient can only have adherence up to their dosage.
	- The first pill of the day always counts.
	- After the first pill, any pill within the THRESHOLD (in hours) from the first
	pill does not count.
	- The next pill outside of the threshold counts again.
	- After this pill the theshold resets, and the rules are the same.

	The THRESHOLDS can be configured, via the variable DEFAULT_THRESHOLD_MAP, which
	maps frequency to thresholds. The defaults are 12 hours for once-a-day patients
	and 8 hours for twice-a-day patients.

	Original Problem statement:

	The wisepill devices for ARemind were sending in a lot of repeat messages
	within a 24 hour window - sometimes a duplicate with the exact same timestamp,
	which is easy to rule out, but sometimes within 5, 10, 30 minutes or sometimes
	in 6-12 hours. This is a bit confusing for patients that are supposed to be
	opening the device 1x or 2x per day. So the data team has some concerns about
	the confidence in the data.

	The data team has decided that multiple readings for patients that take 2 doses
	per day should be considered a single event if they occur within 8 hours, and
	if they take a single dose then within 12 hours should be considered a single
	dose.

	Output should be:
	Columns: patient ID, visit (e.g. T1, T2...), wisepill adherence % (single
	adherence score for this time window from the previous appointment to this
	appointment , which is # of actual pills / expected pills), dosage (1 or 2)

	They also will likely want to tweak the 8 hours for 2x day dose and 12 hours to
	1x per day dose to see how the data looks: I hope it's not too hard to build
	those in as variables for the script.

	"""

	import csv
	from datetime import datetime
	from collections import defaultdict

	# CONFIGURATION PARAMETERS
	# change these filenames as appropriate, assumed to be in the same directory
	# as the script
	USER_FILE = 'dataforwisepillcalculation.csv'
	WISEPILL_FILE = 'wisepill.csv'
	OUTPUT_FILE = 'adherence.csv'

	NO_DATA = '.' # what shows up in the export when there is no data to calculate adherence
	DEFAULT_THRESHOLD_MAP = {1: 12, 2: 8} # frequency: hours apart for doses to be unique


	PAT_COLUMN_ID = "Study ID"
	PAT_COLUMN_ENROLL_DATE = "Enrollment Date"
	PAT_COLUMN_COMPLETION_DATE = "Complete T%s"
	PAT_COLUMN_LAST_VISIT = "Last Visit T8"
	PAT_COLUMN_ANCHOR_MED = "Anchor Medication"
	PAT_COLUMN_FREQUENCY = "Once a day or twice a day"
	PAT_VISIT_COLUMNS = [PAT_COLUMN_ENROLL_DATE] + \
	[PAT_COLUMN_COMPLETION_DATE % i for i in range(1, 7)] + \
	[PAT_COLUMN_LAST_VISIT]


	WISE_COLUMN_PATIENT = 'patient'
	WISE_COLUMN_TIMESTAMP = 'timestamp'

	class Patient(object):
	wisepill_logs = []

	def __init__(self, data):

	def _to_date(datestring, required=False):
	if required and not datestring:
	raise ValueError("Field is required!")
	if not required and not datestring:
	return None
	return datetime.strptime(datestring, "%m/%d/%Y")

	self.id = data[PAT_COLUMN_ID]
	self.enrollment_date = _to_date(data[PAT_COLUMN_ENROLL_DATE], True)
	self.last_visit_date = _to_date(data[PAT_COLUMN_LAST_VISIT])
	self.visit_dates = [_to_date(data[c]) for c in PAT_VISIT_COLUMNS]
	self.anchor_medication = data[PAT_COLUMN_ANCHOR_MED]
	self.frequency = int(data[PAT_COLUMN_FREQUENCY])

	def __repr__(self):
	return "{id}: {start}-{end} ({med} {freq}/day)".format(
	id=self.id, start=self.enrollment_date, end=self.last_visit_date,
	med=self.anchor_medication, freq=self.frequency
	)

	def get_adherence_data(self, threshold_map=None):
	threshold_map = threshold_map or DEFAULT_THRESHOLD_MAP
	adherence_data = []
	last_visit = this_visit = None
	for i in range(len(self.visit_dates)):
	this_visit = self.visit_dates[i]
	expected_pills = actual_pills = None
	if last_visit and this_visit:
	assert this_visit > last_visit, \
	"patient %s visits aren't in order please fix in the source file! %s -> %s" % \
	(self.id, last_visit, this_visit)
	logs = [l for l in self.wisepill_logs if \
	l.timestamp >= last_visit and \
	l.timestamp < this_visit]
	expected_pills = (this_visit.date() - last_visit.date()).days * self.frequency
	actual_pills = _get_pillcount(logs, self.frequency,
	threshold_map[self.frequency])
	adherence_data.append(Adherence(last_visit, this_visit,
	expected_pills, actual_pills))
	last_visit = this_visit
	return adherence_data

	def as_export_row(self, threshold_map=None):
	# Columns: patient ID, visit (e.g. T1, T2...), wisepill adherence % (single
	# adherence score for this time window from the previous appointment to this
	# appointment , which is # of actual pills / expected pills), dosage (1 or 2)
	return [self.id] + [adh.adherence_display for adh in \
	self.get_adherence_data(threshold_map)]

	class WisePillLog(object):
	def __init__(self, data):
	def _to_datetime(datestring):
	return datetime.strptime(datestring, "%Y-%m-%dT%H:%M:%S")
	# there are other columns but for now we don't need them.
	self.patient_id = data[WISE_COLUMN_PATIENT]
	self.timestamp = _to_datetime(data[WISE_COLUMN_TIMESTAMP])

	def __str__(self):
	return "%s: %s" % (self.patient_id, self.timestamp)

	class Adherence(object):
	def __init__(self, start, end, expected, actual):
	self.start = start
	self.end = end
	self.expected = expected
	self.actual = actual

	def has_data(self):
	return self.actual is not None

	@property
	def adherence_display(self):
	if self.has_data():
	return "%.4f" % (100 * float(self.actual) / float(self.expected))
	else:
	return NO_DATA

	def __repr__(self):
	return "%s - %s: %s" % (self.start, self.end, self.adherence_display)

	def _get_pillcount(pills, frequency, threshold):
	"""
	Given a list of wisepill logs and a threshold, returns the number
	of unique entries that should be counted (records on the same day
	that fall within the threshold are ignored)
	"""
	# assumes pills are sorted by timestamp
	pills_that_count = []
	for wp in pills:
	if not pills_that_count:
	# no prior record, it counts
	pills_that_count.append(wp)
	else:
	last_record = pills_that_count[-1]
	if (last_record.timestamp.date() != wp.timestamp.date() or \
	(wp.timestamp - last_record.timestamp).seconds > threshold * 60 * 60) and \
	len(filter(lambda wpd: wpd.timestamp.date() == wp.timestamp.date(), pills_that_count)) < frequency:
	# was either a new day or outside the threshold
	# as a final check make sure we haven't exceeded our count for the day
	pills_that_count.append(wp)
	return len(pills_that_count)

	def process_data(user_file=USER_FILE,
	wisepill_file=WISEPILL_FILE,
	output_file=OUTPUT_FILE):
	users = build_user_db(user_file)
	wisepill_logs = get_wisepill_logs(wisepill_file)
	for u in users:
	u.wisepill_logs = sorted(wisepill_logs[u.id], key=lambda wp: wp.timestamp)

	with open(output_file, 'wb') as f:
	writer = csv.writer(f)
	writer.writerow([PAT_COLUMN_ID] + PAT_VISIT_COLUMNS) # headers
	for u in users:
	writer.writerow(u.as_export_row())

	def build_user_db(filename):
	with open(filename, 'r') as f:
	reader = csv.DictReader(f)
	return [Patient(line) for line in reader]

	def get_wisepill_logs(filename):
	logs_by_patient_id = defaultdict(lambda: [])
	with open(filename, 'r') as f:
	reader = csv.DictReader(f)
	for line in reader:
	wp = WisePillLog(line)
	logs_by_patient_id[wp.patient_id].append(wp)

	return logs_by_patient_id


	if __name__ == '__main__':
	process_data()