Last active
March 14, 2020 00:48
-
-
Save philerooski/e357930a484c3b1190a58c0a75192daf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
options(stringsAsFactors = F) | |
library(synapser) | |
library(tidyverse) | |
library(lubridate) | |
fetch_and_prep <- function() { | |
mpower_f <- synGet("syn21213531") #"syn11488492") | |
mpower <- read_tsv(mpower_f$path) | |
now <- lubridate::now() | |
mpower <- mpower %>% | |
mutate(createdOn = as_datetime(createdOn/1000)) %>% | |
filter(createdOn < now) | |
} | |
mutate_participant_week_day <- function(engagement) { | |
first_activity <- engagement %>% | |
group_by(healthCode) %>% | |
summarise(first_activity_time = min(createdOn, na.rm=T)) | |
engagement <- inner_join(engagement, first_activity) | |
engagement <- engagement %>% | |
mutate( | |
seconds_since_first_activity = createdOn - first_activity_time, | |
participantWeek = as.integer( | |
floor(as.numeric( | |
as.duration(seconds_since_first_activity), "weeks"))), | |
participantDay = as.integer( | |
floor(as.numeric( | |
as.duration(seconds_since_first_activity), "days"))) | |
) %>% | |
select(-first_activity_time, -seconds_since_first_activity) | |
return(engagement) | |
} | |
mutate_task_type <- function(engagement) { | |
engagement %>% | |
mutate(taskType = case_when( | |
simpleName %in% c("WalkingActivity", "TappingActivity", "MemoryActivity", | |
"VoiceActivity", "TremorActivity") ~ "active-sensor", | |
simpleName %in% c("PDQ8", "MDSUPDRS", "study_feedback", "my_thoughts", | |
"MedicationsSurvey", "MedicationTracker", "PHQ8", | |
"MoodSurvey", "EnrollmentSurvey", | |
"NonIdentifiableDemographicsTask", "mythoughts") ~ "survey", | |
simpleName %in% c("HealthKitDataCollector", "HealthKitSleepCollector", | |
"HealthKitWorkoutCollector", "locationTracker", | |
"displacement", "displacementCollector", | |
"motionActivityCollector") ~ "passive-sensor")) | |
} | |
mutate_task_frequency <- function(engagement) { | |
engagement %>% | |
mutate(taskFrequency = case_when( | |
simpleName %in% c("WalkingActivity", "TappingActivity", "MemoryActivity", | |
"VoiceActivity", "TremorActivity") ~ "daily", | |
simpleName %in% c("PDQ8", "MDSUPDRS") ~ "monthly", | |
simpleName %in% c("HealthKitDataCollector", "HealthKitSleepCollector", | |
"HealthKitWorkoutCollector", "locationTracker", | |
"displacement", "displacementCollector", | |
"motionActivityCollector") ~ "continuous", | |
simpleName %in% c("study_feedback", "my_thoughts", "MedicationsSurvey", | |
"MedicationTracker", "PHQ8", "MoodSurvey", | |
"mythoughts") ~ "intermittent", | |
simpleName %in% c("EnrollmentSurvey", "NonIdentifiableDemographicsTask") ~ "baseline")) | |
} | |
mutate_local_time <- function(engagement) { | |
healthcode_to_utc <- synGet("syn12635875")$path %>% read_tsv() | |
engagement <- engagement %>% | |
left_join(healthcode_to_utc, by = "healthCode") %>% | |
mutate(createdOnLocalTime = as.character(createdOn + lubridate::hours(UTC_offset)), | |
createdOnTimeZone = UTC_offset*100, | |
createdOn = as.character(createdOn)) %>% | |
select(-actual_state, -UTC_offset) | |
return(engagement) | |
} | |
curate_mpower_metadata <- function() { | |
df <- data.table::fread(synGet("syn15624955")$path, data.table = F) %>% | |
dplyr::mutate(age_group = cut(age, breaks=c(17,29,39,49, 59, 120))) | |
df <- df %>% dplyr::select(-health_history, -healthcare_provider, -last_smoked_timezone, | |
-SourceID, -diagnosis_year, -medication_start_year, -onset_year, -professional_diagnosis, | |
-when_deep_brain_stimulation, -packs_per_day, -years_smoking, | |
-are_caretaker, -deep_brain_stimulation, -medical_usage, -medical_usage_yesterday, | |
-home_usage, -smoked, -surgery, -living_alone_status, -medication_bool, | |
-past_participation, -video_usage, -phone_usage, -recordId) | |
#remove test users | |
testHealthCodes <- df %>% dplyr::filter(str_detect(dataGroups, 'test|parkinson;control|control;parkinson')) %>% .$healthCode %>% unique() | |
#filter out healthCodes | |
df <- df %>% dplyr::filter(! healthCode %in% testHealthCodes) | |
##Create a column for individuals that are clinically referred | |
## Cohort 1 - ObjectivePD | |
objectivePD_cohort <- data.table::fread(synGet("syn8533708")$path) %>% dplyr::rename(healthCode = healthcode) | |
objectivePD_cohort_mdata <- data.table::fread(synGet("syn12555309")$path, data.table = F) | |
objectivePD_cohort_mdata <- objectivePD_cohort_mdata[, c('healthCode', "Do you have Parkinson disease?")] | |
objectivePD_cohort_mdata <- objectivePD_cohort_mdata %>% distinct() | |
objectivePD_cohort_mdata['clinicalDiagnosis'] = objectivePD_cohort_mdata$`Do you have Parkinson disease?` | |
objectivePD_cohort_mdata$`Do you have Parkinson disease?` <- NULL | |
#clincal referral status + right case status | |
objectivePD_cohort <- merge(objectivePD_cohort, objectivePD_cohort_mdata, all=T, by='healthCode') %>% | |
dplyr::filter(!is.na(clinicalDiagnosis)) %>% | |
dplyr::select(-id) %>% | |
dplyr::mutate(clinicalDiagnosis = ifelse(clinicalDiagnosis == 'Yes', T, F), | |
clinicalReferral = T) | |
#merge with metadata | |
df <- merge(df,objectivePD_cohort, all=T) | |
#For objectivePD cohort - replace value of inferred_diagnosis by clinicalDiagnosis that is taken from curated data | |
to.replace = !is.na(df$clinicalDiagnosis) | |
df$inferred_diagnosis[to.replace] = df$clinicalDiagnosis[to.replace] | |
df$clinicalDiagnosis <- NULL | |
## Cohort 2 | |
## The clinical referrals is the intersection between that file and the query: | |
###Github ref - https://github.com/Sage-Bionetworks/mhealth-engagement-analysis/issues/6 | |
otherClinicalReferrals <- synTableQuery("SELECT * FROM syn3420237 where externalId<>'' and dataGroups not like '%test%'") | |
otherClinicalReferrals <- otherClinicalReferrals$asDataFrame() %>% .$healthCode %>% unique() | |
otherClinicalReferrals <- setdiff(otherClinicalReferrals, objectivePD_cohort$healthCode) | |
#select only those have inferred diagnosis = T | |
otherClinicalReferrals <- df %>% | |
filter(healthCode %in% otherClinicalReferrals & inferred_diagnosis == T ) %>% .$healthCode | |
df$clinicalReferral[df$healthCode %in% otherClinicalReferrals] = T | |
###Fill F for all others who are not clinical | |
df$clinicalReferral[is.na(df$clinicalReferral)] = F | |
df <- df %>% dplyr::rename(state = Enter_State) %>% | |
dplyr::mutate(state = stringr::str_to_title(state)) %>% | |
dplyr::select(-dataGroups) #remove dataGroups - pretty confusing col at this point with so much missing | |
#Fix Race (In this order is IMP) | |
torep <- grepl('Latino/Hispanic', df$race) | |
df$race[torep] = 'Hispanic/Latinos' | |
torep <- grepl('Black or African', df$race) | |
df$race[torep] = 'African-American/Black' | |
torep <- grepl('Native American', df$race) | |
df$race[torep] = 'AIAN' | |
torep <- grepl('Asian', df$race) | |
df$race[torep] = 'Asian' | |
torep <- grepl('White or Caucasian', df$race) | |
df$race[torep] = 'Non-Hispanic White' | |
torep <- grepl('Other|Middle Eastern|Caribbean', df$race, perl=T) | |
df$race[torep] = 'Other' | |
torep <- grepl('Mixed', df$race, perl=T) | |
df$race[torep] = 'More than one' | |
torep <- grepl('Pacific Islander', df$race, perl=T) | |
df$race[torep] = 'Native Hawaiian or other Pacific Islander' | |
#### Employment | |
torep <- grepl('Self-employed|full time|Employment for wages|Military', df$employment) | |
df$employment[torep] = 'employed' | |
torep <- grepl('Unable to work', df$employment) | |
df$employment[torep] = 'Unable to work' | |
torep <- grepl('Retired', df$employment) | |
df$employment[torep] = 'Retired' | |
torep <- grepl('Out of work|no work', df$employment) | |
df$employment[torep] = 'unemployed' | |
torep <- grepl('student', df$employment) | |
df$employment[torep] = 'student' | |
torep <- grepl('occasional|part time', df$employment) | |
df$employment[torep] = 'part-time' | |
df <- df %>% dplyr::rename(caseStatus = inferred_diagnosis) %>% | |
dplyr::mutate(study = 'mPower') | |
#Update Education | |
df <- df %>% | |
dplyr::mutate(education = case_when( | |
grepl('Post graduate|Doctoral Degree|Master', education, perl=T, ignore.case = T) ~ 'Post graduate', | |
education %in% c('8th grade or less', 'More than 8th grade but did not graduate high school') ~ 'Below High School', | |
education %in% c('Some college', 'Graduate of Four Year College', 'Graduate of Two Year College or Technical School', 'Some graduate school') ~ 'College', | |
education %in% c('High school graduate or equivalent', 'Some high school') ~ 'High School', | |
grepl('College', education, perl=T, ignore.case = T) ~ 'College', | |
grepl('high school', education, perl=T, ignore.case = T) ~ 'High School', | |
TRUE ~ education)) | |
#Filter out unclear gender | |
df <- df %>% filter(gender %in% c('Female', 'Male', 'Prefer not to answer')) | |
return(df) | |
} | |
main <- function() { | |
synLogin() | |
#Meta Data | |
#mpower_metadata <- curate_mpower_metadata() | |
#write_csv(mpower_metadata, "mpower_metadata.csv") | |
#f_meta <- synapser::File("mpower_metadata.csv", parent="syn19520775") | |
#synStore(f_meta, used=list("syn15624955")) | |
#unlink("mpower_metadata.csv") | |
#mPower engagement | |
mpower <- fetch_and_prep() %>% | |
mutate_participant_week_day() %>% | |
mutate_task_type() %>% | |
mutate_task_frequency() %>% | |
mutate_local_time() | |
###Keep only those healthCode that have entry in the metaData file | |
#mpower <- mpower %>% | |
# dplyr::filter(healthCode %in% .data = mpower_metadata$healthCode) | |
write_tsv(mpower, "mpower_engagement_curated.tsv") | |
f <- synapser::File("mpower_engagement_curated.tsv", parent="syn21054482") #syn18936398") | |
synStore(f, used=list("syn21054482"), #syn11488492"), | |
executed=paste0("https://gist.github.com/philerooski/", | |
"e357930a484c3b1190a58c0a75192daf")) | |
unlink("mpower_engagement.tsv") | |
} | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment