philerooski/curate_mpower.R

## curate_mpower.R
rm(list=ls())
options(stringsAsFactors = F)
library(synapser)
library(tidyverse)
library(lubridate)

fetch_and_prep <- function() {
  mpower_f <- synGet("syn21213531") #"syn11488492")
  mpower <- read_tsv(mpower_f$path)
  now <- lubridate::now()
  mpower <- mpower %>%
    mutate(createdOn = as_datetime(createdOn/1000)) %>%
    filter(createdOn < now)
}

mutate_participant_week_day <- function(engagement) {
  first_activity <- engagement %>%
    group_by(healthCode) %>%
    summarise(first_activity_time = min(createdOn, na.rm=T))
  engagement <- inner_join(engagement, first_activity)
  engagement <- engagement %>%
    mutate(
      seconds_since_first_activity = createdOn - first_activity_time,
      participantWeek = as.integer(
        floor(as.numeric(
          as.duration(seconds_since_first_activity), "weeks"))),
      participantDay = as.integer(
        floor(as.numeric(
          as.duration(seconds_since_first_activity), "days")))
    ) %>%
    select(-first_activity_time, -seconds_since_first_activity)
  return(engagement)
}

mutate_task_type <- function(engagement) {
  engagement %>%
    mutate(taskType = case_when(
      simpleName %in% c("WalkingActivity", "TappingActivity", "MemoryActivity",
                        "VoiceActivity", "TremorActivity") ~ "active-sensor",
      simpleName %in% c("PDQ8", "MDSUPDRS", "study_feedback", "my_thoughts",
                        "MedicationsSurvey", "MedicationTracker", "PHQ8",
                        "MoodSurvey", "EnrollmentSurvey",
                        "NonIdentifiableDemographicsTask", "mythoughts") ~ "survey",
      simpleName %in% c("HealthKitDataCollector", "HealthKitSleepCollector",
                        "HealthKitWorkoutCollector", "locationTracker",
                        "displacement", "displacementCollector",
                        "motionActivityCollector") ~ "passive-sensor"))
}

mutate_task_frequency <- function(engagement) {
  engagement %>%
    mutate(taskFrequency = case_when(
      simpleName %in% c("WalkingActivity", "TappingActivity", "MemoryActivity",
                        "VoiceActivity", "TremorActivity") ~ "daily",
      simpleName %in% c("PDQ8", "MDSUPDRS") ~ "monthly",
      simpleName %in% c("HealthKitDataCollector", "HealthKitSleepCollector",
                        "HealthKitWorkoutCollector", "locationTracker",
                        "displacement", "displacementCollector",
                        "motionActivityCollector") ~ "continuous",
      simpleName %in% c("study_feedback", "my_thoughts", "MedicationsSurvey",
                        "MedicationTracker", "PHQ8", "MoodSurvey",
                        "mythoughts") ~ "intermittent",
      simpleName %in% c("EnrollmentSurvey", "NonIdentifiableDemographicsTask") ~ "baseline"))
}

mutate_local_time <- function(engagement) {
  healthcode_to_utc <- synGet("syn12635875")$path %>% read_tsv()
  engagement <- engagement %>%
    left_join(healthcode_to_utc, by = "healthCode") %>%
    mutate(createdOnLocalTime = as.character(createdOn + lubridate::hours(UTC_offset)),
           createdOnTimeZone = UTC_offset*100,
           createdOn = as.character(createdOn)) %>%
    select(-actual_state, -UTC_offset)
  return(engagement)
}

curate_mpower_metadata <- function() {

  df <- data.table::fread(synGet("syn15624955")$path, data.table = F) %>%
    dplyr::mutate(age_group = cut(age, breaks=c(17,29,39,49, 59, 120)))
  df <- df %>% dplyr::select(-health_history, -healthcare_provider, -last_smoked_timezone,
                      -SourceID, -diagnosis_year, -medication_start_year, -onset_year, -professional_diagnosis,
                      -when_deep_brain_stimulation, -packs_per_day, -years_smoking,
                      -are_caretaker, -deep_brain_stimulation, -medical_usage, -medical_usage_yesterday,
                      -home_usage, -smoked, -surgery, -living_alone_status, -medication_bool,
                      -past_participation, -video_usage, -phone_usage, -recordId)

  #remove test users
  testHealthCodes <- df %>% dplyr::filter(str_detect(dataGroups, 'test|parkinson;control|control;parkinson')) %>% .$healthCode %>% unique()
  #filter out healthCodes
  df <- df %>% dplyr::filter(! healthCode %in% testHealthCodes)

  ##Create a column for individuals that are clinically referred

  ## Cohort 1 - ObjectivePD
  objectivePD_cohort <- data.table::fread(synGet("syn8533708")$path) %>% dplyr::rename(healthCode = healthcode)
  objectivePD_cohort_mdata <- data.table::fread(synGet("syn12555309")$path, data.table = F)
  objectivePD_cohort_mdata <- objectivePD_cohort_mdata[, c('healthCode', "Do you have Parkinson disease?")]
  objectivePD_cohort_mdata <- objectivePD_cohort_mdata %>% distinct()
  objectivePD_cohort_mdata['clinicalDiagnosis'] = objectivePD_cohort_mdata$`Do you have Parkinson disease?`
  objectivePD_cohort_mdata$`Do you have Parkinson disease?`  <- NULL
  #clincal referral status + right case status
  objectivePD_cohort <-  merge(objectivePD_cohort, objectivePD_cohort_mdata, all=T, by='healthCode') %>%
    dplyr::filter(!is.na(clinicalDiagnosis)) %>%
    dplyr::select(-id) %>%
    dplyr::mutate(clinicalDiagnosis = ifelse(clinicalDiagnosis == 'Yes', T, F),
                  clinicalReferral = T)
  #merge with metadata
  df <- merge(df,objectivePD_cohort, all=T)
  #For objectivePD cohort  - replace value of inferred_diagnosis by clinicalDiagnosis that is taken from curated data
  to.replace = !is.na(df$clinicalDiagnosis)
  df$inferred_diagnosis[to.replace] = df$clinicalDiagnosis[to.replace]
  df$clinicalDiagnosis <- NULL


  ## Cohort 2
  ## The clinical referrals is the intersection between that file and the query:
  ###Github ref - https://github.com/Sage-Bionetworks/mhealth-engagement-analysis/issues/6
  otherClinicalReferrals <- synTableQuery("SELECT * FROM syn3420237 where externalId<>'' and dataGroups not like '%test%'")
  otherClinicalReferrals <- otherClinicalReferrals$asDataFrame() %>% .$healthCode %>% unique()
  otherClinicalReferrals <- setdiff(otherClinicalReferrals, objectivePD_cohort$healthCode)
  #select only those have inferred diagnosis = T
  otherClinicalReferrals <- df %>%
    filter(healthCode %in% otherClinicalReferrals & inferred_diagnosis == T ) %>% .$healthCode
  df$clinicalReferral[df$healthCode %in% otherClinicalReferrals] = T

  ###Fill F for all others who are not clinical
  df$clinicalReferral[is.na(df$clinicalReferral)] = F

  df <- df %>% dplyr::rename(state = Enter_State) %>%
    dplyr::mutate(state = stringr::str_to_title(state)) %>%
    dplyr::select(-dataGroups) #remove dataGroups - pretty confusing col at this point with so much missing

  #Fix Race (In this order is IMP)
  torep <- grepl('Latino/Hispanic', df$race)
  df$race[torep] = 'Hispanic/Latinos'
  torep <- grepl('Black or African', df$race)
  df$race[torep] = 'African-American/Black'
  torep <- grepl('Native American', df$race)
  df$race[torep] = 'AIAN'
  torep <- grepl('Asian', df$race)
  df$race[torep] = 'Asian'
  torep <- grepl('White or Caucasian', df$race)
  df$race[torep] = 'Non-Hispanic White'
  torep <- grepl('Other|Middle Eastern|Caribbean', df$race, perl=T)
  df$race[torep] = 'Other'
  torep <- grepl('Mixed', df$race, perl=T)
  df$race[torep] = 'More than one'
  torep <- grepl('Pacific Islander', df$race, perl=T)
  df$race[torep] = 'Native Hawaiian or other Pacific Islander'


  #### Employment
  torep <- grepl('Self-employed|full time|Employment for wages|Military', df$employment)
  df$employment[torep] = 'employed'
  torep <- grepl('Unable to work', df$employment)
  df$employment[torep] = 'Unable to work'
  torep <- grepl('Retired', df$employment)
  df$employment[torep] = 'Retired'
  torep <- grepl('Out of work|no work', df$employment)
  df$employment[torep] = 'unemployed'
  torep <- grepl('student', df$employment)
  df$employment[torep] = 'student'
  torep <- grepl('occasional|part time', df$employment)
  df$employment[torep] = 'part-time'
  df <- df  %>% dplyr::rename(caseStatus = inferred_diagnosis) %>%
    dplyr::mutate(study = 'mPower')

  #Update Education
  df <- df %>%
    dplyr::mutate(education = case_when(
      grepl('Post graduate|Doctoral Degree|Master', education, perl=T, ignore.case = T)  ~ 'Post graduate',
      education %in%  c('8th grade or less', 'More than 8th grade but did not graduate high school') ~ 'Below High School',
      education %in% c('Some college', 'Graduate of Four Year College', 'Graduate of Two Year College or Technical School', 'Some graduate school') ~ 'College',
      education %in% c('High school graduate or equivalent', 'Some high school') ~ 'High School',
      grepl('College', education, perl=T, ignore.case = T)  ~ 'College',

      grepl('high school', education, perl=T, ignore.case = T)  ~ 'High School',
      TRUE ~ education))


  #Filter out unclear gender
  df <- df %>% filter(gender %in% c('Female', 'Male', 'Prefer not to answer'))
  return(df)
}

main <- function() {

  synLogin()

  #Meta Data
  #mpower_metadata <- curate_mpower_metadata()
  #write_csv(mpower_metadata, "mpower_metadata.csv")
  #f_meta <- synapser::File("mpower_metadata.csv", parent="syn19520775")
  #synStore(f_meta, used=list("syn15624955"))
  #unlink("mpower_metadata.csv")

  #mPower engagement
  mpower <- fetch_and_prep() %>%
    mutate_participant_week_day() %>%
    mutate_task_type() %>%
    mutate_task_frequency() %>%
    mutate_local_time()

  ###Keep only those healthCode that have entry in the metaData file
  #mpower <- mpower %>%
  #  dplyr::filter(healthCode %in% .data = mpower_metadata$healthCode)

  write_tsv(mpower, "mpower_engagement_curated.tsv")
  f <- synapser::File("mpower_engagement_curated.tsv", parent="syn21054482") #syn18936398")
  synStore(f, used=list("syn21054482"), #syn11488492"),
           executed=paste0("https://gist.github.com/philerooski/",
                           "e357930a484c3b1190a58c0a75192daf"))
  unlink("mpower_engagement.tsv")

}

main()
	rm(list=ls())
	options(stringsAsFactors = F)
	library(synapser)
	library(tidyverse)
	library(lubridate)

	fetch_and_prep <- function() {
	mpower_f <- synGet("syn21213531") #"syn11488492")
	mpower <- read_tsv(mpower_f$path)
	now <- lubridate::now()
	mpower <- mpower %>%
	mutate(createdOn = as_datetime(createdOn/1000)) %>%
	filter(createdOn < now)
	}

	mutate_participant_week_day <- function(engagement) {
	first_activity <- engagement %>%
	group_by(healthCode) %>%
	summarise(first_activity_time = min(createdOn, na.rm=T))
	engagement <- inner_join(engagement, first_activity)
	engagement <- engagement %>%
	mutate(
	seconds_since_first_activity = createdOn - first_activity_time,
	participantWeek = as.integer(
	floor(as.numeric(
	as.duration(seconds_since_first_activity), "weeks"))),
	participantDay = as.integer(
	floor(as.numeric(
	as.duration(seconds_since_first_activity), "days")))
	) %>%
	select(-first_activity_time, -seconds_since_first_activity)
	return(engagement)
	}

	mutate_task_type <- function(engagement) {
	engagement %>%
	mutate(taskType = case_when(
	simpleName %in% c("WalkingActivity", "TappingActivity", "MemoryActivity",
	"VoiceActivity", "TremorActivity") ~ "active-sensor",
	simpleName %in% c("PDQ8", "MDSUPDRS", "study_feedback", "my_thoughts",
	"MedicationsSurvey", "MedicationTracker", "PHQ8",
	"MoodSurvey", "EnrollmentSurvey",
	"NonIdentifiableDemographicsTask", "mythoughts") ~ "survey",
	simpleName %in% c("HealthKitDataCollector", "HealthKitSleepCollector",
	"HealthKitWorkoutCollector", "locationTracker",
	"displacement", "displacementCollector",
	"motionActivityCollector") ~ "passive-sensor"))
	}

	mutate_task_frequency <- function(engagement) {
	engagement %>%
	mutate(taskFrequency = case_when(
	simpleName %in% c("WalkingActivity", "TappingActivity", "MemoryActivity",
	"VoiceActivity", "TremorActivity") ~ "daily",
	simpleName %in% c("PDQ8", "MDSUPDRS") ~ "monthly",
	simpleName %in% c("HealthKitDataCollector", "HealthKitSleepCollector",
	"HealthKitWorkoutCollector", "locationTracker",
	"displacement", "displacementCollector",
	"motionActivityCollector") ~ "continuous",
	simpleName %in% c("study_feedback", "my_thoughts", "MedicationsSurvey",
	"MedicationTracker", "PHQ8", "MoodSurvey",
	"mythoughts") ~ "intermittent",
	simpleName %in% c("EnrollmentSurvey", "NonIdentifiableDemographicsTask") ~ "baseline"))
	}

	mutate_local_time <- function(engagement) {
	healthcode_to_utc <- synGet("syn12635875")$path %>% read_tsv()
	engagement <- engagement %>%
	left_join(healthcode_to_utc, by = "healthCode") %>%
	mutate(createdOnLocalTime = as.character(createdOn + lubridate::hours(UTC_offset)),
	createdOnTimeZone = UTC_offset*100,
	createdOn = as.character(createdOn)) %>%
	select(-actual_state, -UTC_offset)
	return(engagement)
	}

	curate_mpower_metadata <- function() {

	df <- data.table::fread(synGet("syn15624955")$path, data.table = F) %>%
	dplyr::mutate(age_group = cut(age, breaks=c(17,29,39,49, 59, 120)))
	df <- df %>% dplyr::select(-health_history, -healthcare_provider, -last_smoked_timezone,
	-SourceID, -diagnosis_year, -medication_start_year, -onset_year, -professional_diagnosis,
	-when_deep_brain_stimulation, -packs_per_day, -years_smoking,
	-are_caretaker, -deep_brain_stimulation, -medical_usage, -medical_usage_yesterday,
	-home_usage, -smoked, -surgery, -living_alone_status, -medication_bool,
	-past_participation, -video_usage, -phone_usage, -recordId)

	#remove test users
	testHealthCodes <- df %>% dplyr::filter(str_detect(dataGroups, 'test\|parkinson;control\|control;parkinson')) %>% .$healthCode %>% unique()
	#filter out healthCodes
	df <- df %>% dplyr::filter(! healthCode %in% testHealthCodes)

	##Create a column for individuals that are clinically referred

	## Cohort 1 - ObjectivePD
	objectivePD_cohort <- data.table::fread(synGet("syn8533708")$path) %>% dplyr::rename(healthCode = healthcode)
	objectivePD_cohort_mdata <- data.table::fread(synGet("syn12555309")$path, data.table = F)
	objectivePD_cohort_mdata <- objectivePD_cohort_mdata[, c('healthCode', "Do you have Parkinson disease?")]
	objectivePD_cohort_mdata <- objectivePD_cohort_mdata %>% distinct()
	objectivePD_cohort_mdata['clinicalDiagnosis'] = objectivePD_cohort_mdata$`Do you have Parkinson disease?`
	objectivePD_cohort_mdata$`Do you have Parkinson disease?` <- NULL
	#clincal referral status + right case status
	objectivePD_cohort <- merge(objectivePD_cohort, objectivePD_cohort_mdata, all=T, by='healthCode') %>%
	dplyr::filter(!is.na(clinicalDiagnosis)) %>%
	dplyr::select(-id) %>%
	dplyr::mutate(clinicalDiagnosis = ifelse(clinicalDiagnosis == 'Yes', T, F),
	clinicalReferral = T)
	#merge with metadata
	df <- merge(df,objectivePD_cohort, all=T)
	#For objectivePD cohort - replace value of inferred_diagnosis by clinicalDiagnosis that is taken from curated data
	to.replace = !is.na(df$clinicalDiagnosis)
	df$inferred_diagnosis[to.replace] = df$clinicalDiagnosis[to.replace]
	df$clinicalDiagnosis <- NULL


	## Cohort 2
	## The clinical referrals is the intersection between that file and the query:
	###Github ref - https://github.com/Sage-Bionetworks/mhealth-engagement-analysis/issues/6
	otherClinicalReferrals <- synTableQuery("SELECT * FROM syn3420237 where externalId<>'' and dataGroups not like '%test%'")
	otherClinicalReferrals <- otherClinicalReferrals$asDataFrame() %>% .$healthCode %>% unique()
	otherClinicalReferrals <- setdiff(otherClinicalReferrals, objectivePD_cohort$healthCode)
	#select only those have inferred diagnosis = T
	otherClinicalReferrals <- df %>%
	filter(healthCode %in% otherClinicalReferrals & inferred_diagnosis == T ) %>% .$healthCode
	df$clinicalReferral[df$healthCode %in% otherClinicalReferrals] = T

	###Fill F for all others who are not clinical
	df$clinicalReferral[is.na(df$clinicalReferral)] = F

	df <- df %>% dplyr::rename(state = Enter_State) %>%
	dplyr::mutate(state = stringr::str_to_title(state)) %>%
	dplyr::select(-dataGroups) #remove dataGroups - pretty confusing col at this point with so much missing

	#Fix Race (In this order is IMP)
	torep <- grepl('Latino/Hispanic', df$race)
	df$race[torep] = 'Hispanic/Latinos'
	torep <- grepl('Black or African', df$race)
	df$race[torep] = 'African-American/Black'
	torep <- grepl('Native American', df$race)
	df$race[torep] = 'AIAN'
	torep <- grepl('Asian', df$race)
	df$race[torep] = 'Asian'
	torep <- grepl('White or Caucasian', df$race)
	df$race[torep] = 'Non-Hispanic White'
	torep <- grepl('Other\|Middle Eastern\|Caribbean', df$race, perl=T)
	df$race[torep] = 'Other'
	torep <- grepl('Mixed', df$race, perl=T)
	df$race[torep] = 'More than one'
	torep <- grepl('Pacific Islander', df$race, perl=T)
	df$race[torep] = 'Native Hawaiian or other Pacific Islander'


	#### Employment
	torep <- grepl('Self-employed\|full time\|Employment for wages\|Military', df$employment)
	df$employment[torep] = 'employed'
	torep <- grepl('Unable to work', df$employment)
	df$employment[torep] = 'Unable to work'
	torep <- grepl('Retired', df$employment)
	df$employment[torep] = 'Retired'
	torep <- grepl('Out of work\|no work', df$employment)
	df$employment[torep] = 'unemployed'
	torep <- grepl('student', df$employment)
	df$employment[torep] = 'student'
	torep <- grepl('occasional\|part time', df$employment)
	df$employment[torep] = 'part-time'
	df <- df %>% dplyr::rename(caseStatus = inferred_diagnosis) %>%
	dplyr::mutate(study = 'mPower')

	#Update Education
	df <- df %>%
	dplyr::mutate(education = case_when(
	grepl('Post graduate\|Doctoral Degree\|Master', education, perl=T, ignore.case = T) ~ 'Post graduate',
	education %in% c('8th grade or less', 'More than 8th grade but did not graduate high school') ~ 'Below High School',
	education %in% c('Some college', 'Graduate of Four Year College', 'Graduate of Two Year College or Technical School', 'Some graduate school') ~ 'College',
	education %in% c('High school graduate or equivalent', 'Some high school') ~ 'High School',
	grepl('College', education, perl=T, ignore.case = T) ~ 'College',

	grepl('high school', education, perl=T, ignore.case = T) ~ 'High School',
	TRUE ~ education))


	#Filter out unclear gender
	df <- df %>% filter(gender %in% c('Female', 'Male', 'Prefer not to answer'))
	return(df)
	}

	main <- function() {

	synLogin()

	#Meta Data
	#mpower_metadata <- curate_mpower_metadata()
	#write_csv(mpower_metadata, "mpower_metadata.csv")
	#f_meta <- synapser::File("mpower_metadata.csv", parent="syn19520775")
	#synStore(f_meta, used=list("syn15624955"))
	#unlink("mpower_metadata.csv")

	#mPower engagement
	mpower <- fetch_and_prep() %>%
	mutate_participant_week_day() %>%
	mutate_task_type() %>%
	mutate_task_frequency() %>%
	mutate_local_time()

	###Keep only those healthCode that have entry in the metaData file
	#mpower <- mpower %>%
	# dplyr::filter(healthCode %in% .data = mpower_metadata$healthCode)

	write_tsv(mpower, "mpower_engagement_curated.tsv")
	f <- synapser::File("mpower_engagement_curated.tsv", parent="syn21054482") #syn18936398")
	synStore(f, used=list("syn21054482"), #syn11488492"),
	executed=paste0("https://gist.github.com/philerooski/",
	"e357930a484c3b1190a58c0a75192daf"))
	unlink("mpower_engagement.tsv")

	}

	main()