Last active
February 5, 2018 19:03
-
-
Save philerooski/dfb733cb2b02c1b34ee8a251309846e9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "JourneyPRO - on-going summary" | |
output: | |
html_document: default | |
html_notebook: default | |
pdf_document: default | |
date: "`r format(Sys.time(), '%d %B, %Y')`" | |
--- | |
```{r global_options, include=FALSE} | |
rm(list=ls()) | |
knitr::opts_chunk$set(fig.path='Figs/', echo=FALSE, warning=FALSE, message=FALSE) | |
``` | |
```{r} | |
Sys.setenv(TZ='GMT') | |
library(synapseClient) | |
synapseLogin() | |
library(install.load) | |
install_load(c('plyr', 'tidyverse', 'data.table', 'lubridate', 'ggpubr', 'zoo')) | |
install_load('ggthemes', 'gridExtra', 'devtools') | |
``` | |
```{r} | |
stringfy <- function(x){ | |
gsub('[\\[\"\\]]','',x, perl=T) | |
} | |
# TODO: When is journeyPRO start date? | |
START_DATE = lubridate::ymd("2017-07-28") | |
DAYS_SINCE_LAUNCH = as.numeric(Sys.Date() - START_DATE) | |
WEEKS_SINCE_LAUNCH = ( (DAYS_SINCE_LAUNCH - 1) %/% 7) + 1 | |
``` | |
```{r} | |
masterTable <- "syn10232190" #poorly named journeyPRO-appVersion | |
#remove all testing users and records created before launch date (see `DAYS_SINCE_LAUNCH`) 12:00:00 AM (GMT) | |
userActivity <- synTableQuery(paste("select * from", masterTable, "WHERE dataGroups NOT LIKE '%test%' AND createdOn >=", as.integer(as.POSIXct(START_DATE), tz="UTC"))) | |
userActivity <- userActivity@values | |
healthSurveyId <- "syn10278768" | |
healthSurvey <- synTableQuery(paste("select * from", healthSurveyId))@values | |
#Data Manipulation | |
userActivity <- userActivity %>% dplyr::mutate(createdOnTimeZone = as.numeric(createdOnTimeZone)/100, | |
createdOn = lubridate::ymd_hms(createdOn, tz='UTC')) | |
healthSurvey <- healthSurvey %>% dplyr::mutate(createdOnTimeZone = as.numeric(createdOnTimeZone)/100, | |
createdOn = lubridate::ymd_hms(createdOn, tz='UTC')) | |
#get start dates for people | |
userStartDates <- userActivity %>% dplyr::group_by(healthCode) %>% | |
dplyr::summarise(participantStartDate = min(lubridate::date(createdOn))) | |
userActivity <- userActivity %>% inner_join(userStartDates) | |
healthSurvey <- healthSurvey %>% inner_join(userStartDates) | |
userActivity <- userActivity %>% | |
dplyr::mutate(createdOn_localTime = createdOn + lubridate::hours(createdOnTimeZone), | |
participant_day = as.numeric(lubridate::date(createdOn) - participantStartDate ) + 1, | |
participant_week = ((participant_day - 1) %/% 7 ) + 1, | |
study_day = as.numeric(lubridate::date(createdOn) - START_DATE) + 1, | |
study_week = ((study_day - 1) %/% 7 ) + 1, | |
currentDate = Sys.Date() | |
) | |
healthSurvey <- healthSurvey %>% | |
dplyr::mutate(createdOn_localTime = createdOn + lubridate::hours(createdOnTimeZone), | |
participant_day = as.numeric(lubridate::date(createdOn) - participantStartDate ) + 1, | |
participant_week = ((participant_day - 1) %/% 7 ) + 1, | |
study_day = as.numeric(lubridate::date(createdOn) - START_DATE) + 1, | |
study_week = ((study_day - 1) %/% 7 ) + 1, | |
currentDate = Sys.Date() | |
) | |
``` | |
```{r} | |
STUDY_HEALTHCODES <- unique(userActivity$healthCode) | |
total_journeyPRO_users <- n_distinct(userActivity$healthCode) | |
num_BT_patients <- userActivity %>% dplyr::filter(dataGroups %like% "beta_thalassemia") %>% .$healthCode %>% n_distinct() | |
num_MP_patients <- userActivity %>% dplyr::filter(dataGroups %like% "myelodysplastic_syndrome") %>% .$healthCode %>% n_distinct() | |
num_MF_patients <- userActivity %>% dplyr::filter(dataGroups %like% "myelofibrosis") %>% .$healthCode %>% n_distinct() | |
#patients_dataContrib <- round(table(userActivity$dataGroups) / nrow(userActivity) * 100, digits=2) | |
``` | |
---------- | |
#### Summary stats | |
* Weeks since launch (Jul 29, 2017): `r WEEKS_SINCE_LAUNCH` | |
* Total users: `r total_journeyPRO_users` | |
* Beta Thalassemia patients: `r num_BT_patients` | |
* Myelodysplastic Syndrome patients: `r num_MP_patients` | |
* Myelofibrosis patients: `r num_MF_patients` | |
--------- | |
#### Errors Detected | |
```{r} | |
records_with_wrong_createdOn_dates <- sum(lubridate::date(lubridate::ymd_hms(userActivity$createdOn)) > Sys.Date()) | |
``` | |
```{r} | |
#filter bad data | |
userActivity <- userActivity %>% dplyr::filter(lubridate::ymd_hms(createdOn) <= Sys.Date()) | |
``` | |
* Records with malformed timestamp - `r records_with_wrong_createdOn_dates` | |
------- | |
### Enrollment summary | |
```{r} | |
enrollment <- userActivity %>% select(healthCode, study_week, dataGroups) | |
enrollment <- enrollment[!duplicated(enrollment),] | |
enrollment <- plyr::ldply(1:max(enrollment$study_week), function(x){ | |
untilNow <- enrollment %>% filter(study_week < x) %>% .$healthCode %>% unique() | |
thisWeek <- enrollment %>% filter(study_week == x) %>% .$healthCode %>% unique() | |
newUsers_thisweek <- setdiff(thisWeek, untilNow) | |
res <- enrollment %>% select(healthCode, dataGroups) | |
res <- res[!duplicated(res),] | |
res <- res %>% filter(healthCode %in% newUsers_thisweek) | |
data.frame('week'=x, | |
"beta_thalassemia_patients" = res %>% | |
filter(dataGroups == "beta_thalassemia") %>% nrow(), | |
"myelodysplastic_syndrome_patients" = res %>% | |
filter(dataGroups == "myelodysplastic_syndrome") %>% nrow(), | |
"myelofibrosis_patients" = res %>% | |
filter(dataGroups == "myelofibrosis") %>% nrow()) %>% | |
gather(type, num, 2:4) | |
}) | |
enrollment <- enrollment %>% mutate(type = factor(type)) | |
p1 <- ggplot(data=enrollment, aes(x=week, y=num, fill=type )) + geom_bar(stat="identity", position="dodge", width = .7) + theme_bw() + scale_x_discrete(limits=seq(1,max(enrollment$week),2)) + ylab('People enrolled') + xlab('Weeks since launch') | |
p1 <- p1 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = c(0.82, 0.9)) | |
p1 <- p1 + theme(text = element_text(size=15)) + ggtitle('Users joining per week') | |
p1 <- p1 + geom_vline(xintercept=WEEKS_SINCE_LAUNCH, colour="grey40", size=1, linetype=3) | |
``` | |
```{r} | |
#People active each week since study start | |
activeEachWeek <- userActivity %>% dplyr::group_by(dataGroups, study_week) %>% | |
dplyr::summarise(uniqUsers = n_distinct(healthCode)) | |
p2 <- ggplot(data=activeEachWeek, aes(x=study_week, y=uniqUsers, fill=dataGroups )) + geom_bar(stat="identity", width = .5) + theme_bw() + scale_x_discrete(limits=seq(1,max(activeEachWeek$study_week),2)) | |
p2 <- p2 + labs(col="User type\n") + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) | |
p2 <- p2 + theme_few() + theme(legend.position = c(0.2, 0.82)) | |
p2 <- p2 + theme(text = element_text(size=15)) + ggtitle('Users active each week') + ylab('People') + xlab('Weeks since launch') | |
p2 <- p2 + geom_vline(xintercept=WEEKS_SINCE_LAUNCH, colour="grey40", size=1, linetype=3) | |
``` | |
```{r,fig.width=15, fig.height=4} | |
grid.arrange(p1,p2, ncol=2) | |
``` | |
------------- | |
### Compliance | |
```{r, fig.width=7, fig.height=5, fig.align='center'} | |
### This is mainly to figure out the #users that have been in study for atleast #weeks irrespective of the data contributed | |
user_time_in_study <- userActivity %>% dplyr::group_by(healthCode) %>% | |
dplyr::mutate(currentDate = Sys.Date(), | |
days_in_study = as.numeric(currentDate - participantStartDate) + 1, | |
weeks_in_study = ((days_in_study - 1 ) %/% 7 ) + 1, | |
current_days_in_study = as.numeric(lubridate::as_date(createdOn) - participantStartDate) + 1, | |
current_weeks_in_study = ((current_days_in_study - 1 ) %/% 7 ) + 1) | |
userWeeks <- user_time_in_study %>% dplyr::group_by(weeks_in_study) %>% | |
dplyr::summarise(numUsers = n_distinct(healthCode)) | |
userWeeks <- userWeeks %>% ddply(.variables='weeks_in_study', | |
.fun = function(x){ | |
tmp <- userWeeks %>% dplyr::filter(weeks_in_study >= unique(x$weeks_in_study)) | |
data.frame('totalUsersReachedThisWeek' = sum(tmp$numUsers)) | |
}) | |
allWeeks <- data.frame(weeks_in_study = 1:max(userActivity$study_week)) | |
userWeeks <- join(allWeeks, userWeeks) | |
userWeeks <- na.locf(userWeeks, fromLast = TRUE) | |
usersWhoMadeItThisFar <- user_time_in_study %>% select(healthCode, study_week) | |
usersWhoMadeItThisFar <- plyr::ldply(1:max(usersWhoMadeItThisFar$study_week), function(w) { | |
untilNow <- usersWhoMadeItThisFar %>% filter(study_week >= w) %>% .$healthCode %>% unique() %>% length() | |
data.frame('current_weeks_in_study' = w, 'numUsersWhoMadeItThisFar' = untilNow)}) | |
usersWhoActuallyDidSomethingThisWeek <- user_time_in_study %>% group_by(current_weeks_in_study) %>% | |
summarise(numUsersWhoDidSomething = n_distinct(healthCode)) | |
usersWhoActuallyDidSomethingThisWeek <- plyr::rename(usersWhoActuallyDidSomethingThisWeek, replace=c("current_weeks_in_study" = "weeks_in_study")) | |
specificUserWeeks <- user_time_in_study %>% select(healthCode, weeks_in_study) %>% unique() | |
specificUserWeeks <- ddply(specificUserWeeks, .variables=.(healthCode), .fun=function(r) { | |
data.frame("healthCode" = r$healthCode, "participant_week" = 1:r$weeks_in_study) | |
}) | |
overallCompliance <- join(usersWhoActuallyDidSomethingThisWeek, userWeeks) | |
overallCompliance <- overallCompliance %>% mutate(percent = numUsersWhoDidSomething / totalUsersReachedThisWeek) | |
overallCompliance$percent <- round(overallCompliance$percent * 100, digits=2) | |
#Since the study is still going on we are not dividing by the all users rather number of unique users in each week of the study | |
p3 <- ggplot(data=overallCompliance, aes(x=weeks_in_study, y=percent )) + geom_bar(stat="identity", position="dodge", width = .4) + theme_bw() + scale_x_discrete(limits=seq(1,max(overallCompliance$weeks_in_study),2)) + ggtitle('Percent users active by time in study') | |
p3 <- p3 + theme_few() + scale_fill_manual(values=c('#00AFBB')) + theme(text = element_text(size=15)) | |
p3 <- p3 + xlab('Weeks enrolled') + theme(text = element_text(size=15)) | |
``` | |
```{r} | |
### CURRENTLY NOT PLOTTED ### | |
# pro_surveys <- userActivity %>% filter( ( originalTable %like% 'Schedule' | originalTable %like% 'Challenge-v6')) | |
# pro_surveys_compliance <- pro_surveys %>% group_by(participant_week, originalTable, healthCode) %>% | |
# summarize(numRecords = n())#dplyr::summarise(num = n_distinct(healthCode)) | |
# pro_surveys_compliance <- merge(pro_surveys_compliance, userWeeks, by.x='participant_week', by.y='weeks_in_study') | |
# pro_surveys_compliance <- pro_surveys_compliance %>% dplyr::mutate(percent = round((num/totalUsersReachedThisWeek) * 100, digits=2)) | |
# pro_surveys_compliance['Survey'] = pro_surveys_compliance$originalTable | |
# pro_surveys_compliance$originalTable <- NULL | |
# p4 <- ggplot(data=pro_surveys_compliance, aes(x=participant_week, y=percent, color=Survey)) + geom_point() + geom_line(size=1.5, alpha=0.6) | |
# p4 <- p4 + theme_few() + scale_color_manual(values=c( "#2EC4B6", "#E71D36", "#FF9F1C", '#95C623', '#42c5f4')) | |
# p4 <- p4 + ggtitle('Percent Surveys Completed') + xlab('Weeks enrolled') + theme(text = element_text(size=15)) | |
``` | |
```{r} | |
activeSensor_tasks <- userActivity %>% | |
filter( (originalTable %like% 'Visual' | originalTable %like% 'Reaction' | originalTable %like% 'Attention' | originalTable %like% 'Challenge-v6')) | |
allSensorCombinations <- expand.grid( | |
healthCode = unique(userActivity$healthCode), | |
originalTable = unique(activeSensor_tasks$originalTable)) | |
allValidSensorCombinations <- join(specificUserWeeks, allSensorCombinations) | |
activeSensor_tasks_compliance <- activeSensor_tasks %>% group_by(participant_week, originalTable, healthCode) %>% | |
dplyr::summarise(numRecords = n()) | |
activeSensor_tasks_compliance <- join(allValidSensorCombinations, activeSensor_tasks_compliance) | |
activeSensor_tasks_compliance$numRecords[is.na(activeSensor_tasks_compliance$numRecords)] <- 0 | |
activeSensor_tasks_compliance <- activeSensor_tasks_compliance %>% group_by(participant_week, originalTable) %>% | |
summarise(percent = round((sum(numRecords) / n()) * 100, digits=2)) | |
#activeSensor_tasks_compliance <- activeSensor_tasks_compliance %>% dplyr::mutate(percent = round((numRecords / 7) * 100, digits=2)) | |
activeSensor_tasks_compliance['sensorActivity'] = activeSensor_tasks_compliance$originalTable | |
activeSensor_tasks_compliance$originalTable <- NULL | |
activeSensor_tasks_compliance['sensorActivity'] = gsub(' Activity.*', '',activeSensor_tasks_compliance$`sensorActivity`, perl=T) | |
activeSensor_tasks_compliance['sensorActivity'] = gsub('-.*', '',activeSensor_tasks_compliance$`sensorActivity`, perl=T) | |
p5 <- ggplot(data=activeSensor_tasks_compliance, aes(x=participant_week, y=percent, color=sensorActivity )) + geom_point() + geom_line(size=1.5, alpha=0.6) | |
p5 <- p5 + theme_few() + scale_color_manual(values=c( "#2EC4B6", "#E71D36", "#FF9F1C", "#011627")) | |
p5 <- p5 + ggtitle('Percent active (non-survey) tasks completed') + xlab('Weeks enrolled') + theme(text = element_text(size=15)) | |
``` | |
```{r} | |
otherShortSurveys <- userActivity %>% | |
filter( (originalTable == 'Check-In-v2' | originalTable %like% 'Lab Details' | originalTable %like% "Transfusion Details")) | |
allSurveyCombinations <- expand.grid( | |
healthCode = unique(userActivity$healthCode), | |
originalTable = unique(otherShortSurveys$originalTable)) | |
allValidSurveyCombinations <- join(specificUserWeeks, allSurveyCombinations) | |
otherShortSurveys_compliance <- otherShortSurveys %>% group_by(participant_week, originalTable, healthCode) %>% | |
dplyr::summarise(numRecords = n()) | |
otherShortSurveys_compliance <- join(allValidSurveyCombinations, otherShortSurveys_compliance) | |
otherShortSurveys_compliance$numRecords[is.na(otherShortSurveys_compliance$numRecords)] <- 0 | |
otherShortSurveys_compliance <- otherShortSurveys_compliance %>% group_by(participant_week, originalTable) %>% | |
dplyr::summarise(percent = round((sum(numRecords)/(7*n())) * 100, digits=2)) | |
otherShortSurveys_compliance['otherSurveys'] = otherShortSurveys_compliance$originalTable | |
otherShortSurveys_compliance$originalTable <- NULL | |
otherShortSurveys_compliance['otherSurveys'] = gsub('-v\\d', '',otherShortSurveys_compliance$`otherSurveys`, perl=T) | |
p6 <- ggplot(data=otherShortSurveys_compliance, aes(x=participant_week, y=percent, color=otherSurveys )) + geom_point() + geom_line(size=1.5, alpha=0.6) | |
p6 <- p6 + theme_few() + scale_color_manual(values=c( "#2EC4B6", "#E71D36", "#FF9F1C", '#011627')) | |
p6 <- p6 + ggtitle('Other Short Surveys Completed') + xlab('Weeks enrolled') + theme(text = element_text(size=15)) | |
``` | |
```{r} | |
sf12Compliance <- healthSurvey %>% group_by(participant_week) %>% dplyr::summarise(num = n_distinct(healthCode)) | |
sf12Compliance <- merge(sf12Compliance, userWeeks, by.x = 'participant_week', by.y='weeks_in_study') | |
sf12Compliance <- sf12Compliance %>% dplyr::mutate(percent = round((num/totalUsersReachedThisWeek) * 100, digits=2)) | |
p7 <- ggplot(data=sf12Compliance, aes(x=participant_week, y=percent)) + geom_point() + geom_line(size=1.5) + theme_few() + | |
ggtitle('Weekly SF-12 Surveys Completed') + xlab('Weeks enrolled') + theme(text = element_text(size=15)) | |
``` | |
```{r,fig.width=15, fig.height=8} | |
grid.arrange(p3,p5,p6,p7, ncol=2) | |
``` | |
------------- | |
```{r} | |
checkInId <- "syn10232192" | |
checkIn <- synTableQuery(paste("select dataGroups, mood_sleep, mood_pain, mood_health, mood_tired from", checkInId))@values | |
checkIn <- checkIn %>% na.omit() %>% filter(dataGroups %in% unique(userActivity$dataGroups)) | |
checkInGraph <- checkIn %>% gather(Category, Score, -dataGroups) %>% group_by(Category, dataGroups) %>% | |
summarise(val=mean(as.numeric(Score))) %>% | |
ggplot(aes(Category, val, fill=dataGroups)) + | |
geom_bar(stat="identity", position="dodge", width = .7) + | |
theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), | |
labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + | |
ylab("Mean Score (out of 5)") + ggtitle("Daily Check-In") | |
``` | |
```{r} | |
labDetailsId <- "syn10245386" | |
labDetails <- synTableQuery(paste("select dataGroups, hgb, rbc, wbc, hct, neutrophils, platelets, ferritin from", labDetailsId))@values | |
labDetails <- labDetails %>% filter(dataGroups %in% unique(userActivity$dataGroups)) | |
labDetails2 <- labDetails %>% select(dataGroups, hgb, rbc, wbc, hct, neutrophils) | |
labDetails <- labDetails %>% select(dataGroups, platelets, ferritin) | |
labDetails2 <- labDetails2[rowSums(is.na(labDetails2 %>% select(-dataGroups))) != ncol(labDetails2)-1,] | |
labDetails <- labDetails[rowSums(is.na(labDetails %>% select(-dataGroups))) != ncol(labDetails)-1,] | |
labDetailsGraph <- labDetails %>% gather(Category, Value, -dataGroups) %>% | |
group_by(Category) %>% | |
ggplot(aes(Category, Value, color=dataGroups)) + geom_crossbar(ymin=0, ymax=2000) + | |
theme_few() + scale_color_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), | |
labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + | |
ggtitle("Lab Details (2)") + ylim(NA, 2000) | |
labDetailsGraph2 <- labDetails2 %>% gather(Category, Value, -dataGroups) %>% | |
group_by(Category) %>% | |
ggplot(aes(Category, Value, color=dataGroups)) + geom_crossbar(ymin=0, ymax=55) + | |
theme_few() + scale_color_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), | |
labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + | |
ggtitle("Lab Details (1)") + ylim(NA, 55) | |
``` | |
```{r} | |
transfusionId <- "syn10245387" | |
transfusion <- synTableQuery(paste("select dataGroups, unitsBlood, infusionRate from", transfusionId))@values | |
transfusion <- transfusion %>% filter(dataGroups %in% unique(userActivity$dataGroups)) %>% na.omit() | |
transfusion <- transfusion[2:nrow(transfusion),] # drop outlier | |
transfusionGraph <- transfusion %>% ggplot(aes(unitsBlood, infusionRate)) + | |
geom_point(alpha=0.33, size=8, color="#ff5e5e") + theme_few() + | |
ggtitle("Transfusion Details") | |
``` | |
```{r,fig.width=15, fig.height=10} | |
grid.arrange(checkInGraph,transfusionGraph,labDetailsGraph2,labDetailsGraph, ncol=2) | |
``` | |
------------- | |
### Demographics Summary | |
```{r} | |
dem_survey_id <- "syn10232193" | |
demog <- synTableQuery(paste("select * from", dem_survey_id))@values | |
demog$metadata.json.dataGroups <- NULL | |
colnames(demog) <- gsub('.json.answer', '',colnames(demog)) | |
colnames(demog) <- gsub('metadata.json.', '',colnames(demog)) | |
demog <- demog %>% dplyr::select(-appVersion, -phoneInfo, -validationErrors, -createdOn, -createdOnTimeZone, -recordId, -uploadDate, -externalId, -birthYear_unit, -ethnicity) %>% | |
filter(dataGroups %in% c("beta_thalassemia", "myelodysplastic_syndrome", "myelofibrosis") & healthCode %in% STUDY_HEALTHCODES) #%>% | |
#dplyr::mutate(createdOn = as.Date(lubridate::ymd_hms(startDate))) | |
demog <- ddply(.data = demog, .variables = c('healthCode')) | |
#race <- demog %>% tidyr::gather(race, value, 2:4) %>% dplyr::filter(value == 'TRUE') %>% | |
# select(healthCode, race) %>% group_by(healthCode) %>% | |
# dplyr::summarise(race = paste(race, collapse=',')) | |
#demog <- merge(demog, race) #%>% select(-c(7:16)) | |
profile_id <- "syn10235462" | |
profiles <- synTableQuery(paste("select * from", profile_id))@values | |
profiles <- profiles %>% dplyr::filter(healthCode %in% STUDY_HEALTHCODES) %>% | |
dplyr::transmute(healthCode = healthCode, | |
age = demographics.age, | |
gender = demographics.gender, height = demographics.height, | |
weight = demographics.weight, ethnicity = demographics.ethnicity, | |
yearDiagnosed = demographics.yearDiagnosed, | |
userSharingScope=NA) %>% | |
dplyr::group_by(healthCode) %>% | |
dplyr::summarise( | |
age = unique(age)[1], gender = unique(gender)[1], height = unique(height)[1], | |
weight = unique(weight)[1], | |
userSharingScope=unique(userSharingScope)[1]) | |
#Add age to demog | |
demog <- merge(demog, profiles %>% dplyr::select(healthCode, age, gender, height, weight), all.x=T) | |
demog_derived_from_profiles <- profiles %>% dplyr::filter(!healthCode %in% demog$healthCode) | |
dataGroups <- userActivity %>% select(healthCode, dataGroups) | |
dataGroups <- dataGroups[!duplicated(dataGroups),] | |
demog_derived_from_profiles <- merge(demog_derived_from_profiles , dataGroups, all.x=T) | |
demog_derived_from_profiles <- demog_derived_from_profiles %>% | |
dplyr::select(healthCode, age, dataGroups, userSharingScope, gender, height, | |
weight) | |
tmp_df <- data.frame(dataGroups = c("beta_thalassemia", "myelodysplastic_syndrome", "myelofibrosis"), | |
num = c(num_BT_patients, num_MP_patients, num_MF_patients), stringsAsFactors = F) | |
percent_demog_data_compliance = round( (n_distinct(demog$healthCode)/total_journeyPRO_users) *100, digits=2) | |
percent_demog_data_from_profiles_compliance = round( (n_distinct(demog_derived_from_profiles$healthCode)/total_journeyPRO_users) *100, digits=2) | |
demog_total <- plyr::rbind.fill(demog, demog_derived_from_profiles) | |
``` | |
* Demographics data available for : `r n_distinct(demog_total$healthCode)` individuals | `r percent_demog_data_compliance` % | |
* Demographics data obtained from profiles : `r n_distinct(demog_derived_from_profiles$healthCode)` individuals | `r percent_demog_data_from_profiles_compliance` % | |
```{r, align='center', fig.width=5, fig.height=5} | |
missingData <- apply(demog_total,2, function(x){ | |
round( (sum(is.na(x) | x == "") / nrow(demog_total)) *100, digits=2) | |
})[c("age", "gender", "race")] | |
tmp_df <- data.frame(col = names(missingData), percentMissing = as.numeric(missingData)) %>% arrange(percentMissing) | |
missingGraph <- ggplot(data=tmp_df, aes(x=col, y=percentMissing)) + geom_bar(stat="identity") + theme_few() + coord_flip() + scale_y_continuous(limits=c(0,100)) + xlab('Demographic Variable') + theme(text = element_text(size=15)) + ylab('Percent') + ggtitle("Percent Missing") | |
``` | |
-------- | |
```{r} | |
tmp_df <- data.frame(dataGroups = c("beta_thalassemia", "myelodysplastic_syndrome", "myelofibrosis"), | |
num = c(num_BT_patients, num_MP_patients, num_MF_patients), stringsAsFactors = F) | |
gender = demog_total %>% filter(gender %in% c('Female', 'Male')) %>% inner_join(tmp_df) %>% dplyr::group_by(dataGroups, gender, num) %>% dplyr::summarise(prop =n_distinct(healthCode)) %>% mutate(prop = round((prop / num)*100, digits=2)) | |
p2 <- ggplot(data=gender, aes(x=gender, y=prop, fill=dataGroups )) + geom_bar(stat="identity", position="dodge", width = .7) + theme_bw() + ylab("Percent") | |
p2 <- p2 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = c(0.3, 0.85)) | |
genderGraph <- p2 + theme(text = element_text(size=15)) + ggtitle('Gender') + xlab("Gender") | |
``` | |
```{r} | |
p3 <- ggplot(data=demog_total %>% filter(!is.na(gender)), aes(x=gender, y=age, fill=dataGroups )) + geom_boxplot(width = .7) + theme_bw() + ylab('Age') + xlab('Gender') | |
p3 <- p3 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = 'none') | |
ageGraph <- p3 + theme(text = element_text(size=15)) + ggtitle('Age') | |
``` | |
```{r} | |
race = demog_total %>% filter(!is.na(race)) %>% dplyr::group_by(dataGroups, race) %>% dplyr::summarise(prop =n_distinct(healthCode)) | |
#levels <- race %>% filter(dataGroups %in% unique(userActivity$dataGroups)) %>% arrange(desc(prop)) %>% .$race | |
#race$race <- rev(levels) | |
p5 <- ggplot(data=race, aes(x=race, y=prop, fill=dataGroups)) + geom_bar(stat="identity", width = .7) + theme_bw() + ylab('Number') | |
p5 <- p5 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = c(0.6, 0.5)) | |
raceGraph <- p5 + theme(text = element_text(size=15)) + ggtitle('Race') + coord_flip() + xlab('') | |
``` | |
```{r,fig.width=15, fig.height=10} | |
grid.arrange(genderGraph,ageGraph,raceGraph,missingGraph, ncol=2) | |
``` | |
```{r, fig.width=15, fig.height=10} | |
if (!require("lbscorer",character.only = TRUE)) { | |
install_github("philerooski/lbscorer", silent=TRUE) | |
} | |
library(lbscorer) | |
questionCols <- names(healthSurvey)[12:23] | |
sf <- healthSurvey[c("recordId", "healthCode", "dataGroups", questionCols)] %>% filter(dataGroups %in% unique(userActivity$dataGroups)) %>% na.omit() | |
sfScores <- sf12(sf[questionCols]) | |
names(sfScores) <- c("Physical Function", "Role Physical", "Bodily Pain", "General Health", "Vitality", "Social Function", "Role Emotional", "Mental Health", "Physical Summary", "Mental Summary") | |
df <- cbind(sf[c("recordId", "healthCode", "dataGroups")], sfScores) | |
dfG <- df %>% gather(cols, value, -dataGroups, -recordId, -healthCode) | |
dfGBS <- dfG %>% filter(dataGroups == "beta_thalassemia") | |
dfGMS <- dfG %>% filter(dataGroups == "myelodysplastic_syndrome") | |
dfGM <- dfG %>% filter(dataGroups == "myelofibrosis") | |
par(cex.axis=0.8) | |
boxplot(sfScores, main="SF12-v2 Scores", xlab="Category", ylab="Score") | |
stripchart(value ~ cols, vertical = TRUE, data = dfGBS, | |
method = "jitter", add = TRUE, pch = 20, col = "#A5BE0050") | |
stripchart(value ~ cols, vertical = TRUE, data = dfGMS, | |
method = "jitter", add = TRUE, pch = 20, col = "#4286f450") | |
stripchart(value ~ cols, vertical = TRUE, data = dfGMS, | |
method = "jitter", add = TRUE, pch = 20, col = "#FC4E0750") | |
``` | |
-------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment