Skip to content

Instantly share code, notes, and snippets.

@philerooski
Last active February 5, 2018 19:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save philerooski/dfb733cb2b02c1b34ee8a251309846e9 to your computer and use it in GitHub Desktop.
Save philerooski/dfb733cb2b02c1b34ee8a251309846e9 to your computer and use it in GitHub Desktop.
---
title: "JourneyPRO - on-going summary"
output:
html_document: default
html_notebook: default
pdf_document: default
date: "`r format(Sys.time(), '%d %B, %Y')`"
---
```{r global_options, include=FALSE}
rm(list=ls())
knitr::opts_chunk$set(fig.path='Figs/', echo=FALSE, warning=FALSE, message=FALSE)
```
```{r}
Sys.setenv(TZ='GMT')
library(synapseClient)
synapseLogin()
library(install.load)
install_load(c('plyr', 'tidyverse', 'data.table', 'lubridate', 'ggpubr', 'zoo'))
install_load('ggthemes', 'gridExtra', 'devtools')
```
```{r}
stringfy <- function(x){
gsub('[\\[\"\\]]','',x, perl=T)
}
# TODO: When is journeyPRO start date?
START_DATE = lubridate::ymd("2017-07-28")
DAYS_SINCE_LAUNCH = as.numeric(Sys.Date() - START_DATE)
WEEKS_SINCE_LAUNCH = ( (DAYS_SINCE_LAUNCH - 1) %/% 7) + 1
```
```{r}
masterTable <- "syn10232190" #poorly named journeyPRO-appVersion
#remove all testing users and records created before launch date (see `DAYS_SINCE_LAUNCH`) 12:00:00 AM (GMT)
userActivity <- synTableQuery(paste("select * from", masterTable, "WHERE dataGroups NOT LIKE '%test%' AND createdOn >=", as.integer(as.POSIXct(START_DATE), tz="UTC")))
userActivity <- userActivity@values
healthSurveyId <- "syn10278768"
healthSurvey <- synTableQuery(paste("select * from", healthSurveyId))@values
#Data Manipulation
userActivity <- userActivity %>% dplyr::mutate(createdOnTimeZone = as.numeric(createdOnTimeZone)/100,
createdOn = lubridate::ymd_hms(createdOn, tz='UTC'))
healthSurvey <- healthSurvey %>% dplyr::mutate(createdOnTimeZone = as.numeric(createdOnTimeZone)/100,
createdOn = lubridate::ymd_hms(createdOn, tz='UTC'))
#get start dates for people
userStartDates <- userActivity %>% dplyr::group_by(healthCode) %>%
dplyr::summarise(participantStartDate = min(lubridate::date(createdOn)))
userActivity <- userActivity %>% inner_join(userStartDates)
healthSurvey <- healthSurvey %>% inner_join(userStartDates)
userActivity <- userActivity %>%
dplyr::mutate(createdOn_localTime = createdOn + lubridate::hours(createdOnTimeZone),
participant_day = as.numeric(lubridate::date(createdOn) - participantStartDate ) + 1,
participant_week = ((participant_day - 1) %/% 7 ) + 1,
study_day = as.numeric(lubridate::date(createdOn) - START_DATE) + 1,
study_week = ((study_day - 1) %/% 7 ) + 1,
currentDate = Sys.Date()
)
healthSurvey <- healthSurvey %>%
dplyr::mutate(createdOn_localTime = createdOn + lubridate::hours(createdOnTimeZone),
participant_day = as.numeric(lubridate::date(createdOn) - participantStartDate ) + 1,
participant_week = ((participant_day - 1) %/% 7 ) + 1,
study_day = as.numeric(lubridate::date(createdOn) - START_DATE) + 1,
study_week = ((study_day - 1) %/% 7 ) + 1,
currentDate = Sys.Date()
)
```
```{r}
STUDY_HEALTHCODES <- unique(userActivity$healthCode)
total_journeyPRO_users <- n_distinct(userActivity$healthCode)
num_BT_patients <- userActivity %>% dplyr::filter(dataGroups %like% "beta_thalassemia") %>% .$healthCode %>% n_distinct()
num_MP_patients <- userActivity %>% dplyr::filter(dataGroups %like% "myelodysplastic_syndrome") %>% .$healthCode %>% n_distinct()
num_MF_patients <- userActivity %>% dplyr::filter(dataGroups %like% "myelofibrosis") %>% .$healthCode %>% n_distinct()
#patients_dataContrib <- round(table(userActivity$dataGroups) / nrow(userActivity) * 100, digits=2)
```
----------
#### Summary stats
* Weeks since launch (Jul 29, 2017): `r WEEKS_SINCE_LAUNCH`
* Total users: `r total_journeyPRO_users`
* Beta Thalassemia patients: `r num_BT_patients`
* Myelodysplastic Syndrome patients: `r num_MP_patients`
* Myelofibrosis patients: `r num_MF_patients`
---------
#### Errors Detected
```{r}
records_with_wrong_createdOn_dates <- sum(lubridate::date(lubridate::ymd_hms(userActivity$createdOn)) > Sys.Date())
```
```{r}
#filter bad data
userActivity <- userActivity %>% dplyr::filter(lubridate::ymd_hms(createdOn) <= Sys.Date())
```
* Records with malformed timestamp - `r records_with_wrong_createdOn_dates`
-------
### Enrollment summary
```{r}
enrollment <- userActivity %>% select(healthCode, study_week, dataGroups)
enrollment <- enrollment[!duplicated(enrollment),]
enrollment <- plyr::ldply(1:max(enrollment$study_week), function(x){
untilNow <- enrollment %>% filter(study_week < x) %>% .$healthCode %>% unique()
thisWeek <- enrollment %>% filter(study_week == x) %>% .$healthCode %>% unique()
newUsers_thisweek <- setdiff(thisWeek, untilNow)
res <- enrollment %>% select(healthCode, dataGroups)
res <- res[!duplicated(res),]
res <- res %>% filter(healthCode %in% newUsers_thisweek)
data.frame('week'=x,
"beta_thalassemia_patients" = res %>%
filter(dataGroups == "beta_thalassemia") %>% nrow(),
"myelodysplastic_syndrome_patients" = res %>%
filter(dataGroups == "myelodysplastic_syndrome") %>% nrow(),
"myelofibrosis_patients" = res %>%
filter(dataGroups == "myelofibrosis") %>% nrow()) %>%
gather(type, num, 2:4)
})
enrollment <- enrollment %>% mutate(type = factor(type))
p1 <- ggplot(data=enrollment, aes(x=week, y=num, fill=type )) + geom_bar(stat="identity", position="dodge", width = .7) + theme_bw() + scale_x_discrete(limits=seq(1,max(enrollment$week),2)) + ylab('People enrolled') + xlab('Weeks since launch')
p1 <- p1 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = c(0.82, 0.9))
p1 <- p1 + theme(text = element_text(size=15)) + ggtitle('Users joining per week')
p1 <- p1 + geom_vline(xintercept=WEEKS_SINCE_LAUNCH, colour="grey40", size=1, linetype=3)
```
```{r}
#People active each week since study start
activeEachWeek <- userActivity %>% dplyr::group_by(dataGroups, study_week) %>%
dplyr::summarise(uniqUsers = n_distinct(healthCode))
p2 <- ggplot(data=activeEachWeek, aes(x=study_week, y=uniqUsers, fill=dataGroups )) + geom_bar(stat="identity", width = .5) + theme_bw() + scale_x_discrete(limits=seq(1,max(activeEachWeek$study_week),2))
p2 <- p2 + labs(col="User type\n") + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis"))
p2 <- p2 + theme_few() + theme(legend.position = c(0.2, 0.82))
p2 <- p2 + theme(text = element_text(size=15)) + ggtitle('Users active each week') + ylab('People') + xlab('Weeks since launch')
p2 <- p2 + geom_vline(xintercept=WEEKS_SINCE_LAUNCH, colour="grey40", size=1, linetype=3)
```
```{r,fig.width=15, fig.height=4}
grid.arrange(p1,p2, ncol=2)
```
-------------
### Compliance
```{r, fig.width=7, fig.height=5, fig.align='center'}
### This is mainly to figure out the #users that have been in study for atleast #weeks irrespective of the data contributed
user_time_in_study <- userActivity %>% dplyr::group_by(healthCode) %>%
dplyr::mutate(currentDate = Sys.Date(),
days_in_study = as.numeric(currentDate - participantStartDate) + 1,
weeks_in_study = ((days_in_study - 1 ) %/% 7 ) + 1,
current_days_in_study = as.numeric(lubridate::as_date(createdOn) - participantStartDate) + 1,
current_weeks_in_study = ((current_days_in_study - 1 ) %/% 7 ) + 1)
userWeeks <- user_time_in_study %>% dplyr::group_by(weeks_in_study) %>%
dplyr::summarise(numUsers = n_distinct(healthCode))
userWeeks <- userWeeks %>% ddply(.variables='weeks_in_study',
.fun = function(x){
tmp <- userWeeks %>% dplyr::filter(weeks_in_study >= unique(x$weeks_in_study))
data.frame('totalUsersReachedThisWeek' = sum(tmp$numUsers))
})
allWeeks <- data.frame(weeks_in_study = 1:max(userActivity$study_week))
userWeeks <- join(allWeeks, userWeeks)
userWeeks <- na.locf(userWeeks, fromLast = TRUE)
usersWhoMadeItThisFar <- user_time_in_study %>% select(healthCode, study_week)
usersWhoMadeItThisFar <- plyr::ldply(1:max(usersWhoMadeItThisFar$study_week), function(w) {
untilNow <- usersWhoMadeItThisFar %>% filter(study_week >= w) %>% .$healthCode %>% unique() %>% length()
data.frame('current_weeks_in_study' = w, 'numUsersWhoMadeItThisFar' = untilNow)})
usersWhoActuallyDidSomethingThisWeek <- user_time_in_study %>% group_by(current_weeks_in_study) %>%
summarise(numUsersWhoDidSomething = n_distinct(healthCode))
usersWhoActuallyDidSomethingThisWeek <- plyr::rename(usersWhoActuallyDidSomethingThisWeek, replace=c("current_weeks_in_study" = "weeks_in_study"))
specificUserWeeks <- user_time_in_study %>% select(healthCode, weeks_in_study) %>% unique()
specificUserWeeks <- ddply(specificUserWeeks, .variables=.(healthCode), .fun=function(r) {
data.frame("healthCode" = r$healthCode, "participant_week" = 1:r$weeks_in_study)
})
overallCompliance <- join(usersWhoActuallyDidSomethingThisWeek, userWeeks)
overallCompliance <- overallCompliance %>% mutate(percent = numUsersWhoDidSomething / totalUsersReachedThisWeek)
overallCompliance$percent <- round(overallCompliance$percent * 100, digits=2)
#Since the study is still going on we are not dividing by the all users rather number of unique users in each week of the study
p3 <- ggplot(data=overallCompliance, aes(x=weeks_in_study, y=percent )) + geom_bar(stat="identity", position="dodge", width = .4) + theme_bw() + scale_x_discrete(limits=seq(1,max(overallCompliance$weeks_in_study),2)) + ggtitle('Percent users active by time in study')
p3 <- p3 + theme_few() + scale_fill_manual(values=c('#00AFBB')) + theme(text = element_text(size=15))
p3 <- p3 + xlab('Weeks enrolled') + theme(text = element_text(size=15))
```
```{r}
### CURRENTLY NOT PLOTTED ###
# pro_surveys <- userActivity %>% filter( ( originalTable %like% 'Schedule' | originalTable %like% 'Challenge-v6'))
# pro_surveys_compliance <- pro_surveys %>% group_by(participant_week, originalTable, healthCode) %>%
# summarize(numRecords = n())#dplyr::summarise(num = n_distinct(healthCode))
# pro_surveys_compliance <- merge(pro_surveys_compliance, userWeeks, by.x='participant_week', by.y='weeks_in_study')
# pro_surveys_compliance <- pro_surveys_compliance %>% dplyr::mutate(percent = round((num/totalUsersReachedThisWeek) * 100, digits=2))
# pro_surveys_compliance['Survey'] = pro_surveys_compliance$originalTable
# pro_surveys_compliance$originalTable <- NULL
# p4 <- ggplot(data=pro_surveys_compliance, aes(x=participant_week, y=percent, color=Survey)) + geom_point() + geom_line(size=1.5, alpha=0.6)
# p4 <- p4 + theme_few() + scale_color_manual(values=c( "#2EC4B6", "#E71D36", "#FF9F1C", '#95C623', '#42c5f4'))
# p4 <- p4 + ggtitle('Percent Surveys Completed') + xlab('Weeks enrolled') + theme(text = element_text(size=15))
```
```{r}
activeSensor_tasks <- userActivity %>%
filter( (originalTable %like% 'Visual' | originalTable %like% 'Reaction' | originalTable %like% 'Attention' | originalTable %like% 'Challenge-v6'))
allSensorCombinations <- expand.grid(
healthCode = unique(userActivity$healthCode),
originalTable = unique(activeSensor_tasks$originalTable))
allValidSensorCombinations <- join(specificUserWeeks, allSensorCombinations)
activeSensor_tasks_compliance <- activeSensor_tasks %>% group_by(participant_week, originalTable, healthCode) %>%
dplyr::summarise(numRecords = n())
activeSensor_tasks_compliance <- join(allValidSensorCombinations, activeSensor_tasks_compliance)
activeSensor_tasks_compliance$numRecords[is.na(activeSensor_tasks_compliance$numRecords)] <- 0
activeSensor_tasks_compliance <- activeSensor_tasks_compliance %>% group_by(participant_week, originalTable) %>%
summarise(percent = round((sum(numRecords) / n()) * 100, digits=2))
#activeSensor_tasks_compliance <- activeSensor_tasks_compliance %>% dplyr::mutate(percent = round((numRecords / 7) * 100, digits=2))
activeSensor_tasks_compliance['sensorActivity'] = activeSensor_tasks_compliance$originalTable
activeSensor_tasks_compliance$originalTable <- NULL
activeSensor_tasks_compliance['sensorActivity'] = gsub(' Activity.*', '',activeSensor_tasks_compliance$`sensorActivity`, perl=T)
activeSensor_tasks_compliance['sensorActivity'] = gsub('-.*', '',activeSensor_tasks_compliance$`sensorActivity`, perl=T)
p5 <- ggplot(data=activeSensor_tasks_compliance, aes(x=participant_week, y=percent, color=sensorActivity )) + geom_point() + geom_line(size=1.5, alpha=0.6)
p5 <- p5 + theme_few() + scale_color_manual(values=c( "#2EC4B6", "#E71D36", "#FF9F1C", "#011627"))
p5 <- p5 + ggtitle('Percent active (non-survey) tasks completed') + xlab('Weeks enrolled') + theme(text = element_text(size=15))
```
```{r}
otherShortSurveys <- userActivity %>%
filter( (originalTable == 'Check-In-v2' | originalTable %like% 'Lab Details' | originalTable %like% "Transfusion Details"))
allSurveyCombinations <- expand.grid(
healthCode = unique(userActivity$healthCode),
originalTable = unique(otherShortSurveys$originalTable))
allValidSurveyCombinations <- join(specificUserWeeks, allSurveyCombinations)
otherShortSurveys_compliance <- otherShortSurveys %>% group_by(participant_week, originalTable, healthCode) %>%
dplyr::summarise(numRecords = n())
otherShortSurveys_compliance <- join(allValidSurveyCombinations, otherShortSurveys_compliance)
otherShortSurveys_compliance$numRecords[is.na(otherShortSurveys_compliance$numRecords)] <- 0
otherShortSurveys_compliance <- otherShortSurveys_compliance %>% group_by(participant_week, originalTable) %>%
dplyr::summarise(percent = round((sum(numRecords)/(7*n())) * 100, digits=2))
otherShortSurveys_compliance['otherSurveys'] = otherShortSurveys_compliance$originalTable
otherShortSurveys_compliance$originalTable <- NULL
otherShortSurveys_compliance['otherSurveys'] = gsub('-v\\d', '',otherShortSurveys_compliance$`otherSurveys`, perl=T)
p6 <- ggplot(data=otherShortSurveys_compliance, aes(x=participant_week, y=percent, color=otherSurveys )) + geom_point() + geom_line(size=1.5, alpha=0.6)
p6 <- p6 + theme_few() + scale_color_manual(values=c( "#2EC4B6", "#E71D36", "#FF9F1C", '#011627'))
p6 <- p6 + ggtitle('Other Short Surveys Completed') + xlab('Weeks enrolled') + theme(text = element_text(size=15))
```
```{r}
sf12Compliance <- healthSurvey %>% group_by(participant_week) %>% dplyr::summarise(num = n_distinct(healthCode))
sf12Compliance <- merge(sf12Compliance, userWeeks, by.x = 'participant_week', by.y='weeks_in_study')
sf12Compliance <- sf12Compliance %>% dplyr::mutate(percent = round((num/totalUsersReachedThisWeek) * 100, digits=2))
p7 <- ggplot(data=sf12Compliance, aes(x=participant_week, y=percent)) + geom_point() + geom_line(size=1.5) + theme_few() +
ggtitle('Weekly SF-12 Surveys Completed') + xlab('Weeks enrolled') + theme(text = element_text(size=15))
```
```{r,fig.width=15, fig.height=8}
grid.arrange(p3,p5,p6,p7, ncol=2)
```
-------------
```{r}
checkInId <- "syn10232192"
checkIn <- synTableQuery(paste("select dataGroups, mood_sleep, mood_pain, mood_health, mood_tired from", checkInId))@values
checkIn <- checkIn %>% na.omit() %>% filter(dataGroups %in% unique(userActivity$dataGroups))
checkInGraph <- checkIn %>% gather(Category, Score, -dataGroups) %>% group_by(Category, dataGroups) %>%
summarise(val=mean(as.numeric(Score))) %>%
ggplot(aes(Category, val, fill=dataGroups)) +
geom_bar(stat="identity", position="dodge", width = .7) +
theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),
labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) +
ylab("Mean Score (out of 5)") + ggtitle("Daily Check-In")
```
```{r}
labDetailsId <- "syn10245386"
labDetails <- synTableQuery(paste("select dataGroups, hgb, rbc, wbc, hct, neutrophils, platelets, ferritin from", labDetailsId))@values
labDetails <- labDetails %>% filter(dataGroups %in% unique(userActivity$dataGroups))
labDetails2 <- labDetails %>% select(dataGroups, hgb, rbc, wbc, hct, neutrophils)
labDetails <- labDetails %>% select(dataGroups, platelets, ferritin)
labDetails2 <- labDetails2[rowSums(is.na(labDetails2 %>% select(-dataGroups))) != ncol(labDetails2)-1,]
labDetails <- labDetails[rowSums(is.na(labDetails %>% select(-dataGroups))) != ncol(labDetails)-1,]
labDetailsGraph <- labDetails %>% gather(Category, Value, -dataGroups) %>%
group_by(Category) %>%
ggplot(aes(Category, Value, color=dataGroups)) + geom_crossbar(ymin=0, ymax=2000) +
theme_few() + scale_color_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),
labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) +
ggtitle("Lab Details (2)") + ylim(NA, 2000)
labDetailsGraph2 <- labDetails2 %>% gather(Category, Value, -dataGroups) %>%
group_by(Category) %>%
ggplot(aes(Category, Value, color=dataGroups)) + geom_crossbar(ymin=0, ymax=55) +
theme_few() + scale_color_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),
labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) +
ggtitle("Lab Details (1)") + ylim(NA, 55)
```
```{r}
transfusionId <- "syn10245387"
transfusion <- synTableQuery(paste("select dataGroups, unitsBlood, infusionRate from", transfusionId))@values
transfusion <- transfusion %>% filter(dataGroups %in% unique(userActivity$dataGroups)) %>% na.omit()
transfusion <- transfusion[2:nrow(transfusion),] # drop outlier
transfusionGraph <- transfusion %>% ggplot(aes(unitsBlood, infusionRate)) +
geom_point(alpha=0.33, size=8, color="#ff5e5e") + theme_few() +
ggtitle("Transfusion Details")
```
```{r,fig.width=15, fig.height=10}
grid.arrange(checkInGraph,transfusionGraph,labDetailsGraph2,labDetailsGraph, ncol=2)
```
-------------
### Demographics Summary
```{r}
dem_survey_id <- "syn10232193"
demog <- synTableQuery(paste("select * from", dem_survey_id))@values
demog$metadata.json.dataGroups <- NULL
colnames(demog) <- gsub('.json.answer', '',colnames(demog))
colnames(demog) <- gsub('metadata.json.', '',colnames(demog))
demog <- demog %>% dplyr::select(-appVersion, -phoneInfo, -validationErrors, -createdOn, -createdOnTimeZone, -recordId, -uploadDate, -externalId, -birthYear_unit, -ethnicity) %>%
filter(dataGroups %in% c("beta_thalassemia", "myelodysplastic_syndrome", "myelofibrosis") & healthCode %in% STUDY_HEALTHCODES) #%>%
#dplyr::mutate(createdOn = as.Date(lubridate::ymd_hms(startDate)))
demog <- ddply(.data = demog, .variables = c('healthCode'))
#race <- demog %>% tidyr::gather(race, value, 2:4) %>% dplyr::filter(value == 'TRUE') %>%
# select(healthCode, race) %>% group_by(healthCode) %>%
# dplyr::summarise(race = paste(race, collapse=','))
#demog <- merge(demog, race) #%>% select(-c(7:16))
profile_id <- "syn10235462"
profiles <- synTableQuery(paste("select * from", profile_id))@values
profiles <- profiles %>% dplyr::filter(healthCode %in% STUDY_HEALTHCODES) %>%
dplyr::transmute(healthCode = healthCode,
age = demographics.age,
gender = demographics.gender, height = demographics.height,
weight = demographics.weight, ethnicity = demographics.ethnicity,
yearDiagnosed = demographics.yearDiagnosed,
userSharingScope=NA) %>%
dplyr::group_by(healthCode) %>%
dplyr::summarise(
age = unique(age)[1], gender = unique(gender)[1], height = unique(height)[1],
weight = unique(weight)[1],
userSharingScope=unique(userSharingScope)[1])
#Add age to demog
demog <- merge(demog, profiles %>% dplyr::select(healthCode, age, gender, height, weight), all.x=T)
demog_derived_from_profiles <- profiles %>% dplyr::filter(!healthCode %in% demog$healthCode)
dataGroups <- userActivity %>% select(healthCode, dataGroups)
dataGroups <- dataGroups[!duplicated(dataGroups),]
demog_derived_from_profiles <- merge(demog_derived_from_profiles , dataGroups, all.x=T)
demog_derived_from_profiles <- demog_derived_from_profiles %>%
dplyr::select(healthCode, age, dataGroups, userSharingScope, gender, height,
weight)
tmp_df <- data.frame(dataGroups = c("beta_thalassemia", "myelodysplastic_syndrome", "myelofibrosis"),
num = c(num_BT_patients, num_MP_patients, num_MF_patients), stringsAsFactors = F)
percent_demog_data_compliance = round( (n_distinct(demog$healthCode)/total_journeyPRO_users) *100, digits=2)
percent_demog_data_from_profiles_compliance = round( (n_distinct(demog_derived_from_profiles$healthCode)/total_journeyPRO_users) *100, digits=2)
demog_total <- plyr::rbind.fill(demog, demog_derived_from_profiles)
```
* Demographics data available for : `r n_distinct(demog_total$healthCode)` individuals | `r percent_demog_data_compliance` %
* Demographics data obtained from profiles : `r n_distinct(demog_derived_from_profiles$healthCode)` individuals | `r percent_demog_data_from_profiles_compliance` %
```{r, align='center', fig.width=5, fig.height=5}
missingData <- apply(demog_total,2, function(x){
round( (sum(is.na(x) | x == "") / nrow(demog_total)) *100, digits=2)
})[c("age", "gender", "race")]
tmp_df <- data.frame(col = names(missingData), percentMissing = as.numeric(missingData)) %>% arrange(percentMissing)
missingGraph <- ggplot(data=tmp_df, aes(x=col, y=percentMissing)) + geom_bar(stat="identity") + theme_few() + coord_flip() + scale_y_continuous(limits=c(0,100)) + xlab('Demographic Variable') + theme(text = element_text(size=15)) + ylab('Percent') + ggtitle("Percent Missing")
```
--------
```{r}
tmp_df <- data.frame(dataGroups = c("beta_thalassemia", "myelodysplastic_syndrome", "myelofibrosis"),
num = c(num_BT_patients, num_MP_patients, num_MF_patients), stringsAsFactors = F)
gender = demog_total %>% filter(gender %in% c('Female', 'Male')) %>% inner_join(tmp_df) %>% dplyr::group_by(dataGroups, gender, num) %>% dplyr::summarise(prop =n_distinct(healthCode)) %>% mutate(prop = round((prop / num)*100, digits=2))
p2 <- ggplot(data=gender, aes(x=gender, y=prop, fill=dataGroups )) + geom_bar(stat="identity", position="dodge", width = .7) + theme_bw() + ylab("Percent")
p2 <- p2 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = c(0.3, 0.85))
genderGraph <- p2 + theme(text = element_text(size=15)) + ggtitle('Gender') + xlab("Gender")
```
```{r}
p3 <- ggplot(data=demog_total %>% filter(!is.na(gender)), aes(x=gender, y=age, fill=dataGroups )) + geom_boxplot(width = .7) + theme_bw() + ylab('Age') + xlab('Gender')
p3 <- p3 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = 'none')
ageGraph <- p3 + theme(text = element_text(size=15)) + ggtitle('Age')
```
```{r}
race = demog_total %>% filter(!is.na(race)) %>% dplyr::group_by(dataGroups, race) %>% dplyr::summarise(prop =n_distinct(healthCode))
#levels <- race %>% filter(dataGroups %in% unique(userActivity$dataGroups)) %>% arrange(desc(prop)) %>% .$race
#race$race <- rev(levels)
p5 <- ggplot(data=race, aes(x=race, y=prop, fill=dataGroups)) + geom_bar(stat="identity", width = .7) + theme_bw() + ylab('Number')
p5 <- p5 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"), labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = c(0.6, 0.5))
raceGraph <- p5 + theme(text = element_text(size=15)) + ggtitle('Race') + coord_flip() + xlab('')
```
```{r,fig.width=15, fig.height=10}
grid.arrange(genderGraph,ageGraph,raceGraph,missingGraph, ncol=2)
```
```{r, fig.width=15, fig.height=10}
if (!require("lbscorer",character.only = TRUE)) {
install_github("philerooski/lbscorer", silent=TRUE)
}
library(lbscorer)
questionCols <- names(healthSurvey)[12:23]
sf <- healthSurvey[c("recordId", "healthCode", "dataGroups", questionCols)] %>% filter(dataGroups %in% unique(userActivity$dataGroups)) %>% na.omit()
sfScores <- sf12(sf[questionCols])
names(sfScores) <- c("Physical Function", "Role Physical", "Bodily Pain", "General Health", "Vitality", "Social Function", "Role Emotional", "Mental Health", "Physical Summary", "Mental Summary")
df <- cbind(sf[c("recordId", "healthCode", "dataGroups")], sfScores)
dfG <- df %>% gather(cols, value, -dataGroups, -recordId, -healthCode)
dfGBS <- dfG %>% filter(dataGroups == "beta_thalassemia")
dfGMS <- dfG %>% filter(dataGroups == "myelodysplastic_syndrome")
dfGM <- dfG %>% filter(dataGroups == "myelofibrosis")
par(cex.axis=0.8)
boxplot(sfScores, main="SF12-v2 Scores", xlab="Category", ylab="Score")
stripchart(value ~ cols, vertical = TRUE, data = dfGBS,
method = "jitter", add = TRUE, pch = 20, col = "#A5BE0050")
stripchart(value ~ cols, vertical = TRUE, data = dfGMS,
method = "jitter", add = TRUE, pch = 20, col = "#4286f450")
stripchart(value ~ cols, vertical = TRUE, data = dfGMS,
method = "jitter", add = TRUE, pch = 20, col = "#FC4E0750")
```
--------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment