philerooski/QC.Rmd

## QC.Rmd
---
title: "JourneyPRO - on-going summary"
output:
  html_document: default
  html_notebook: default
  pdf_document: default
date: "`r format(Sys.time(), '%d %B, %Y')`"
---

```{r global_options, include=FALSE}
rm(list=ls())
knitr::opts_chunk$set(fig.path='Figs/', echo=FALSE, warning=FALSE, message=FALSE)
```

```{r}
Sys.setenv(TZ='GMT')
library(synapseClient)
synapseLogin()
library(install.load)
install_load(c('plyr', 'tidyverse', 'data.table', 'lubridate', 'ggpubr', 'zoo'))
install_load('ggthemes', 'gridExtra', 'devtools')
```

```{r}
stringfy <- function(x){
  gsub('[\\[\"\\]]','',x, perl=T)
}
# TODO: When is journeyPRO start date?
START_DATE = lubridate::ymd("2017-07-28")
DAYS_SINCE_LAUNCH = as.numeric(Sys.Date() - START_DATE)
WEEKS_SINCE_LAUNCH = ( (DAYS_SINCE_LAUNCH - 1) %/% 7) + 1
```


```{r}
masterTable <- "syn10232190"  #poorly named journeyPRO-appVersion
#remove all testing users and records created before launch date (see `DAYS_SINCE_LAUNCH`) 12:00:00 AM (GMT)
userActivity <- synTableQuery(paste("select * from", masterTable, "WHERE dataGroups NOT LIKE '%test%' AND createdOn >=", as.integer(as.POSIXct(START_DATE), tz="UTC")))
userActivity <- userActivity@values
healthSurveyId <- "syn10278768"
healthSurvey <- synTableQuery(paste("select * from", healthSurveyId))@values

#Data Manipulation
userActivity <- userActivity %>% dplyr::mutate(createdOnTimeZone = as.numeric(createdOnTimeZone)/100,
                                               createdOn = lubridate::ymd_hms(createdOn, tz='UTC'))
healthSurvey <- healthSurvey %>% dplyr::mutate(createdOnTimeZone = as.numeric(createdOnTimeZone)/100,
                                               createdOn = lubridate::ymd_hms(createdOn, tz='UTC'))
#get start dates for people
userStartDates <- userActivity %>% dplyr::group_by(healthCode) %>%
  dplyr::summarise(participantStartDate = min(lubridate::date(createdOn)))
userActivity <- userActivity %>% inner_join(userStartDates)
healthSurvey <- healthSurvey %>% inner_join(userStartDates)

userActivity <- userActivity %>%
  dplyr::mutate(createdOn_localTime = createdOn + lubridate::hours(createdOnTimeZone),
                participant_day = as.numeric(lubridate::date(createdOn) - participantStartDate ) + 1,
                participant_week = ((participant_day - 1) %/% 7 ) + 1,
                study_day = as.numeric(lubridate::date(createdOn) - START_DATE) + 1,
                study_week = ((study_day - 1) %/% 7 ) + 1,
                currentDate = Sys.Date()
                )
healthSurvey <- healthSurvey %>%
  dplyr::mutate(createdOn_localTime = createdOn + lubridate::hours(createdOnTimeZone),
                participant_day = as.numeric(lubridate::date(createdOn) - participantStartDate ) + 1,
                participant_week = ((participant_day - 1) %/% 7 ) + 1,
                study_day = as.numeric(lubridate::date(createdOn) - START_DATE) + 1,
                study_week = ((study_day - 1) %/% 7 ) + 1,
                currentDate = Sys.Date()
                )
```


```{r}
STUDY_HEALTHCODES <- unique(userActivity$healthCode)
total_journeyPRO_users <- n_distinct(userActivity$healthCode)
num_BT_patients <- userActivity %>% dplyr::filter(dataGroups %like% "beta_thalassemia") %>% .$healthCode %>% n_distinct()
num_MP_patients <- userActivity %>% dplyr::filter(dataGroups %like% "myelodysplastic_syndrome") %>% .$healthCode %>% n_distinct()
num_MF_patients <- userActivity %>% dplyr::filter(dataGroups %like% "myelofibrosis") %>% .$healthCode %>% n_distinct()
#patients_dataContrib <- round(table(userActivity$dataGroups) / nrow(userActivity) * 100, digits=2)
```


----------


#### Summary stats
* Weeks since launch (Jul 29, 2017): `r  WEEKS_SINCE_LAUNCH`
* Total users:   `r total_journeyPRO_users`
* Beta Thalassemia patients:   `r num_BT_patients`
* Myelodysplastic Syndrome patients:   `r num_MP_patients`
* Myelofibrosis patients:   `r num_MF_patients`


---------


#### Errors Detected

```{r}
records_with_wrong_createdOn_dates <- sum(lubridate::date(lubridate::ymd_hms(userActivity$createdOn)) > Sys.Date())
```


```{r}
#filter bad data
userActivity <- userActivity %>% dplyr::filter(lubridate::ymd_hms(createdOn)  <= Sys.Date())
```


* Records with malformed timestamp -  `r records_with_wrong_createdOn_dates`

-------

### Enrollment summary

```{r}
enrollment <- userActivity %>% select(healthCode, study_week, dataGroups)
enrollment <- enrollment[!duplicated(enrollment),]
enrollment <- plyr::ldply(1:max(enrollment$study_week), function(x){
  untilNow <- enrollment %>% filter(study_week < x) %>% .$healthCode %>% unique()
  thisWeek <- enrollment %>% filter(study_week == x) %>% .$healthCode %>% unique()
  newUsers_thisweek <- setdiff(thisWeek, untilNow)
  res <- enrollment %>% select(healthCode, dataGroups)
  res <- res[!duplicated(res),]
  res <- res %>% filter(healthCode %in% newUsers_thisweek)
  data.frame('week'=x,
             "beta_thalassemia_patients" = res %>%
                filter(dataGroups == "beta_thalassemia") %>% nrow(),
             "myelodysplastic_syndrome_patients" = res %>%
                filter(dataGroups == "myelodysplastic_syndrome") %>% nrow(),
             "myelofibrosis_patients" = res %>%
                filter(dataGroups == "myelofibrosis") %>% nrow()) %>%
    gather(type, num, 2:4)
})

enrollment <- enrollment %>% mutate(type = factor(type))
p1 <- ggplot(data=enrollment, aes(x=week, y=num, fill=type )) + geom_bar(stat="identity", position="dodge", width = .7) + theme_bw() + scale_x_discrete(limits=seq(1,max(enrollment$week),2))  + ylab('People enrolled') + xlab('Weeks since launch')
p1 <- p1 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),  labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = c(0.82, 0.9))
p1 <- p1 + theme(text = element_text(size=15)) + ggtitle('Users joining per week')
p1 <- p1 + geom_vline(xintercept=WEEKS_SINCE_LAUNCH, colour="grey40", size=1, linetype=3)
```

```{r}
#People active each week since study start
activeEachWeek <- userActivity %>% dplyr::group_by(dataGroups, study_week) %>%
  dplyr::summarise(uniqUsers = n_distinct(healthCode))

p2 <- ggplot(data=activeEachWeek, aes(x=study_week, y=uniqUsers, fill=dataGroups )) + geom_bar(stat="identity", width = .5) + theme_bw() + scale_x_discrete(limits=seq(1,max(activeEachWeek$study_week),2))
p2 <- p2 +  labs(col="User type\n") + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),  labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis"))
p2 <- p2 + theme_few() + theme(legend.position = c(0.2, 0.82))
p2 <- p2 + theme(text = element_text(size=15)) + ggtitle('Users active each week') + ylab('People') + xlab('Weeks since launch')
p2 <- p2 + geom_vline(xintercept=WEEKS_SINCE_LAUNCH, colour="grey40", size=1, linetype=3)
```

```{r,fig.width=15, fig.height=4}
grid.arrange(p1,p2, ncol=2)
```

-------------

### Compliance

```{r, fig.width=7, fig.height=5, fig.align='center'}
### This is mainly to figure out the #users that have been in study for atleast #weeks irrespective of the data contributed
user_time_in_study <- userActivity %>% dplyr::group_by(healthCode) %>%
              dplyr::mutate(currentDate = Sys.Date(),
                days_in_study =  as.numeric(currentDate - participantStartDate) + 1,
                weeks_in_study =  ((days_in_study - 1 ) %/% 7 ) + 1,
                current_days_in_study = as.numeric(lubridate::as_date(createdOn) - participantStartDate) + 1,
                current_weeks_in_study = ((current_days_in_study - 1 ) %/% 7 ) + 1)

userWeeks <- user_time_in_study %>% dplyr::group_by(weeks_in_study) %>%
  dplyr::summarise(numUsers = n_distinct(healthCode))
userWeeks <- userWeeks %>% ddply(.variables='weeks_in_study',
                    .fun = function(x){
                      tmp <- userWeeks %>% dplyr::filter(weeks_in_study >=  unique(x$weeks_in_study))
                      data.frame('totalUsersReachedThisWeek' = sum(tmp$numUsers))
                    })
allWeeks <- data.frame(weeks_in_study = 1:max(userActivity$study_week))
userWeeks <- join(allWeeks, userWeeks)
userWeeks <- na.locf(userWeeks, fromLast = TRUE)

usersWhoMadeItThisFar <- user_time_in_study %>% select(healthCode, study_week)
usersWhoMadeItThisFar <- plyr::ldply(1:max(usersWhoMadeItThisFar$study_week), function(w) {
    untilNow <- usersWhoMadeItThisFar %>% filter(study_week >= w) %>% .$healthCode %>% unique() %>% length()
    data.frame('current_weeks_in_study' = w, 'numUsersWhoMadeItThisFar' = untilNow)})

usersWhoActuallyDidSomethingThisWeek <- user_time_in_study %>% group_by(current_weeks_in_study) %>%
  summarise(numUsersWhoDidSomething = n_distinct(healthCode))
usersWhoActuallyDidSomethingThisWeek <- plyr::rename(usersWhoActuallyDidSomethingThisWeek, replace=c("current_weeks_in_study" = "weeks_in_study"))

specificUserWeeks <- user_time_in_study %>% select(healthCode, weeks_in_study) %>% unique()
specificUserWeeks <- ddply(specificUserWeeks, .variables=.(healthCode), .fun=function(r) {
    data.frame("healthCode" = r$healthCode, "participant_week" = 1:r$weeks_in_study)
})

overallCompliance <- join(usersWhoActuallyDidSomethingThisWeek, userWeeks)
overallCompliance <- overallCompliance %>% mutate(percent = numUsersWhoDidSomething / totalUsersReachedThisWeek)
overallCompliance$percent <- round(overallCompliance$percent * 100, digits=2)
#Since the study is still going on we are not dividing by the all users rather number of unique users in each week of the study
p3 <- ggplot(data=overallCompliance, aes(x=weeks_in_study, y=percent )) + geom_bar(stat="identity", position="dodge", width = .4) + theme_bw() + scale_x_discrete(limits=seq(1,max(overallCompliance$weeks_in_study),2)) + ggtitle('Percent users active by time in study')
p3 <- p3 + theme_few() + scale_fill_manual(values=c('#00AFBB')) +  theme(text = element_text(size=15))
p3 <- p3 + xlab('Weeks enrolled') +  theme(text = element_text(size=15))
```

```{r}
### CURRENTLY NOT PLOTTED ###
# pro_surveys <- userActivity %>% filter( ( originalTable %like% 'Schedule' | originalTable %like% 'Challenge-v6'))
# pro_surveys_compliance <- pro_surveys %>% group_by(participant_week, originalTable, healthCode) %>%
#   summarize(numRecords = n())#dplyr::summarise(num = n_distinct(healthCode))
# pro_surveys_compliance <- merge(pro_surveys_compliance, userWeeks, by.x='participant_week', by.y='weeks_in_study')
# pro_surveys_compliance <- pro_surveys_compliance %>% dplyr::mutate(percent = round((num/totalUsersReachedThisWeek) * 100, digits=2))
#  pro_surveys_compliance['Survey'] = pro_surveys_compliance$originalTable
#  pro_surveys_compliance$originalTable <- NULL
#  p4 <- ggplot(data=pro_surveys_compliance, aes(x=participant_week, y=percent, color=Survey))  + geom_point() + geom_line(size=1.5, alpha=0.6)
# p4 <- p4 + theme_few() +  scale_color_manual(values=c( "#2EC4B6", "#E71D36", "#FF9F1C", '#95C623', '#42c5f4'))
# p4 <- p4 +  ggtitle('Percent Surveys Completed')  + xlab('Weeks enrolled') +  theme(text = element_text(size=15))
```

```{r}
activeSensor_tasks <- userActivity %>%
  filter( (originalTable %like% 'Visual' | originalTable %like% 'Reaction' | originalTable %like%  'Attention' | originalTable %like% 'Challenge-v6'))
allSensorCombinations <- expand.grid(
  healthCode = unique(userActivity$healthCode),
  originalTable = unique(activeSensor_tasks$originalTable))
allValidSensorCombinations <- join(specificUserWeeks, allSensorCombinations)
activeSensor_tasks_compliance <- activeSensor_tasks %>% group_by(participant_week, originalTable, healthCode) %>%
  dplyr::summarise(numRecords = n())
activeSensor_tasks_compliance <- join(allValidSensorCombinations, activeSensor_tasks_compliance)
activeSensor_tasks_compliance$numRecords[is.na(activeSensor_tasks_compliance$numRecords)] <- 0

activeSensor_tasks_compliance <- activeSensor_tasks_compliance %>% group_by(participant_week, originalTable) %>%
   summarise(percent = round((sum(numRecords) / n()) * 100, digits=2))
#activeSensor_tasks_compliance <- activeSensor_tasks_compliance %>% dplyr::mutate(percent = round((numRecords / 7) * 100, digits=2))
activeSensor_tasks_compliance['sensorActivity'] = activeSensor_tasks_compliance$originalTable
activeSensor_tasks_compliance$originalTable <- NULL
activeSensor_tasks_compliance['sensorActivity'] = gsub(' Activity.*', '',activeSensor_tasks_compliance$`sensorActivity`, perl=T)
activeSensor_tasks_compliance['sensorActivity'] = gsub('-.*', '',activeSensor_tasks_compliance$`sensorActivity`, perl=T)

p5 <- ggplot(data=activeSensor_tasks_compliance, aes(x=participant_week, y=percent, color=sensorActivity )) + geom_point()  + geom_line(size=1.5, alpha=0.6)
p5 <- p5 + theme_few() + scale_color_manual(values=c( "#2EC4B6", "#E71D36", "#FF9F1C", "#011627"))
p5 <- p5 +  ggtitle('Percent active (non-survey) tasks completed')  + xlab('Weeks enrolled') +  theme(text = element_text(size=15))
```

```{r}
otherShortSurveys <-  userActivity %>%
  filter( (originalTable == 'Check-In-v2' | originalTable %like%  'Lab Details' | originalTable %like% "Transfusion Details"))
allSurveyCombinations <- expand.grid(
  healthCode = unique(userActivity$healthCode),
  originalTable = unique(otherShortSurveys$originalTable))
allValidSurveyCombinations <- join(specificUserWeeks, allSurveyCombinations)
otherShortSurveys_compliance <- otherShortSurveys %>% group_by(participant_week, originalTable, healthCode) %>%
  dplyr::summarise(numRecords = n())
otherShortSurveys_compliance <- join(allValidSurveyCombinations, otherShortSurveys_compliance)
otherShortSurveys_compliance$numRecords[is.na(otherShortSurveys_compliance$numRecords)] <- 0

otherShortSurveys_compliance <- otherShortSurveys_compliance %>% group_by(participant_week, originalTable) %>%
  dplyr::summarise(percent = round((sum(numRecords)/(7*n())) * 100, digits=2))
otherShortSurveys_compliance['otherSurveys'] = otherShortSurveys_compliance$originalTable
otherShortSurveys_compliance$originalTable <- NULL
otherShortSurveys_compliance['otherSurveys'] = gsub('-v\\d', '',otherShortSurveys_compliance$`otherSurveys`, perl=T)

p6 <- ggplot(data=otherShortSurveys_compliance, aes(x=participant_week, y=percent, color=otherSurveys ))  + geom_point() + geom_line(size=1.5, alpha=0.6)
p6 <- p6 + theme_few() +  scale_color_manual(values=c( "#2EC4B6", "#E71D36", "#FF9F1C", '#011627'))
p6 <- p6 +  ggtitle('Other Short Surveys Completed')  + xlab('Weeks enrolled') +  theme(text = element_text(size=15))
```

```{r}
sf12Compliance <- healthSurvey %>% group_by(participant_week) %>% dplyr::summarise(num = n_distinct(healthCode))
sf12Compliance <- merge(sf12Compliance, userWeeks, by.x = 'participant_week', by.y='weeks_in_study')
sf12Compliance <- sf12Compliance %>% dplyr::mutate(percent = round((num/totalUsersReachedThisWeek) * 100, digits=2))
p7 <- ggplot(data=sf12Compliance, aes(x=participant_week, y=percent)) + geom_point() + geom_line(size=1.5) + theme_few() +
        ggtitle('Weekly SF-12 Surveys Completed') + xlab('Weeks enrolled') +  theme(text = element_text(size=15))
```

```{r,fig.width=15, fig.height=8}
grid.arrange(p3,p5,p6,p7, ncol=2)
```

-------------

```{r}
checkInId <- "syn10232192"
checkIn <- synTableQuery(paste("select dataGroups, mood_sleep, mood_pain, mood_health, mood_tired from", checkInId))@values
checkIn <- checkIn %>% na.omit() %>% filter(dataGroups %in% unique(userActivity$dataGroups))
checkInGraph <- checkIn %>% gather(Category, Score, -dataGroups) %>% group_by(Category, dataGroups) %>%
  summarise(val=mean(as.numeric(Score))) %>%
  ggplot(aes(Category, val, fill=dataGroups)) +
  geom_bar(stat="identity", position="dodge", width = .7) +
  theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),
                                  labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) +
  ylab("Mean Score (out of 5)") + ggtitle("Daily Check-In")
```

```{r}
labDetailsId <- "syn10245386"
labDetails <- synTableQuery(paste("select dataGroups, hgb, rbc, wbc, hct, neutrophils, platelets, ferritin from", labDetailsId))@values
labDetails <- labDetails %>% filter(dataGroups %in% unique(userActivity$dataGroups))
labDetails2 <- labDetails %>% select(dataGroups, hgb, rbc, wbc, hct, neutrophils)
labDetails <- labDetails %>% select(dataGroups, platelets, ferritin)
labDetails2 <- labDetails2[rowSums(is.na(labDetails2 %>% select(-dataGroups))) != ncol(labDetails2)-1,]
labDetails <- labDetails[rowSums(is.na(labDetails %>% select(-dataGroups))) != ncol(labDetails)-1,]
labDetailsGraph <- labDetails %>% gather(Category, Value, -dataGroups) %>%
  group_by(Category) %>%
  ggplot(aes(Category, Value, color=dataGroups)) + geom_crossbar(ymin=0, ymax=2000) +
  theme_few() + scale_color_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),
                                  labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) +
                                  ggtitle("Lab Details (2)") + ylim(NA, 2000)
labDetailsGraph2 <- labDetails2 %>% gather(Category, Value, -dataGroups) %>%
  group_by(Category) %>%
  ggplot(aes(Category, Value, color=dataGroups)) + geom_crossbar(ymin=0, ymax=55) +
  theme_few() + scale_color_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),
                                  labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) +
                                  ggtitle("Lab Details (1)") + ylim(NA, 55)
```

```{r}
transfusionId <- "syn10245387"
transfusion <- synTableQuery(paste("select dataGroups, unitsBlood, infusionRate from", transfusionId))@values
transfusion <- transfusion %>% filter(dataGroups %in% unique(userActivity$dataGroups)) %>% na.omit()
transfusion <- transfusion[2:nrow(transfusion),] # drop outlier
transfusionGraph <- transfusion %>% ggplot(aes(unitsBlood, infusionRate)) +
  geom_point(alpha=0.33, size=8, color="#ff5e5e") + theme_few() +
  ggtitle("Transfusion Details")
```

```{r,fig.width=15, fig.height=10}
grid.arrange(checkInGraph,transfusionGraph,labDetailsGraph2,labDetailsGraph, ncol=2)
```
-------------
### Demographics Summary

```{r}
dem_survey_id <- "syn10232193"
demog <-  synTableQuery(paste("select * from", dem_survey_id))@values
demog$metadata.json.dataGroups <- NULL
colnames(demog) <- gsub('.json.answer', '',colnames(demog))
colnames(demog)  <- gsub('metadata.json.', '',colnames(demog))
demog <- demog %>% dplyr::select(-appVersion, -phoneInfo, -validationErrors, -createdOn, -createdOnTimeZone, -recordId, -uploadDate, -externalId, -birthYear_unit, -ethnicity) %>%
  filter(dataGroups %in% c("beta_thalassemia", "myelodysplastic_syndrome", "myelofibrosis") & healthCode %in% STUDY_HEALTHCODES) #%>%
  #dplyr::mutate(createdOn = as.Date(lubridate::ymd_hms(startDate)))

demog <- ddply(.data = demog, .variables = c('healthCode'))
#race <- demog %>% tidyr::gather(race, value, 2:4) %>% dplyr::filter(value == 'TRUE') %>%
#  select(healthCode, race) %>% group_by(healthCode) %>%
#  dplyr::summarise(race = paste(race, collapse=','))
#demog <- merge(demog, race) #%>% select(-c(7:16))

profile_id <- "syn10235462"
profiles <- synTableQuery(paste("select * from", profile_id))@values
profiles <- profiles %>% dplyr::filter(healthCode %in% STUDY_HEALTHCODES) %>%
  dplyr::transmute(healthCode = healthCode,
                   age = demographics.age,
                   gender = demographics.gender, height = demographics.height,
                   weight = demographics.weight, ethnicity = demographics.ethnicity,
                   yearDiagnosed = demographics.yearDiagnosed,
                   userSharingScope=NA) %>%
  dplyr::group_by(healthCode) %>%
  dplyr::summarise(
                    age = unique(age)[1], gender = unique(gender)[1], height = unique(height)[1],
                    weight = unique(weight)[1],
                    userSharingScope=unique(userSharingScope)[1])

#Add age to demog
demog <- merge(demog, profiles %>% dplyr::select(healthCode, age, gender, height, weight), all.x=T)

demog_derived_from_profiles <- profiles %>% dplyr::filter(!healthCode %in% demog$healthCode)
dataGroups <- userActivity %>% select(healthCode, dataGroups)
dataGroups <- dataGroups[!duplicated(dataGroups),]
demog_derived_from_profiles <- merge(demog_derived_from_profiles , dataGroups, all.x=T)


demog_derived_from_profiles <- demog_derived_from_profiles %>%
  dplyr::select(healthCode, age, dataGroups, userSharingScope, gender, height,
                weight)


tmp_df <- data.frame(dataGroups = c("beta_thalassemia", "myelodysplastic_syndrome", "myelofibrosis"),
                     num = c(num_BT_patients, num_MP_patients, num_MF_patients), stringsAsFactors = F)

percent_demog_data_compliance =  round( (n_distinct(demog$healthCode)/total_journeyPRO_users) *100, digits=2)

percent_demog_data_from_profiles_compliance =  round( (n_distinct(demog_derived_from_profiles$healthCode)/total_journeyPRO_users) *100, digits=2)

demog_total <- plyr::rbind.fill(demog, demog_derived_from_profiles)
```

* Demographics data available for : `r  n_distinct(demog_total$healthCode)` individuals | `r percent_demog_data_compliance` %

* Demographics data obtained from profiles : `r  n_distinct(demog_derived_from_profiles$healthCode)` individuals | `r percent_demog_data_from_profiles_compliance` %

```{r, align='center', fig.width=5, fig.height=5}
missingData <- apply(demog_total,2, function(x){
  round( (sum(is.na(x) | x == "") / nrow(demog_total)) *100, digits=2)
})[c("age", "gender", "race")]
tmp_df <- data.frame(col = names(missingData), percentMissing = as.numeric(missingData)) %>% arrange(percentMissing)

missingGraph <- ggplot(data=tmp_df, aes(x=col, y=percentMissing)) + geom_bar(stat="identity") + theme_few() + coord_flip() + scale_y_continuous(limits=c(0,100)) + xlab('Demographic Variable') + theme(text = element_text(size=15)) + ylab('Percent') + ggtitle("Percent Missing")
```

--------

```{r}
tmp_df <- data.frame(dataGroups = c("beta_thalassemia", "myelodysplastic_syndrome", "myelofibrosis"),
                     num = c(num_BT_patients, num_MP_patients, num_MF_patients), stringsAsFactors = F)

gender = demog_total %>% filter(gender %in% c('Female', 'Male')) %>% inner_join(tmp_df) %>% dplyr::group_by(dataGroups, gender, num) %>% dplyr::summarise(prop =n_distinct(healthCode)) %>% mutate(prop = round((prop / num)*100, digits=2))
p2 <- ggplot(data=gender, aes(x=gender, y=prop, fill=dataGroups )) + geom_bar(stat="identity", position="dodge", width = .7) + theme_bw()  + ylab("Percent")
p2 <- p2 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),  labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = c(0.3, 0.85))
genderGraph <- p2 + theme(text = element_text(size=15)) + ggtitle('Gender') + xlab("Gender")
```

```{r}
p3 <- ggplot(data=demog_total %>% filter(!is.na(gender)), aes(x=gender, y=age, fill=dataGroups )) + geom_boxplot(width = .7) + theme_bw()  + ylab('Age') + xlab('Gender')
p3 <- p3 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),  labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = 'none')
ageGraph <- p3 + theme(text = element_text(size=15)) + ggtitle('Age')
```

```{r}
race = demog_total %>% filter(!is.na(race)) %>% dplyr::group_by(dataGroups, race) %>% dplyr::summarise(prop =n_distinct(healthCode))
#levels <- race %>% filter(dataGroups %in% unique(userActivity$dataGroups)) %>% arrange(desc(prop)) %>% .$race
#race$race <- rev(levels)
p5 <- ggplot(data=race, aes(x=race, y=prop, fill=dataGroups)) + geom_bar(stat="identity", width = .7) + theme_bw()  + ylab('Number')
p5 <- p5 + theme_few() + scale_fill_manual(values=c("#A5BE00", "#4286f4", "#FC4E07"),  labels=c('Beta Thalassemia', 'Myelodysplastic Syndrome', "Myelofibrosis")) + theme(legend.position = c(0.6, 0.5))
raceGraph <- p5 + theme(text = element_text(size=15)) + ggtitle('Race') + coord_flip() + xlab('')
```


```{r,fig.width=15, fig.height=10}
grid.arrange(genderGraph,ageGraph,raceGraph,missingGraph, ncol=2)
```

```{r, fig.width=15, fig.height=10}
if (!require("lbscorer",character.only = TRUE)) {
  install_github("philerooski/lbscorer", silent=TRUE)
}
library(lbscorer)
questionCols <- names(healthSurvey)[12:23]
sf <- healthSurvey[c("recordId", "healthCode", "dataGroups", questionCols)] %>% filter(dataGroups %in% unique(userActivity$dataGroups)) %>% na.omit()
sfScores <- sf12(sf[questionCols])
names(sfScores) <- c("Physical Function", "Role Physical", "Bodily Pain", "General Health", "Vitality", "Social Function", "Role Emotional", "Mental Health", "Physical Summary", "Mental Summary")
df <- cbind(sf[c("recordId", "healthCode", "dataGroups")], sfScores)
dfG <- df %>% gather(cols, value, -dataGroups, -recordId, -healthCode)
dfGBS <- dfG %>% filter(dataGroups == "beta_thalassemia")
dfGMS <- dfG %>% filter(dataGroups == "myelodysplastic_syndrome")
dfGM <- dfG %>% filter(dataGroups == "myelofibrosis")
par(cex.axis=0.8)
boxplot(sfScores, main="SF12-v2 Scores", xlab="Category", ylab="Score")
stripchart(value ~ cols, vertical = TRUE, data = dfGBS,
    method = "jitter", add = TRUE, pch = 20, col = "#A5BE0050")
stripchart(value ~ cols, vertical = TRUE, data = dfGMS,
    method = "jitter", add = TRUE, pch = 20, col = "#4286f450")
stripchart(value ~ cols, vertical = TRUE, data = dfGMS,
    method = "jitter", add = TRUE, pch = 20, col = "#FC4E0750")
```
--------