dantalus/Morning2

## Morning2


  urlfile <-"https://raw.githubusercontent.com/dantalus/intro_workshop/master/plot.csv"
  plotDf <-read.csv(urlfile)

# install.packages(c("ggthemes", "ggbeeswarm"))
  library(ggthemes)
  library(ggbeeswarm)
  library(ggplot2)
  library(dplyr)

# WTF is a tibble?
  class(plotDf)

  as.tbl(plotDf)

  plotDf <- as.tbl(plotDf)

  class(plotDf) <- "data.frame"

  library(readr)
  plotDf <-read_csv(urlfile)

  plotDf <-as_tbl(read.csv(urlfile))
  plotDf <-read.csv(urlfile) %>% as_tbl()

# Rename variables
# Base R
  plotDf$age <- plotDf$demo_age
  plotDf[, -5]
  plotDf[, !grepl("demo_age", names(plotDf))]

  names(plotDf)[5] <- "age"

# dplyr

  plotDf <-read_csv(urlfile)

  rename(plotDf, age = demo_age)


# Select rows ####
# Base R
  plotDf[1:5, ]

  plotDf[plotDf$id > 199, ]
  plotDf[plotDf$id > 199 & plotDf$demo_gender == "Male", ]
  plotDf[grepl(" A", plotDf$arm), ]  # character matching

  r <- plotDf$id > 199
  plotDf[r, ]

  r <- plotDf$id %in% c(201, 202, 101)
  plotDf[r, ]

  subset(plotDf, id > 199)

# dplyr

  filter(plotDf, id > 199 & demo_gender == "Male")
  filter(plotDf, id > 199 | demo_gender == "Male")
  filter(plotDf, (id > 199 & demo_gender == "Male") |
                 (id < 199 & demo_gender == "Female"))

# Putting rows back together

  a <- filter(plotDf, id > 199)
  b <- filter(plotDf, id <= 199)

  rbind(a, b)

  a <- filter(plotDf, id > 199) %>%
       rename(age = demo_age)
  b <- filter(plotDf, id <= 199)

  rbind(a, b) #error

  names(b) <- names(a)

# select columns ####

# Base R

  plotDf$glvef %>% class()
  plotDf[, 1] %>% class()
  plotDf[[1]]
  plotDf[1]


  plotDf[1, 1]
  plotDf[[1]][1]


  plotDf[, "glvef"]
  plotDf[, grepl("demo", names(plotDf))]

# dplyr

  select(plotDf, id, glvef)
  select(plotDf, glvef:id)

  select(plotDf, starts_with("demo"))

  select(plotDf, id, GLVEF = glvef)

  plotDf <- select(plotDf, arm, id, time, everything() )

# Putting columns back together

  a <- select(plotDf, id, glvef)
  b <-select(plotDf, starts_with("demo"))

  cbind(a, b) # danger

  a <- select(plotDf, id, time, glvef)
  b <- select(plotDf, id, time, starts_with("demo"))

  g <- full_join(a, b, by = c("id", "time"))

  cbind(a, b) # danger


# New/modify variables
# Base R
  plotDf$bmi <- plotDf$demo_wt_kg / (plotDf$demo_ht_cm / 100)^2

  plotDf[plotDf$id == 107, ]$demo_ht_cm <- NA
  View(plotDf)

  plotDf[plotDf$demo_gender == "Male", ]$demo_age <- NA

  plotDf <-read_csv(urlfile)

# dplyr

  plotDf <- mutate(plotDf, bmi = demo_wt_kg / (demo_ht_cm / 100)^2)

  mutate(plotDf, bmi_rank = percent_rank(bmi)) %>% View()

  plotDf <- mutate(plotDf, bmi = demo_wt_kg / (demo_ht_cm / 100)^2) %>%
            mutate(bmi_rank = percent_rank(bmi))

# Arrange by rows

# Base R

  plotDf[order(plotDf$demo_age), ]
  plotDf[order(plotDf$arm, plotDf$id, plotDf$time), ]

# dplyr

  arrange(plotDf, arm, id, time)

# Unique rows

  plotDf[!duplicated(plotDf$id), ]

  unique(plotDf[c("id")])

# dplyr

  distinct(plotDf, id, .keep_all = TRUE)
  distinct(plotDf, id) %>% unlist() %>% as.numeric()


  distinct(plotDf, id, time, .keep_all = TRUE)

# Summarizing

  summarise(plotDf,
            n = n(),
            mean_glvef = mean(glvef, na.rm = TRUE),
            min = min(glvef, na.rm = TRUE))

  group_by(plotDf, arm, demo_gender) %>%
    summarise(n = n(),
              mean_glvef = mean(glvef, na.rm = TRUE))

# Group, summarise, join
  group_by(plotDf, arm) %>%
    summarise(n = n(),
              mean_glvef = mean(glvef, na.rm = TRUE)) %>%
    full_join(plotDf, by = "arm") %>% View()

# Group, mutate, join
  group_by(plotDf, demo_gender) %>%
    mutate(bmi_rank = percent_rank(demo_ht_cm)) %>%
    full_join(plotDf, by = "demo_gender")  %>% View()


# Back to our plot ####

  plotDf <-read.csv(urlfile)

  plotDf <- mutate(plotDf, arm = factor(arm, labels = c("Placebo", "Low Dose", "High Dose"))) %>%
    mutate(time = factor(time, levels = c("Baseline", "8 weeks")))


  ggplot(plotDf, aes(y = glvef, shape = arm)) +
    geom_violin(aes(x = as.numeric(time), group = time),
                color = "grey90", fill = "grey90", width = .5) +
    geom_line(aes(group = id,
                  x = as.numeric(time)),
              alpha = 0.7, linetype = "dashed") +
    facet_wrap(~arm) +
    theme_base() +
    scale_x_continuous(breaks = c(1, 2),
                       labels = c("Baseline", "8 weeks")) +
    geom_point(aes(group = id, x = as.numeric(time))) +
    geom_smooth(aes(x = as.numeric(time)),
                method = "lm", se = FALSE, size = 2, color = "black") +
    geom_pointrange(data = filter(plotDf, arm == "Placebo" & time == "Baseline"),
                    aes(x = 0.5,
                        y =    mean(glvef, na.rm = TRUE),
                        ymax = mean(glvef, na.rm = TRUE) +
                          sd(glvef, na.rm = TRUE),
                        ymin = mean(glvef, na.rm = TRUE) -
                          sd(glvef, na.rm = TRUE))) +
    geom_pointrange(data = filter(plotDf, arm == "Placebo" & time == "8 weeks"),
                    aes(x = 2.5,
                        y =    mean(glvef, na.rm = TRUE),
                        ymax = mean(glvef, na.rm = TRUE) +
                          sd(glvef, na.rm = TRUE),
                        ymin = mean(glvef, na.rm = TRUE) -
                          sd(glvef, na.rm = TRUE))) +
    geom_pointrange(data = filter(plotDf, arm == "Low Dose" & time == "Baseline"),
                    aes(x = 0.5,
                        y =    mean(glvef, na.rm = TRUE),
                        ymax = mean(glvef, na.rm = TRUE) +
                          sd(glvef, na.rm = TRUE),
                        ymin = mean(glvef, na.rm = TRUE) -
                          sd(glvef, na.rm = TRUE))) +
    geom_pointrange(data = filter(plotDf, arm == "Low Dose" & time == "8 weeks"),
                    aes(x = 2.5,
                        y =    mean(glvef, na.rm = TRUE),
                        ymax = mean(glvef, na.rm = TRUE) +
                          sd(glvef, na.rm = TRUE),
                        ymin = mean(glvef, na.rm = TRUE) -
                          sd(glvef, na.rm = TRUE))) +
    geom_pointrange(data = filter(plotDf, arm == "High Dose" & time == "Baseline"),
                    aes(x = 0.5,
                        y =    mean(glvef, na.rm = TRUE),
                        ymax = mean(glvef, na.rm = TRUE) +
                          sd(glvef, na.rm = TRUE),
                        ymin = mean(glvef, na.rm = TRUE) -
                          sd(glvef, na.rm = TRUE))) +
    geom_pointrange(data = filter(plotDf, arm == "High Dose" & time == "8 weeks"),
                    aes(x = 2.5,
                        y =    mean(glvef, na.rm = TRUE),
                        ymax = mean(glvef, na.rm = TRUE) +
                          sd(glvef, na.rm = TRUE),
                        ymin = mean(glvef, na.rm = TRUE) -
                          sd(glvef, na.rm = TRUE))) +
    # scale_color_brewer(guide = FALSE, palette = "Set1") +
    # scale_fill_manual(guide  = FALSE, palette = "Set1") +
    scale_shape(guide = FALSE) +
    xlab("")+
    ylab("GLVEF (%)") +
    ylim(0, 75) +
    theme(panel.spacing = unit(2, "lines"),
          panel.border = element_rect(color = "white"),
          axis.text.y = element_text(size = 16),
          strip.text = element_text(size = 16))


	urlfile <-"https://raw.githubusercontent.com/dantalus/intro_workshop/master/plot.csv"
	plotDf <-read.csv(urlfile)

	# install.packages(c("ggthemes", "ggbeeswarm"))
	library(ggthemes)
	library(ggbeeswarm)
	library(ggplot2)
	library(dplyr)

	# WTF is a tibble?
	class(plotDf)

	as.tbl(plotDf)

	plotDf <- as.tbl(plotDf)

	class(plotDf) <- "data.frame"

	library(readr)
	plotDf <-read_csv(urlfile)

	plotDf <-as_tbl(read.csv(urlfile))
	plotDf <-read.csv(urlfile) %>% as_tbl()

	# Rename variables
	# Base R
	plotDf$age <- plotDf$demo_age
	plotDf[, -5]
	plotDf[, !grepl("demo_age", names(plotDf))]

	names(plotDf)[5] <- "age"

	# dplyr

	plotDf <-read_csv(urlfile)

	rename(plotDf, age = demo_age)


	# Select rows ####
	# Base R
	plotDf[1:5, ]

	plotDf[plotDf$id > 199, ]
	plotDf[plotDf$id > 199 & plotDf$demo_gender == "Male", ]
	plotDf[grepl(" A", plotDf$arm), ] # character matching

	r <- plotDf$id > 199
	plotDf[r, ]

	r <- plotDf$id %in% c(201, 202, 101)
	plotDf[r, ]

	subset(plotDf, id > 199)

	# dplyr

	filter(plotDf, id > 199 & demo_gender == "Male")
	filter(plotDf, id > 199 \| demo_gender == "Male")
	filter(plotDf, (id > 199 & demo_gender == "Male") \|
	(id < 199 & demo_gender == "Female"))

	# Putting rows back together

	a <- filter(plotDf, id > 199)
	b <- filter(plotDf, id <= 199)

	rbind(a, b)

	a <- filter(plotDf, id > 199) %>%
	rename(age = demo_age)
	b <- filter(plotDf, id <= 199)

	rbind(a, b) #error

	names(b) <- names(a)

	# select columns ####

	# Base R

	plotDf$glvef %>% class()
	plotDf[, 1] %>% class()
	plotDf[[1]]
	plotDf[1]


	plotDf[1, 1]
	plotDf[[1]][1]


	plotDf[, "glvef"]
	plotDf[, grepl("demo", names(plotDf))]

	# dplyr

	select(plotDf, id, glvef)
	select(plotDf, glvef:id)

	select(plotDf, starts_with("demo"))

	select(plotDf, id, GLVEF = glvef)

	plotDf <- select(plotDf, arm, id, time, everything() )

	# Putting columns back together

	a <- select(plotDf, id, glvef)
	b <-select(plotDf, starts_with("demo"))

	cbind(a, b) # danger

	a <- select(plotDf, id, time, glvef)
	b <- select(plotDf, id, time, starts_with("demo"))

	g <- full_join(a, b, by = c("id", "time"))

	cbind(a, b) # danger


	# New/modify variables
	# Base R
	plotDf$bmi <- plotDf$demo_wt_kg / (plotDf$demo_ht_cm / 100)^2

	plotDf[plotDf$id == 107, ]$demo_ht_cm <- NA
	View(plotDf)

	plotDf[plotDf$demo_gender == "Male", ]$demo_age <- NA

	plotDf <-read_csv(urlfile)

	# dplyr

	plotDf <- mutate(plotDf, bmi = demo_wt_kg / (demo_ht_cm / 100)^2)

	mutate(plotDf, bmi_rank = percent_rank(bmi)) %>% View()

	plotDf <- mutate(plotDf, bmi = demo_wt_kg / (demo_ht_cm / 100)^2) %>%
	mutate(bmi_rank = percent_rank(bmi))

	# Arrange by rows

	# Base R

	plotDf[order(plotDf$demo_age), ]
	plotDf[order(plotDf$arm, plotDf$id, plotDf$time), ]

	# dplyr

	arrange(plotDf, arm, id, time)

	# Unique rows

	plotDf[!duplicated(plotDf$id), ]

	unique(plotDf[c("id")])

	# dplyr

	distinct(plotDf, id, .keep_all = TRUE)
	distinct(plotDf, id) %>% unlist() %>% as.numeric()


	distinct(plotDf, id, time, .keep_all = TRUE)

	# Summarizing

	summarise(plotDf,
	n = n(),
	mean_glvef = mean(glvef, na.rm = TRUE),
	min = min(glvef, na.rm = TRUE))

	group_by(plotDf, arm, demo_gender) %>%
	summarise(n = n(),
	mean_glvef = mean(glvef, na.rm = TRUE))

	# Group, summarise, join
	group_by(plotDf, arm) %>%
	summarise(n = n(),
	mean_glvef = mean(glvef, na.rm = TRUE)) %>%
	full_join(plotDf, by = "arm") %>% View()

	# Group, mutate, join
	group_by(plotDf, demo_gender) %>%
	mutate(bmi_rank = percent_rank(demo_ht_cm)) %>%
	full_join(plotDf, by = "demo_gender") %>% View()


	# Back to our plot ####

	plotDf <-read.csv(urlfile)

	plotDf <- mutate(plotDf, arm = factor(arm, labels = c("Placebo", "Low Dose", "High Dose"))) %>%
	mutate(time = factor(time, levels = c("Baseline", "8 weeks")))


	ggplot(plotDf, aes(y = glvef, shape = arm)) +
	geom_violin(aes(x = as.numeric(time), group = time),
	color = "grey90", fill = "grey90", width = .5) +
	geom_line(aes(group = id,
	x = as.numeric(time)),
	alpha = 0.7, linetype = "dashed") +
	facet_wrap(~arm) +
	theme_base() +
	scale_x_continuous(breaks = c(1, 2),
	labels = c("Baseline", "8 weeks")) +
	geom_point(aes(group = id, x = as.numeric(time))) +
	geom_smooth(aes(x = as.numeric(time)),
	method = "lm", se = FALSE, size = 2, color = "black") +
	geom_pointrange(data = filter(plotDf, arm == "Placebo" & time == "Baseline"),
	aes(x = 0.5,
	y = mean(glvef, na.rm = TRUE),
	ymax = mean(glvef, na.rm = TRUE) +
	sd(glvef, na.rm = TRUE),
	ymin = mean(glvef, na.rm = TRUE) -
	sd(glvef, na.rm = TRUE))) +
	geom_pointrange(data = filter(plotDf, arm == "Placebo" & time == "8 weeks"),
	aes(x = 2.5,
	y = mean(glvef, na.rm = TRUE),
	ymax = mean(glvef, na.rm = TRUE) +
	sd(glvef, na.rm = TRUE),
	ymin = mean(glvef, na.rm = TRUE) -
	sd(glvef, na.rm = TRUE))) +
	geom_pointrange(data = filter(plotDf, arm == "Low Dose" & time == "Baseline"),
	aes(x = 0.5,
	y = mean(glvef, na.rm = TRUE),
	ymax = mean(glvef, na.rm = TRUE) +
	sd(glvef, na.rm = TRUE),
	ymin = mean(glvef, na.rm = TRUE) -
	sd(glvef, na.rm = TRUE))) +
	geom_pointrange(data = filter(plotDf, arm == "Low Dose" & time == "8 weeks"),
	aes(x = 2.5,
	y = mean(glvef, na.rm = TRUE),
	ymax = mean(glvef, na.rm = TRUE) +
	sd(glvef, na.rm = TRUE),
	ymin = mean(glvef, na.rm = TRUE) -
	sd(glvef, na.rm = TRUE))) +
	geom_pointrange(data = filter(plotDf, arm == "High Dose" & time == "Baseline"),
	aes(x = 0.5,
	y = mean(glvef, na.rm = TRUE),
	ymax = mean(glvef, na.rm = TRUE) +
	sd(glvef, na.rm = TRUE),
	ymin = mean(glvef, na.rm = TRUE) -
	sd(glvef, na.rm = TRUE))) +
	geom_pointrange(data = filter(plotDf, arm == "High Dose" & time == "8 weeks"),
	aes(x = 2.5,
	y = mean(glvef, na.rm = TRUE),
	ymax = mean(glvef, na.rm = TRUE) +
	sd(glvef, na.rm = TRUE),
	ymin = mean(glvef, na.rm = TRUE) -
	sd(glvef, na.rm = TRUE))) +
	# scale_color_brewer(guide = FALSE, palette = "Set1") +
	# scale_fill_manual(guide = FALSE, palette = "Set1") +
	scale_shape(guide = FALSE) +
	xlab("")+
	ylab("GLVEF (%)") +
	ylim(0, 75) +
	theme(panel.spacing = unit(2, "lines"),
	panel.border = element_rect(color = "white"),
	axis.text.y = element_text(size = 16),
	strip.text = element_text(size = 16))