njahn82/03-manipulating-data.R

## 03-manipulating-data.R
#' Manipulating, analyzing and exporting data with tidyverse
#' https://datacarpentry.org/R-ecology-lesson/03-dplyr.html

library(tidyverse)

#' dplyr
#' - the most common data manipulation tasks
#' - https://github.com/rstudio/cheatsheets/raw/master/data-transformation.pdf
#' tidyr
#' - reshape your data
#' - https://github.com/rstudio/cheatsheets/raw/master/data-import.pdf

surveys <- read_csv("data_raw/portal_data_joined.csv")

## inspect the data
str(surveys)

## preview the data
view(surveys)

#' dplyr functions:
#' - select(): subset columns
#' - filter(): subset rows on conditions
#' - mutate(): create new columns by using information from other columns
#' - group_by() and summarize(): create summary statistics on grouped data
#' - arrange(): sort results
#' - count(): count discrete values

#' Selecting columns and filtering rows
#' select()
#' first argument: the data frame (surveys),
#' the subsequent arguments: the columns to keep
select(surveys, plot_id, species_id, weight)

# select all columns except certain ones: "-"
select(surveys, -record_id, -species_id)

# choose rows based on a specific criterion, use filter():
filter(surveys, year == 1995)
filter(surveys, year %in% 1995:2000)

# exclude
filter(surveys, year != 1995)
filter(surveys, !year %in% 1995:2000)
filter(surveys, !(year %in% 1995:2000))

#' Pipes
#' temporary data frames
surveys2 <- filter(surveys, weight < 5)
surveys_sml <- select(surveys2, species_id, sex, weight)

# nesting
surveys_sml <-
  select(filter(surveys, weight < 5),
         species_id, sex, weight)

# Pipe!
# %>% via the magrittr package
# Ctrl + Shift + M
# |> native pipe, i.e. you d need to laod the magrittr

surveys %>% # then
  filter(weight < 5) %>%
  select(species_id, sex, weight)

# save to new object
surveys_sml <- surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)

surveys_sml

#' Challenge
#' 1. Using pipes, subset the surveys data to include animals collected
#'    before 1995 and retain only the columns year, sex, and weight.
surveys %>%
  filter(year < 1995) %>%
  select(year, sex, weight)


#' Mutate
#' create new columns
surveys %>%
  mutate(weight_kg = weight / 1000)

# reuse
surveys %>%
  mutate(weight_kg = weight / 1000,
         weight_lb = weight_kg * 2.2)

# add more pipes
surveys %>%
  mutate(weight_kg = weight / 1000) %>%
  head()

# even more
surveys %>%
  filter(!is.na(weight)) %>%
  mutate(weight_kg = weight / 1000) %>%
  head()

#' Challenge
#' 1. Create a new data frame from the surveys data that meets the following
#'    criteria: contains only the species_id column and a new column called
#'    hindfoot_cm containing the hindfoot_length values converted to centimeters.
#'    In this hindfoot_cm column, there are no NAs and all values are less than 3.
#'    Hint: think about how the commands should be ordered to produce this
#'    data frame!
surveys_hindfoot_cm <- surveys %>%
  filter(!is.na(hindfoot_length)) %>%
  mutate(hindfoot_cm = hindfoot_length / 10) %>%
  filter(hindfoot_cm < 3) |>
  select(species_id, hindfoot_cm)

surveys_hindfoot_cm
#' Split-apply-combine data analysis and the summarize() function
#' 1. split the data into groups,
#' 2. apply some analysis to each group,
#' 3. combine the results

surveys %>%
  group_by(sex) %>%
  summarize(mean_weight = mean(weight, na.rm = TRUE))

# group by multiple columns
surveys %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight, na.rm = TRUE)) %>%
  tail()

# combine with filter
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight))

# print(n = )
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight)) %>%
  print(n = 40)

# summarize multiple variables
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight))

# rearrange (sort) the result
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight)) %>%
  arrange(min_weight)

# descending order
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight)) %>%
  arrange(desc(mean_weight))

# Counting
surveys %>%
  count(sex)

# equivalent of
surveys %>%
  group_by(sex) %>%
  summarise(count = n())

# with sort
surveys %>%
  count(sex, sort = TRUE)

# count combination of factors
surveys %>%
  count(sex, species)

# sort
surveys %>%
  count(sex, species) %>%
  arrange(species, desc(n))

#' Challenge
#'
#' 1. How many animals were caught in each plot_type surveyed?
#'
surveys %>%
  count(plot_type)

#' 2. Use group_by() and summarize() to find the mean, min, and max hindfoot
#'    length for each species (using species_id). Also add the number of
#'    observations (hint: see ?n).
#'
surveys %>%
  filter(!is.na(hindfoot_length)) %>%
  group_by(species_id) %>%
  summarise(
    mean_hindfoot_length = mean(hindfoot_length),
    min_hindfoot_length = min(hindfoot_length),
    max_hindfoot_length = max(hindfoot_length),
    n = n()
  ) %>%
  View()
#'
#' 3. What was the heaviest animal measured in each year? Return the columns
#'    year, genus, species_id, and weight.
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(year) %>%
  filter(weight == max(weight)) %>%
  select(year, genus, species_id, weight) %>%
  arrange(year)

#' Reshaping with pivot_wider and pivot_longer
#' tidy dataset:
#' - Each variable has its own column
#' - Each observation has its own row
#' - Each value must have its own cell
#' - Each type of observational unit forms a table

# pivot_wider() takes three principal arguments:

# the data
# the names_from column variable whose values will become new column names.
# the values_from column variable whose values will fill the new column variables.

surveys_gw <- surveys %>%
  filter(!is.na(weight)) %>%
  group_by(plot_id, genus) %>%
  summarize(mean_weight = mean(weight))

str(surveys_gw)
surveys_gw

# and now pivot_wider
surveys_wide <- surveys_gw %>%
  pivot_wider(names_from = genus, values_from = mean_weight)

str(surveys_wide)

surveys_wide

# fill in the missing values
surveys_gw %>%
  pivot_wider(names_from = genus, values_from = mean_weight, values_fill = 0) |>
  head()

#' pivot_longer
#' pivot_longer() takes four principal arguments:
#' - the data
#' - the key column variable we wish to create from column names.
#' - the values column variable we wish to create and fill with values
#'   associated with the key.
#' - the names of the columns we use to fill the key variable (or to drop).

surveys_long <- surveys_wide %>%
  pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id)

str(surveys_long)

# specify columns
surveys_long <- surveys_wide %>%
  pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id)

str(surveys_long)

#' Challenge
#'
#' 1. Reshape the surveys data frame with year as columns,
#'    plot_id as rows, and the number of genera per plot as the values.
#'    You will need to summarize before reshaping, and use the
#'    function n_distinct() to get the number of unique genera
#'    within a particular chunk of data.
#'    It’s a powerful function! See ?n_distinct for more.
surveys_wide <- surveys %>%
  group_by(plot_id, year) %>%
  summarise(n = n_distinct(genus)) %>%
  pivot_wider(names_from = year, values_from = n)

surveys_wide
#' 2. Now take that data frame and pivot_longer it again, so each row is a unique
#'    plot_id by year combination.
surveys_wide %>%
  pivot_longer(names_to = "year", values_to = "n", cols = -plot_id)

#' 3. The surveys data set has two measurement columns:
#'    hindfoot_length and weight.
#'    This makes it difficult to do things like look at the
#'    relationship between mean values of each measurement per year
#'    in different plot types.
#'    Let’s walk through a common solution for this type of problem.
#'    First, use pivot_longer() to create a dataset where we
#'    have a names column called measurement and a value column
#'    that takes on the value of either hindfoot_length or weight.
#'    Hint: You’ll need to specify which columns will be part of the
#'    reshape.
#'
surveys_long <- surveys %>%
  pivot_longer(names_to = "measurement", values_to = "value",
               cols = c(hindfoot_length, weight))

#' 4. With this new data set, calculate the average of each measurement
#'    in each year for each different plot_type.
#'    Then pivot_wider() them into a data set with a column for
#'    hindfoot_length and weight.
#'    Hint: You only need to specify the names and values columns
#'    for pivot_wider().
surveys_long %>%
  group_by(year, measurement, plot_type) %>%
  summarize(mean_value = mean(value, na.rm = TRUE)) %>%
  pivot_wider(names_from = measurement, values_from = mean_value, names_prefix = "mean_")


#' Exporting data
#' write_csv()

surveys_complete <- surveys %>%
  filter(!is.na(weight),           # remove missing weight
         !is.na(hindfoot_length),  # remove missing hindfoot_length
         !is.na(sex))              # remove missing sex

surveys_complete

## Extract the most common species_id
species_counts <- surveys_complete %>%
  count(species_id) %>%
  filter(n >= 50)

species_counts

## Only keep the most common species
surveys_complete <- surveys_complete %>%
  filter(species_id %in% species_counts$species_id)

surveys_complete

# save data
write_csv(surveys_complete, file = "data/surveys_complete.csv")
	#' Manipulating, analyzing and exporting data with tidyverse
	#' https://datacarpentry.org/R-ecology-lesson/03-dplyr.html

	library(tidyverse)

	#' dplyr
	#' - the most common data manipulation tasks
	#' - https://github.com/rstudio/cheatsheets/raw/master/data-transformation.pdf
	#' tidyr
	#' - reshape your data
	#' - https://github.com/rstudio/cheatsheets/raw/master/data-import.pdf

	surveys <- read_csv("data_raw/portal_data_joined.csv")

	## inspect the data
	str(surveys)

	## preview the data
	view(surveys)

	#' dplyr functions:
	#' - select(): subset columns
	#' - filter(): subset rows on conditions
	#' - mutate(): create new columns by using information from other columns
	#' - group_by() and summarize(): create summary statistics on grouped data
	#' - arrange(): sort results
	#' - count(): count discrete values

	#' Selecting columns and filtering rows
	#' select()
	#' first argument: the data frame (surveys),
	#' the subsequent arguments: the columns to keep
	select(surveys, plot_id, species_id, weight)

	# select all columns except certain ones: "-"
	select(surveys, -record_id, -species_id)

	# choose rows based on a specific criterion, use filter():
	filter(surveys, year == 1995)
	filter(surveys, year %in% 1995:2000)

	# exclude
	filter(surveys, year != 1995)
	filter(surveys, !year %in% 1995:2000)
	filter(surveys, !(year %in% 1995:2000))

	#' Pipes
	#' temporary data frames
	surveys2 <- filter(surveys, weight < 5)
	surveys_sml <- select(surveys2, species_id, sex, weight)

	# nesting
	surveys_sml <-
	select(filter(surveys, weight < 5),
	species_id, sex, weight)

	# Pipe!
	# %>% via the magrittr package
	# Ctrl + Shift + M
	# \|> native pipe, i.e. you d need to laod the magrittr

	surveys %>% # then
	filter(weight < 5) %>%
	select(species_id, sex, weight)

	# save to new object
	surveys_sml <- surveys %>%
	filter(weight < 5) %>%
	select(species_id, sex, weight)

	surveys_sml

	#' Challenge
	#' 1. Using pipes, subset the surveys data to include animals collected
	#' before 1995 and retain only the columns year, sex, and weight.
	surveys %>%
	filter(year < 1995) %>%
	select(year, sex, weight)



	#' Mutate
	#' create new columns
	surveys %>%
	mutate(weight_kg = weight / 1000)

	# reuse
	surveys %>%
	mutate(weight_kg = weight / 1000,
	weight_lb = weight_kg * 2.2)

	# add more pipes
	surveys %>%
	mutate(weight_kg = weight / 1000) %>%
	head()

	# even more
	surveys %>%
	filter(!is.na(weight)) %>%
	mutate(weight_kg = weight / 1000) %>%
	head()

	#' Challenge
	#' 1. Create a new data frame from the surveys data that meets the following
	#' criteria: contains only the species_id column and a new column called
	#' hindfoot_cm containing the hindfoot_length values converted to centimeters.
	#' In this hindfoot_cm column, there are no NAs and all values are less than 3.
	#' Hint: think about how the commands should be ordered to produce this
	#' data frame!
	surveys_hindfoot_cm <- surveys %>%
	filter(!is.na(hindfoot_length)) %>%
	mutate(hindfoot_cm = hindfoot_length / 10) %>%
	filter(hindfoot_cm < 3) \|>
	select(species_id, hindfoot_cm)

	surveys_hindfoot_cm
	#' Split-apply-combine data analysis and the summarize() function
	#' 1. split the data into groups,
	#' 2. apply some analysis to each group,
	#' 3. combine the results

	surveys %>%
	group_by(sex) %>%
	summarize(mean_weight = mean(weight, na.rm = TRUE))

	# group by multiple columns
	surveys %>%
	group_by(sex, species_id) %>%
	summarize(mean_weight = mean(weight, na.rm = TRUE)) %>%
	tail()

	# combine with filter
	surveys %>%
	filter(!is.na(weight)) %>%
	group_by(sex, species_id) %>%
	summarize(mean_weight = mean(weight))

	# print(n = )
	surveys %>%
	filter(!is.na(weight)) %>%
	group_by(sex, species_id) %>%
	summarize(mean_weight = mean(weight)) %>%
	print(n = 40)

	# summarize multiple variables
	surveys %>%
	filter(!is.na(weight)) %>%
	group_by(sex, species_id) %>%
	summarize(mean_weight = mean(weight),
	min_weight = min(weight))

	# rearrange (sort) the result
	surveys %>%
	filter(!is.na(weight)) %>%
	group_by(sex, species_id) %>%
	summarize(mean_weight = mean(weight),
	min_weight = min(weight)) %>%
	arrange(min_weight)

	# descending order
	surveys %>%
	filter(!is.na(weight)) %>%
	group_by(sex, species_id) %>%
	summarize(mean_weight = mean(weight),
	min_weight = min(weight)) %>%
	arrange(desc(mean_weight))

	# Counting
	surveys %>%
	count(sex)

	# equivalent of
	surveys %>%
	group_by(sex) %>%
	summarise(count = n())

	# with sort
	surveys %>%
	count(sex, sort = TRUE)

	# count combination of factors
	surveys %>%
	count(sex, species)

	# sort
	surveys %>%
	count(sex, species) %>%
	arrange(species, desc(n))

	#' Challenge
	#'
	#' 1. How many animals were caught in each plot_type surveyed?
	#'
	surveys %>%
	count(plot_type)

	#' 2. Use group_by() and summarize() to find the mean, min, and max hindfoot
	#' length for each species (using species_id). Also add the number of
	#' observations (hint: see ?n).
	#'
	surveys %>%
	filter(!is.na(hindfoot_length)) %>%
	group_by(species_id) %>%
	summarise(
	mean_hindfoot_length = mean(hindfoot_length),
	min_hindfoot_length = min(hindfoot_length),
	max_hindfoot_length = max(hindfoot_length),
	n = n()
	) %>%
	View()
	#'
	#' 3. What was the heaviest animal measured in each year? Return the columns
	#' year, genus, species_id, and weight.
	surveys %>%
	filter(!is.na(weight)) %>%
	group_by(year) %>%
	filter(weight == max(weight)) %>%
	select(year, genus, species_id, weight) %>%
	arrange(year)

	#' Reshaping with pivot_wider and pivot_longer
	#' tidy dataset:
	#' - Each variable has its own column
	#' - Each observation has its own row
	#' - Each value must have its own cell
	#' - Each type of observational unit forms a table

	# pivot_wider() takes three principal arguments:

	# the data
	# the names_from column variable whose values will become new column names.
	# the values_from column variable whose values will fill the new column variables.

	surveys_gw <- surveys %>%
	filter(!is.na(weight)) %>%
	group_by(plot_id, genus) %>%
	summarize(mean_weight = mean(weight))

	str(surveys_gw)
	surveys_gw

	# and now pivot_wider
	surveys_wide <- surveys_gw %>%
	pivot_wider(names_from = genus, values_from = mean_weight)

	str(surveys_wide)

	surveys_wide

	# fill in the missing values
	surveys_gw %>%
	pivot_wider(names_from = genus, values_from = mean_weight, values_fill = 0) \|>
	head()

	#' pivot_longer
	#' pivot_longer() takes four principal arguments:
	#' - the data
	#' - the key column variable we wish to create from column names.
	#' - the values column variable we wish to create and fill with values
	#' associated with the key.
	#' - the names of the columns we use to fill the key variable (or to drop).

	surveys_long <- surveys_wide %>%
	pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id)

	str(surveys_long)

	# specify columns
	surveys_long <- surveys_wide %>%
	pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id)

	str(surveys_long)

	#' Challenge
	#'
	#' 1. Reshape the surveys data frame with year as columns,
	#' plot_id as rows, and the number of genera per plot as the values.
	#' You will need to summarize before reshaping, and use the
	#' function n_distinct() to get the number of unique genera
	#' within a particular chunk of data.
	#' It’s a powerful function! See ?n_distinct for more.
	surveys_wide <- surveys %>%
	group_by(plot_id, year) %>%
	summarise(n = n_distinct(genus)) %>%
	pivot_wider(names_from = year, values_from = n)

	surveys_wide
	#' 2. Now take that data frame and pivot_longer it again, so each row is a unique
	#' plot_id by year combination.
	surveys_wide %>%
	pivot_longer(names_to = "year", values_to = "n", cols = -plot_id)

	#' 3. The surveys data set has two measurement columns:
	#' hindfoot_length and weight.
	#' This makes it difficult to do things like look at the
	#' relationship between mean values of each measurement per year
	#' in different plot types.
	#' Let’s walk through a common solution for this type of problem.
	#' First, use pivot_longer() to create a dataset where we
	#' have a names column called measurement and a value column
	#' that takes on the value of either hindfoot_length or weight.
	#' Hint: You’ll need to specify which columns will be part of the
	#' reshape.
	#'
	surveys_long <- surveys %>%
	pivot_longer(names_to = "measurement", values_to = "value",
	cols = c(hindfoot_length, weight))

	#' 4. With this new data set, calculate the average of each measurement
	#' in each year for each different plot_type.
	#' Then pivot_wider() them into a data set with a column for
	#' hindfoot_length and weight.
	#' Hint: You only need to specify the names and values columns
	#' for pivot_wider().
	surveys_long %>%
	group_by(year, measurement, plot_type) %>%
	summarize(mean_value = mean(value, na.rm = TRUE)) %>%
	pivot_wider(names_from = measurement, values_from = mean_value, names_prefix = "mean_")






	#' Exporting data
	#' write_csv()

	surveys_complete <- surveys %>%
	filter(!is.na(weight), # remove missing weight
	!is.na(hindfoot_length), # remove missing hindfoot_length
	!is.na(sex)) # remove missing sex

	surveys_complete

	## Extract the most common species_id
	species_counts <- surveys_complete %>%
	count(species_id) %>%
	filter(n >= 50)

	species_counts

	## Only keep the most common species
	surveys_complete <- surveys_complete %>%
	filter(species_id %in% species_counts$species_id)

	surveys_complete

	# save data
	write_csv(surveys_complete, file = "data/surveys_complete.csv")