Created
November 30, 2023 11:52
-
-
Save njahn82/c264faf8a179e6f03cb83da4244af313 to your computer and use it in GitHub Desktop.
03-manipulating-data.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Manipulating, analyzing and exporting data with tidyverse | |
#' https://datacarpentry.org/R-ecology-lesson/03-dplyr.html | |
library(tidyverse) | |
#' dplyr | |
#' - the most common data manipulation tasks | |
#' - https://github.com/rstudio/cheatsheets/raw/master/data-transformation.pdf | |
#' tidyr | |
#' - reshape your data | |
#' - https://github.com/rstudio/cheatsheets/raw/master/data-import.pdf | |
surveys <- read_csv("data_raw/portal_data_joined.csv") | |
## inspect the data | |
str(surveys) | |
## preview the data | |
view(surveys) | |
#' dplyr functions: | |
#' - select(): subset columns | |
#' - filter(): subset rows on conditions | |
#' - mutate(): create new columns by using information from other columns | |
#' - group_by() and summarize(): create summary statistics on grouped data | |
#' - arrange(): sort results | |
#' - count(): count discrete values | |
#' Selecting columns and filtering rows | |
#' select() | |
#' first argument: the data frame (surveys), | |
#' the subsequent arguments: the columns to keep | |
select(surveys, plot_id, species_id, weight) | |
# select all columns except certain ones: "-" | |
select(surveys, -record_id, -species_id) | |
# choose rows based on a specific criterion, use filter(): | |
filter(surveys, year == 1995) | |
filter(surveys, year %in% 1995:2000) | |
# exclude | |
filter(surveys, year != 1995) | |
filter(surveys, !year %in% 1995:2000) | |
filter(surveys, !(year %in% 1995:2000)) | |
#' Pipes | |
#' temporary data frames | |
surveys2 <- filter(surveys, weight < 5) | |
surveys_sml <- select(surveys2, species_id, sex, weight) | |
# nesting | |
surveys_sml <- | |
select(filter(surveys, weight < 5), | |
species_id, sex, weight) | |
# Pipe! | |
# %>% via the magrittr package | |
# Ctrl + Shift + M | |
# |> native pipe, i.e. you d need to laod the magrittr | |
surveys %>% # then | |
filter(weight < 5) %>% | |
select(species_id, sex, weight) | |
# save to new object | |
surveys_sml <- surveys %>% | |
filter(weight < 5) %>% | |
select(species_id, sex, weight) | |
surveys_sml | |
#' Challenge | |
#' 1. Using pipes, subset the surveys data to include animals collected | |
#' before 1995 and retain only the columns year, sex, and weight. | |
surveys %>% | |
filter(year < 1995) %>% | |
select(year, sex, weight) | |
#' Mutate | |
#' create new columns | |
surveys %>% | |
mutate(weight_kg = weight / 1000) | |
# reuse | |
surveys %>% | |
mutate(weight_kg = weight / 1000, | |
weight_lb = weight_kg * 2.2) | |
# add more pipes | |
surveys %>% | |
mutate(weight_kg = weight / 1000) %>% | |
head() | |
# even more | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
mutate(weight_kg = weight / 1000) %>% | |
head() | |
#' Challenge | |
#' 1. Create a new data frame from the surveys data that meets the following | |
#' criteria: contains only the species_id column and a new column called | |
#' hindfoot_cm containing the hindfoot_length values converted to centimeters. | |
#' In this hindfoot_cm column, there are no NAs and all values are less than 3. | |
#' Hint: think about how the commands should be ordered to produce this | |
#' data frame! | |
surveys_hindfoot_cm <- surveys %>% | |
filter(!is.na(hindfoot_length)) %>% | |
mutate(hindfoot_cm = hindfoot_length / 10) %>% | |
filter(hindfoot_cm < 3) |> | |
select(species_id, hindfoot_cm) | |
surveys_hindfoot_cm | |
#' Split-apply-combine data analysis and the summarize() function | |
#' 1. split the data into groups, | |
#' 2. apply some analysis to each group, | |
#' 3. combine the results | |
surveys %>% | |
group_by(sex) %>% | |
summarize(mean_weight = mean(weight, na.rm = TRUE)) | |
# group by multiple columns | |
surveys %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight, na.rm = TRUE)) %>% | |
tail() | |
# combine with filter | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight)) | |
# print(n = ) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight)) %>% | |
print(n = 40) | |
# summarize multiple variables | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight), | |
min_weight = min(weight)) | |
# rearrange (sort) the result | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight), | |
min_weight = min(weight)) %>% | |
arrange(min_weight) | |
# descending order | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight), | |
min_weight = min(weight)) %>% | |
arrange(desc(mean_weight)) | |
# Counting | |
surveys %>% | |
count(sex) | |
# equivalent of | |
surveys %>% | |
group_by(sex) %>% | |
summarise(count = n()) | |
# with sort | |
surveys %>% | |
count(sex, sort = TRUE) | |
# count combination of factors | |
surveys %>% | |
count(sex, species) | |
# sort | |
surveys %>% | |
count(sex, species) %>% | |
arrange(species, desc(n)) | |
#' Challenge | |
#' | |
#' 1. How many animals were caught in each plot_type surveyed? | |
#' | |
surveys %>% | |
count(plot_type) | |
#' 2. Use group_by() and summarize() to find the mean, min, and max hindfoot | |
#' length for each species (using species_id). Also add the number of | |
#' observations (hint: see ?n). | |
#' | |
surveys %>% | |
filter(!is.na(hindfoot_length)) %>% | |
group_by(species_id) %>% | |
summarise( | |
mean_hindfoot_length = mean(hindfoot_length), | |
min_hindfoot_length = min(hindfoot_length), | |
max_hindfoot_length = max(hindfoot_length), | |
n = n() | |
) %>% | |
View() | |
#' | |
#' 3. What was the heaviest animal measured in each year? Return the columns | |
#' year, genus, species_id, and weight. | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(year) %>% | |
filter(weight == max(weight)) %>% | |
select(year, genus, species_id, weight) %>% | |
arrange(year) | |
#' Reshaping with pivot_wider and pivot_longer | |
#' tidy dataset: | |
#' - Each variable has its own column | |
#' - Each observation has its own row | |
#' - Each value must have its own cell | |
#' - Each type of observational unit forms a table | |
# pivot_wider() takes three principal arguments: | |
# the data | |
# the names_from column variable whose values will become new column names. | |
# the values_from column variable whose values will fill the new column variables. | |
surveys_gw <- surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(plot_id, genus) %>% | |
summarize(mean_weight = mean(weight)) | |
str(surveys_gw) | |
surveys_gw | |
# and now pivot_wider | |
surveys_wide <- surveys_gw %>% | |
pivot_wider(names_from = genus, values_from = mean_weight) | |
str(surveys_wide) | |
surveys_wide | |
# fill in the missing values | |
surveys_gw %>% | |
pivot_wider(names_from = genus, values_from = mean_weight, values_fill = 0) |> | |
head() | |
#' pivot_longer | |
#' pivot_longer() takes four principal arguments: | |
#' - the data | |
#' - the key column variable we wish to create from column names. | |
#' - the values column variable we wish to create and fill with values | |
#' associated with the key. | |
#' - the names of the columns we use to fill the key variable (or to drop). | |
surveys_long <- surveys_wide %>% | |
pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id) | |
str(surveys_long) | |
# specify columns | |
surveys_long <- surveys_wide %>% | |
pivot_longer(names_to = "genus", values_to = "mean_weight", cols = -plot_id) | |
str(surveys_long) | |
#' Challenge | |
#' | |
#' 1. Reshape the surveys data frame with year as columns, | |
#' plot_id as rows, and the number of genera per plot as the values. | |
#' You will need to summarize before reshaping, and use the | |
#' function n_distinct() to get the number of unique genera | |
#' within a particular chunk of data. | |
#' It’s a powerful function! See ?n_distinct for more. | |
surveys_wide <- surveys %>% | |
group_by(plot_id, year) %>% | |
summarise(n = n_distinct(genus)) %>% | |
pivot_wider(names_from = year, values_from = n) | |
surveys_wide | |
#' 2. Now take that data frame and pivot_longer it again, so each row is a unique | |
#' plot_id by year combination. | |
surveys_wide %>% | |
pivot_longer(names_to = "year", values_to = "n", cols = -plot_id) | |
#' 3. The surveys data set has two measurement columns: | |
#' hindfoot_length and weight. | |
#' This makes it difficult to do things like look at the | |
#' relationship between mean values of each measurement per year | |
#' in different plot types. | |
#' Let’s walk through a common solution for this type of problem. | |
#' First, use pivot_longer() to create a dataset where we | |
#' have a names column called measurement and a value column | |
#' that takes on the value of either hindfoot_length or weight. | |
#' Hint: You’ll need to specify which columns will be part of the | |
#' reshape. | |
#' | |
surveys_long <- surveys %>% | |
pivot_longer(names_to = "measurement", values_to = "value", | |
cols = c(hindfoot_length, weight)) | |
#' 4. With this new data set, calculate the average of each measurement | |
#' in each year for each different plot_type. | |
#' Then pivot_wider() them into a data set with a column for | |
#' hindfoot_length and weight. | |
#' Hint: You only need to specify the names and values columns | |
#' for pivot_wider(). | |
surveys_long %>% | |
group_by(year, measurement, plot_type) %>% | |
summarize(mean_value = mean(value, na.rm = TRUE)) %>% | |
pivot_wider(names_from = measurement, values_from = mean_value, names_prefix = "mean_") | |
#' Exporting data | |
#' write_csv() | |
surveys_complete <- surveys %>% | |
filter(!is.na(weight), # remove missing weight | |
!is.na(hindfoot_length), # remove missing hindfoot_length | |
!is.na(sex)) # remove missing sex | |
surveys_complete | |
## Extract the most common species_id | |
species_counts <- surveys_complete %>% | |
count(species_id) %>% | |
filter(n >= 50) | |
species_counts | |
## Only keep the most common species | |
surveys_complete <- surveys_complete %>% | |
filter(species_id %in% species_counts$species_id) | |
surveys_complete | |
# save data | |
write_csv(surveys_complete, file = "data/surveys_complete.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment