MarinaGolivets/GP_Oekologie_2022.R

## GP_Oekologie_2022.R
# GP Oekologie 2022
# Ingolf Kühn & Marina Golivets


# install & load R packages ------------------------------------------------------------------------

rm(list = ls())

# packages that we need for analyses
pkgs <- c(
  "here", # for handling working directory
  "googlesheets4", # for loading data from Google spreadsheets
  "tidyverse", # for data manipulation
  "magrittr", # for piping
  "skimr", # for summarizing data
  "gtsummary", # for summarizing data
  "taxize", # for checking taxonomic names
  "TR8", # for retrieving trait data from databases
  "performance", # for inspecting regression models,
  "rstatix", # for statistical analyses, e.g. ANOVA
  "DescTools", # for miscellaneous basic stats
  "ggpubr", # for plotting
  "ggrepel", # for plotting
  "viridis", # for plotting
  "cowplot", # for plotting
  "vegan", # for multivariate analyses
  "FactoMineR", # for PCA
  "ks" # for kernel smoothers
)

# install (if not installed) and load all the packages

# # run only once!
# install.packages("devtools")
# devtools::install_github("ddsjoberg/gtsummary")
# devtools::install_github("ropensci/skimr")

for (p in pkgs) {
  if (!require(p, character.only = TRUE)) install.packages(p)
  library(p, character.only = TRUE)
}

# choose an option to not treat warnings as errors
options(warn = 1)

# check what packages are being used on your machine
sessionInfo()

# check your working (home) directory
here()

# load data ----------------------------------------------------------------------------------------

if (length(list.files(here(), "veg_data_2022.csv")) == 0) {
  # load the data directly from the Google sheet
  veg <- googlesheets4::read_sheet(
    "https://docs.google.com/spreadsheets/d/1X9DM-uaaQ9MP5KnSNUTonx67BygXYH_7xd1G_vXOIqE",
    sheet = "Daten", range = "A1:S142", na = c(NULL, "-", ""), col_types = "iicccidddddddcdccdc"
  )
  # save the data to your home directory
  readr::write_csv2(veg, here("veg_data_2022.csv"))
} else {
  # load the data from your home directory
  veg <- readr::read_csv2(here("veg_data_2022.csv"))
}

# take a quick look at the data
glimpse(veg)

# rename columns -----------------------------------------------------------------------------------

veg %<>%
  rename(
    Fortlaufendenr = `...1`,
    Aufnahmenr = Aufnahmenr.,
    WissName = `Wissenschaftlicher Name`,
    Hoehe.avg = `Höhe Ø (cm)`,
    Hoehe.var = `Höhe σ²`,
    Blattflaeche = `Blatt-Fläche (mm²)`,
    Frischmasse = `Frischmasse (g)`,
    Trockenmasse = `Trockenmasse (mg)`,
    Germinulenmasse = `Germinulenmasse (mg)`,
    Feuchte = `Ellenberg-Feuchte`,
    Bodenfeuchte = `Bodenfeuchte (%vol)`
  )

# take a more detailed look at the data
skimr::skim(veg)
select(veg, -Fortlaufendenr, -Aufnahmenr, -WissName, -Anmerkungen) %>%
  gtsummary::tbl_summary()

# check data for errors ----------------------------------------------------------------------------

# reassign expert water availability measures
veg %<>% rows_update(tibble(Aufnahmenr = 1:8, WasserHH = c(1, 2, 3, 5, 6, 7, 8, 4)))

# make sure plot-level data don't have duplicates
select(veg, Aufnahmenr, Ort, Vegetation, WasserHH, Bodenfeuchte) %>%
  n_distinct() # should equal the number of plots (n = 8)

# make sure species are not duplicated within plots
nrow(veg) # 141
select(veg, Aufnahmenr, WissName) %>%
  n_distinct() # should equal the number of rows (n = 141)

# make sure species-level data (from databases) are unique
select(veg, WissName) %>%
  n_distinct() # 79
select(veg, WissName, Lebensform, Germinulenmasse, Bestaeubung, Feuchte) %>%
  n_distinct() # should equal the number of species (n = 79)

# check life form
table(veg$Lebensform)
filter(veg, Lebensform == "G") %>%
  pull(WissName) %>%
  unique()
veg %<>% mutate(Lebensform = case_when(
  WissName == "Phalaris arundinacea" ~ "mL",
  TRUE ~ Lebensform
))

# check % cover per species
hist(veg$Deckung)

# check total % cover per plot
group_by(veg, Aufnahmenr) %>%
  summarise(Deckung.tot = sum(Deckung))

# check vegetation types at each site
table(veg$Ort, veg$Vegetation)

# check if Ellenberg humidity indicator values are integers
unique(veg$Feuchte)
unique(as.integer(veg$Feuchte))
veg %<>%
  mutate(
    Feuchte = stringr::str_remove_all(Feuchte, pattern = "~|x|=") %>% # remove non-numeric symbols
      as.integer() # convert to integer
  )
veg[veg$Fortlaufendenr == 41, "Feuchte"] <- NA

# check taxon names
tax <- taxize::gnr_resolve(veg$WissName, data_source_id = 1, fields = "all", best_match_only = TRUE)
View(tax)
veg %<>%
  mutate(
    WissName = recode(
      WissName,
      `Taraxacum officinalis` = "Taraxacum officinale",
      `Viola arvense` = "Viola arvensis"
    )
  )
veg[veg$Fortlaufendenr == 41, "WissName"] <- "Agrostis capillaris"

# check leaf area
summary(veg$Blattflaeche)
veg %<>% mutate(Blattflaeche = na_if(Blattflaeche, 0))

# check seed mass
summary(veg$Germinulenmasse)
veg %>%
  filter(Germinulenmasse == .001) %>%
  pull(WissName) %>%
  unique()
View(available_tr8)
gmasse_tr8 <- TR8::tr8(
  species_list = c("Agrostis stolonifera", "Cerastium semidecandrum"),
  download_list = c("seed_wght", "SeedMass", "seed_mass"),
  allow_persistent = TRUE
)
gmasse_tr8
veg %<>%
  mutate(Germinulenmasse = case_when(
    WissName == "Cerastium semidecandrum" ~ .04,
    WissName == "Agrostis stolonifera" ~ .07,
    Fortlaufendenr == 41 ~ .1,
    TRUE ~ Germinulenmasse
  ))

# take a look at the data again
skim(veg)

# impute missing trait data ------------------------------------------------------------------------

# life form (data from FloraWeb)
select(veg, WissName, Lebensform) %>%
  distinct() %>%
  filter(is.na(Lebensform))
veg %<>%
  mutate(Lebensform = case_when(
    WissName %in% c("Cerastium glomeratum", "Erophila verna") ~ "mL",
    TRUE ~ Lebensform
  ))

# pollination type (data from FloraWeb)
select(veg, WissName, Bestaeubung) %>%
  distinct() %>%
  filter(is.na(Bestaeubung))
veg %<>%
  mutate(Bestaeubung = case_when(
    WissName %in% c(
      "Cerastium glomeratum", "Erophila verna", "Valerianella spec."
    ) ~ "se",
    TRUE ~ Bestaeubung
  ))

# seed mass (data from Seed Information Database)
select(veg, WissName, Germinulenmasse) %>%
  distinct() %>%
  filter(is.na(Germinulenmasse))
veg %<>%
  mutate(Germinulenmasse = case_when(
    WissName == "Allium vineale" ~ 7,
    WissName == "Carex ovalis" ~ .52,
    WissName == "Cerastium glomeratum" ~ .05,
    WissName == "Cerastium glutinosum" ~ .0654,
    WissName == "Cruciata laevipes" ~ 3.5856,
    WissName == "Erodium cicutarium" ~ 2,
    WissName == "Erophila verna" ~ .024,
    WissName == "Senecio erucifolius" ~ .3935,
    WissName == "Saxifraga granulata" ~ .03,
    WissName == "Prunus mahaleb" ~ 79.9,
    WissName == "Polygonum persicaria" ~ 2.1,
    TRUE ~ Germinulenmasse
  ))
summary(veg$Germinulenmasse)

# convert life form and pollination to factor
veg %<>% mutate(across(c(Lebensform, Bestaeubung), ~ as.factor(.)))

# calculate SLA & LDMC -----------------------------------------------------------------------------

veg %<>%
  mutate(
    SLA = Blattflaeche / Trockenmasse,
    LDMC = Trockenmasse / Frischmasse
  )

# check SLA
hist(veg$SLA)

# check LDMC
hist(veg$LDMC)
filter(veg, LDMC > 500) %>%
  select(Fortlaufendenr, WissName, Deckung, Blattflaeche, Frischmasse, Trockenmasse, LDMC)

# make corrections in the data
veg %<>%
  mutate(
    Frischmasse = if_else(Fortlaufendenr == 127, .2063, Frischmasse),
    Trockenmasse = if_else(Fortlaufendenr == 79, 85, Trockenmasse) # it's a guess!
  )

# recalculate SLA & LDMC
veg %<>%
  mutate(
    SLA = Blattflaeche / Trockenmasse,
    LDMC = Trockenmasse / Frischmasse
  )

# analyse species numbers --------------------------------------------------------------------------

# check the number of species per plot
artz1 <- table(veg$Aufnahmenr)
artz1

# check the number of species at each water availability level
artz2 <- table(veg$WasserHH)
artz2
# test whether the observed species numbers differ from the expected
chisq.test(artz2)
chisq.test(artz2)$expected
barplot(artz2,
  xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = "Artenzahl"
)

# analyse water availability -----------------------------------------------------------------------

# check the correspondence between the Ellenberg values and estimated water availability
boxplot(veg$Feuchte ~ veg$WasserHH)

# calculate the mean plot-level Ellenberg humidity indicator value
WasserHH.Feuchte <- group_by(veg, Aufnahmenr, WasserHH, Bodenfeuchte) %>%
  summarise(
    # community mean
    Feuchte_CM = mean(Feuchte, na.rm = TRUE) %>%
      round(., 1),
    # community weighted mean
    Feuchte_CWM = weighted.mean(x = Feuchte, w = Deckung, na.rm = TRUE) %>%
      round(., 1)
  )

# plot the plot-level Ellenberg humidity
op <- par(mfcol = c(1, 2), mar = c(4, 4, 2, 2))
plot(Feuchte_CWM ~ WasserHH,
  data = WasserHH.Feuchte, col = "red",
  pch = 20, ylab = "mittlere Feuchte nach Ellenberg"
)
points(WasserHH.Feuchte$WasserHH, WasserHH.Feuchte$Feuchte_CM, col = "green", pch = 20)
legend(2, 7, c("CWM", "CM"), col = c("red", "green"), pch = 20)

plot(Feuchte_CWM ~ Bodenfeuchte, data = WasserHH.Feuchte, col = "red", pch = 20, ylab = "")
points(WasserHH.Feuchte$Bodenfeuchte, WasserHH.Feuchte$Feuchte_CM, col = "green", pch = 20)
legend(2, 7, c("CWM", "CM"), col = c("red", "green"), pch = 20)
par(op)

# calculate correlation between the water availability and plot-level Ellenberg humidity
cor.test(WasserHH.Feuchte$WasserHH, WasserHH.Feuchte$Feuchte_CM, method = "kendall", exact = FALSE)
cor.test(WasserHH.Feuchte$WasserHH, WasserHH.Feuchte$Feuchte_CWM, method = "kendall", exact = FALSE)
cor.test(WasserHH.Feuchte$Bodenfeuchte, WasserHH.Feuchte$Feuchte_CWM,
  method = "kendall", exact = FALSE
)

# add the calculated mean Ellenberg humidity to the main data table
veg %<>% left_join(WasserHH.Feuchte)
glimpse(veg)

# analyse LDMC -------------------------------------------------------------------------------------

# LDMC = leaf dry matter content

# inspect the distribution of LDMC visually
hist(veg$LDMC)

# plot LDMC across water availability levels
boxplot(
  LDMC ~ WasserHH,
  dat = veg, ylab = "LDMC", xlab = "geschaetzte Wasserverfuegbarkeit (ordinal)",
  col = rgb(r = 0, g = 0, b = 1, alpha = seq(.1, .9, length.out = 7)), log = "y"
)
boxplot(LDMC ~ Feuchte_CWM,
  dat = veg, ylab = "LDMC", xlab = "mittlere Feuchte nach Ellenberg",
  x.axis = 1:8, col = rgb(r = 0, g = 0, b = 1, alpha = seq(.1, .9, length.out = 7)), log = "y"
)
ggplot(veg, aes(x = Bodenfeuchte, y = LDMC)) +
  geom_point(na.rm = TRUE) +
  stat_smooth(method = "lm", col = "#E69F00", se = FALSE, na.rm = TRUE) +
  labs(x = "Bodenfeuchte [%]") +
  theme_minimal()

# perform regression analyses
# using WasserHH as explanatory variable
fm0.LDMC <- lm(LDMC ~ WasserHH, data = veg)
summary(fm0.LDMC)
performance::check_model(fm0.LDMC)

fm1.LDMC <- lm(LDMC ~ WasserHH, weight = Deckung, data = veg) # weighted by % cover
summary(fm1.LDMC)
check_model(fm1.LDMC)

# compare the two models
tbl_merge(
  tbls = list(tbl_regression(fm0.LDMC), tbl_regression(fm1.LDMC)),
  tab_spanner = c("**Non-weighted**", "**Weigted**")
)

# using soil moisture (Bodenfeuchte) as explanatory variable
fm0.LDMC.b <- lm(LDMC ~ Bodenfeuchte, data = veg)
summary(fm0.LDMC.b)
check_model(fm0.LDMC.b)

fm1.LDMC.b <- lm(LDMC ~ Bodenfeuchte, weight = Deckung, data = veg)
summary(fm1.LDMC.b)
check_model(fm1.LDMC.b)

# compare the two models
tbl_merge(
  tbls = list(tbl_regression(fm0.LDMC.b), tbl_regression(fm1.LDMC.b)),
  tab_spanner = c("**Non-weighted**", "**Weigted**")
)

plot1 <- ggplot(veg, aes(x = Bodenfeuchte, y = LDMC)) +
  geom_point(na.rm = TRUE) +
  geom_abline(
    intercept = fm0.LDMC.b$coefficients[1], slope = fm0.LDMC.b$coefficients[2],
    col = "#E69F00", size = 1, na.rm = TRUE
  ) +
  geom_abline(
    intercept = fm1.LDMC.b$coefficients[1], slope = fm1.LDMC.b$coefficients[2],
    col = "#56B4E9", size = 1, na.rm = TRUE
  ) +
  labs(x = "Bodenfeuchte [%]") +
  theme_minimal() +
  annotate("text", x = 20, y = 270, label = "Non-weighted", col = "#E69F00") +
  annotate("text", x = 25, y = 320, label = "Weighted", col = "#56B4E9")
plot1

# analyse SLA --------------------------------------------------------------------------------------

# SLA = specific leaf area

# inspect the distribution of SLA visually
hist(veg$SLA)
hist(log(veg$SLA))

# test the normality of distribution
shapiro_test(veg$SLA)
shapiro_test(log(veg$SLA))

# add log-transformed values to the main data table
veg %<>% mutate(SLA.log = log(SLA))

# plot the distribution of SLA across water availability levels
op <- par(mar = c(4, 5, 2, 3))
boxplot(SLA ~ WasserHH,
  dat = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = "SLA", col = "blue", log = "y"
)
filter(veg, WasserHH == 2 & SLA > 25) %>%
  select(WissName, Aufnahmenr, SLA)
filter(veg, WissName == "Euphorbia cyparissias") %>%
  select(WissName, Aufnahmenr, SLA)

boxplot(SLA.log ~ WasserHH,
  dat = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = expression(log("SLA")), col = "blue"
)
par(op)
filter(veg, WasserHH == 7 & SLA.log < 3) %>%
  select(WissName, Aufnahmenr, SLA)

# perform regression analyses
fm1.SLA <- lm(SLA ~ WasserHH, weight = Deckung, data = veg)
summary(fm1.SLA)
check_model(fm1.SLA)

fm1.SLA.log <- lm(SLA.log ~ WasserHH, weight = Deckung, data = veg)
summary(fm1.SLA.log)
check_model(fm1.SLA.log)

fm1.SLA.log.b <- lm(SLA.log ~ Bodenfeuchte, weight = Deckung, data = veg)
summary(fm1.SLA.log.b)
check_model(fm1.SLA.log.b)

# plot the regression line
plot2 <- ggplot(veg, aes(x = Bodenfeuchte, y = SLA.log)) +
  geom_point(na.rm = TRUE) +
  geom_abline(
    intercept = fm1.SLA.log.b$coefficients[1], slope = fm1.SLA.log.b$coefficients[2],
    col = "#56B4E9", size = 1, na.rm = TRUE
  ) +
  labs(x = "Bodenfeuchte [%]", y = expression(log("SLA"))) +
  theme_minimal()
plot2

# look at the relationship between SLA and LDMC
plot(SLA ~ LDMC, data = veg)
cor.test(veg$SLA, veg$LDMC, use = "complete.obs")

# analyse height -----------------------------------------------------------------------------------

# inspect distributions visually
hist(veg$Hoehe.avg)
hist(log(veg$Hoehe.avg))

# test the normality of distribution
shapiro_test(veg$Hoehe.avg)
shapiro_test(log(veg$Hoehe.avg))

# add log-transformed values to the main data table
veg %<>% mutate(Hoehe.avg.log = log(Hoehe.avg))

# plot the distribution of height across water availability levels
op <- par(mar = c(4, 5, 2, 3))
boxplot(Hoehe.avg ~ WasserHH,
  data = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = "Wuchshöhe [cm]", col = rainbow(7, start = 1, end = 5 / 7),
  log = "y"
)

filter(veg, WasserHH == 6 & Hoehe.avg > 50) %>%
  select(WissName, Aufnahmenr, Hoehe.avg)
filter(veg, WasserHH == 8 & Hoehe.avg < 20) %>%
  select(WissName, Aufnahmenr, Hoehe.avg)

boxplot(Hoehe.avg.log ~ WasserHH,
  data = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = expression(log("Wuchshöhe [cm]")), col = rainbow(7, start = 1, end = 5 / 7)
)
par(op)

# perform regression analysis
fm1.hoehe.avg.log <- lm(Hoehe.avg.log ~ WasserHH, weight = Deckung, data = veg)
summary(fm1.hoehe.avg.log)
check_model(fm1.hoehe.avg.log)

fm1.hoehe.avg.log.b <- lm(Hoehe.avg.log ~ Bodenfeuchte, weight = Deckung, data = veg)
summary(fm1.hoehe.avg.log.b)
check_model(fm1.hoehe.avg.log.b)

# plot the regression line
plot3 <- ggplot(veg, aes(x = Bodenfeuchte, y = Hoehe.avg.log)) +
  geom_point(na.rm = TRUE) +
  geom_abline(
    intercept = fm1.hoehe.avg.log.b$coefficients[1],
    slope = fm1.hoehe.avg.log.b$coefficients[2],
    col = "#56B4E9", size = 1, na.rm = TRUE
  ) +
  geom_smooth(col = "red", se = FALSE, na.rm = TRUE) +
  labs(x = "Bodenfeuchte [%]", y = expression(log("Wuchshöhe [cm]"))) +
  theme_minimal()
plot3

# analyse height variance --------------------------------------------------------------------------

# plot height variance
hist(veg$Hoehe.var)
hist(log(veg$Hoehe.var))

# add log-transformed values to the main data table
veg %<>% mutate(Hoehe.var.log = log(Hoehe.var))

# check the relationship between height and height variance
boxplot(Hoehe.var.log ~ WasserHH, data = veg)
plot(Hoehe.var.log ~ Hoehe.avg, data = veg)
cor.test(veg$Hoehe.var.log, veg$Hoehe.avg)

# calculate the coefficient of variation
veg %<>% mutate(Hoehe.cv = sqrt(Hoehe.var) / Hoehe.avg * 100)

hist(veg$Hoehe.cv)
hist(log(veg$Hoehe.cv))

veg %<>% mutate(Hoehe.cv.log = log(Hoehe.cv))

plot(Hoehe.cv.log ~ Hoehe.avg, data = veg)
cor.test(veg$Hoehe.cv.log, veg$Hoehe.avg)

boxplot(Hoehe.cv ~ WasserHH,
  data = veg, xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = "Variationskoeffizient der Wuchshöhe", log = "y", col = topo.colors(7)[7:1]
)
filter(veg, WasserHH == 4 & Hoehe.cv > 100) %>%
  select(WissName, Aufnahmenr, Hoehe.avg, Hoehe.var, Hoehe.cv)

# perform regression analysis
fm1.Hoehe.cv.log <- lm(Hoehe.cv.log ~ WasserHH, data = veg, weight = Deckung)
summary(fm1.Hoehe.cv.log)
check_model(fm1.Hoehe.cv.log)

fm1.Hoehe.cv.log.b <- lm(Hoehe.cv.log ~ Bodenfeuchte, data = veg, weight = Deckung)
summary(fm1.Hoehe.cv.log.b)
check_model(fm1.Hoehe.cv.log.b)

# analyse seed mass --------------------------------------------------------------------------------

hist(veg$Germinulenmasse)
hist(log(veg$Germinulenmasse))

veg %<>% mutate(Germinulenmasse.log = log(Germinulenmasse))

op <- par(mar = c(4, 5, 2, 3))
boxplot(Germinulenmasse ~ WasserHH, data = veg, log = "y")
boxplot(Germinulenmasse.log ~ WasserHH, data = veg, ylab = expression(log("Germinulenmasse")))
filter(veg, Germinulenmasse.log > 4) %>%
  select(WissName, Germinulenmasse)
par(op)

# perform regression analysis
fm1.gmasse.log <- lm(Germinulenmasse.log ~ WasserHH, data = veg, weight = Deckung)
summary(fm1.gmasse.log)
check_model(fm1.gmasse.log)

fm1.gmasse.log.b <- lm(Germinulenmasse.log ~ Bodenfeuchte, data = veg, weight = Deckung)
summary(fm1.gmasse.log.b)
check_model(fm1.gmasse.log.b)

# analyse life form --------------------------------------------------------------------------------

# compare the distributions of life forms across vegetation types
select(veg, Lebensform, Vegetation) %>%
  gtsummary::tbl_summary(by = Vegetation) %>%
  add_p()

# calculate total % cover per life form across humidity levels
lft.1 <- group_by(veg, Aufnahmenr, WasserHH, Lebensform) %>%
  summarise(Deckung = sum(Deckung))
head(lft.1)
tail(lft.1)

lft.2 <- xtabs(Deckung ~ Lebensform + WasserHH, data = lft.1)
lft.2 <- DescTools::as.matrix.xtabs(lft.3)
lft.2
chisq_test(lft.2, simulate.p.value = TRUE)

# plot the distribution of life forms across humidity levels
# using base R
op <- par(mar = c(4, 5, 2, 3))
barplot(lft.3,
  xlab = "geschätzte Wasserverfügbarkeit (ordinal)",
  ylab = expression(sum("Deckung")),
  main = "Lebensformenspektren", ylim = c(0, 160)
)
legend(8.5, 150, legend = row.names(lft.3), pch = 15, col = grey.colors(ncol(lft.3)))
par(op)

# using ggplot2
ggplot(data = lft.1, aes(fill = Lebensform, y = Deckung, x = ordered(WasserHH))) +
  geom_bar(position = "stack", stat = "identity") +
  viridis::scale_fill_viridis(discrete = TRUE) +
  labs(x = "geschätzte Wasserverfügbarkeit (ordinal)", y = expression(sum("Deckung"))) +
  ggtitle("Lebensformenspektren") +
  theme_minimal()

# analyse pollination type -------------------------------------------------------------------------

# compare the distributions of pollination syndromes across vegetation types
select(veg, Bestaeubung, Vegetation) %>%
  gtsummary::tbl_summary(by = Vegetation) %>%
  add_p()

# calculate total % cover per pollination type across humidity levels
bst.1 <- group_by(veg, Aufnahmenr, WasserHH, Bestaeubung) %>%
  summarise(Deckung = sum(Deckung))
head(bst.1)
tail(bst.1)

bst.2 <- xtabs(Deckung ~ Bestaeubung + WasserHH, data = bst.1)
bst.2 <- DescTools::as.matrix.xtabs(bst.2)
bst.2
chisq_test(bst.2, simulate.p.value = TRUE)

# plot the distribution of pollination types
# using base R
op <- par(mar = c(4, 5, 2, 3))
cols <- c("yellow", "brown", "grey", "lightblue")
barplot(bst.3,
  xlab = "geschätzte Wasserverfügbarkeit (ordinal)", ylab = expression(sum("Deckung")),
  main = "Bestäubungstypenspektren",
  sub = expression(Chi^2 * "= 243, p < 0.001"), cex.sub = 0.8, col = cols
)
legend(0, 140,
  legend = c("Wind", "Selbst", "multiple", "Insekten"),
  pch = 15, col = cols[4:1], cex = 0.7
)

# using ggplot2
plot4 <- ggplot(data = bst.1, aes(fill = Bestaeubung, y = Deckung, x = ordered(WasserHH))) +
  geom_bar(position = "stack", stat = "identity") +
  viridis::scale_fill_viridis(
    discrete = TRUE, labels = c("Insekten", "multiple", "Selbst", "Wind")
  ) +
  labs(x = "geschätzte Wasserverfügbarkeit (ordinal)", y = expression(sum("Deckung"))) +
  ggtitle("Bestäubungstypenspektren") +
  theme_minimal()
plot4

# analyse the relationships between traits ---------------------------------------------------------

# test if height varies across life forms
# plot height by life form
ggpubr::ggboxplot(veg, x = "Lebensform", y = "Hoehe.avg.log", select = c("H", "G", "T", "mL"))

# check the ANOVA assumptions
# outliers
group_by(veg, Lebensform) %>%
  identify_outliers(Hoehe.avg.log)
# normality
lm.hoe.lf <- lm(Hoehe.avg.log ~ Lebensform, data = veg)
ggpubr::ggqqplot(residuals(lm.hoe.lf))
rstatix::shapiro_test(residuals(lm.hoe.lf))
# homogeneity of variances
levene_test(veg, Hoehe.avg.log ~ Lebensform)

# perform ANOVA and the Kruskal-Wallis test (non-parametric alternative)
anova_test(veg, Hoehe.avg.log ~ Lebensform)
# because the normality of residuals assumption was violated
# the Kruskal Wallis test should be used instead of a standard ANOVA
kruskal_test(veg, Hoehe.avg.log ~ Lebensform)


# test if the height varies across pollination types
veg$Bestaeubung <- as.factor(veg$Bestaeubung)
ggboxplot(veg, x = "Bestaeubung", y = "Hoehe.avg.log")
group_by(veg, Bestaeubung) %>%
  identify_outliers(Hoehe.avg.log) %>%
  select(Fortlaufendenr, WissName, Hoehe.avg, Hoehe.avg.log, is.outlier, is.extreme)
lm.hoe.best <- lm(Hoehe.avg.log ~ Bestaeubung, data = veg)
ggqqplot(residuals(lm.hoe.best))
shapiro_test(residuals(lm.hoe.best))
levene_test(veg, Hoehe.avg.log ~ Bestaeubung)

# because the normality of residuals assumption was violated
# we use the Kruskal Wallis test instead of a standard ANOVA
# and the Dunn's post hoc test for pairwise comparisons
hoe.best.kw <- kruskal_test(veg, Hoehe.avg.log ~ Bestaeubung)
hoe.best.kw
hoe.best.pwc <- dunn_test(veg, Hoehe.avg.log ~ Bestaeubung)
hoe.best.pwc

# plot the results
hoe.best.pwc %<>% add_xy_position(x = "Bestaeubung")
plot6 <- ggboxplot(veg, x = "Bestaeubung", y = "Germinulenmasse.log") +
  stat_pvalue_manual(hoe.best.pwc, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(hoe.best.kw, detailed = TRUE),
    caption = get_pwc_label(hoe.best.pwc),
    y = expression(log("Wuchshöhe")),
    x = "Bestäubungstyp"
  )
plot6

# test if the SLA varies across life forms
ggboxplot(veg, x = "Lebensform", y = "SLA.log", select = c("H", "G", "T", "mL"))
group_by(veg, Lebensform) %>%
  identify_outliers(SLA.log)
lm.sla.lf <- lm(SLA.log ~ Lebensform, data = veg)
ggqqplot(residuals(lm.sla.lf))
shapiro_test(residuals(lm.sla.lf))
levene_test(veg, SLA.log ~ Lebensform)
anova_test(veg, SLA.log ~ Lebensform)


# test if the seed mass varies across life forms
gmasse <- select(veg, WissName, Germinulenmasse.log, Lebensform, Bestaeubung) %>%
  distinct()
ggboxplot(gmasse, x = "Lebensform", y = "Germinulenmasse.log")
group_by(gmasse, Lebensform) %>%
  identify_outliers(Germinulenmasse.log)
lm.gmasse.lf <- lm(Germinulenmasse.log ~ Lebensform, data = gmasse)
ggqqplot(residuals(lm.gmasse.lf))
shapiro_test(residuals(lm.gmasse.lf))
levene_test(gmasse, Germinulenmasse.log ~ Lebensform)
gmasse.lf.aov <- anova_test(gmasse, Germinulenmasse.log ~ Lebensform)
gmasse.lf.aov
gmasse.lf.pwc <- tukey_hsd(gmasse, Germinulenmasse.log ~ Lebensform)
gmasse.lf.pwc

gmasse.lf.pwc %<>% add_xy_position(x = "Lebensform")
plot7 <- ggboxplot(gmasse, x = "Lebensform", y = "Germinulenmasse.log") +
  stat_pvalue_manual(gmasse.lf.pwc, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(gmasse.lf.aov, detailed = TRUE),
    caption = get_pwc_label(gmasse.lf.pwc),
    y = expression(log("Germinulenmasse")),
    x = "Lebensform"
  )
plot7


# test if the seed mass varies across pollination types
ggboxplot(gmasse, x = "Bestaeubung", y = "Germinulenmasse.log")
group_by(gmasse, Bestaeubung) %>%
  identify_outliers(Germinulenmasse.log)
fm.gmasse.best <- lm(Germinulenmasse.log ~ Bestaeubung, data = gmasse)
ggpubr::ggqqplot(residuals(fm.gmasse.best))
rstatix::shapiro_test(residuals(fm.gmasse.best))
ggqqplot(gmasse, "Germinulenmasse.log", facet.by = "Bestaeubung")
levene_test(gmasse, Germinulenmasse.log ~ Bestaeubung)

# because the homogeneity of variance assumption was violated
# we use the Welch test instead of a standard ANOVA
# and the Games-Howell post hoc test for pairwise comparisons
gmasse.best.aov <- welch_anova_test(gmasse, Germinulenmasse.log ~ Bestaeubung)
gmasse.best.aov
gmasse.best.pwc <- games_howell_test(gmasse, Germinulenmasse.log ~ Bestaeubung)
gmasse.best.pwc

gmasse.best.pwc %<>% add_xy_position(x = "Bestaeubung")
ggboxplot(gmasse, x = "Bestaeubung", y = "Germinulenmasse.log") +
  stat_pvalue_manual(gmasse.best.pwc, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(gmasse.best.aov, detailed = TRUE),
    caption = get_pwc_label(gmasse.best.pwc),
    y = expression(log("Germinulenmasse")),
    x = "Bestäubungstyp"
  )

# save the edited data as a separate file
write_csv2(veg, here("veg_data_2020_edited.csv"))

# analyse species composition ----------------------------------------------------------------------

veg <- read_csv2(here("veg_data_2022_edited.csv"))

mat <- xtabs(Deckung ~ Aufnahmenr + WissName, data = veg)
str(mat)

# perform correspondence analysis
decorana(mat)

fm.ca <- cca(mat)
fm.ca
summary(fm.ca)
plot(fm.ca, display = "sites")
plot(fm.ca)

# shorten taxon names
wname <- str_remove(veg$WissName, pattern = " cf\\.| x") %>%
  str_split(pattern = " ", simplify = TRUE) %>%
  substr(., 1, 3)
wname <- paste(wname[, 1], wname[, 2], sep = ".")
veg %<>% mutate(wname = wname)

# repeat the same analysis with shortened taxon names provided
mat <- xtabs(Deckung ~ Aufnahmenr + wname, data = veg)
fm.ca <- cca(mat)
summary(fm.ca)

# plot the results
summary(fm.ca)$species %>%
  as_tibble() %>%
  mutate(wname = rownames(summary(fm.ca)$species)) %>%
  ggplot(aes(x = CA1, y = CA2, label = wname)) +
  geom_point() +
  ggrepel::geom_text_repel(size = 1.5, force = 15, max.overlaps = 30) +
  theme_bw()

# prepare data for trait ordination ----------------------------------------------------------------

# calculate the proportion of each life form per plot
lf.kt <- select(veg, Aufnahmenr, Deckung, Lebensform) %>%
  pivot_wider(
    names_from = Lebensform, values_from = Deckung,
    values_fn = sum, values_fill = 0, names_prefix = "lf."
  ) %>%
  mutate(total = rowSums(.[, -1])) %>%
  mutate(across(lf.H:lf.M, ~ . / total), .keep = "unused")

# calculate the proportion of each pollination type per plot
best.kt <- select(veg, Aufnahmenr, Deckung, Bestaeubung) %>%
  pivot_wider(
    names_from = Bestaeubung, values_from = Deckung,
    values_fn = sum, values_fill = 0, names_prefix = "b."
  ) %>%
  mutate(total = rowSums(.[, -1])) %>%
  mutate(across(b.wi:b.mb, ~ . / total), .keep = "unused")

# compute community weighted means for numerical traits
cwm <- group_by(veg, Aufnahmenr) %>%
  summarise(across(
    c(Hoehe.avg.log, Hoehe.cv.log, LDMC, SLA.log, Germinulenmasse.log),
    ~ weighted.mean(., w = Deckung, na.rm = TRUE)
  )) %>%
  rename(
    Ho.av = Hoehe.avg.log,
    Ho.cv = Hoehe.cv.log,
    SLA = SLA.log,
    GM = Germinulenmasse.log
  )

# combine the three tables
cwm %<>%
  left_join(lf.kt) %>%
  left_join(best.kt)
cwm

# perform principal components analysis (PCA) of CWMs ----------------------------------------------

# analyse all traits
pca1 <- FactoMineR::PCA(cwm[, -c(1:2)])
summary(pca1)

cowplot::plot_grid(
  plot.PCA(pca1),
  plot.PCA(pca1, c(1, 3))
)

# analyse numerical traits only
colnames(cwm)
pca2 <- PCA(select(cwm, Ho.av:GM))
cowplot::plot_grid(
  plot.PCA(pca2),
  plot.PCA(pca2, c(1, 3))
)

# perform PCA of species traits --------------------------------------------------------------------

# create a separate table containing numerical traits
dat <- select(
  veg, WissName, Vegetation, Blattflaeche, Frischmasse, Trockenmasse,
  SLA, LDMC, Hoehe.avg, Hoehe.var, Germinulenmasse
) %>%
  rename(
    BlF = Blattflaeche,
    FrM = Frischmasse,
    TrM = Trockenmasse,
    Ho.av = Hoehe.avg,
    Ho.var = Hoehe.var,
    GM = Germinulenmasse
  ) %>%
  na.omit()
glimpse(dat)

# perform PCA
pca3 <- FactoMineR::PCA(dat[, -c(1:2)])
summary(pca3)
plot.PCA(pca3)

# add PC to the trait table
dat$pc1 <- pca3$ind$coord[, 1]
dat$pc2 <- pca3$ind$coord[, 2]
pc12 <- pca3$ind$coord[, 1:2]

# plot the PCA results
ggplot(data = dat, aes(x = pc1, y = pc2, color = Vegetation, shape = Vegetation)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_point(alpha = .8) +
  stat_ellipse(
    geom = "polygon", aes(fill = Vegetation), alpha = .2,
    show.legend = FALSE, level = .95
  ) +
  guides(
    color = guide_legend(title = "Vegetation"),
    shape = guide_legend(title = "Vegetation")
  ) +
  labs(x = "PC 1 (45.24%)", y = "PC 2 (20.27%)") +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    panel.border = element_rect(fill = "transparent"),
    legend.position = "bottom"
  )

# plot PCA results using kernels
# Code from Björn Reu
# Used to produce Fig. 2 in Diaz et al. 2016, Nature

H <- Hpi(x = pc12) # optimal bandwidth estimation
est <- kde(x = pc12, H = H, compute.cont = TRUE) # kernel density estimation

# set contour probabilities for drawing contour levels
cl <- contourLevels(est, prob = c(.5, .05, .001), approx = TRUE)

# use envfit for drawing arrows, can be also done using trait loadings
fit <- envfit(pc12, select(dat, BlF:GM))

# create a plot
par(mar = c(4, 4, 2, 2))
cols <- ifelse(dat$Vegetation == "TR", "darkred", "darkblue")
plot(est,
  cont = seq(1, 100, by = 1), display = "filled.contour2", add = FALSE, ylab = "", xlab = "",
  cex.axis = .75, ylim = c(-5, 4), xlim = c(-3, 7), las = 1
)
plot(est, abs.cont = cl[1], labels = .5, labcex = .75, add = TRUE, lwd = .75, col = "grey30")
plot(est, abs.cont = cl[2], labels = .95, labcex = .75, add = TRUE, lwd = .5, col = "grey60")
plot(est, abs.cont = cl[3], labels = .99, labcex = .75, add = TRUE, lwd = .5, col = "grey60")
points(pc12, pch = 16, cex = .5, col = cols)
plot(fit, cex = .90, col = 1)
mtext("PC 1 (45.24%)", cex = .75, side = 1, line = 1.8)
mtext("PC 2 (20.27%)", cex = .75, side = 2, line = 1.8)

dat[dat$pc1 > 4, ]
dat[dat$pc2 < -3, ]