franvillamil/elias.R

## elias.R
# R Script

setwd("~/Downloads") # NOTE you might want to change this!

# NOTE: It is usually best to transform Excel files into .csv files (in Save as > Csv file)
# but anyway, I'm using here the readxl package.

# Packages

library(vegan) # Species diversity
library(readxl) # Read Excel files

# Load data from Excel file
# I'm uploading the two matrices in two separate data frames to join them later
# NOTE: the as.data.frame at the beginning makes the output of read_excel a normal
# data frame, otherwise it is a slightly different data frame
vegetation_mat <- as.data.frame(read_excel("Glabbeek.xlsx", sheet = 1))
abiotic_mat <- as.data.frame(read_excel("Glabbeek.xlsx", sheet = 2))

# NOTE: Now you have one variable that is in both dataframes (Plot),
# you can use it to merge the data together later on
# HOWEVER! You can't include it in the calculations of the species diversity,
# so when doing it, just select the rest of columns,
# doing "[, -1]" which means "exclude column 1"

# ALFA DIVERSITY
# (NOTE: I'm not doing the BETA DIVERSITY, AS THIS IS ONLY FOR THE REGRESSIONS)

# Species richness
S <- specnumber(vegetation_mat[, -1])

# Shannon index
H <- diversity(vegetation_mat[, -1], "shannon")

# Gini-Simpson index (1-D!!)
D <- diversity(vegetation_mat[, -1], "simpson")

# 1/D
invD <- diversity(vegetation_mat[, -1], "inv")

#Shannon evenness
J <- H/log(S)

#Simpson evenness
E <- invD/S

# This is not valid for regression, it creates a list of three
# variables, for 0, 1, 2... ??
# #Hill numbers
# Hill <- tsallis(vegetation_mat[, -1],
#   hill = TRUE, scales = c(0,1,2))


# Now you want to merge these indices with the abiotic matrix
# You first want to check if they have the some order:
abiotic_mat$Plot == vegetation_mat$Plot
# And because they are all TRUE, you just can add the variables as new columns
# (If you had different order, you would have to use "merge" or "match" functions,
# or order both of them before from 1 to 100 or whatever)
# NOTE: cbind joints data frames and/or variables as columns ("column bind") into a data frame
data <- cbind(abiotic_mat, S, H, D, invD, J, E, Hill)
head(data)

## REGRESSIONS

# NOTE: I'm putting all variables as explanatory variables as they stand, without logs or anything
# You could do that if you want, just put log(variablename)
# NOTE2: lm is the function to calculate linear regression, and it is important that you work
# with a data frame that has all the variables INSIDE.
# So you just put the formula (dependent variable ~ independent1 + independent2 + etc...)
# and the specify the dataframe where those variables are (data = data), but if the name of the
# dataframe was e.g. Glabbeek, you would put: data = Glabbeek


model1 <- lm(S ~ Alluvial + Area +
  Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
  data = data)

# After you define and create the model, which is in the object model1 in this case, you can
# access the results by putting summary(modelobject)
# So in this case:
summary(model1)
# Which gives you this:
# ---------------------------------------------------------
# Call:
# lm(formula = S ~ Alluvial + Area + Circumference + Age + Duration_forest +
#     Duration_grass + Duration_arable, data = data)
#
# Residuals:
#      Min       1Q   Median       3Q      Max
# -20.3122  -5.6043  -0.0818   6.2390  19.2513
#
# Coefficients: (1 not defined because of singularities)
#                   Estimate Std. Error t value Pr(>|t|)
# (Intercept)      7.590e+00  4.815e+00   1.576  0.11835
# Alluvial         1.174e+01  2.120e+00   5.535 2.86e-07 ***
# Area            -4.019e-05  2.058e-04  -0.195  0.84560
# Circumference    2.675e-02  1.004e-02   2.663  0.00914 **
# Age              4.854e-02  2.449e-02   1.982  0.05045 .
# Duration_forest -1.741e-02  3.308e-02  -0.526  0.59982
# Duration_grass  -1.043e-02  2.198e-02  -0.475  0.63616
# Duration_arable         NA         NA      NA       NA
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 8.203 on 93 degrees of freedom
# Multiple R-squared:  0.5202,	Adjusted R-squared:  0.4893
# F-statistic: 16.81 on 6 and 93 DF,  p-value: 4.771e-13
# ---------------------------------------------------------

# What does it mean? In this case, Alluvial has a positive effect on
# Species richness (S), significant at the 99.9% etc (being Alluvial plot
# increases the number of species in 1.174). Circumference also has a positive
# and significant effect on species richness. Age shows a positive relationship
# but it is only sigificant at the 90% level. All the other variables do not
# show significant effects.

# NOTE: linear regression coefficients always indicate the increase in the dependent
# variable following a 1-UNIT INCREASE in the independent variable. In this case, Alluvial
# is just 0 ot 1, so the coefficient indicates the effect of being Alluvial. But with other
# variables, e.g. Area, the coefficient indicates the effect of a 1-unit increase in area
# (however it is measured, km2 or hectareas etc)


# And then you can just create the other models etc
# And see the results with summary()

model2 <- lm(H ~ Alluvial + Area +
  Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
  data = data)

model3 <- lm(D ~ Alluvial + Area +
  Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
  data = data)

model4 <- lm(invD ~ Alluvial + Area +
  Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
  data = data)

model5 <- lm(J ~ Alluvial + Area +
  Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
  data = data)

model6 <- lm(E ~ Alluvial + Area +
  Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
  data = data)

# NOTE! A LITTLE TRICK
# There is one package (stargazer) that allows you to create tables of all the models automatically
# It is very handy if you write in LaTeX because it creates the table ready for the document,
# but you can also create a text version, either to check several models quickly, or to
# create the tables for the paper. I don't think you can create tables for Word,
# but it'll be quicker in any case.
library(stargazer)

stargazer(model1, model2, model3, model4, model5, model6,
  type = "text", # This tells stargazer to produce plain text
  star.char = c("+", "*", "**", "***"), # This is to indicate how to mark
  star.cutoffs = c(0.1, 0.05, 0.01, 0.001), # the significance levels, and
  notes = "+ p<0.1; * p<0.05; ** p<0.01; *** p<0.001", notes.append = F) # to add a note explaining

# =========================================================================================
#                                                   Dependent variable:
#                               -----------------------------------------------------------
#                                   S         H         D       invD        J         E
#                                  (1)       (2)       (3)       (4)       (5)       (6)
# -----------------------------------------------------------------------------------------
# Alluvial                      11.735*** 0.430***  0.018***  11.735***   0.000     0.000
#                                (2.120)   (0.088)   (0.005)   (2.120)   (0.000)   (0.000)
#
# Area                          -0.00004  -0.00000  -0.00000  -0.00004   -0.000    -0.000
#                               (0.0002)  (0.00001) (0.00000) (0.0002)   (0.000)   (0.000)
#
# Circumference                  0.027**   0.001*   0.00005*   0.027**    0.000     0.000
#                                (0.010)  (0.0004)  (0.00002)  (0.010)   (0.000)   (0.000)
#
# Age                            0.049+    0.002*    0.0001*   0.049+    -0.000    -0.000
#                                (0.024)   (0.001)  (0.0001)   (0.024)   (0.000)   (0.000)
#
# Duration_forest                -0.017    -0.001    -0.0001   -0.017     0.000     0.000
#                                (0.033)   (0.001)  (0.0001)   (0.033)   (0.000)   (0.000)
#
# Duration_grass                 -0.010    -0.0003  -0.00001   -0.010    -0.000    -0.000
#                                (0.022)   (0.001)  (0.00005)  (0.022)   (0.000)   (0.000)
#
# Duration_arable
#
#
# Constant                        7.590   2.499***  0.925***    7.590   1.000***  1.000***
#                                (4.815)   (0.199)   (0.010)   (4.815)   (0.000)   (0.000)
#
# -----------------------------------------------------------------------------------------
# Observations                     100       100       100       100       100       100
# R2                              0.520     0.444     0.335     0.520     0.500     0.495
# Adjusted R2                     0.489     0.408     0.292     0.489     0.468     0.463
# Residual Std. Error (df = 93)   8.203     0.339     0.018     8.203     0.000     0.000
# F Statistic (df = 6; 93)      16.807*** 12.390*** 7.815***  16.807*** 15.528*** 15.203***
# =========================================================================================
# Note:                                           + p<0.1; * p<0.05; ** p<0.01; *** p<0.001

# And here you can see that Alluvial and circumference have positive and significant effects
# on almost all measures of biodiversity (except J and E). Age has similar effects, although
# in two cases (S and invD), the effect is only signifcant at the 90% level.
# The rest of the variables do not have any significant effect on any measure.

# NOTE SO WHAT COULD BE THE CONCLUSION? I dont know about ecology, but it looks like
# Alluvial plots, older forests, and forests with a higher circumference all have higher
# biodiversity, at least using most of the measures (except the evenness ones)
	# R Script

	setwd("~/Downloads") # NOTE you might want to change this!

	# NOTE: It is usually best to transform Excel files into .csv files (in Save as > Csv file)
	# but anyway, I'm using here the readxl package.

	# Packages

	library(vegan) # Species diversity
	library(readxl) # Read Excel files

	# Load data from Excel file
	# I'm uploading the two matrices in two separate data frames to join them later
	# NOTE: the as.data.frame at the beginning makes the output of read_excel a normal
	# data frame, otherwise it is a slightly different data frame
	vegetation_mat <- as.data.frame(read_excel("Glabbeek.xlsx", sheet = 1))
	abiotic_mat <- as.data.frame(read_excel("Glabbeek.xlsx", sheet = 2))

	# NOTE: Now you have one variable that is in both dataframes (Plot),
	# you can use it to merge the data together later on
	# HOWEVER! You can't include it in the calculations of the species diversity,
	# so when doing it, just select the rest of columns,
	# doing "[, -1]" which means "exclude column 1"

	# ALFA DIVERSITY
	# (NOTE: I'm not doing the BETA DIVERSITY, AS THIS IS ONLY FOR THE REGRESSIONS)

	# Species richness
	S <- specnumber(vegetation_mat[, -1])

	# Shannon index
	H <- diversity(vegetation_mat[, -1], "shannon")

	# Gini-Simpson index (1-D!!)
	D <- diversity(vegetation_mat[, -1], "simpson")

	# 1/D
	invD <- diversity(vegetation_mat[, -1], "inv")

	#Shannon evenness
	J <- H/log(S)

	#Simpson evenness
	E <- invD/S

	# This is not valid for regression, it creates a list of three
	# variables, for 0, 1, 2... ??
	# #Hill numbers
	# Hill <- tsallis(vegetation_mat[, -1],
	# hill = TRUE, scales = c(0,1,2))


	# Now you want to merge these indices with the abiotic matrix
	# You first want to check if they have the some order:
	abiotic_mat$Plot == vegetation_mat$Plot
	# And because they are all TRUE, you just can add the variables as new columns
	# (If you had different order, you would have to use "merge" or "match" functions,
	# or order both of them before from 1 to 100 or whatever)
	# NOTE: cbind joints data frames and/or variables as columns ("column bind") into a data frame
	data <- cbind(abiotic_mat, S, H, D, invD, J, E, Hill)
	head(data)

	## REGRESSIONS

	# NOTE: I'm putting all variables as explanatory variables as they stand, without logs or anything
	# You could do that if you want, just put log(variablename)
	# NOTE2: lm is the function to calculate linear regression, and it is important that you work
	# with a data frame that has all the variables INSIDE.
	# So you just put the formula (dependent variable ~ independent1 + independent2 + etc...)
	# and the specify the dataframe where those variables are (data = data), but if the name of the
	# dataframe was e.g. Glabbeek, you would put: data = Glabbeek



	model1 <- lm(S ~ Alluvial + Area +
	Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
	data = data)

	# After you define and create the model, which is in the object model1 in this case, you can
	# access the results by putting summary(modelobject)
	# So in this case:
	summary(model1)
	# Which gives you this:
	# ---------------------------------------------------------
	# Call:
	# lm(formula = S ~ Alluvial + Area + Circumference + Age + Duration_forest +
	# Duration_grass + Duration_arable, data = data)
	#
	# Residuals:
	# Min 1Q Median 3Q Max
	# -20.3122 -5.6043 -0.0818 6.2390 19.2513
	#
	# Coefficients: (1 not defined because of singularities)
	# Estimate Std. Error t value Pr(>\|t\|)
	# (Intercept) 7.590e+00 4.815e+00 1.576 0.11835
	# Alluvial 1.174e+01 2.120e+00 5.535 2.86e-07 ***
	# Area -4.019e-05 2.058e-04 -0.195 0.84560
	# Circumference 2.675e-02 1.004e-02 2.663 0.00914 **
	# Age 4.854e-02 2.449e-02 1.982 0.05045 .
	# Duration_forest -1.741e-02 3.308e-02 -0.526 0.59982
	# Duration_grass -1.043e-02 2.198e-02 -0.475 0.63616
	# Duration_arable NA NA NA NA
	# ---
	# Signif. codes: 0 ‘*’ 0.001 ‘’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
	#
	# Residual standard error: 8.203 on 93 degrees of freedom
	# Multiple R-squared: 0.5202, Adjusted R-squared: 0.4893
	# F-statistic: 16.81 on 6 and 93 DF, p-value: 4.771e-13
	# ---------------------------------------------------------

	# What does it mean? In this case, Alluvial has a positive effect on
	# Species richness (S), significant at the 99.9% etc (being Alluvial plot
	# increases the number of species in 1.174). Circumference also has a positive
	# and significant effect on species richness. Age shows a positive relationship
	# but it is only sigificant at the 90% level. All the other variables do not
	# show significant effects.

	# NOTE: linear regression coefficients always indicate the increase in the dependent
	# variable following a 1-UNIT INCREASE in the independent variable. In this case, Alluvial
	# is just 0 ot 1, so the coefficient indicates the effect of being Alluvial. But with other
	# variables, e.g. Area, the coefficient indicates the effect of a 1-unit increase in area
	# (however it is measured, km2 or hectareas etc)


	# And then you can just create the other models etc
	# And see the results with summary()

	model2 <- lm(H ~ Alluvial + Area +
	Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
	data = data)

	model3 <- lm(D ~ Alluvial + Area +
	Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
	data = data)

	model4 <- lm(invD ~ Alluvial + Area +
	Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
	data = data)

	model5 <- lm(J ~ Alluvial + Area +
	Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
	data = data)

	model6 <- lm(E ~ Alluvial + Area +
	Circumference + Age + Duration_forest + Duration_grass + Duration_arable,
	data = data)

	# NOTE! A LITTLE TRICK
	# There is one package (stargazer) that allows you to create tables of all the models automatically
	# It is very handy if you write in LaTeX because it creates the table ready for the document,
	# but you can also create a text version, either to check several models quickly, or to
	# create the tables for the paper. I don't think you can create tables for Word,
	# but it'll be quicker in any case.
	library(stargazer)

	stargazer(model1, model2, model3, model4, model5, model6,
	type = "text", # This tells stargazer to produce plain text
	star.char = c("+", "", "", "**"), # This is to indicate how to mark
	star.cutoffs = c(0.1, 0.05, 0.01, 0.001), # the significance levels, and
	notes = "+ p<0.1; * p<0.05; p<0.01; * p<0.001", notes.append = F) # to add a note explaining

	# =========================================================================================
	# Dependent variable:
	# -----------------------------------------------------------
	# S H D invD J E
	# (1) (2) (3) (4) (5) (6)
	# -----------------------------------------------------------------------------------------
	# Alluvial 11.735* 0.430* 0.018* 11.735* 0.000 0.000
	# (2.120) (0.088) (0.005) (2.120) (0.000) (0.000)
	#
	# Area -0.00004 -0.00000 -0.00000 -0.00004 -0.000 -0.000
	# (0.0002) (0.00001) (0.00000) (0.0002) (0.000) (0.000)
	#
	# Circumference 0.027** 0.001* 0.00005* 0.027** 0.000 0.000
	# (0.010) (0.0004) (0.00002) (0.010) (0.000) (0.000)
	#
	# Age 0.049+ 0.002* 0.0001* 0.049+ -0.000 -0.000
	# (0.024) (0.001) (0.0001) (0.024) (0.000) (0.000)
	#
	# Duration_forest -0.017 -0.001 -0.0001 -0.017 0.000 0.000
	# (0.033) (0.001) (0.0001) (0.033) (0.000) (0.000)
	#
	# Duration_grass -0.010 -0.0003 -0.00001 -0.010 -0.000 -0.000
	# (0.022) (0.001) (0.00005) (0.022) (0.000) (0.000)
	#
	# Duration_arable
	#
	#
	# Constant 7.590 2.499* 0.925* 7.590 1.000* 1.000*
	# (4.815) (0.199) (0.010) (4.815) (0.000) (0.000)
	#
	# -----------------------------------------------------------------------------------------
	# Observations 100 100 100 100 100 100
	# R2 0.520 0.444 0.335 0.520 0.500 0.495
	# Adjusted R2 0.489 0.408 0.292 0.489 0.468 0.463
	# Residual Std. Error (df = 93) 8.203 0.339 0.018 8.203 0.000 0.000
	# F Statistic (df = 6; 93) 16.807* 12.390* 7.815* 16.807* 15.528* 15.203*
	# =========================================================================================
	# Note: + p<0.1; * p<0.05; p<0.01; * p<0.001

	# And here you can see that Alluvial and circumference have positive and significant effects
	# on almost all measures of biodiversity (except J and E). Age has similar effects, although
	# in two cases (S and invD), the effect is only signifcant at the 90% level.
	# The rest of the variables do not have any significant effect on any measure.

	# NOTE SO WHAT COULD BE THE CONCLUSION? I dont know about ecology, but it looks like
	# Alluvial plots, older forests, and forests with a higher circumference all have higher
	# biodiversity, at least using most of the measures (except the evenness ones)