Last active
December 17, 2019 09:59
-
-
Save franvillamil/6828dcf47a9c1f69563b8a9aa88956bc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R Script | |
setwd("~/Downloads") # NOTE you might want to change this! | |
# NOTE: It is usually best to transform Excel files into .csv files (in Save as > Csv file) | |
# but anyway, I'm using here the readxl package. | |
# Packages | |
library(vegan) # Species diversity | |
library(readxl) # Read Excel files | |
# Load data from Excel file | |
# I'm uploading the two matrices in two separate data frames to join them later | |
# NOTE: the as.data.frame at the beginning makes the output of read_excel a normal | |
# data frame, otherwise it is a slightly different data frame | |
vegetation_mat <- as.data.frame(read_excel("Glabbeek.xlsx", sheet = 1)) | |
abiotic_mat <- as.data.frame(read_excel("Glabbeek.xlsx", sheet = 2)) | |
# NOTE: Now you have one variable that is in both dataframes (Plot), | |
# you can use it to merge the data together later on | |
# HOWEVER! You can't include it in the calculations of the species diversity, | |
# so when doing it, just select the rest of columns, | |
# doing "[, -1]" which means "exclude column 1" | |
# ALFA DIVERSITY | |
# (NOTE: I'm not doing the BETA DIVERSITY, AS THIS IS ONLY FOR THE REGRESSIONS) | |
# Species richness | |
S <- specnumber(vegetation_mat[, -1]) | |
# Shannon index | |
H <- diversity(vegetation_mat[, -1], "shannon") | |
# Gini-Simpson index (1-D!!) | |
D <- diversity(vegetation_mat[, -1], "simpson") | |
# 1/D | |
invD <- diversity(vegetation_mat[, -1], "inv") | |
#Shannon evenness | |
J <- H/log(S) | |
#Simpson evenness | |
E <- invD/S | |
# This is not valid for regression, it creates a list of three | |
# variables, for 0, 1, 2... ?? | |
# #Hill numbers | |
# Hill <- tsallis(vegetation_mat[, -1], | |
# hill = TRUE, scales = c(0,1,2)) | |
# Now you want to merge these indices with the abiotic matrix | |
# You first want to check if they have the some order: | |
abiotic_mat$Plot == vegetation_mat$Plot | |
# And because they are all TRUE, you just can add the variables as new columns | |
# (If you had different order, you would have to use "merge" or "match" functions, | |
# or order both of them before from 1 to 100 or whatever) | |
# NOTE: cbind joints data frames and/or variables as columns ("column bind") into a data frame | |
data <- cbind(abiotic_mat, S, H, D, invD, J, E, Hill) | |
head(data) | |
## REGRESSIONS | |
# NOTE: I'm putting all variables as explanatory variables as they stand, without logs or anything | |
# You could do that if you want, just put log(variablename) | |
# NOTE2: lm is the function to calculate linear regression, and it is important that you work | |
# with a data frame that has all the variables INSIDE. | |
# So you just put the formula (dependent variable ~ independent1 + independent2 + etc...) | |
# and the specify the dataframe where those variables are (data = data), but if the name of the | |
# dataframe was e.g. Glabbeek, you would put: data = Glabbeek | |
model1 <- lm(S ~ Alluvial + Area + | |
Circumference + Age + Duration_forest + Duration_grass + Duration_arable, | |
data = data) | |
# After you define and create the model, which is in the object model1 in this case, you can | |
# access the results by putting summary(modelobject) | |
# So in this case: | |
summary(model1) | |
# Which gives you this: | |
# --------------------------------------------------------- | |
# Call: | |
# lm(formula = S ~ Alluvial + Area + Circumference + Age + Duration_forest + | |
# Duration_grass + Duration_arable, data = data) | |
# | |
# Residuals: | |
# Min 1Q Median 3Q Max | |
# -20.3122 -5.6043 -0.0818 6.2390 19.2513 | |
# | |
# Coefficients: (1 not defined because of singularities) | |
# Estimate Std. Error t value Pr(>|t|) | |
# (Intercept) 7.590e+00 4.815e+00 1.576 0.11835 | |
# Alluvial 1.174e+01 2.120e+00 5.535 2.86e-07 *** | |
# Area -4.019e-05 2.058e-04 -0.195 0.84560 | |
# Circumference 2.675e-02 1.004e-02 2.663 0.00914 ** | |
# Age 4.854e-02 2.449e-02 1.982 0.05045 . | |
# Duration_forest -1.741e-02 3.308e-02 -0.526 0.59982 | |
# Duration_grass -1.043e-02 2.198e-02 -0.475 0.63616 | |
# Duration_arable NA NA NA NA | |
# --- | |
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 | |
# | |
# Residual standard error: 8.203 on 93 degrees of freedom | |
# Multiple R-squared: 0.5202, Adjusted R-squared: 0.4893 | |
# F-statistic: 16.81 on 6 and 93 DF, p-value: 4.771e-13 | |
# --------------------------------------------------------- | |
# What does it mean? In this case, Alluvial has a positive effect on | |
# Species richness (S), significant at the 99.9% etc (being Alluvial plot | |
# increases the number of species in 1.174). Circumference also has a positive | |
# and significant effect on species richness. Age shows a positive relationship | |
# but it is only sigificant at the 90% level. All the other variables do not | |
# show significant effects. | |
# NOTE: linear regression coefficients always indicate the increase in the dependent | |
# variable following a 1-UNIT INCREASE in the independent variable. In this case, Alluvial | |
# is just 0 ot 1, so the coefficient indicates the effect of being Alluvial. But with other | |
# variables, e.g. Area, the coefficient indicates the effect of a 1-unit increase in area | |
# (however it is measured, km2 or hectareas etc) | |
# And then you can just create the other models etc | |
# And see the results with summary() | |
model2 <- lm(H ~ Alluvial + Area + | |
Circumference + Age + Duration_forest + Duration_grass + Duration_arable, | |
data = data) | |
model3 <- lm(D ~ Alluvial + Area + | |
Circumference + Age + Duration_forest + Duration_grass + Duration_arable, | |
data = data) | |
model4 <- lm(invD ~ Alluvial + Area + | |
Circumference + Age + Duration_forest + Duration_grass + Duration_arable, | |
data = data) | |
model5 <- lm(J ~ Alluvial + Area + | |
Circumference + Age + Duration_forest + Duration_grass + Duration_arable, | |
data = data) | |
model6 <- lm(E ~ Alluvial + Area + | |
Circumference + Age + Duration_forest + Duration_grass + Duration_arable, | |
data = data) | |
# NOTE! A LITTLE TRICK | |
# There is one package (stargazer) that allows you to create tables of all the models automatically | |
# It is very handy if you write in LaTeX because it creates the table ready for the document, | |
# but you can also create a text version, either to check several models quickly, or to | |
# create the tables for the paper. I don't think you can create tables for Word, | |
# but it'll be quicker in any case. | |
library(stargazer) | |
stargazer(model1, model2, model3, model4, model5, model6, | |
type = "text", # This tells stargazer to produce plain text | |
star.char = c("+", "*", "**", "***"), # This is to indicate how to mark | |
star.cutoffs = c(0.1, 0.05, 0.01, 0.001), # the significance levels, and | |
notes = "+ p<0.1; * p<0.05; ** p<0.01; *** p<0.001", notes.append = F) # to add a note explaining | |
# ========================================================================================= | |
# Dependent variable: | |
# ----------------------------------------------------------- | |
# S H D invD J E | |
# (1) (2) (3) (4) (5) (6) | |
# ----------------------------------------------------------------------------------------- | |
# Alluvial 11.735*** 0.430*** 0.018*** 11.735*** 0.000 0.000 | |
# (2.120) (0.088) (0.005) (2.120) (0.000) (0.000) | |
# | |
# Area -0.00004 -0.00000 -0.00000 -0.00004 -0.000 -0.000 | |
# (0.0002) (0.00001) (0.00000) (0.0002) (0.000) (0.000) | |
# | |
# Circumference 0.027** 0.001* 0.00005* 0.027** 0.000 0.000 | |
# (0.010) (0.0004) (0.00002) (0.010) (0.000) (0.000) | |
# | |
# Age 0.049+ 0.002* 0.0001* 0.049+ -0.000 -0.000 | |
# (0.024) (0.001) (0.0001) (0.024) (0.000) (0.000) | |
# | |
# Duration_forest -0.017 -0.001 -0.0001 -0.017 0.000 0.000 | |
# (0.033) (0.001) (0.0001) (0.033) (0.000) (0.000) | |
# | |
# Duration_grass -0.010 -0.0003 -0.00001 -0.010 -0.000 -0.000 | |
# (0.022) (0.001) (0.00005) (0.022) (0.000) (0.000) | |
# | |
# Duration_arable | |
# | |
# | |
# Constant 7.590 2.499*** 0.925*** 7.590 1.000*** 1.000*** | |
# (4.815) (0.199) (0.010) (4.815) (0.000) (0.000) | |
# | |
# ----------------------------------------------------------------------------------------- | |
# Observations 100 100 100 100 100 100 | |
# R2 0.520 0.444 0.335 0.520 0.500 0.495 | |
# Adjusted R2 0.489 0.408 0.292 0.489 0.468 0.463 | |
# Residual Std. Error (df = 93) 8.203 0.339 0.018 8.203 0.000 0.000 | |
# F Statistic (df = 6; 93) 16.807*** 12.390*** 7.815*** 16.807*** 15.528*** 15.203*** | |
# ========================================================================================= | |
# Note: + p<0.1; * p<0.05; ** p<0.01; *** p<0.001 | |
# And here you can see that Alluvial and circumference have positive and significant effects | |
# on almost all measures of biodiversity (except J and E). Age has similar effects, although | |
# in two cases (S and invD), the effect is only signifcant at the 90% level. | |
# The rest of the variables do not have any significant effect on any measure. | |
# NOTE SO WHAT COULD BE THE CONCLUSION? I dont know about ecology, but it looks like | |
# Alluvial plots, older forests, and forests with a higher circumference all have higher | |
# biodiversity, at least using most of the measures (except the evenness ones) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment