Skip to content

Instantly share code, notes, and snippets.

@noamross
Created November 9, 2012 20:35
Show Gist options
  • Save noamross/4048075 to your computer and use it in GitHub Desktop.
Save noamross/4048075 to your computer and use it in GitHub Desktop.
GAMs using mgcv and amphibian presence/absence dataset
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 1 column, instead of 35 in line 1.
#Data provided for purpose of R example only. Not for other use without permission. Contact Rosemary Hartman (rosehartman at gmail dot com.)
lake.ID,name,treatment,shorline,sizehectars,elevation,fish.y.n,fish.breeding,BUBO.breeding,PSRE.breeding,TAGR.present,RACA.present,RACA.breeding,adult.RACA,YOPY.RACA,meta.RACA,RACA.eggs,RACA.larvae,veg.extent,transect.with.veg,ave.0.1m,ave.0.5m,ave.1m,bank.slope,pH,LGWD,SMWD,silt.0.1,silt.total,area.10cm,vegetated.area,near.type,count.RACApops.1K,count.lakes1K,weighted.dist
533,East Boulder,Co,1356.4,11.4,2034.8448,y,y,y,y,n,y,y,6,2,0,0,350,2.28,0.292,0.722916667,5.016666667,10.68333333,0.092475736,8.1,0,0.0417,0.376,0.3472,980.56,3092.59,RACA,5,10,0.8268
534,Mosquito,Co,555.65,2.221,2017,y,y,y,y,n,y,y,2,0,0,0,70,0.425,0.292,0.520833333,4.291666667,14.2875,0.067204288,9,0.25,0.478,0.375,0.6806,289.4,236.15,RACA,0,2,0.52
537,Upper Boulder,Co,530.48,2.009,1927,y,y,y,y,n,y,y,3,5,0,0,240,1.17,0.875,0.46,5.125,14.11666667,0.068711911,8.3,0.04,0,0.875,0.9306,244.02,620.66,FISH,4,9,0.8312
539,Big Marshy,Co,667.29,2.813,1911,y,y,y,y,n,y,y,3,2,0,0,180,0.517,0.826,0.23,1.26,3,0.331579018,8.5,0.26,0.3333,0.83333,0.5278,153.48,344.99,RACA,2,4,0.563
543,Little Marshy,Co,339.29,0.661,1875,y,y,y,y,y,y,y,2,0,0,0,840,0.27,0.875,0.18,2.15,4.142857143,0.233320289,8.1,0.67,0.75,1,1,61.07,91.61,CO,3,4,0.5114
544,Middle Boulder,Co,604.369,2.493,1984.8576,y,y,y,n,n,y,y,7,2,0,0,8000,2.48,0.636,0.327272727,3.313636364,11.25,0.08493805,8.5,0.318,0.54545,0.6364,0.81818,197.79,1498.84,CO,3,7,0.9013
554,Telephone,Co,534.796,1.493,2098.8528,y,n,y,y,n,y,y,8,0,0,0,20,0.292,0.291,0.333333333,1.825,3.9125,0.254833499,8,0,0.4583,0.3333,0.29167,178.27,156.16,CO,4,5,0.4198
561,Virginia,Co,453.892,1.43,2008.0224,y,y,n,n,n,y,y,7,0,0,0,350,1.2,0.708,0.333333333,3.6125,7.095833333,0.136797022,7.4,2.71,0.875,0.9167,0.8472,151.3,544.67,FISH,0,1,0.4821
574,Fish,Co,525.934,1.558,1821.18,y,y,y,n,n,y,y,3,5,0,0,367,6.36,1,0.583333333,6.5625,NA,0.07240347,6.2,0.167,0.217,1,1,306.79,3344.94,RACA,1,2,1.0843
609,Rush Creek,Co,453.08,1.311,2013,y,y,y,y,n,y,y,5,0,0,1,100,1.45,0.9583,0.54,3.995833333,7.038095238,0.13807655,7.45,1.79,0.75,1,0.97222,244.67,656.97,RACA,0,0,0
612,McDonald,Co,714.23,2.77,1797,y,y,n,y,n,y,y,6,0,0,0,57,2.54,0.8333,0.73,3.11,6.09,0.165445662,7.1,1.87,0.958,0.875,0.8611,521.39,1814.15,CO,1,6,0.1235
681,"Boulder Lake, Big",Co,654.43,2.642,1850,y,y,y,y,y,y,n,4,0,1,0,0,0.895,0.571,0.252,4.305,10.21,0.094854622,7.3,2.048,0.7619,0.857,0.9206,164.92,585.72,FISH,3,7,0.5311
682,Conway,Co,217.701,0.283,2078.736,y,y,n,n,n,y,y,2,0,0,0,11,1.426,0.842,0.632,3.247368421,9.331578947,0.105423578,7.6,1.053,0.4737,0.8421,0.8596,137.59,310.44,FISH,1,16,0.0902
688,Union,Co,666.691,1.49,1837.3344,y,y,n,y,n,y,y,9,3,2,0,1,0.654,0.583,2,10,20,0.05,9.2,2.5,0.6667,0.7083,0.8056,500,436.02,RACA,2,4,1.091
722,Salmon,Co,490.09,0.658,2179,y,y,y,y,n,y,y,6,1,0,4,200,1.29,0.895,0.58,2.17,3.8,0.265192487,6.6,1.29,0.3158,0.9474,0.8567,284.25,632.21,RACA,1,11,0.1394
732,El,Co,1071.03,2.259,1990,y,y,y,y,n,y,y,5,0,2,0,635,0.22,0.2,0.49,5.18,5.26,0.141795358,7.1,4.35,0.8,0.9,0.867,524.8,235.63,FISH,0,4,0.2779
531,Mill Creek,fish,377.96,0.713,2008.9368,y,n,y,n,n,y,n,1,0,0,0,0,0.691,0.682,0.295,1.386363636,3.25,0.308095216,8.3,0.591,0.636,0.6818,0.71212,111.5,261.17,RACA,3,4,0.5538
553,Fox Creek,fish,661.25,2.791,2003,y,y,n,y,n,y,n,3,0,0,0,0,2.16,0.542,0.46,6.045833333,10.31363636,0.091902485,7.2,1.04,0.667,0.8333,0.90278,304.17,1428.3,FISH,3,4,0.7386
555,Mavis,fish,502.95,1.721,2048,y,n,n,y,n,y,n,3,1,0,0,0,1.8,0.565,0.37,3.186956522,6.37826087,0.153416601,7.05,2.87,1,0.4782,0.6956,186.09,905.31,RACA,2,2,0.6144
556,West Boulder,fish,552.5,2.053,2122.3224,y,n,y,y,n,n,n,0,0,0,0,0,0.2,0.25,0.25,1.108333333,2.525,0.398011014,7.7,0.583,0.833,0.1667,0.1691,138.12,110.5,RACA,2,2,0.5504
563,Tangle Blue,fish,870.27,4.534,1751,y,y,y,y,y,y,n,1,0,0,0,0,2.08,0.833,0.358,2.43,4.48,0.218926013,7.7,0.375,0.583,0.9583,0.8889,311.56,1810.16,RACA,0,0,0.5399
573,Twin 1,fish,261.673,0.415,1967.1792,y,y,y,y,n,y,n,1,0,0,0,0,4.43,0.857,1.2,4.007142857,21.275,0.043439116,8.6,4.43,0.1429,1,1,314.01,1159.21,RACA,1,5,0.44
576,Long Gulch,fish,1021.974,7.362,1952.5488,y,y,y,n,n,n,n,0,0,0,0,0,0.9125,0.667,0.366666667,1.7,3.425,0.293120249,7.1,3.29,1,0.583,0.513,374.72,932.55,RACA,0,0,0.1676
588,Trail Gulch,fish,983.875,6.777,1961.6928,y,y,y,n,n,n,n,0,0,0,0,0,0.45,0.4583,0.329166667,1.454166667,3.179166667,0.316543667,8.1,1.79,0.875,0.54166,0.4167,323.86,442.74,RACA,0,0,0.6774
593,Granite,fish,594.77,2.297,1964.7408,y,y,n,n,n,n,n,0,0,0,0,0,0,0,0.220833333,1.5875,2.804166667,0.347121369,8.1,1.71,0.9583,0.79167,0.875,131.35,0,RACA,0,0,0.5445
601,Doe,fish,543.717,1.672,2131.1616,y,y,y,n,n,n,n,0,0,0,0,0,0,0,0.170833333,1.683333333,3.4,0.286799343,7.7,2.21,0.91666,0.7917,0.69444,92.88,0,RACA,0,0,0.5531
611,Stoddard,fish,1349.484,12.69,1777.5936,y,n,n,y,n,y,n,1,2,0,0,0,0.492,0.7917,0.470833333,2.591666667,4.670833333,0.21156318,8.1,0.583,0.5417,0.333,0.5278,635.38,663.95,CO,2,6,0.2882
666,Sugar Pine,fish,638.939,2.474,2005.2792,y,y,n,n,y,n,n,0,0,0,0,0,0.192,0.0833,0.220833333,1.504166667,2.916666667,0.337661923,8,0.792,0.7083,0.4583,0.5139,141.1,122.68,FISH,1,2,0.1666
683,Foster,fish,680.13,2.662,2208.276,y,y,n,n,n,y,n,1,0,0,0,0,0,0,0.3625,3.041666667,5.245833333,0.183714243,8.2,2.333,0.833,0.125,0.4028,246.55,0,FISH,1,13,0.1753
684,Lion,fish,471.651,1.544,2132.3808,y,y,n,n,n,y,n,1,0,0,0,0,0.129,0.5417,0.145833333,0.75,1.883333333,0.527896107,8.5,0.5,0.583,0.625,0.444,68.78,60.84,CO,2,18,0.1694
686,Little Boulder,fish,531.091,1.897,1925.7264,y,n,n,n,n,n,n,0,0,0,0,0,1.181,0.857,0.219047619,1.338095238,2.733333333,0.362842014,8.7,1.19,0.762,0.619,0.571,116.33,627.22,FISH,1,3,0.4532
700,Lilypad,fish,349.108,0.846,1915.3632,y,y,y,n,n,n,n,0,0,0,0,0,1.65,0.789,0.705263158,3.636842105,12.67894737,0.076035504,8.1,0.737,0.526,1,1,246.21,576.03,RACA,1,6,0.25
727,Twin 2,fish,238.123,0.319,1549.908,y,y,n,y,y,y,n,1,0,0,0,0,0.281,0.25,0.075,2.1375,11.30625,0.080434362,8.3,2.125,0.75,1,1,17.86,66.91,RACA,1,4,0.34
760,Luella,fish,357,0.942,2110.1304,y,y,y,y,n,y,n,3,0,0,0,0,0.025,0.0833,0.066666667,0.766666667,2.020833333,0.480850091,9.6,0.583,0.7917,0.5417,0.36111,23.8,8.92,FISH,2,7,0.16
765,Forbidden 1,fish,453.246,0.773,1869.948,y,y,n,y,n,n,n,0,0,0,0,0,2.38,0.6,0.645,4.545,3.44,0.162730513,7.3,0.2,0.45,0.6,0.63333,292.34,1078.73,RACA,6,10,0.5672
767,Forbidden 2,fish,272.539,0.355,1875.1296,y,y,n,n,n,y,n,2,0,0,0,0,0.141,0.3182,0.186363636,0.977272727,1.881818182,0.529263988,NA,0.545,0.409,0.4545,0.363636,50.79,38.43,FISH,4,7,0.54
770,Deer,fish,440.21,1.314,2176,y,y,y,n,n,n,n,5,0,0,0,0,0.48,0.625,0.29,1.23,2.73,0.369188018,8.7,0.04,0.0833,0.875,0.75,127.66,211.3,FISH,1,4,0.3665
771,Diamond,fish,372.57,0.949,2200,y,y,n,n,n,n,n,0,0,0,0,0,0.7,0.217,0.02,0.49,1.73,0.546235472,8.85,0.04,0.087,0.8696,0.8116,7.45,260.8,RACA,1,6,0.33
773,Summit,fish,738.19,3.636,2300.0208,y,y,n,n,n,n,n,0,0,0,0,0,0,0,0.2,1.2,2.616666667,0.379639963,8.6,0.083,0.2083,0,0,147.64,0,FISH,2,4,0.428
26177,South Fork 1,fish,529.882,1.823,2041.5504,y,y,n,n,y,n,n,0,0,0,0,0,0.191,0.45833,0.6,3.341666667,5.8625,0.167922684,8.6,1.416,1,0.4583,0.5556,317.93,101.21,FISH,0,4,0
26184,U Boulder Pond 1,fish,243.28,0.441,2072.64,y,y,n,n,n,y,n,4,1,0,0,0,0.742,0.917,0.441666667,1.891666667,4.025,0.250498382,8.3,0.083,0.4167,0.9167,0.8889,107.45,180.51,FISH,5,10,0.5
26187,M Boulder Pond 1,fish,234.927,0.375,2133.6,y,y,n,n,n,y,n,1,0,0,0,0,0.575,1,0.15,1.25,3.233333333,0.302735107,9.1,0.25,0.5,1,0.8056,35.24,135.08,FISH,2,5,0.482
## GAMs using mgcv and amphibian presence/absence dataset
## code and data by Rosemary Hartman (rosehartman at gmail dot com)
## First, load the mgcv package (Simon Wood)
library(mgcv)
## Now, attach your data set (or my data set)
fishlakes <- read.csv("fishlakes.csv", comment.char="#")
## create a full exploritory model using varibles you think are biologically
## significant (this is pretty subjective, and where you need to use your brain)
## For my data set, I am trying to predict the presence of cascades frogs
fishlakes.logr <- gam(treatment ~ ## "treatment" = breeding cascades frogs in lake
BUBO.breeding + ## breeding toads in lake
count.RACApops.1K + ## number of cascades frog populations within 1km
s(silt.total, k=6) + ## amount of silt in lake bed
s(vegetated.area, k=7) + ## total lake area with emergent aquatic vegetation
s(area.10cm, k=6) + ## total lake area less than 10cm deep
s(weighted.dist, k=6) + ## perportion of nearby water bodies with frogs weighted by distance
s(bank.slope, k=6), ## average slope
family=binomial("logit"), data= fishlakes) ## this is a logistic function since I have binomial data
summary(fishlakes.logr) ## Note, this model explains 68.5% of devience
gam.check(fishlakes.logr) ## It seems to think my ks are too low, but raising them
## gives huge overspecification. I'm not sure what to do about this.
plot(fishlakes.logr) ## diagnostic plots for the smooths
## To choose my k's, I looped through a bunch of different values for k, and basically
## just played around until my model stopped overspecifying
for (i in 2:8) { fishlakes.logr <- gam(treatment ~ count.RACApops.1K + BUBO.breeding + s(silt.total, k=i) + s(vegetated.area, k=i) + s(bank.slope, k=i) + s(area.10cm, k=i) + s(weighted.dist, k=i), family=binomial("logit"), data= fishlakes);
x <- summary(fishlakes.logr);
y <- AIC(fishlakes.logr);
print(i)
print(x)
print(y)
}
for (i in 2:10) { fishlakes.logr <- gam(treatment ~ count.RACApops.1K + BUBO.breeding + s(silt.total, k=i) + s(vegetated.area, k=7) + s(bank.slope, k=6) + s(area.10cm, k=6) + s(weighted.dist, k=6), family=binomial("logit"), data= fishlakes);
x <- summary(fishlakes.logr);
y <- AIC(fishlakes.logr);
print(i)
print(x)
print(y)
}
## In order to see which of the candidate terms are significant, I will run the model
## several times, dropping one term out each time and then do a likelyhood ratio test
## to see if adding the term leads to a significant improvement in residual deviance.
## fishlakes2.logr is the same model as fishlakes.logr without the "bank slope" term
fishlakes2.logr <- gam(treatment ~ BUBO.breeding + count.RACApops.1K +
s(silt.total, k=6) + s(vegetated.area, k=7) + s(area.10cm, k=6) + s(weighted.dist, k=6),
family=binomial("logit"), data= fishlakes)
summary(fishlakes2.logr) ## This model only explains 34.1% of deviance
anova(fishlakes2.logr, fishlakes.logr, test = "LRT") ## There is a significant difference
## in explanitory power between the two models, that means the "bank slope" term
## should be included in the final model
## Now do the same with the rest of the varibles
fishlakes3.logr <- gam(treatment ~ BUBO.breeding + count.RACApops.1K +s(silt.total, k=6)
+ s(vegetated.area, k=7) + s(bank.slope, k=6) + s(area.10cm, k=6),
family=binomial("logit"), data= fishlakes) ## all variables except "weighted distance"
summary(fishlakes3.logr) ## This model is way overspecified (100% of devience explained)
anova(fishlakes3.logr, fishlakes.logr, test = "LRT") ## significant differnce!
## Include "weighted distance" in final model
fishlakes4.logr <- gam(treatment ~ BUBO.breeding +count.RACApops.1K+
s(silt.total, k=6) + s(vegetated.area, k=7) +
s(bank.slope, k=6) + s(weighted.dist, k=6),
family=binomial("logit"), data= fishlakes) ## all variables except area under 10cm
summary(fishlakes4.logr) ## This model only explains 54.5% of deviance
anova(fishlakes4.logr, fishlakes.logr, test = "LRT") ## the difference is not
## significant, leave "area under 10cm" out of final model
fishlakes5.logr <- gam(treatment ~ BUBO.breeding +
s(silt.total, k=6) + s(vegetated.area, k=7)+ s(area.10cm, k=6) +
s(weighted.dist, k=6), family=binomial("logit"), data= fishlakes) ## all variables except number
## of frog populations within 1km
summary(fishlakes5.logr) ## This model only explains 31.8% of deviance
anova(fishlakes5.logr, fishlakes.logr, test = "LRT") ## significant differnce!
## Include "count RACA pops" in final model
fishlakes6.logr <-gam(treatment ~ BUBO.breeding + count.RACApops.1K+
s(silt.total, k=6) + s(bank.slope, k=6) + s(area.10cm, k=6) + s(weighted.dist, k=6),
family=binomial("logit"), data= fishlakes) ## all varibles except vegetated area
summary(fishlakes6.logr) ## This model explains 61.4% of deviance
anova(fishlakes6.logr, fishlakes.logr, test = "LRT") ## Only marginally significant
## maybe drop vegetated area out of final model
fishlakes7.logr <- gam(treatment ~ BUBO.breeding + count.RACApops.1K+ s(vegetated.area, k=7) + s(bank.slope, k=6) + s(area.10cm, k=6) +
s(weighted.dist, k=6), family=binomial("logit"), data= fishlakes) ## all variables except silt
summary(fishlakes7.logr) ## This model explains 63.3% of deviance
anova(fishlakes7.logr, fishlakes.logr, test = "LRT") ## No difference. Definitely
## drop silt out of final model
fishlakes8.logr <- gam(treatment ~ count.RACApops.1K+ s(vegetated.area, k=7) + s(silt.total, k=6) +
s(bank.slope, k=5) + s(area.10cm, k=5) + s(weighted.dist, k=5),
family=binomial("logit"), data= fishlakes) ## all variables except toads
summary(fishlakes8.logr) ## This model only explains 49.7% of deviance
anova(fishlakes8.logr, fishlakes.logr, test = "LRT") ## significant differnce!
## Include breeding toads in final model
## Now that we know what variables we are using, we need to figure out how to put them all in
## a useful model.
fish1 <- gam(treatment ~ BUBO.breeding + count.RACApops.1K + s(bank.slope, k=6) + s(vegetated.area, k=7) +
s(weighted.dist, k=6), family=binomial("logit"), data= fishlakes)
summary(fish1)
## Look at AICc of the models we have so far
library("bbmle")
AICctab(fishlakes.logr, fishlakes2.logr, fishlakes4.logr, fishlakes5.logr, fishlakes6.logr, fishlakes7.logr, fishlakes8.logr, fish1, base = TRUE, weights = TRUE, nobs=nrow(fishlakes))
## Now we can come up with more models, look for interactions, cross-validate the
## best ones we have so far, or just give up.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment