Created
April 1, 2019 15:56
-
-
Save xie186/3e3aa2b9853e37a059f109e11ecaecea to your computer and use it in GitHub Desktop.
1 R code for Random Forest s permutation test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
R code for Random Forest | |
s | |
permutation test | |
library( | |
gdata | |
) | |
library( | |
randomForest | |
) | |
# mass shift | |
data file, 'original_data' | |
# column 1: whole number | |
associated with each | |
mass | |
shift | |
; | |
'bin_number' (1,2,3,...n) | |
# column 2: | |
mass shift | |
; | |
'm | |
ass | |
_shift | |
' | |
# column | |
3...n: m | |
ass shift | |
data for each sample; | |
'sample_1'...'sample_n', | |
# sample names data file; 'sample_names' | |
# column 1: sample names from original_data; 'sample' | |
# column 2: type of sample; 'class' | |
# read original data | |
orig.df | |
< | |
- | |
read.xls( | |
"original_data.xlsx" | |
, | |
stringsAsFactors | |
= | |
FALSE | |
) | |
# extract bin ids into vector | |
bin.id | |
< | |
- | |
orig.df | |
$ | |
bin_number | |
# extract mass values and create column names | |
mod.col | |
< | |
- | |
paste( | |
"mass | |
_shift | |
" | |
, | |
orig.df | |
$ | |
mass_shift | |
, | |
sep | |
= | |
"_" | |
) | |
# create transposed matrix of | |
abundances and rename columns | |
orig.mat | |
< | |
- | |
t( | |
orig.df | |
[, | |
- | |
( | |
1 | |
: | |
2 | |
)]) | |
colnames( | |
orig.mat | |
) < | |
- | |
mod.col | |
# function takes a matrix and vector of bins (as long as number of | |
columns in 'mat') | |
# returns matrix of summed values for each bin | |
sumBins | |
< | |
- | |
function | |
( | |
mat | |
, | |
bins | |
) | |
{ | |
sums | |
< | |
- | |
tapply( | |
1 | |
:ncol( | |
mat | |
), | |
bins | |
, | |
function | |
( | |
i | |
) { | |
bin.mat | |
< | |
- | |
orig.mat | |
[, | |
i | |
, | |
drop | |
= | |
FALSE | |
] | |
rowSums( | |
bin.mat | |
, | |
na.rm | |
= | |
TRUE | |
) | |
}) | |
do.call( | |
cbind | |
, | |
sums | |
) | |
} | |
# this is the empirical matrix of summed values | |
orig.bin.mat | |
< | |
- | |
sumBins( | |
orig.mat | |
, | |
bin.id | |
) | |
# | |
this is one matrix of randomly assigned bins (with bin frequency | |
kept constant) | |
ran.bin.mat1 | |
< | |
- | |
sumBins( | |
orig.mat | |
, sample( | |
bin.id | |
)) | |
# read sample | |
- | |
class assignments | |
sample.names | |
< | |
- | |
read.xls( | |
"sample_names.xlsx" | |
, | |
stringsAsFactors | |
= | |
FALSE | |
) | |
rownames( | |
sample.names | |
) < | |
- | |
sample.names | |
$ | |
sample | |
row.class | |
< | |
- | |
factor( | |
sample.names | |
[rownames( | |
orig.bin.mat | |
), | |
"class" | |
]) | |
# the class frequencies do not vary, so calculate them here: | |
2 | |
freq | |
= table( | |
row.class | |
) | |
n | |
= min(ceiling( | |
freq | |
/ | |
2 | |
)) | |
n | |
= max( | |
n | |
, | |
4 | |
) | |
n | |
= rep( | |
n | |
, length( | |
freq | |
)) | |
# setup a r | |
andomForest function for your observed and null models | |
rfFunc | |
< | |
- | |
function | |
( | |
mat | |
, | |
y | |
, | |
n | |
) { | |
randomForest( | |
mat | |
, | |
y | |
, | |
proximity | |
= | |
TRUE | |
, | |
importance | |
= | |
TRUE | |
, | |
ntree | |
= | |
10000 | |
, | |
sampsize | |
= | |
n | |
, | |
replace | |
= | |
FALSE | |
) | |
} | |
# run your observed randomForest | |
obs.rf | |
< | |
- | |
rfFunc( | |
orig.bin.mat | |
, | |
row.class | |
, | |
n | |
) | |
obs.oob | |
< | |
- | |
obs.rf | |
$ | |
err.rate | |
[nrow( | |
obs.rf | |
$ | |
err.rate | |
), | |
1 | |
] | |
# use lapply to collect random forest models for your null | |
distribution: | |
null.rf | |
< | |
- | |
lapply( | |
1 | |
: | |
10 | |
, | |
function | |
( | |
i | |
) { | |
cat( | |
i | |
, | |
" | |
\ | |
n" | |
) | |
rfFunc(sumBins( | |
orig.mat | |
, sample( | |
bin.id | |
)), | |
row.class | |
, | |
n | |
) | |
}) | |
# get whatever you want from models by looping through null.rf | |
null.oob.dist | |
< | |
- | |
sapply( | |
null.rf | |
, | |
function | |
( | |
x | |
) | |
x | |
$ | |
err.rate | |
[nrow( | |
x | |
$ | |
err.rate | |
), | |
1 | |
]) | |
hist( | |
null.oob.dist | |
) | |
abline( | |
v | |
= | |
obs.oob | |
, | |
lty | |
= | |
"dashed" | |
, | |
col | |
= | |
"red" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment