Skip to content

Instantly share code, notes, and snippets.

@xie186
Created April 1, 2019 15:56
Show Gist options
  • Save xie186/3e3aa2b9853e37a059f109e11ecaecea to your computer and use it in GitHub Desktop.
Save xie186/3e3aa2b9853e37a059f109e11ecaecea to your computer and use it in GitHub Desktop.
1 R code for Random Forest s permutation test
R code for Random Forest
s
permutation test
library(
gdata
)
library(
randomForest
)
# mass shift
data file, 'original_data'
# column 1: whole number
associated with each
mass
shift
;
'bin_number' (1,2,3,...n)
# column 2:
mass shift
;
'm
ass
_shift
'
# column
3...n: m
ass shift
data for each sample;
'sample_1'...'sample_n',
# sample names data file; 'sample_names'
# column 1: sample names from original_data; 'sample'
# column 2: type of sample; 'class'
# read original data
orig.df
<
-
read.xls(
"original_data.xlsx"
,
stringsAsFactors
=
FALSE
)
# extract bin ids into vector
bin.id
<
-
orig.df
$
bin_number
# extract mass values and create column names
mod.col
<
-
paste(
"mass
_shift
"
,
orig.df
$
mass_shift
,
sep
=
"_"
)
# create transposed matrix of
abundances and rename columns
orig.mat
<
-
t(
orig.df
[,
-
(
1
:
2
)])
colnames(
orig.mat
) <
-
mod.col
# function takes a matrix and vector of bins (as long as number of
columns in 'mat')
# returns matrix of summed values for each bin
sumBins
<
-
function
(
mat
,
bins
)
{
sums
<
-
tapply(
1
:ncol(
mat
),
bins
,
function
(
i
) {
bin.mat
<
-
orig.mat
[,
i
,
drop
=
FALSE
]
rowSums(
bin.mat
,
na.rm
=
TRUE
)
})
do.call(
cbind
,
sums
)
}
# this is the empirical matrix of summed values
orig.bin.mat
<
-
sumBins(
orig.mat
,
bin.id
)
#
this is one matrix of randomly assigned bins (with bin frequency
kept constant)
ran.bin.mat1
<
-
sumBins(
orig.mat
, sample(
bin.id
))
# read sample
-
class assignments
sample.names
<
-
read.xls(
"sample_names.xlsx"
,
stringsAsFactors
=
FALSE
)
rownames(
sample.names
) <
-
sample.names
$
sample
row.class
<
-
factor(
sample.names
[rownames(
orig.bin.mat
),
"class"
])
# the class frequencies do not vary, so calculate them here:
2
freq
= table(
row.class
)
n
= min(ceiling(
freq
/
2
))
n
= max(
n
,
4
)
n
= rep(
n
, length(
freq
))
# setup a r
andomForest function for your observed and null models
rfFunc
<
-
function
(
mat
,
y
,
n
) {
randomForest(
mat
,
y
,
proximity
=
TRUE
,
importance
=
TRUE
,
ntree
=
10000
,
sampsize
=
n
,
replace
=
FALSE
)
}
# run your observed randomForest
obs.rf
<
-
rfFunc(
orig.bin.mat
,
row.class
,
n
)
obs.oob
<
-
obs.rf
$
err.rate
[nrow(
obs.rf
$
err.rate
),
1
]
# use lapply to collect random forest models for your null
distribution:
null.rf
<
-
lapply(
1
:
10
,
function
(
i
) {
cat(
i
,
"
\
n"
)
rfFunc(sumBins(
orig.mat
, sample(
bin.id
)),
row.class
,
n
)
})
# get whatever you want from models by looping through null.rf
null.oob.dist
<
-
sapply(
null.rf
,
function
(
x
)
x
$
err.rate
[nrow(
x
$
err.rate
),
1
])
hist(
null.oob.dist
)
abline(
v
=
obs.oob
,
lty
=
"dashed"
,
col
=
"red"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment