-
Notifications
You must be signed in to change notification settings - Fork 1
/
RFE_randomForest_caret.R
365 lines (257 loc) · 11.7 KB
/
RFE_randomForest_caret.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
# 2/4/2020
# extract covariate data at observation points
#
# covariate data reduction using recursive feature elimination
#
# random forest model of basic ecosites implemented using the caret package
#
#
# Load and install packages
required.packages <- c("caret", "rgdal", "raster", "doParallel", "psych", "maptools", "dplyr", "sp", "snow", "snowfall", "plyr", "randomForest", "fmsb")
new.packages <- required.packages[!(required.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(required.packages, require, character.only=T)
rm(required.packages, new.packages)
# set directory that contains the covariates and point data
# *** pay attention to the / below
setwd("C:/cov")
getwd()
# read in raster layers to create raster stack
r.list=list.files(getwd(), pattern="tif$", full.names = FALSE)
r.list
#create raster stack of covariates
r.stack <- stack(r.list)
# check the names of the raster stack
names(r.stack)
## load in shapefile with points, if points are in a different folder change the dsn="." to the directory that it is in
shp.pts <-readOGR(dsn=".", layer="field_data_0206")
# the points and the raster stack need to be in the same projection
r.stack@crs # projection information for raster stack
shp.pts@proj4string # proj info for shapefile
all.equal(r.stack@crs, shp.pts@proj4string) #True - same projection
# if you need to convert the projection of the shape file, it would be done as follows:
#
# shp.pts <- spTransform(shp.pts, CRSobj = r.stack@crs)
# r brings in a points shp file in as a SpatialPointsDataFrame (SPDF) object
# check the names of your SPDF
names(shp.pts)
# we only need the column that we are modeling remove all other columns
# remove columns 1 through 12 from shp.pts and rewrite it to shp.pts
shp.pts <- shp.pts[-c(1:12,14:18)]
# check the names of shp.pts to see if it worked you should only have the column with your modeling class
names(shp.pts)
## Plot to ensure alignment bw points and rasters
plot(r.stack$rockdry)
plot(shp.pts, add=TRUE)
# this step gets the names of the rasters for use later
# convert raster stack to list of single raster layers
r.stack.list <- unstack(r.stack)
names(r.stack.list) <- names(r.stack)
# this following lines are the extract function this allows you to
# extract the covariate values at each data point collected
## Parallelized extract: (larger datasets)
sfInit(parallel=TRUE, cpus=parallel:::detectCores()-1)
sfLibrary(raster)
sfLibrary(rgdal)
# run parallelized 'extract'
e.df <- sfSapply(r.stack.list, extract, y=shp.pts)
sfStop()
# clean memory
gc()
# now we need to assign the names of the covariates to the extracted values
DF <- as.data.frame(e.df)
names(DF) = tools::file_path_sans_ext(basename(names(r.stack.list)))
names(DF)
# head() looks at the top five rows of a data frame
head(DF)
# create ID field in both dataframe and shp
DF$ID <- seq.int(nrow(DF))
shp.pts$ID <- seq.int(nrow(shp.pts))
# create the training points by merging the extracted covariate values with the shape file
train.pts = merge(shp.pts, DF, by="ID")
#check the names to ensure merge
names(train.pts)
## Save points as a RDS file - this is a R file that contains an R object - save this for use later
#saveRDS(train.pts, file = "points.rds")
# clean junk out of memory
gc()
###################################################################
# this section gets the data ready for covariate reduction, where we select the covariates that are most usefule for modeling
# read in RDS file with points that include the extracted covariate values
#comp <- as.data.frame(readRDS("points.rds"))
comp <- as.data.frame(train.pts)
names(comp)
# remove the ID column and change the first column name to class
comp <- comp[,-c(1,107,108)] # removes the ID column
names(comp) # check the column names to make sure ID is removed
names(comp)[c(1)] <- c('Class') # change name of first col to class
names(comp) # check names of comp
# check the levels (classes) of comp class
levels(comp$Class)
# remove NA values from class
#comp <- comp[complete.cases(comp), ]
##################################################
# Recursive Feature Selection (RFE)
# this section is covariate reduction section
# This process is setup to run as a parallel
# in the make cluster function.
# Next, change the subsets
# to match the number of covariates that you have.
# check the number of covariates
length(comp) # number of covariates plus the class column
subsets <- c(1:(length(comp)-1))
# set seeds to get reporducable results when running the process in parallel
set.seed(238)
seeds <- vector(mode = "list", length=105)
for(i in 1:104) seeds[[i]] <- sample.int(1000, length(subsets) + 1)
seeds[[105]] <- sample.int(1000, 1)
# set up the rfe control
ctrl.RFE <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
number = 15,
repeats = 5,
seeds = seeds,
verbose = FALSE)
## highlight and run everything from c1 to stopCluster(c1) to run RFE
c1 <- makeCluster(detectCores()-1)
registerDoParallel(c1)
set.seed(9)
rf.RFE <- rfe(x = comp[,-1],
y = comp$Class,
sizes = subsets,
rfeControl = ctrl.RFE,
allowParallel = TRUE
)
stopCluster(c1)
gc()
# 4.3 Look at the results
rf.RFE
rf.RFE$fit
rf.RFE$results
plot(rf.RFE) # default plot is for Accuracy, but it can also be changed to Kappa
plot(rf.RFE, metric="Kappa", main='RFE Kappa')
plot(rf.RFE, metric="Accuracy", main='RFE Accuracy')
# see list of predictors
predictors(rf.RFE)
# the rfe function retuns the covariates with the highest accuracy for the rf model
# view the highest accuracy noted by the *
rf.RFE
# take the accuracy and subtract the accuracySD. look in the results of rf.RFE and find the accuracy that is > or = to this value. this is the number of covariates to use below
predictors(rf.RFE) # top number of covariates
# look at the top number of covariates that are equal to greater than the accuracy minus the accuracySD
predictors(rf.RFE)[1:5]
# assign this to a variable
a <- predictors(rf.RFE)[1:5]
# subset the covariate stack and the data frame by the selected covariates the variable a is your selected covariates
# subsed the raser stack to the selected covariates
r.stack.model <- subset(r.stack, a)
# subset the data frame points with the number of covariates selected
comp.sub <- (comp[,c("Class", a)])
names(comp.sub)
## run the random forest model using the caret pkg
#subsets max number should be the same or equal to the number of covariates being used
# determine number of covariates
length(a)
subsets <- length(a)
# set seeds to get reproducable results when running the process in parallel
set.seed(12)
seeds <- vector(mode = "list", length=76)
for(i in 1:75) seeds[[i]] <- sample.int(1000, length(1:subsets) + 1)
seeds[[76]] <- sample.int(1000, 1)
# set up the train control
fitControl <- trainControl(method = "repeatedcv",
number = 15,
repeats = 5,
p = 0.7, #30% used for test set, 70% used for training set
selectionFunction = 'best',
classProbs = T,
savePredictions = T,
returnResamp = 'final',
search = "random",
seeds = seeds)
#make.names(RFE.train.all$Class)
# 5.1 Random Forest - Parallel process, highlight and run from c1 to stopCluster(c1)
c1 <- makeCluster(detectCores()-1)
registerDoParallel(c1)
set.seed(48)
rfFit = train(Class ~ ., data = comp.sub,
"rf",
trControl = fitControl,
ntree = 500, #number of trees default is 500, which seems to work best anyway.
tuneLength=10,
metric = 'Kappa',
na.action=na.pass,
keep.forest=TRUE, # added this line and the next for partial dependance plots
importance=TRUE)
stopCluster(c1)
gc()
# Inspect rfFit
rfFit$finalModel
#look at overall accuracy, kappa, and SD associated with each mtry value
print(rfFit)
rfFit$results
#look at accuracy associated with each CV fold
str(rfFit)
rfFit$resample
# Convert caret wrapper into randomforest model object for raster prediction
rf.model <- rfFit$finalModel
varImpPlot(rf.model)
## below are a couple of different methods for looking and saving the confusion matrix
# set working directory to save file outputs
setwd("D:/dsm/FL_field_week/data/modeling/results")
## confusion matrix method one
confmatrix <- rf.model$confusion[1:length(rf.model$classes),1:length(rf.model$classes)]#Must match number of classes
OOBkappa <- Kappa.test(confmatrix, conf.level = 0.95)
OOBkappa # Summary of Kappa result
write.table(confmatrix, file = paste("conf_matr_oob.txt",sep="_"), sep = "\t", row.names = TRUE) ## need to add one space to first row in excel
# confusion matrix method two
confusion.mat <- as.data.frame(rfFit$finalModel$confusion)
confusion.mat
# save confusion matrix as tab delimited txt file to import into excel
#set working directory on where to save raster layer
write.table(confusion.mat, "confusion_matrix.txt", sep = "\t")
####################################################
## Parallelized predict
#rasterOptions(maxmemory = 5e+08,chunksize = 3e+06)
# prediction of classes
library(randomForest)
beginCluster()
pred <- clusterR(r.stack.model, predict, args=list(model=rf.model),progress="text")
endCluster()
gc()
# predict probabilities
beginCluster()
## predprob makes a huge temp file on c drive, make sure there's room...
# Classwise probabilities for every pixel
predprob <- clusterR(r.stack.model, predict, args=list(model=rf.model, type="prob", index = 1:length(rf.model$classes)),progress="text")
## Summarize probability stack
maxfn <- function(probstk){ind <- max(probstk)
return(ind)
} # Function to find max probability
predprob_stk <- stack(predprob)
## Render max probability layer
probmax <- clusterR(predprob_stk, calc, args=list(fun=maxfn),progress='text')
names(probmax) <-"probmax"
names(probmax)
intfn <- function(pmax){ind <- pmax*100
return(ind)
}
## Render integer version of max probability raster for smaller file size
probmaxint <- clusterR(probmax, calc, args=list(fun=intfn),progress='text') # not produced yet for 2018 field week
## Render integer version of probability raster stack for smaller file size
predprobint <- clusterR(predprob, calc, args=list(fun=intfn),progress='text') # not produced yet for 2018 field week
endCluster()
# save rasters
# class prediction raster
writeRaster(pred, overwrite = TRUE, filename = "comp_class.tif", options=c("COMPRESS=DEFLATE", "TFW=YES"),datatype='INT1U', progress="text")
# probability rasters
writeRaster(predprobint, overwrite=TRUE,filename="probmatrix_sf.tif", options=c("COMPRESS=DEFLATE", "TFW=YES"),datatype='INT1U', progress="text")
writeRaster(probmaxint, overwrite=TRUE,filename="probmax.tif", options=c("COMPRESS=DEFLATE", "TFW=YES"),datatype='INT1U', progress="text")
plot(pred)
# create the raster attribute table
# Create lookup table to match with values in raster
lookup_tab <- as.data.frame(rf.model$classes)
lookup_tab$value <- as.numeric(lookup_tab$`rf.model$classes`)
lookup_tab$predicted <- as.character(lookup_tab$`rf.model$classes`)
lookup_tab <- lookup_tab[c("value","predicted")]
write.table(lookup_tab, file = "PredRAT.txt", sep = "\t", row.names = FALSE)