-
Notifications
You must be signed in to change notification settings - Fork 1
/
ncaa_predict.R
155 lines (118 loc) · 4.56 KB
/
ncaa_predict.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
library(caret)
library(DAMisc)
library(splines)
library(caretEnsemble)
library(foreign)
library(sna)
library(network)
library(psych)
library(doSNOW)
library(parallel)
# install with install_github('zachmayer/kaggleNCAA')
require(kaggleNCAA)
# to get a phone notification when you run this via boxcar, uncomment and run following::
# require(devtools)
# install_github('trcook/tmisc',subdir='tmisc')
# .boxcar_token<-c("TOKEN_HERE")
# library(tmisc)
# N clusters, set to 0 to turn off parallel processing
ncluster<-4
### Set your directory here:
setwd("C:/Users/Thomas/Desktop/NCAA/")
######################### START TRAINING ####
ivalidate<-read.csv("./ivalidate_pt1.csv")
start<-Sys.time()
if(ncluster>0){
cl<-makeCluster(ncluster)
registerDoSNOW(cl)
}
my_control <- trainControl(
method='repeatedcv',
repeats=5,
savePredictions=TRUE,
classProbs=TRUE,
summaryFunction=twoClassSummary
)
models <- caretList(
win~., data=ivalidate[,grepl("win|survscores|powscore|rank|winper|rpi|sos|ncf|orank", names(ivalidate))],
trControl=my_control,
methodList=c('bagFDA', 'nnet', 'ada', 'bayesglm', 'svmPoly', 'rf', 'knn', 'svmLinear', 'gbm')#'knn', , 'qrnn', 'svmPoly', 'AdaBag'
)
#This will stack the models into an ensemble using a greedy stepwise algorithm
stack <- caretStack(models, method='glm')
greedy <- caretEnsemble(models, iter=1000L)
if(ncluster>0){
stopCluster(cl)
}
end<-Sys.time()
boxcar_notify(token = .boxcar_token,body = paste("time taken:",c(start-end)),title = "Training Done")
####################################### END TRAINING ####
### Validation data
ivalidate<-read.csv("./ivalidate_pt2.csv")
################ START VALIDATION
#Make the predictions and get the log loss for the validation set.
preds <- predict(stack, type="prob", newdata = ivalidate[ ,grepl("win|survscores|powscore|rank|winper|rpi|sos|ncf|orank", names(ivalidate))])[,1]
df <- data.frame(preds=preds[which(ivalidate$daynum>135)], realscore=ivalidate$scorediff[which(ivalidate$daynum>135)], season=ivalidate$season[which(ivalidate$daynum>135)])
qplot(preds, realscore, data=df, xlab="Prediction", ylab="Real Margin") + geom_smooth(method="loess")
df$win <- 1*(df$realscore>0)
df$pwin <- 1*(df$preds>=.5)
logloss <- sum((df$win*log(df$preds) + (1-df$win)*log(1-df$preds)) * (1/nrow(df)) ); logloss
accuracy <- sum(df$win==df$pwin)/nrow(df) #Make 65% accuracy
#Log loss
CappedBinomialDeviance <- function(a, p) {
if (length(a) != length(p)) stop("Actual and Predicted need to be equal lengths!")
p_capped <- pmin(0.99, p)
p_capped <- pmax(0.01, p_capped)
-sum(a * log(p_capped) + (1 - a) * log(1 - p_capped)) / length(a)
}
CappedBinomialDeviance(df$win, df$preds)
############### END VALIDATION
### get final data
ivalidate<-read.csv("./ivalidate_pt3.csv")
## Final round training
#Make the Final Training
######################### START TRAINING 2015 ####
start<-Sys.time()
if(ncluster>0){
cl<-makeCluster(ncluster)
registerDoSNOW(cl)
}
my_control <- trainControl(
method='repeatedcv',
repeats=5,
savePredictions=TRUE,
classProbs=TRUE,
summaryFunction=twoClassSummary
)
models <- caretList(
win~., data=ivalidate[,grepl("win|survscores|powscore|rank|winper|rpi|sos|ncf", names(ivalidate))],
trControl=my_control,
methodList=c('bagFDA', 'nnet', 'ada', 'bayesglm', 'svmPoly', 'rf', 'knn', 'svmLinear', 'gbm')#'knn', , 'qrnn', 'svmPoly', 'AdaBag'
)
#This will stack the models into an ensemble using a greedy stepwise algorithm
stack <- caretStack(models, method='glm')
greedy <- caretEnsemble(models, iter=1000L)
if(ncluster>0){
stopCluster(cl)
}
end<-Sys.time()
boxcar_notify(token = .boxcar_token,body = paste("time taken:",c(start-end)),title = "Final Training Done")
####################################### END TRAINING 2015 ####
## Get final round data:
ivalidate<-read.csv("./ivalidate_pt4.csv")
df2<-read.csv("./df2.csv")
df<-read.csv("./df.csv")
#PICKUPHERE: merge in ordinals
############## START CREATE PREDICTIONS FOR EVERY MATCH-UP FOR STAGE 2
preds <- predict(stack, type="prob", newdata = df2[,grepl("win|survscores|powscore|rank|winper|rpi|sos|ncf|orank", names(df2))])[,2]
finaldf <- data.frame(id=df$id, pred=1-preds)
write.csv(finaldf, "./stage2_n2.csv", row.names=F)
d1 <- read.csv("./kaggle_submission_public.csv")
d2 <- read.csv("./stage2_n2.csv"); names(d2)[2] <- "mypred"
dat <- merge(d1, d2, by="id")
qplot(mypred, pred, data=dat)
### From the kaggleNCAA package (install with install_github('zachmayer/kaggleNCAA'))
dat<-parseBracket("./stage2_n2.csv")
setnames(dat,names(dat),c("season", "team_2", "team_1", "pred"))
bracket <- walkTourney(dat, year=2015)
printableBracket(bracket)