-
Notifications
You must be signed in to change notification settings - Fork 17
/
0-expression_data_overlap_and_split.R
206 lines (167 loc) · 8.19 KB
/
0-expression_data_overlap_and_split.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# J. Taroni Jun 2016
# The purpose of this script is to read in TGCA array and sequencing data,
# to preprocess leaving only overlapping genes and samples with complete
# category information, and to split the data into training and testing sets
# It should be run from the command line through the run_experiments.R script
option_list <- list(
optparse::make_option("--cancer_type",
default = NA_character_,
help = "Cancer type"),
optparse::make_option("--predictor",
default = NA_character_,
help = "Predictor used"),
optparse::make_option("--seed1",
default = NA_integer_,
help = "Random seed"),
optparse::make_option("--null_model",
action = "store_true",
default = FALSE,
help = "Permute dependent variable (within subtype if predictor is a gene)")
)
opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
source(here::here("util/option_functions.R"))
check_options(opt)
# load libraries
suppressMessages(source(here::here("load_packages.R")))
# set options
cancer_type <- opt$cancer_type
predictor <- opt$predictor
null_model <- opt$null_model
file_identifier <- ifelse(null_model,
str_c(cancer_type, predictor, "null", sep = "_"),
str_c(cancer_type, predictor, sep = "_"))
# set seed
initial.seed <- as.integer(opt$seed1)
set.seed(initial.seed)
# set seed for spliting into train/test here, before null_model scramble
split.seed <- sample(1:10000, 1)
# define directories
data.dir <- here::here("data")
plot.dir <- here::here("plots")
plot.data.dir <- file.path(plot.dir, "data")
res.dir <- here::here("results")
# name input files
seq.exprs.filename <- paste0(cancer_type, "RNASeq.pcl")
array.exprs.filename <- paste0(cancer_type, "array.pcl")
clin.filename <- paste0("combined_clinical_data.", cancer_type, ".tsv")
# name output files
category.distribtion.plot.data <- paste0(file_identifier,
"_dist_split_stacked_bar_",
initial.seed, ".tsv")
train.test.labels <- paste0(file_identifier,
"_matchedSamples_training_testing_split_labels_",
initial.seed, ".tsv")
#### read in expression and clinical data --------------------------------------
# read in expression data as data.frame
seq.data <- fread(file.path(data.dir, seq.exprs.filename),
data.table = FALSE)
array.data <- fread(file.path(data.dir, array.exprs.filename),
data.table = FALSE)
clinical <- fread(file.path(data.dir, clin.filename),
data.table = FALSE)
# filter clinical data to keep tumor samples with complete data
# if the predictor is subtype, we only select subtype (twice, but once)
# if the predictor is a gene, we select subtype and the gene
# this ensures downstream mutation predictions will have subtype available as covariate
clinical <- clinical %>%
mutate(category = !!sym(predictor)) %>%
select(Sample, Type, "subtype", "category") %>%
filter(Type == "tumor") %>%
tidyr::drop_na()
# change first column name to "gene"
colnames(array.data)[1] <- colnames(seq.data)[1] <- "gene"
# remove tumor-adjacent samples from the array data set
array.tumor.smpls <- clinical$Sample
array.tumor.smpls <- substr(array.tumor.smpls, 1, 15)
array.category <- clinical$category
# filter array data only to include tumor samples
array.data <- array.data[, c(1, which(colnames(array.data) %in%
array.tumor.smpls))]
# what are the overlapping sample names -- "matched" samples?
# includes "gene" column
sample.overlap <- intersect(colnames(array.data), colnames(seq.data))
# what are the overlapping genes between the two platforms?
gene.overlap <- intersect(array.data$gene, seq.data$gene)
# filter the expression data for matched samples and overlapping genes
array.matched <- array.data[which(array.data$gene %in% gene.overlap),
sample.overlap]
seq.matched <- seq.data[which(seq.data$gene %in% gene.overlap),
sample.overlap]
# reorder genes on both platforms
array.matched <- array.matched[order(array.matched$gene), ]
seq.matched <- seq.matched[order(seq.matched$gene), ]
# reorder samples on both platforms
array.matched <- array.matched[, c(1, (order(colnames(array.matched)[-1]) + 1))]
seq.matched <- seq.matched[, c(1, (order(colnames(seq.matched)[-1]) + 1))]
# check reording sample names worked as expected
if (any(colnames(array.matched) != colnames(seq.matched))) {
stop("Column name reordering did not work as expected in 0-expression_data_overlap_and_split.R")
}
# keep category labels for samples with expression data
array.category <- as.factor(array.category[which(array.tumor.smpls %in%
colnames(array.matched))])
array.tumor.smpls <- array.tumor.smpls[which(array.tumor.smpls %in%
colnames(array.matched))]
# remove "unmatched" / "raw" expression data
rm(array.data, seq.data)
# write matched only samples to pcl files
array.output.nm <- sub(".pcl", "_matchedOnly_ordered.pcl", array.exprs.filename)
array.output.nm <- file.path(data.dir, array.output.nm)
write.table(array.matched, file = array.output.nm,
row.names = FALSE, quote = FALSE, sep = "\t")
seq.output.nm <- sub(".pcl", "_matchedOnly_ordered.pcl", seq.exprs.filename)
seq.output.nm <- file.path(data.dir, seq.output.nm)
write.table(seq.matched, file = seq.output.nm,
row.names = FALSE, quote = FALSE, sep = "\t")
#### split data into balanced training and testing sets ------------------------
# order array category to match the expression data order
array.category <- array.category[order(array.tumor.smpls)]
message(paste("\nRandom seed for splitting into testing and training:",
split.seed), appendLF = TRUE)
set.seed(split.seed)
train.index <- unlist(createDataPartition(array.category, times = 1, p = (2/3)))
#### write training/test labels to file ----------------------------------------
lbl <- rep("test", length(array.tumor.smpls))
lbl[train.index] <- "train"
lbl.df <- tibble(sample = colnames(array.matched)[2:ncol(array.matched)],
split = lbl,
category = as.character(array.category))
# add back subtype
lbl.df <- lbl.df %>%
left_join(clinical %>%
select(Sample, subtype),
by = c("sample" = "Sample"))
#### permute category labels for null model ------------------------------------
# this comes after createDataPartition() to ensure same samples go to train/test
# grouping by split ensure labels remain balanced within train and test
# if null_model is specified and predicting subtype, permute subtype labels
# if null_model is specified and predicting mutation status,
# permute mutation labels WITHIN subtype
if (null_model) {
if (predictor == "subtype") { # here, subtype = category
lbl.df <- lbl.df %>%
group_by(split) %>%
mutate(category = case_when(split == "train" ~ sample(category),
split == "test" ~ category)) %>%
ungroup()
} else { # if predictor not subtype, then must be mutation
lbl.df <- lbl.df %>% # subtype = subtype, category = TP53 or PIK3CA 0/1
group_by(split, subtype) %>% # sample within subtype
mutate(category = case_when(split == "train" ~ sample(category),
split == "test" ~ category)) %>%
ungroup()
}
}
write.table(lbl.df,
file = file.path(res.dir, train.test.labels),
quote = FALSE, sep = "\t", row.names = FALSE)
#### save plot data frame ------------------------------------------------------
plot.df <- lbl.df %>%
mutate(split = case_when(split == "train" ~ "Train (2/3)",
split == "test" ~ "Test (1/3)")) %>%
bind_rows(lbl.df %>% mutate(split = "Whole")) %>%
mutate(initial_seed = initial.seed)
write.table(plot.df,
file = file.path(plot.data.dir,
category.distribtion.plot.data),
quote = FALSE, sep = "\t", row.names = FALSE)