-
Notifications
You must be signed in to change notification settings - Fork 0
/
structData.R
140 lines (111 loc) · 4.58 KB
/
structData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# ----------
# This script takes transition matrices generated with the scripts:
# next-genotype_transitionMatrix-sim.R
# next-genotype_transitionMatrix-cpm.R
# next-genotype_transitionMatrix-timeDiscretizedCPMs.R
# And combines all the data in a single file (generates one file per fitness
# landscape ID).
# ----------
source("oncoFunctions.R")
# directories to load from/save to
simDirectory <- askDir(defaultDir="./sim",
message="Enter directory where .RData files containing transition matrices extracted from OncoSimulR simulations are.")
cpmDirectory <- askDir(defaultDir="./cpm",
message="Enter directory where .RData files containing formatted transition matrices extracted from CPM outputs are.")
saveDirectory <- askDir(defaultDir="./data",
message="Enter directory to save output to.")
# create save directory (if it doesn't exist)
dir.create(saveDirectory,showWarnings=F)
# list all CPM sub-directories under the cpmDirectory
cpm <- list.dirs(cpmDirectory,full.names=F,rec=F)
# list all files
cat("Checking provided directories")
cat("\n")
simFiles <- list.files(simDirectory,full.names=T,rec=F)
simFiles <- simFiles[!file.info(simFiles)$isdir]
cpmFiles <- vector(mode="list")
for (i in 1:length(cpm)) {
files <- list.files(file.path(cpmDirectory,cpm[i]),
full.names=T,rec=F)
files <- files[!file.info(files)$isdir]
cpmFiles[[cpm[i]]] <- files
}
cpmFiles_listed <- unlist(cpmFiles)
names(cpmFiles_listed) <- NULL
# list all IDs
flIDs <- sapply(simFiles,flIDFromFile)
names(flIDs) <- NULL
# list all size_splits
size_splits <- sapply(cpmFiles_listed,sizeSplitFromFile)
names(size_splits) <- NULL
size_splits <- unique(size_splits)
# list all detect regimes
detects <- sapply(cpmFiles_listed,detectFromFile)
names(detects) <- NULL
detects <- unique(detects)
# match landscape IDs and types
flTypes <- data.frame(ID=sapply(cpmFiles_listed,flIDFromFile),
typeLandscape=sapply(cpmFiles_listed,flTypeFromFile))
flTypes <- unique(flTypes)
rownames(flTypes) <- NULL
# check if IDs match or there are missing files
simIDs <- sapply(simFiles,flIDFromFile)
cpmIDs <- sapply(cpmFiles_listed,flIDFromFile)
cpmIDs <- as.data.frame(table(cpmIDs))
isMatched <- rep(NA,length(simIDs))
for (i in 1:length(simIDs)) {
isMatched[i] <- simIDs[i] %in% cpmIDs$cpmIDs
}
if(sum(isMatched)==length(isMatched)) {
cat(" > Landscape IDs matched")
} else {
cat(" > WARNING: encountered landscape ID mismatch")
}
cat("\n")
if(sum(cpmIDs$Freq==length(size_splits)*length(detects)*length(cpm))==dim(cpmIDs)[1]) {
cat(" > All CPM files located")
} else {
stop(" > WARNING: missing files")
}
cat("\n")
cat("\n")
# group all matrices for a given ID under a same object
cat("Structuring data")
cat("\n")
pboptions(type="txt")
x <- pblapply(simIDs,
function(ID) {
# initialize output object
out <- vector(mode="list")
out[["ID"]] <- as.character(ID)
out[["typeLandscape"]] <- flTypes$typeLandscape[flTypes$ID==ID]
out[["nGenes"]] <- numGenesFromFile(simFiles[grepl(ID,simFiles)])
# "sim" matrix
load(simFiles[grepl(ID,simFiles)])
out[["sim"]] <- list(transitionMatrix=transitionMatrix,
timesInPOM=timesInPOM)
# null model
out[["null"]] <- list(transitionMatrix=
nullMatrix(transitionMatrix)$norm)
# loop through CPMs, size_splits and detection regimes (nest)
cpm <- names(cpmFiles)
for (i in 1:length(cpm)) { # cpm index
files <- cpmFiles[[cpm[i]]]
files <- files[grepl(ID,files)]
for (j in 1:length(size_splits)) { # "size_split" index
for (k in 1:length(detects)) { # "detect" index
n <- grepl(size_splits[j],files) & grepl(detects[k],files)
load(files[n])
out[["cpm"]][[cpm[i]]][[size_splits[j]]][[detects[k]]] <-
transitionMatrix
}
}
}
# save output
data <- out
outFile <- file.path(saveDirectory,
paste(ID,".RData",sep=""))
save(data,file=outFile)
return(0)
},
cl=detectCores())