-
Notifications
You must be signed in to change notification settings - Fork 17
/
combine_clinical_data.R
67 lines (57 loc) · 2.96 KB
/
combine_clinical_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# Script combines clinical data from all cancer types to one data frame
# Clinical data includes: subtype and TP53/PIK3CA mutation status
# Steven Foltz August 2021
option_list <- list(
optparse::make_option("--cancer_type",
default = NA_character_,
help = "Cancer type"),
optparse::make_option("--clinical_input",
default = NA_character_,
help = "Clinical information input file path (.tsv)"),
optparse::make_option("--mutation_input",
default = NA_character_,
help = "Mutation input file path (.tsv)"),
optparse::make_option("--combined_output",
default = NA_character_,
help = "Combined subtype and mutation output file path (.tsv)"),
optparse::make_option("--overwrite",
action = "store_true",
default = FALSE,
help = "Overwrite existing output files [default: %default]")
)
opt <- optparse::parse_args(optparse::OptionParser(option_list=option_list))
source(here::here("util/option_functions.R"))
check_options(opt)
# load libraries
suppressMessages(library(tidyverse))
# set options
cancer_type <- opt$cancer_type
clinical_input_filepath <- opt$clinical_input
mutation_input_filepath <- opt$mutation_input
combined_output_filepath <- opt$combined_output
################################################################################
# Read in clinical and mutation data
################################################################################
clinical_df <- read_tsv(clinical_input_filepath, # treat all columns equally
col_types = cols(.default = col_character())) %>%
mutate(Sample = substr(Sample, 1, 15)) # remove extra parts of TCGA ID
mutation_df <- read_tsv(mutation_input_filepath, # treat all columns equally
col_types = cols(.default = col_character())) %>%
mutate(tcga_id = substr(tcga_id, 1, 15)) # remove extra parts of TCGA ID
################################################################################
# Combine clinical and mutation data
################################################################################
# combine data frames with left_join() to get the left side of venn diagram
# start join with clinical_df because later scripts expect column name = Sample
combined_df <- clinical_df %>%
left_join(mutation_df,
by = c("Sample" = "tcga_id")) %>%
mutate(PIK3CA = case_when(PIK3CA == 0 ~ "No_PIK3CA_mutation",
PIK3CA == 1 ~ "PIK3CA_mutation"),
TP53 = case_when(TP53 == 0 ~ "No_TP53_mutation",
TP53 == 1 ~ "TP53_mutation"))
################################################################################
# Save output file
################################################################################
write_tsv(combined_df,
combined_output_filepath)