Skip to content

Commit

Permalink
Merge pull request #113 from greenelab/envest/41-ensure_numeric_gex
Browse files Browse the repository at this point in the history
Envest/41 ensure numeric gex
  • Loading branch information
envest authored Apr 6, 2022
2 parents 6be34a2 + e65eba7 commit 4852bec
Show file tree
Hide file tree
Showing 15 changed files with 250 additions and 74 deletions.
9 changes: 8 additions & 1 deletion 2A-small_n_differential_expression.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ ncores <- min(parallel::detectCores() - 1,
# set seed
initial.seed <- opt$seed
set.seed(initial.seed)

# set additional random seeds for reproducibility within foreach dopar loops
random_seeds <- sample(1:10000, size = 9)

message(paste("\nInitial seed set to:", initial.seed))

# define directories
Expand Down Expand Up @@ -92,7 +96,7 @@ seq.dt <- data.table(seq.data[,
samples.to.keep))])
sample.df <- sample.df[which(sample.df$sample %in% samples.to.keep), ]

smaller_subtype_size <- min(table(droplevels(sample.df$category)))
smaller_subtype_size <- min(table(as.character(sample.df$category)))

# different sizes of n to test
no.samples <- c(3, 4, 5, 6, 8, 10, 15, 25, 50)
Expand All @@ -112,6 +116,9 @@ doParallel::registerDoParallel(cl)
# at each titration level (0-100% RNA-seq)
stats.df.list[1:9] <- foreach(seq_prop = seq(0.1, .9, 0.1), .packages = c("tidyverse")) %dopar% {

# random_seeds indexed by 1 through 9, corresponding to seq_prop 0.1 through 0.9
set.seed(random_seeds[seq_prop*10])

# we're going to repeat the small n experiment 10 times
stats.df.iter_list <- list() # this is returned to stats.df.list each iteration
for (trial.iter in 1:10) {
Expand Down
4 changes: 2 additions & 2 deletions 3-combine_category_kappa.R
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,8 @@ summary.df <- test.df %>%
dplyr::group_by(Classifier, Normalization, Platform, Perc.Seq) %>%
dplyr::summarise(Median = median(Kappa),
Mean = mean(Kappa),
SD = sd(Kappa)) %>%
dplyr::ungroup()
SD = sd(Kappa),
.groups = "drop")

readr::write_tsv(summary.df,
summary.df.filename) # delta or not delta in file name
11 changes: 5 additions & 6 deletions 6-save_recon_error_kappa_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ kappa.summary.df <-
dplyr::group_by(Classifier, Normalization, Platform, Perc.seq) %>%
dplyr::summarise(Median = median(Kappa),
Mean = mean(Kappa),
SD = sd(Kappa)) %>%
dplyr::ungroup()
SD = sd(Kappa),
.groups = "drop")
readr::write_tsv(kappa.summary.df,
file.path(rcn.res.dir,
paste0(file_identifier,
Expand Down Expand Up @@ -138,10 +138,9 @@ error.master.df$comp.method <- as.factor(error.master.df$comp.method)

# take the average of each genes error across replicates
error.mean.df <- error.master.df %>%
dplyr::group_by(gene, perc.seq, norm.method, comp.method,
platform) %>%
dplyr::summarise(mean_mase = mean(MASE)) %>%
dplyr::ungroup()
dplyr::group_by(gene, perc.seq, norm.method, comp.method, platform) %>%
dplyr::summarise(mean_mase = mean(MASE),
.groups = "drop")
rm(error.master.df)
colnames(error.mean.df) <- c("Gene", "Perc.seq", "Normalization",
"Method", "Platform", "Mean_Value")
Expand Down
5 changes: 2 additions & 3 deletions 7-extract_plier_pathways.R
Original file line number Diff line number Diff line change
Expand Up @@ -375,9 +375,8 @@ if (length(jaccard_list) > 0) {
"seed_index" = "L1"
)

readr::write_tsv(
x = jaccard_df,
path = plot_data_filename
readr::write_tsv(jaccard_df,
plot_data_filename
)

}
34 changes: 23 additions & 11 deletions check_sums.tsv
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
1a89ea769381e300e5a88ec61713ad9e data/BRCAarray.pcl
8c76a476c5b6f4f8deec017c876db156 data/BRCAClin.tsv
7f00ea6ef1f309773b02e6118046550f data/BRCARNASeq.pcl
d4486dde14da14b4f8887a7415e2866f data/BRCARNASeqClin.tsv
7fafc537807d5b3ddf0bb89665279a9d data/broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg
639ad8f8386e98dacc22e439188aa8fa data/mc3.v0.2.8.PUBLIC.maf.gz
a4591b2dcee39591f59e5e25a6ce75fa data/TCGA-CDR-SupplementalTableS1.xlsx
02e72c33071307ff6570621480d3c90b data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv
76a0454f911aeb17276725abb760ce89 data/GSE83130/GSE83130/GSE83130.tsv
1d8834a51282396e07e3ce9a5417d024 data/gbm_clinical_table_S7.xlsx
e5df57691b44c47b8c916116b5ac7acf data/PanCan-General_Open_GDC-Manifest_2.txt
2f4f2fcd97eff5385c0b1205b719b8dc data/BRCAClin.tsv
7f00ea6ef1f309773b02e6118046550f data/BRCARNASeq.pcl
d4486dde14da14b4f8887a7415e2866f data/BRCARNASeqClin.tsv
1a89ea769381e300e5a88ec61713ad9e data/BRCAarray.pcl
02e72c33071307ff6570621480d3c90b data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv
90feb5edb48d7619568d2de21697509e data/GBMClin.tsv
949938abadd336eb9a2f698b3102e1bb data/GBMRNASeq.pcl
3bbdad11c322ebf3b03ada07263c6444 data/GBMarray.pcl
e5df57691b44c47b8c916116b5ac7acf data/PanCan-General_Open_GDC-Manifest_2.txt
a4591b2dcee39591f59e5e25a6ce75fa data/TCGA-CDR-SupplementalTableS1.xlsx
7fafc537807d5b3ddf0bb89665279a9d data/broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg
d79d2399598ac2e6c11a1b44c5c603df data/combined_clinical_data.BRCA.tsv
90f745d4eac485168cb2b50be29104b1 data/combined_clinical_data.GBM.tsv
1d8834a51282396e07e3ce9a5417d024 data/gbm_clinical_table_S7.xlsx
639ad8f8386e98dacc22e439188aa8fa data/mc3.v0.2.8.PUBLIC.maf.gz
7583a5fb4d23d50b79813b26469f6385 data/mutations.BRCA.tsv
15cae05325c1b0562be8029efba5534a data/mutations.GBM.tsv
5484229fa691a721dd7fd08ade2233e7 data/mutations.maf
e56585bd0c2e59658b1d54fc8b0c9df2 data/mutations.tsv
b62634d9eccbb548499ce384605fe47a data/GSE83130/LICENSE.TXT
9ed2fa92d31d51f17fc048b98158a5e1 data/GSE83130/README.md
76a0454f911aeb17276725abb760ce89 data/GSE83130/GSE83130/GSE83130.tsv
dca310d9643a18d35e694425c56b9d2b data/GSE83130/GSE83130/metadata_GSE83130.tsv
2 changes: 1 addition & 1 deletion combine_clinical_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,4 @@ combined_df <- clinical_df %>%
################################################################################

write_tsv(combined_df,
path = combined_output_filepath)
combined_output_filepath)
10 changes: 5 additions & 5 deletions download_TCGA_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,6 @@ else
wget -O $data/gbm_clinical_table_S7.xlsx $gbm_clinical_link
fi

# check md5 sums of downloaded files
echo Checking md5 sums of downloaded files ...
md5sum --check check_sums.tsv
echo All files downloaded match expected md5 sums!

# modify BRCA clinical file column PAM50 to be subtype
sed -i 's/PAM50/subtype/' $data/BRCAClin.tsv

Expand Down Expand Up @@ -110,6 +105,11 @@ Rscript combine_clinical_data.R \
--mutation_input $data/mutations.GBM.tsv \
--combined_output $data/combined_clinical_data.GBM.tsv

# check md5 sums of downloaded files
echo Checking md5 sums of downloaded files ...
md5sum --check --quiet check_sums.tsv
echo All data files match expected md5 sums!

# get BRCA array expression data from TCGA Legacy Archive
# data/gdc_legacy_archive_brca_manifest.txt obtained from https://portal.gdc.cancer.gov/legacy-archive
# with search parameters
Expand Down
2 changes: 1 addition & 1 deletion plots/scripts/2A-plot_small_n_differential_expression.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ plot_small_n <- function(subtypes){
y = ifelse(using_single_measure,
unique(stats_df$measure),
"Measure of Similarity"),
title = paste(cancer_type, subtypes_nice, "FDR < 10%")) +
title = str_c("Small n Experiment: ", paste(cancer_type, subtypes_nice, "FDR < 10%"))) +
scale_colour_manual(values = cbPalette[c(2, 3)])

if (using_single_measure) {
Expand Down
4 changes: 2 additions & 2 deletions plots/scripts/3-plot_category_kappa.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,5 +98,5 @@ plot_obj <- ggplot(plot_df,

ggsave(output_filename,
plot = plot_obj,
height = 5,
width = 7.5)
height = 4,
width = 7.25)
4 changes: 2 additions & 2 deletions plots/scripts/6-plot_recon_error.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,5 @@ plot_obj <- ggplot(plot_df,

ggsave(output_filename,
plot = plot_obj,
height = 5,
width = 7.5)
height = 4,
width = 7.25)
4 changes: 2 additions & 2 deletions plots/scripts/6-plot_recon_kappa.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,5 +85,5 @@ plot_obj <- ggplot(plot_df,

ggsave(output_filename,
plot = plot_obj,
height = 5,
width = 7.5)
height = 4,
width = 7.25)
8 changes: 4 additions & 4 deletions plots/scripts/recon_kappa_difference.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,15 @@ with_df <- read_tsv(with_recon_input_filename,

without_summary_df <- without_df %>%
group_by(Perc.Seq, Classifier, Normalization, Platform) %>%
summarize(median_without = median(Kappa)) %>%
ungroup()
summarize(median_without = median(Kappa),
.groups = "drop")

with_summary_df <- with_df %>%
filter(Reconstruction == "PCA",
Measure == "kappa") %>%
group_by(Perc.Seq, Classifier, Normalization, Platform) %>%
summarize(median_with = median(Kappa)) %>%
ungroup()
summarize(median_with = median(Kappa),
.groups = "drop")

# combined data frames and calculate difference in median kappas

Expand Down
15 changes: 9 additions & 6 deletions prepare_GBM_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ all_array_tumor_samples <- tibble(accession = names(metadata_json$samples)) %>%
# keep one (first) accession per TCGA ID
array_accession_tcga_id_keep <- all_array_tumor_samples %>%
group_by(tcga_id) %>%
summarize(accession = sort(accession)[1])
summarize(accession = sort(accession)[1]) %>%
ungroup()
accession_colnames_keep <- colnames(gbm_array_expression)[-1][colnames(gbm_array_expression)[-1] %in% array_accession_tcga_id_keep$accession]

# select columns to keep and rename with TCGA IDs
Expand Down Expand Up @@ -122,8 +123,10 @@ gbm_seq_tumor_samples <- tibble(tcga_id_raw = tcga_seq_expression_column_names[-
sample = str_sub(tcga_id_raw, 14, 15)) %>% # and sample is ZZ
filter(sample == "01") %>% # require sample to be primary solid tumor
filter(tcga_id %in% array_accession_tcga_id_keep$tcga_id) %>% # keep array GBMs
group_by(tcga_patient, tcga_id) %>%
summarize(tcga_id_raw = sort(tcga_id_raw)[1]) # keep one raw ID per person
group_by(tcga_patient) %>%
summarize(tcga_id_raw = sort(tcga_id_raw)[1], # keep one raw ID per person
tcga_id = str_sub(tcga_id_raw, 1, 15)) %>%
ungroup()

# now read in GBM subset of entire TCGA seq expression file
# this is faster and uses less memory than reading in entire file and then subsetting
Expand Down Expand Up @@ -211,12 +214,12 @@ missing_clinical <- gbm_subtypes %>%

write_tsv(gbm_array_expression_renamed %>%
select(-all_of(missing_clinical)),
path = gbm_array_output_filepath)
gbm_array_output_filepath)

write_tsv(gbm_seq_expression_renamed %>%
select(-all_of(missing_clinical)),
path = gbm_seq_output_filepath)
gbm_seq_output_filepath)

write_tsv(gbm_subtypes %>%
filter(!is.na(subtype)),
path = clinical_tsv_output_filepath)
clinical_tsv_output_filepath)
10 changes: 7 additions & 3 deletions run_all_analyses_and_plots.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ Rscript plots/scripts/1A-plot_DEGs.R \
--subtype_vs_others Basal \
--proportion_output_directory plots/supplementary \
--overlap_output_directory plots/supplementary \
--overlap_measure Jaccard
--overlap_measure Jaccard,Spearman

# bar plot showing proportion of genes differentially expressed (Her2 vs. LumA)
# line plot showing overlap with silver standard DEGs (Her2 vs. LumA)
Expand All @@ -77,7 +77,7 @@ Rscript plots/scripts/1A-plot_DEGs.R \
--subtype_vs_subtype Her2,LumA \
--proportion_output_directory plots/supplementary \
--overlap_output_directory plots/main \
--overlap_measure Jaccard
--overlap_measure Jaccard,Spearman

# line plot showing overlap with silver standard DEGs (Her2 vs. LumA) across small n values
Rscript plots/scripts/2A-plot_small_n_differential_expression.R \
Expand Down Expand Up @@ -110,6 +110,7 @@ Rscript plots/scripts/0-plot_predictor_category_distributions.R \
Rscript plots/scripts/3-plot_category_kappa.R \
--cancer_type BRCA \
--predictor TP53 \
--null_model \
--output_directory plots/supplementary

# ------------------------------------------------------------------------------
Expand All @@ -136,6 +137,7 @@ Rscript plots/scripts/0-plot_predictor_category_distributions.R \
Rscript plots/scripts/3-plot_category_kappa.R \
--cancer_type BRCA \
--predictor PIK3CA \
--null_model \
--output_directory plots/supplementary

# ------------------------------------------------------------------------------
Expand Down Expand Up @@ -201,7 +203,7 @@ Rscript plots/scripts/1A-plot_DEGs.R \
--subtype_vs_subtype Classical,Mesenchymal \
--proportion_output_directory plots/supplementary \
--overlap_output_directory plots/supplementary \
--overlap_measure Jaccard
--overlap_measure Jaccard,Spearman

# line plot showing overlap with silver standard DEGs (Classical vs. Mesenchymal) across small n values
Rscript plots/scripts/2A-plot_small_n_differential_expression.R \
Expand Down Expand Up @@ -234,6 +236,7 @@ Rscript plots/scripts/0-plot_predictor_category_distributions.R \
Rscript plots/scripts/3-plot_category_kappa.R \
--cancer_type GBM \
--predictor TP53 \
--null_model \
--output_directory plots/main

# ------------------------------------------------------------------------------
Expand All @@ -260,6 +263,7 @@ Rscript plots/scripts/0-plot_predictor_category_distributions.R \
Rscript plots/scripts/3-plot_category_kappa.R \
--cancer_type GBM \
--predictor PIK3CA \
--null_model \
--output_directory plots/supplementary

# ------------------------------------------------------------------------------
Loading

0 comments on commit 4852bec

Please sign in to comment.