Merge pull request #113 from greenelab/envest/41-ensure_numeric_gex

Envest/41 ensure numeric gex
greenelab · Apr 6, 2022 · 4852bec · 4852bec
2 parents 6be34a2 + e65eba7
commit 4852bec
Show file tree

Hide file tree

Showing 15 changed files with 250 additions and 74 deletions.
diff --git a/2A-small_n_differential_expression.R b/2A-small_n_differential_expression.R
@@ -42,6 +42,10 @@ ncores <- min(parallel::detectCores() - 1,
 # set seed
 initial.seed <- opt$seed
 set.seed(initial.seed)
+
+# set additional random seeds for reproducibility within foreach dopar loops
+random_seeds <- sample(1:10000, size = 9)
+
 message(paste("\nInitial seed set to:", initial.seed))
 
 # define directories
@@ -92,7 +96,7 @@ seq.dt <- data.table(seq.data[,
                                            samples.to.keep))])
 sample.df <- sample.df[which(sample.df$sample %in% samples.to.keep), ]
 
-smaller_subtype_size <- min(table(droplevels(sample.df$category)))
+smaller_subtype_size <- min(table(as.character(sample.df$category)))
 
 # different sizes of n to test
 no.samples <- c(3, 4, 5, 6, 8, 10, 15, 25, 50)
@@ -112,6 +116,9 @@ doParallel::registerDoParallel(cl)
 # at each titration level (0-100% RNA-seq)
 stats.df.list[1:9] <- foreach(seq_prop = seq(0.1, .9, 0.1), .packages = c("tidyverse")) %dopar% {
 
+  # random_seeds indexed by 1 through 9, corresponding to seq_prop 0.1 through 0.9
+  set.seed(random_seeds[seq_prop*10])
+
   # we're going to repeat the small n experiment 10 times
   stats.df.iter_list <- list() # this is returned to stats.df.list each iteration
   for (trial.iter in 1:10) {

diff --git a/3-combine_category_kappa.R b/3-combine_category_kappa.R
@@ -157,8 +157,8 @@ summary.df <- test.df %>%
   dplyr::group_by(Classifier, Normalization, Platform, Perc.Seq) %>%
   dplyr::summarise(Median = median(Kappa),
                    Mean = mean(Kappa),
-                   SD = sd(Kappa)) %>%
-  dplyr::ungroup()
+                   SD = sd(Kappa),
+                   .groups = "drop")
 
 readr::write_tsv(summary.df,
                  summary.df.filename) # delta or not delta in file name
diff --git a/6-save_recon_error_kappa_data.R b/6-save_recon_error_kappa_data.R
@@ -100,8 +100,8 @@ kappa.summary.df <-
   dplyr::group_by(Classifier, Normalization, Platform, Perc.seq) %>%
   dplyr::summarise(Median = median(Kappa),
                    Mean = mean(Kappa),
-                   SD = sd(Kappa)) %>%
-  dplyr::ungroup()
+                   SD = sd(Kappa),
+                   .groups = "drop")
 readr::write_tsv(kappa.summary.df,
                  file.path(rcn.res.dir,
                            paste0(file_identifier,
@@ -138,10 +138,9 @@ error.master.df$comp.method <- as.factor(error.master.df$comp.method)
 
 # take the average of each genes error across replicates
 error.mean.df <- error.master.df %>%
-  dplyr::group_by(gene, perc.seq, norm.method, comp.method,
-                  platform) %>%
-  dplyr::summarise(mean_mase = mean(MASE)) %>%
-  dplyr::ungroup()
+  dplyr::group_by(gene, perc.seq, norm.method, comp.method, platform) %>%
+  dplyr::summarise(mean_mase = mean(MASE),
+                   .groups = "drop")
 rm(error.master.df)
 colnames(error.mean.df) <- c("Gene", "Perc.seq", "Normalization",
                              "Method", "Platform", "Mean_Value")

diff --git a/7-extract_plier_pathways.R b/7-extract_plier_pathways.R
@@ -375,9 +375,8 @@ if (length(jaccard_list) > 0) {
       "seed_index" = "L1"
     )
 
-  readr::write_tsv(
-    x = jaccard_df,
-    path = plot_data_filename
+  readr::write_tsv(jaccard_df,
+                   plot_data_filename
   )
 
 }
diff --git a/check_sums.tsv b/check_sums.tsv
@@ -1,11 +1,23 @@
-1a89ea769381e300e5a88ec61713ad9e	data/BRCAarray.pcl
-8c76a476c5b6f4f8deec017c876db156	data/BRCAClin.tsv
-7f00ea6ef1f309773b02e6118046550f	data/BRCARNASeq.pcl
-d4486dde14da14b4f8887a7415e2866f	data/BRCARNASeqClin.tsv
-7fafc537807d5b3ddf0bb89665279a9d	data/broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg
-639ad8f8386e98dacc22e439188aa8fa	data/mc3.v0.2.8.PUBLIC.maf.gz
-a4591b2dcee39591f59e5e25a6ce75fa	data/TCGA-CDR-SupplementalTableS1.xlsx
-02e72c33071307ff6570621480d3c90b	data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv
-76a0454f911aeb17276725abb760ce89	data/GSE83130/GSE83130/GSE83130.tsv
-1d8834a51282396e07e3ce9a5417d024	data/gbm_clinical_table_S7.xlsx
-e5df57691b44c47b8c916116b5ac7acf	data/PanCan-General_Open_GDC-Manifest_2.txt
+2f4f2fcd97eff5385c0b1205b719b8dc  data/BRCAClin.tsv
+7f00ea6ef1f309773b02e6118046550f  data/BRCARNASeq.pcl
+d4486dde14da14b4f8887a7415e2866f  data/BRCARNASeqClin.tsv
+1a89ea769381e300e5a88ec61713ad9e  data/BRCAarray.pcl
+02e72c33071307ff6570621480d3c90b  data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv
+90feb5edb48d7619568d2de21697509e  data/GBMClin.tsv
+949938abadd336eb9a2f698b3102e1bb  data/GBMRNASeq.pcl
+3bbdad11c322ebf3b03ada07263c6444  data/GBMarray.pcl
+e5df57691b44c47b8c916116b5ac7acf  data/PanCan-General_Open_GDC-Manifest_2.txt
+a4591b2dcee39591f59e5e25a6ce75fa  data/TCGA-CDR-SupplementalTableS1.xlsx
+7fafc537807d5b3ddf0bb89665279a9d  data/broad.mit.edu_PANCAN_Genome_Wide_SNP_6_whitelisted.seg
+d79d2399598ac2e6c11a1b44c5c603df  data/combined_clinical_data.BRCA.tsv
+90f745d4eac485168cb2b50be29104b1  data/combined_clinical_data.GBM.tsv
+1d8834a51282396e07e3ce9a5417d024  data/gbm_clinical_table_S7.xlsx
+639ad8f8386e98dacc22e439188aa8fa  data/mc3.v0.2.8.PUBLIC.maf.gz
+7583a5fb4d23d50b79813b26469f6385  data/mutations.BRCA.tsv
+15cae05325c1b0562be8029efba5534a  data/mutations.GBM.tsv
+5484229fa691a721dd7fd08ade2233e7  data/mutations.maf
+e56585bd0c2e59658b1d54fc8b0c9df2  data/mutations.tsv
+b62634d9eccbb548499ce384605fe47a  data/GSE83130/LICENSE.TXT
+9ed2fa92d31d51f17fc048b98158a5e1  data/GSE83130/README.md
+76a0454f911aeb17276725abb760ce89  data/GSE83130/GSE83130/GSE83130.tsv
+dca310d9643a18d35e694425c56b9d2b  data/GSE83130/GSE83130/metadata_GSE83130.tsv
diff --git a/combine_clinical_data.R b/combine_clinical_data.R
@@ -60,4 +60,4 @@ combined_df <- clinical_df %>%
 ################################################################################
 
 write_tsv(combined_df,
-          path = combined_output_filepath)
+          combined_output_filepath)
diff --git a/download_TCGA_data.sh b/download_TCGA_data.sh
@@ -72,11 +72,6 @@ else
   wget -O $data/gbm_clinical_table_S7.xlsx $gbm_clinical_link
 fi
 
-# check md5 sums of downloaded files
-echo Checking md5 sums of downloaded files ...
-md5sum --check check_sums.tsv
-echo All files downloaded match expected md5 sums!
-
 # modify BRCA clinical file column PAM50 to be subtype
 sed -i 's/PAM50/subtype/' $data/BRCAClin.tsv
 
@@ -110,6 +105,11 @@ Rscript combine_clinical_data.R \
   --mutation_input $data/mutations.GBM.tsv \
   --combined_output $data/combined_clinical_data.GBM.tsv
 
+# check md5 sums of downloaded files
+echo Checking md5 sums of downloaded files ...
+md5sum --check --quiet check_sums.tsv
+echo All data files match expected md5 sums!
+
 # get BRCA array expression data from TCGA Legacy Archive
 # data/gdc_legacy_archive_brca_manifest.txt obtained from https://portal.gdc.cancer.gov/legacy-archive
 # with search parameters

diff --git a/plots/scripts/2A-plot_small_n_differential_expression.R b/plots/scripts/2A-plot_small_n_differential_expression.R
@@ -133,7 +133,7 @@ plot_small_n <- function(subtypes){
          y = ifelse(using_single_measure,
                     unique(stats_df$measure),
                     "Measure of Similarity"),
-         title = paste(cancer_type, subtypes_nice, "FDR < 10%")) +
+         title = str_c("Small n Experiment: ", paste(cancer_type, subtypes_nice, "FDR < 10%"))) +
     scale_colour_manual(values = cbPalette[c(2, 3)])
 
   if (using_single_measure) {

diff --git a/plots/scripts/3-plot_category_kappa.R b/plots/scripts/3-plot_category_kappa.R
@@ -98,5 +98,5 @@ plot_obj <- ggplot(plot_df,
 
 ggsave(output_filename,
        plot = plot_obj,
-       height = 5,
-       width = 7.5)
+       height = 4,
+       width = 7.25)
diff --git a/plots/scripts/6-plot_recon_error.R b/plots/scripts/6-plot_recon_error.R
@@ -87,5 +87,5 @@ plot_obj <- ggplot(plot_df,
 
 ggsave(output_filename,
        plot = plot_obj,
-       height = 5,
-       width = 7.5)
+       height = 4,
+       width = 7.25)
diff --git a/plots/scripts/6-plot_recon_kappa.R b/plots/scripts/6-plot_recon_kappa.R
@@ -85,5 +85,5 @@ plot_obj <- ggplot(plot_df,
 
 ggsave(output_filename,
        plot = plot_obj,
-       height = 5,
-       width = 7.5)
+       height = 4,
+       width = 7.25)
diff --git a/plots/scripts/recon_kappa_difference.R b/plots/scripts/recon_kappa_difference.R
@@ -56,15 +56,15 @@ with_df <- read_tsv(with_recon_input_filename,
 
 without_summary_df <- without_df %>%
   group_by(Perc.Seq, Classifier, Normalization, Platform) %>%
-  summarize(median_without = median(Kappa)) %>%
-  ungroup()
+  summarize(median_without = median(Kappa),
+            .groups = "drop")
 
 with_summary_df <- with_df %>%
   filter(Reconstruction == "PCA",
          Measure == "kappa") %>%
   group_by(Perc.Seq, Classifier, Normalization, Platform) %>%
-  summarize(median_with = median(Kappa)) %>%
-  ungroup()
+  summarize(median_with = median(Kappa),
+            .groups = "drop")
 
 # combined data frames and calculate difference in median kappas
 

diff --git a/prepare_GBM_data.R b/prepare_GBM_data.R
@@ -92,7 +92,8 @@ all_array_tumor_samples <- tibble(accession = names(metadata_json$samples)) %>%
 # keep one (first) accession per TCGA ID
 array_accession_tcga_id_keep <- all_array_tumor_samples %>%
   group_by(tcga_id) %>%
-  summarize(accession = sort(accession)[1])
+  summarize(accession = sort(accession)[1]) %>%
+  ungroup()
 accession_colnames_keep <- colnames(gbm_array_expression)[-1][colnames(gbm_array_expression)[-1] %in% array_accession_tcga_id_keep$accession]
 
 # select columns to keep and rename with TCGA IDs
@@ -122,8 +123,10 @@ gbm_seq_tumor_samples <- tibble(tcga_id_raw = tcga_seq_expression_column_names[-
          sample = str_sub(tcga_id_raw, 14, 15)) %>% # and sample is ZZ
   filter(sample == "01") %>% # require sample to be primary solid tumor
   filter(tcga_id %in% array_accession_tcga_id_keep$tcga_id) %>% # keep array GBMs
-  group_by(tcga_patient, tcga_id) %>%
-  summarize(tcga_id_raw = sort(tcga_id_raw)[1]) # keep one raw ID per person
+  group_by(tcga_patient) %>%
+  summarize(tcga_id_raw = sort(tcga_id_raw)[1], # keep one raw ID per person
+            tcga_id = str_sub(tcga_id_raw, 1, 15)) %>%
+  ungroup()
 
 # now read in GBM subset of entire TCGA seq expression file
 # this is faster and uses less memory than reading in entire file and then subsetting
@@ -211,12 +214,12 @@ missing_clinical <- gbm_subtypes %>%
 
 write_tsv(gbm_array_expression_renamed %>%
             select(-all_of(missing_clinical)),
-          path = gbm_array_output_filepath)
+          gbm_array_output_filepath)
 
 write_tsv(gbm_seq_expression_renamed %>%
             select(-all_of(missing_clinical)),
-          path = gbm_seq_output_filepath)
+          gbm_seq_output_filepath)
 
 write_tsv(gbm_subtypes %>%
             filter(!is.na(subtype)),
-          path = clinical_tsv_output_filepath)
+          clinical_tsv_output_filepath)
diff --git a/run_all_analyses_and_plots.sh b/run_all_analyses_and_plots.sh
@@ -68,7 +68,7 @@ Rscript plots/scripts/1A-plot_DEGs.R \
   --subtype_vs_others Basal \
   --proportion_output_directory plots/supplementary \
   --overlap_output_directory plots/supplementary \
-  --overlap_measure Jaccard
+  --overlap_measure Jaccard,Spearman
 
 # bar plot showing proportion of genes differentially expressed (Her2 vs. LumA)
 # line plot showing overlap with silver standard DEGs (Her2 vs. LumA)
@@ -77,7 +77,7 @@ Rscript plots/scripts/1A-plot_DEGs.R \
   --subtype_vs_subtype Her2,LumA \
   --proportion_output_directory plots/supplementary \
   --overlap_output_directory plots/main \
-  --overlap_measure Jaccard
+  --overlap_measure Jaccard,Spearman
 
 # line plot showing overlap with silver standard DEGs (Her2 vs. LumA) across small n values
 Rscript plots/scripts/2A-plot_small_n_differential_expression.R \
@@ -110,6 +110,7 @@ Rscript plots/scripts/0-plot_predictor_category_distributions.R \
 Rscript plots/scripts/3-plot_category_kappa.R \
   --cancer_type BRCA \
   --predictor TP53 \
+  --null_model \
   --output_directory plots/supplementary
 
 # ------------------------------------------------------------------------------
@@ -136,6 +137,7 @@ Rscript plots/scripts/0-plot_predictor_category_distributions.R \
 Rscript plots/scripts/3-plot_category_kappa.R \
   --cancer_type BRCA \
   --predictor PIK3CA \
+  --null_model \
   --output_directory plots/supplementary
 
 # ------------------------------------------------------------------------------
@@ -201,7 +203,7 @@ Rscript plots/scripts/1A-plot_DEGs.R \
   --subtype_vs_subtype Classical,Mesenchymal \
   --proportion_output_directory plots/supplementary \
   --overlap_output_directory plots/supplementary \
-  --overlap_measure Jaccard
+  --overlap_measure Jaccard,Spearman
 
 # line plot showing overlap with silver standard DEGs (Classical vs. Mesenchymal) across small n values
 Rscript plots/scripts/2A-plot_small_n_differential_expression.R \
@@ -234,6 +236,7 @@ Rscript plots/scripts/0-plot_predictor_category_distributions.R \
 Rscript plots/scripts/3-plot_category_kappa.R \
   --cancer_type GBM \
   --predictor TP53 \
+  --null_model \
   --output_directory plots/main
 
 # ------------------------------------------------------------------------------
@@ -260,6 +263,7 @@ Rscript plots/scripts/0-plot_predictor_category_distributions.R \
 Rscript plots/scripts/3-plot_category_kappa.R \
   --cancer_type GBM \
   --predictor PIK3CA \
+  --null_model \
   --output_directory plots/supplementary
 
 # ------------------------------------------------------------------------------