From e2e13d81cbc88a7163bb169a11d11a679d4a04c7 Mon Sep 17 00:00:00 2001 From: vertesy Date: Fri, 8 Nov 2024 15:39:17 +0100 Subject: [PATCH] fun filterExpressedGenes; rename filterCodingGenes --- NAMESPACE | 3 +- R/Seurat.Utils.R | 53 ++++++++++++++++++- R/Seurat.Utils.Visualization.R | 2 +- ...{filterNcGenes.Rd => filterCodingGenes.Rd} | 8 +-- man/filterExpressedGenes.Rd | 34 ++++++++++++ 5 files changed, 92 insertions(+), 8 deletions(-) rename man/{filterNcGenes.Rd => filterCodingGenes.Rd} (91%) create mode 100644 man/filterExpressedGenes.Rd diff --git a/NAMESPACE b/NAMESPACE index 4a2c83f..05dc0ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -80,8 +80,9 @@ export(downsampleSeuObj) export(downsampleSeuObj.and.Save) export(downsampleSeuObjByIdentAndMaxcells) export(dropLevelsSeurat) +export(filterCodingGenes) +export(filterExpressedGenes) export(filterGoEnrichment) -export(filterNcGenes) export(find10XoutputFolders) export(find_prefix_in_cell_IDs) export(fix.orig.ident) diff --git a/R/Seurat.Utils.R b/R/Seurat.Utils.R index a3ca949..832ae66 100644 --- a/R/Seurat.Utils.R +++ b/R/Seurat.Utils.R @@ -1164,13 +1164,13 @@ calc.q99.Expression.and.set.all.genes <- function( #' #' @examples #' genes <- c("AC123", "AL456", "c1orf7", "TP53", "BRCA1", "X1.AS1", "MYC") -#' genes_kept <- filterNcGenes(genes) +#' genes_kept <- filterCodingGenes(genes) #' print(genes_kept) #' #' @importFrom stringr str_detect #' @export #' -filterNcGenes <- function(genes, pattern_NC = c( +filterCodingGenes <- function(genes, pattern_NC = c( "^A[CFLP][0-9]{6}", "^Z[0-9]{5}", "^LINC0[0-9]{4}", "^C[1-9]+orf[1-9]+", "[-|\\.]AS[1-9]*$", "[-|\\.]DT[1-9]*$", @@ -1209,6 +1209,55 @@ v = TRUE, unique = TRUE, ...) { return(genes_kept) } +# _________________________________________________________________________________________________ +#' @title Filter and Sort Gene Expression List Based on Specified Genes and Expression Threshold +#' +#' @description This function takes a named list of gene expression values and a character vector of gene +#' symbols. It identifies the intersection of gene symbols with names in the list, filters genes based on a +#' specified expression threshold, and returns a character vector of genes that meet the criteria, sorted +#' by expression in descending order. +#' +#' @param genes Character vector of gene symbols to search for in the gene list. Default: NULL. +#' @param gene_list A named list of gene expression values where names are gene symbols, and values are +#' expression levels. Default: all.genes +#' @param threshold Numeric value specifying the minimum expression level for filtering. Genes with +#' expression values below this threshold will be excluded. Default: 0.1. +#' +#' @return A character vector of gene symbols that match the specified list, meet the expression threshold, +#' and are sorted in descending order by expression level. +#' +#' @examples +#' # Example usage: +#' gene_list <- list(ROBO2 = 0.9982406, CDH18 = 0.9981755, DCC = 0.9981755, AL589740.1 = 0.9981103) +#' genes <- c("ROBO2", "DCC", "AL589740.1", "UNKNOWN") +#' filterExpressedGenes(gene_list, genes, threshold = 0.9981) +#' +#' @export +filterExpressedGenes <- function(genes = NULL, gene_list = all.genes, threshold = 0.1) { + + # Assertions + stopifnot( + is.list(gene_list), !is.null(gene_list), + is.character(genes), !is.null(genes), + is.numeric(threshold), length(threshold) == 1 + ) + + # Step 1: Intersect the gene symbols with the names in the list and report statistics + matching_genes <- intersect(names(gene_list), genes) + message("Number of matching genes: ", length(matching_genes)) + + # Step 2: Filter out genes below the expression threshold + filtered_genes <- matching_genes[sapply(matching_genes, function(g) gene_list[[g]] >= threshold)] + message("Number of genes above the threshold: ", length(filtered_genes)) + + # Step 3: Sort the genes according to their expression in descending order + # sorted_genes <- filtered_genes[order(sapply(filtered_genes, function(g) gene_list[[g]]), decreasing = TRUE)] + sorted_genes <- names(sort(unlist(gene_list[filtered_genes]), decreasing = TRUE)) + + # Step 4: Return the character vector + return(sorted_genes) +} + diff --git a/R/Seurat.Utils.Visualization.R b/R/Seurat.Utils.Visualization.R index bfd0803..eb4c071 100644 --- a/R/Seurat.Utils.Visualization.R +++ b/R/Seurat.Utils.Visualization.R @@ -2146,7 +2146,7 @@ qUMAP <- function( "UMAP is not 2 dimensional! \n Check obj@reductions[[reduction]]@cell.embeddings" = if (check_for_2D) ncol(obj@reductions[[reduction]]@cell.embeddings) == 2, reduction %in% names(obj@reductions), - assay %in% names(combined.obj@assays), + assay %in% names(obj@assays), "split.by column not found in meta.data / not categorical" = if (!is.null(splitby)) {splitby %in% colnames(META) && is.factor(META[[splitby]]) || is.character(META[[splitby]])} else TRUE ) diff --git a/man/filterNcGenes.Rd b/man/filterCodingGenes.Rd similarity index 91% rename from man/filterNcGenes.Rd rename to man/filterCodingGenes.Rd index 4d9867a..5219bfb 100644 --- a/man/filterNcGenes.Rd +++ b/man/filterCodingGenes.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/Seurat.Utils.R -\name{filterNcGenes} -\alias{filterNcGenes} +\name{filterCodingGenes} +\alias{filterCodingGenes} \title{Filter Coding Gene Symbols (or any matching input Patterns)} \usage{ -filterNcGenes( +filterCodingGenes( genes, pattern_NC = c("^A[CFLP][0-9]{6}", "^Z[0-9]{5}", "^LINC0[0-9]{4}", "^C[1-9]+orf[1-9]+", "[-|\\\\.]AS[1-9]*$", "[-|\\\\.]DT[1-9]*$", "^MIR[1-9]", "^SNHG[1-9]"), @@ -35,7 +35,7 @@ It filters out non-coding gene symbols by default. } \examples{ genes <- c("AC123", "AL456", "c1orf7", "TP53", "BRCA1", "X1.AS1", "MYC") -genes_kept <- filterNcGenes(genes) +genes_kept <- filterCodingGenes(genes) print(genes_kept) } diff --git a/man/filterExpressedGenes.Rd b/man/filterExpressedGenes.Rd new file mode 100644 index 0000000..24d8368 --- /dev/null +++ b/man/filterExpressedGenes.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Seurat.Utils.R +\name{filterExpressedGenes} +\alias{filterExpressedGenes} +\title{Filter and Sort Gene Expression List Based on Specified Genes and Expression Threshold} +\usage{ +filterExpressedGenes(genes = NULL, gene_list = all.genes, threshold = 0) +} +\arguments{ +\item{genes}{Character vector of gene symbols to search for in the gene list. Default: NULL.} + +\item{gene_list}{A named list of gene expression values where names are gene symbols, and values are +expression levels. Default: all.genes} + +\item{threshold}{Numeric value specifying the minimum expression level for filtering. Genes with +expression values below this threshold will be excluded. Default: 0.} +} +\value{ +A character vector of gene symbols that match the specified list, meet the expression threshold, +and are sorted in descending order by expression level. +} +\description{ +This function takes a named list of gene expression values and a character vector of gene +symbols. It identifies the intersection of gene symbols with names in the list, filters genes based on a +specified expression threshold, and returns a character vector of genes that meet the criteria, sorted +by expression in descending order. +} +\examples{ +# Example usage: +gene_list <- list(ROBO2 = 0.9982406, CDH18 = 0.9981755, DCC = 0.9981755, AL589740.1 = 0.9981103) +genes <- c("ROBO2", "DCC", "AL589740.1", "UNKNOWN") +filter_and_sort_genes(gene_list, genes, threshold = 0.9981) + +}