openpipelines-bio · VladimirShitov · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -93,6 +93,8 @@
 
 * `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863).
 
+* `interpret/atac_marker_peaks`: Added a component to identify marker peaks in ATAC-seq data (PR #868).
+
 ## MINOR CHANGES
 
 * `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726).

diff --git a/src/interpret/atac_marker_peaks/config.vsh.yaml b/src/interpret/atac_marker_peaks/config.vsh.yaml
@@ -0,0 +1,84 @@
+name: atac_marker_peaks
+namespace: "interpret"
+description: |
+  Rank differentially accessible peaks for each group of cells.
+
+authors:
+  - __merge__: /src/authors/vladimir_shitov.yaml
+    roles: [ author ]
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input"
+        type: file
+        description: Input h5mu file
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--modality"
+        type: string
+        default: "atac"
+        required: false
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_compression"
+        type: string
+        description: The compression format to be used on the output h5mu object.
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+      - name: "--output_marker_peaks"
+        type: file
+        description: Output tsv file with marker peaks.
+        direction: output
+        example: marker_peaks.tsv
+  - name: Arguments
+    arguments:
+      - name: "--groupby"
+        type: string
+        description: The group to use for ranking peaks.
+        required: true
+        example: "leiden"
+      - name: "--method"
+        type: string
+        description: The method to use for ranking peaks.
+        choices: ["t-test", "wilcoxon", "logreg", "t-test_overestim_var"]
+        required: false
+        default: "t-test"
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/cellranger_atac_tiny_bcl/counts/
+engines:
+  - type: docker
+    image: python:3.11-slim
+    setup:
+      - type: apt
+        packages: 
+          - procps
+          - pkg-config  # Otherwise h5py installation fails, which is required for scanpy
+          - libhdf5-dev
+          - gcc
+      - type: python
+        __merge__: [/src/base/requirements/anndata_mudata.yaml, .]
+        __merge__: [ /src/base/requirements/scanpy.yaml, .]
+        packages:
+          - muon~=0.1.5
+          - pysam~=0.22.0
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/viashpy.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [singlecpu, midmem]
diff --git a/src/interpret/atac_marker_peaks/script.py b/src/interpret/atac_marker_peaks/script.py
@@ -0,0 +1,42 @@
+import mudata
+import muon as mu
+import pandas as pd
+
+### VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu",
+    "output": "foo.h5mu",
+    "output_marker_peaks": "marker_peaks.tsv",
+    "output_compression": "gzip",
+    "modality": "atac",
+    "groupby": "leiden",
+    "method": "t-test",
+}
+### VIASH END
+
+def main():
+    mdata = mudata.read(par["input"].strip())
+    mod = mdata.mod[par["modality"]]
+
+    if not "peak_annotation" in mod.uns["atac"]:
+        raise ValueError("Peak annotation not found. Please run `muon.atac.tl.add_peak_annotation` first.")
+
+    mu.atac.tl.rank_peaks_groups(mod, par["groupby"], method=par["method"])
+
+    result = mod.uns["rank_genes_groups"]
+    groups = result["names"].dtype.names
+
+    marker_peaks_df = pd.DataFrame({
+        group + "_" + key: result[key][group]
+        for group in groups
+        for key in ["names", "genes", "pvals_adj"]
+    })
+
+    mdata.mod[par["modality"]].uns["rank_genes_groups"] = mod.uns["rank_genes_groups"]
+
+    mdata.write_h5mu(par["output"].strip(), compression=par["output_compression"])
+
+    marker_peaks_df.to_csv(par["output_marker_peaks"].strip(), sep="\t", index=False)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/interpret/atac_marker_peaks/test.py b/src/interpret/atac_marker_peaks/test.py
@@ -0,0 +1,69 @@
+import sys
+from pathlib import Path
+import os
+import pytest
+
+import mudata as md
+import scanpy as sc
+import muon as mu
+import numpy as np
+import pandas as pd
+
+## VIASH START
+meta = {
+    'executable': './target/docker/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics',
+    'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/",
+    'config': './src/qc/calculate_atac_qc_metrics/config.vsh.yaml',
+    'cpus': 2
+}
+## VIASH END
+
+@pytest.fixture
+def tiny_atac_mudata(tmp_path):
+    resources_dir = Path(meta["resources_dir"])
+    mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5")
+    mu.atac.tl.locate_fragments(mdata, fragments=str(resources_dir / "counts" / "fragments.tsv.gz"))
+    assert "files" in mdata.mod["atac"].uns.keys()
+    assert "fragments" in mdata.mod["atac"].uns["files"].keys()
+
+    # Read features annotation and save it to uns
+    peak_annotation = pd.read_csv(resources_dir / "counts" / "peak_annotation.tsv", sep="\t")
+    peak_annotation["gene"] = peak_annotation["gene"].astype(str)  # Fixes saving error
+    mu.atac.tl.add_peak_annotation(mdata.mod["atac"], peak_annotation)
+
+    # Simulate clustering to not install leiden dependencies
+    mdata.mod["atac"].obs["leiden"] = pd.Categorical(np.random.choice(np.arange(5), size=mdata.n_obs))
+
+    mdata_path = tmp_path / "tiny_atac.h5mu"
+    mdata.write(mdata_path)
+
+    return mdata_path
+
+@pytest.mark.parametrize("mudata", ["tiny_atac_mudata"])
+def test_marker_peaks(run_component, request, mudata, tmp_path):
+    input_path = request.getfixturevalue(mudata)
+    output_path = tmp_path / "foo.h5mu"
+
+    args = [
+        "--input", str(input_path),
+        "--output", str(output_path),
+        "--output_marker_peaks", str(tmp_path / "marker_peaks.tsv"),
+        "--modality", "atac",
+        "--groupby", "leiden"
+    ]
+
+    run_component(args)
+    assert output_path.is_file()
+    assert "marker_peaks.tsv" in os.listdir(output_path.parent)
+
+    data_with_markers = md.read(output_path)
+
+    assert "rank_genes_groups" in data_with_markers.mod["atac"].uns
+
+    marker_peaks = pd.read_csv(tmp_path / "marker_peaks.tsv", sep="\t")
+    assert marker_peaks.shape[0] > 0
+    assert "pvals_adj" in marker_peaks.columns
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))