Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add atac_marker_peaks component #879

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@

* `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863).

* `interpret/atac_marker_peaks`: Added a component to identify marker peaks in ATAC-seq data (PR #868).

## MINOR CHANGES

* `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726).
Expand Down
84 changes: 84 additions & 0 deletions src/interpret/atac_marker_peaks/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
name: atac_marker_peaks
namespace: "interpret"
description: |
Rank differentially accessible peaks for each group of cells.

authors:
- __merge__: /src/authors/vladimir_shitov.yaml
roles: [ author ]
argument_groups:
- name: Inputs
arguments:
- name: "--input"
type: file
description: Input h5mu file
direction: input
required: true
example: input.h5mu
- name: "--modality"
type: string
default: "atac"
required: false
- name: Outputs
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_compression"
type: string
description: The compression format to be used on the output h5mu object.
choices: ["gzip", "lzf"]
required: false
example: "gzip"
- name: "--output_marker_peaks"
type: file
description: Output tsv file with marker peaks.
direction: output
example: marker_peaks.tsv
- name: Arguments
arguments:
- name: "--groupby"
type: string
description: The group to use for ranking peaks.
required: true
example: "leiden"
- name: "--method"
type: string
description: The method to use for ranking peaks.
choices: ["t-test", "wilcoxon", "logreg", "t-test_overestim_var"]
required: false
default: "t-test"
resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/cellranger_atac_tiny_bcl/counts/
engines:
- type: docker
image: python:3.11-slim
setup:
- type: apt
packages:
- procps
- pkg-config # Otherwise h5py installation fails, which is required for scanpy
- libhdf5-dev
- gcc
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, .]
__merge__: [ /src/base/requirements/scanpy.yaml, .]
packages:
- muon~=0.1.5
- pysam~=0.22.0
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
runners:
- type: executable
- type: nextflow
directives:
label: [singlecpu, midmem]
42 changes: 42 additions & 0 deletions src/interpret/atac_marker_peaks/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import mudata
import muon as mu
import pandas as pd

### VIASH START
par = {
"input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu",
"output": "foo.h5mu",
"output_marker_peaks": "marker_peaks.tsv",
"output_compression": "gzip",
"modality": "atac",
"groupby": "leiden",
"method": "t-test",
}
### VIASH END

def main():
mdata = mudata.read(par["input"].strip())
mod = mdata.mod[par["modality"]]

if not "peak_annotation" in mod.uns["atac"]:
raise ValueError("Peak annotation not found. Please run `muon.atac.tl.add_peak_annotation` first.")

mu.atac.tl.rank_peaks_groups(mod, par["groupby"], method=par["method"])

result = mod.uns["rank_genes_groups"]
groups = result["names"].dtype.names

marker_peaks_df = pd.DataFrame({
group + "_" + key: result[key][group]
for group in groups
for key in ["names", "genes", "pvals_adj"]
})

mdata.mod[par["modality"]].uns["rank_genes_groups"] = mod.uns["rank_genes_groups"]

mdata.write_h5mu(par["output"].strip(), compression=par["output_compression"])

marker_peaks_df.to_csv(par["output_marker_peaks"].strip(), sep="\t", index=False)

if __name__ == "__main__":
main()
69 changes: 69 additions & 0 deletions src/interpret/atac_marker_peaks/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import sys
from pathlib import Path
import os
import pytest

import mudata as md
import scanpy as sc
import muon as mu
import numpy as np
import pandas as pd

## VIASH START
meta = {
'executable': './target/docker/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics',
'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/",
'config': './src/qc/calculate_atac_qc_metrics/config.vsh.yaml',
'cpus': 2
}
## VIASH END

@pytest.fixture
def tiny_atac_mudata(tmp_path):
resources_dir = Path(meta["resources_dir"])
mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5")
mu.atac.tl.locate_fragments(mdata, fragments=str(resources_dir / "counts" / "fragments.tsv.gz"))
assert "files" in mdata.mod["atac"].uns.keys()
assert "fragments" in mdata.mod["atac"].uns["files"].keys()

# Read features annotation and save it to uns
peak_annotation = pd.read_csv(resources_dir / "counts" / "peak_annotation.tsv", sep="\t")
peak_annotation["gene"] = peak_annotation["gene"].astype(str) # Fixes saving error
mu.atac.tl.add_peak_annotation(mdata.mod["atac"], peak_annotation)

# Simulate clustering to not install leiden dependencies
mdata.mod["atac"].obs["leiden"] = pd.Categorical(np.random.choice(np.arange(5), size=mdata.n_obs))

mdata_path = tmp_path / "tiny_atac.h5mu"
mdata.write(mdata_path)

return mdata_path

@pytest.mark.parametrize("mudata", ["tiny_atac_mudata"])
def test_marker_peaks(run_component, request, mudata, tmp_path):
input_path = request.getfixturevalue(mudata)
output_path = tmp_path / "foo.h5mu"

args = [
"--input", str(input_path),
"--output", str(output_path),
"--output_marker_peaks", str(tmp_path / "marker_peaks.tsv"),
"--modality", "atac",
"--groupby", "leiden"
]

run_component(args)
assert output_path.is_file()
assert "marker_peaks.tsv" in os.listdir(output_path.parent)

data_with_markers = md.read(output_path)

assert "rank_genes_groups" in data_with_markers.mod["atac"].uns

marker_peaks = pd.read_csv(tmp_path / "marker_peaks.tsv", sep="\t")
assert marker_peaks.shape[0] > 0
assert "pvals_adj" in marker_peaks.columns


if __name__ == "__main__":
sys.exit(pytest.main([__file__]))