openpipelines-bio · VladimirShitov · Oct 30, 2023 · Oct 30, 2023 · Oct 31, 2023 · Oct 31, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -117,6 +117,8 @@
 
 * `transform/regress_out`: Allow providing 'input' and 'output' layers for scanpy regress_out functionality (PR #863).
 
+* `qc/calculate_atac_qc_metrics`: new component for calculating ATAC QC metrics (PR #868).
+
 * Added `transform/tfidf` component: normalize ATAC data with TF-IDF (PR #870).
 
 * Added `dimred/lsi` component (PR #552).

diff --git a/src/qc/calculate_atac_qc_metrics/config.vsh.yaml b/src/qc/calculate_atac_qc_metrics/config.vsh.yaml
@@ -0,0 +1,119 @@
+name: calculate_atac_qc_metrics
+namespace: "qc"
+description: |
+  Add basic ATAC quality control metrics to an .h5mu file.
+
+  The metrics are comparable to what scanpy.pp.calculate_qc_metrics output,
+  although they have slightly different names:
+
+  Obs metrics (name in this component -> name in scanpy):
+    - n_features_per_cell -> n_genes_by_counts
+    - total_fragment_counts -> total_counts
+
+authors:
+  - __merge__: /src/authors/vladimir_shitov.yaml
+    roles: [ author ]
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input"
+        type: file
+        description: Input h5mu file
+        direction: input
+        required: true
+        example: input.h5mu
+      - name: "--fragments_path"
+        type: file
+        description: |
+          Path to the fragments file. If not provided and not present in the input h5mu file,
+          the nucleosome signal and TSS enrichment score will not be calculated.
+        direction: input
+        required: false
+        example: fragments.tsv.gz
+      - name: "--modality"
+        type: string
+        default: "atac"
+        required: false
+      - name: "--layer"
+        description: |
+          Layer at `.layers` to use for the calculation. If not provided, `.X` is used.
+        type: string
+        example: "raw_counts"
+        required: false
+      - name: "--n_fragments_for_nucleosome_signal"
+        type: integer
+        description: |
+          Number of fragments to use per cell for nucleosome signal calculation.
+          Takes very long to calculate, for a test run lower value (e.g. 10e3) is recommended.
+          See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#nucleosome-signal
+          for more information
+        default: 10e4
+        required: false
+      - name: "--nuc_signal_threshold"
+        type: double
+        description: |
+          Threshold for nucleosome signal. Cells with nucleosome signal above this threshold
+          will be marked as low quality ("NS_FAIL"), otherwise they will be marked "NS_PASS".
+        default: 2
+        required: false
+      - name: "--n_tss"
+        type: integer
+        description: |
+          Number of the transcription start sites to calculate TSS enrichment score.
+          See https://www.sc-best-practices.org/chromatin_accessibility/quality_control.html#tss-enrichment
+          for more information
+        default: 3000
+        required: false
+      - name: "--tss_threshold"
+        type: double
+        description: |
+          Threshold for TSS enrichment score. Cells with TSS enrichment score below this threshold
+          will be marked as low quality ("TSS_FAIL") otherwise they will be marked as "TSS_PASS".
+        default: 1.5
+        required: false
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        type: file
+        description: Output h5mu file.
+        direction: output
+        example: output.h5mu
+      - name: "--output_compression"
+        type: string
+        description: The compression format to be used on the output h5mu object.
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/setup_logger.py
+test_resources:
+  - type: python_script
+    path: test.py
+  - path: /resources_test/cellranger_atac_tiny_bcl/counts/
+engines:
+  - type: docker
+    image: python:3.9-slim
+    setup:
+      - type: apt
+        packages: 
+          - procps
+          - pkg-config  # Otherwise h5py installation fails, which is required for scanpy
+          - libhdf5-dev
+          - gcc
+      - type: python
+        __merge__: [/src/base/requirements/anndata_mudata.yaml, .]
+        packages:
+          - numpy~=1.23.5
+          - scanpy~=1.9.3
+          - muon~=0.1.5
+          - pysam~=0.22.0
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/viashpy.yaml, /src/base/requirements/scanpy.yaml, .]
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [singlecpu, midmem]
diff --git a/src/qc/calculate_atac_qc_metrics/script.py b/src/qc/calculate_atac_qc_metrics/script.py
@@ -0,0 +1,103 @@
+import sys
+
+import scanpy as sc
+import muon as mu
+from muon import atac as ac  # the module containing function for scATAC data processing
+import numpy as np
+
+## VIASH START
+par = {
+    "input": "resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu",
+    "fragments_path": None,
+    "output": "foo.h5mu",
+    "modality": "atac",
+    "layer": None,
+    "n_fragments_for_nucleosome_signal": 10e4,
+    "n_tss": 3000,
+    "nuc_signal_threshold": 2,
+    "tss_threshold": 1.5,
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+# START TEMPORARY WORKAROUND setup_logger
+# reason: resources aren't available when using Nextflow fusion
+# from setup_logger import setup_logger
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+logger = setup_logger()
+
+def main():
+    logger.info("Reading input data")
+    mdata = mu.read(par["input"])
+
+    logger.debug("Making var names unique")
+    mdata.var_names_make_unique()
+    atac = mdata.mod[par["modality"]]
+
+    logger.info("Checking if QC columns are already calculated")
+    for col in ("n_features_per_cell", "total_fragment_counts"):
+        if col in atac.obs:
+            logger.warning(f"{col} is already in atac.obs, dropping")
+            atac.obs.drop(col, axis=1, inplace=True)
+
+    logger.info("Calculating QC metrics")
+    sc.pp.calculate_qc_metrics(atac, percent_top=None, log1p=False, inplace=True, layer=par["layer"])
+
+    logger.debug("Renaming QC columns")
+    atac.obs.rename(
+        columns={
+            "n_genes_by_counts": "n_features_per_cell",
+            "total_counts": "total_fragment_counts",
+        },
+        inplace=True,
+    )
+
+    logger.debug("Adding log-transformed total fragment counts")
+    # log-transform total counts and add as column
+    atac.obs["log_total_fragment_counts"] = np.log10(atac.obs["total_fragment_counts"])
+
+    if par["fragments_path"] is not None:
+        logger.info("Trying to locate frafments")
+        ac.tl.locate_fragments(atac, fragments=par["fragments_path"])
+    else:
+        logger.info("Skipping fragment location: `fragments_path` is not set")
+
+    # Calculate the nucleosome signal across cells
+    if "files" in atac.uns and "fragments" in atac.uns["files"]:
+        logger.info("Trying to calculate nucleosome signal")
+        ac.tl.nucleosome_signal(atac, n=par["n_fragments_for_nucleosome_signal"] * atac.n_obs)
+        atac.obs["nuc_signal_filter"] = [
+            "NS_FAIL" if ns > par["nuc_signal_threshold"] else "NS_PASS"
+            for ns in atac.obs["nucleosome_signal"]
+        ]
+    else:
+        logger.info("Skipping nucleosome signal calculation: fragments information is not found")
+
+    # If interval information is available, calculate TSS enrichment
+    if "peak_annotation" in mdata.mod["atac"].uns.keys():
+        tss = ac.tl.tss_enrichment(mdata, features=mdata.mod["atac"].uns["peak_annotation"],n_tss=par["n_tss"], random_state=666)
+
+        tss.obs["tss_filter"] = [
+            "TSS_FAIL" if score < par["tss_threshold"] else "TSS_PASS"
+            for score in atac.obs["tss_score"]
+        ]
+    else:
+        logger.info("Skipping TSS enrichment calculation: genes intervals are not found")
+
+    logger.info("Writing output")
+    mdata.write(par["output"], compression=par["output_compression"])
+
+if __name__ == "__main__":
+    main()
diff --git a/src/qc/calculate_atac_qc_metrics/test.py b/src/qc/calculate_atac_qc_metrics/test.py
@@ -0,0 +1,145 @@
+import sys
+from pathlib import Path
+import pytest
+
+import mudata as md
+import numpy as np
+import scanpy as sc
+import muon as mu
+import pandas as pd
+
+## VIASH START
+meta = {
+    'executable': './target/docker/qc/calculate_atac_qc_metrics/calculate_atac_qc_metrics',
+    'resources_dir': "./resources_test/cellranger_atac_tiny_bcl/counts/",
+    'config': './src/qc/calculate_atac_qc_metrics/config.vsh.yaml',
+    'cpus': 2
+}
+## VIASH END
+
+@pytest.fixture
+def synthetic_example():
+    atac = sc.AnnData(np.array([
+        [0, 0, 0],
+        [1, 0, 1],
+        [10, 0, 0],
+        [100, 0, 1],
+        [1000, 0, 0]
+    ]))
+    atac.obs_names = ["A", "B", "C", "D", "E"]
+    atac.var_names = ["x", "y", "z"]
+
+    return md.MuData({"atac": atac})
+
+@pytest.fixture
+def example_mudata(tmp_path, synthetic_example):
+    mdata_path = tmp_path / "example.h5mu"
+    synthetic_example.write(mdata_path)
+
+    return mdata_path
+
+@pytest.fixture
+def example_mudata_with_layer(tmp_path, synthetic_example):
+    synthetic_example.mod["atac"].layers["atac_counts"] = synthetic_example.mod["atac"].X.copy()
+    synthetic_example.mod["atac"].X = np.random.normal(size=synthetic_example.mod["atac"].X.shape)
+    mdata_path = tmp_path / "example.h5mu"
+    synthetic_example.write(mdata_path)
+
+    return mdata_path
+
+@pytest.fixture
+def neurips_mudata(tmp_path):
+    """From the `NeurIPS Multimodal Single-Cell Integration Challenge
+    <https://www.kaggle.com/competitions/open-problems-multimodal/data>`
+
+    Link is taken from the Moscot repository: 
+    https://github.com/theislab/moscot/blob/cb53435c80fafe58046ead3c42a767fd0b818aaa/src/moscot/datasets.py#L67
+
+    """
+    adata = sc.read("../data/neurips_data.h5ad", backup_url="https://figshare.com/ndownloader/files/37993503")
+
+    mdata = md.MuData({"atac": adata})
+    mdata_path = tmp_path / "neurips.h5mu"
+    mdata.write(mdata_path)
+
+    return mdata_path
+
+@pytest.fixture
+def tiny_atac_mudata(tmp_path):
+    resources_dir = Path(meta["resources_dir"])
+    mdata = mu.read_10x_h5(resources_dir / "counts" / "filtered_peak_bc_matrix.h5")
+    mu.atac.tl.locate_fragments(mdata, fragments=str(resources_dir / "counts" / "fragments.tsv.gz"))
+    assert "files" in mdata.mod["atac"].uns.keys()
+    assert "fragments" in mdata.mod["atac"].uns["files"].keys()
+
+    # Read features annotation and save it to uns
+    peak_annotation = pd.read_csv(resources_dir / "counts" / "peak_annotation.tsv", sep="\t")
+    peak_annotation.columns = ["Chromosome", "Start", "End", "gene", "distance", "peak_type"]
+    peak_annotation["gene"] = peak_annotation["gene"].astype(str)  # Fixes saving error
+    mdata.mod["atac"].uns["peak_annotation"] = peak_annotation
+
+    mdata_path = tmp_path / "tiny_atac.h5mu"
+    mdata.write(mdata_path)
+
+    return mdata_path
+
+@pytest.mark.parametrize("mudata", ["example_mudata", "neurips_mudata", "tiny_atac_mudata"])
+def test_qc_columns_in_tables(run_component, request, mudata, tmp_path):
+    input_path = request.getfixturevalue(mudata)
+    output_path = tmp_path / "foo.h5mu"
+
+    args = [
+        "--input", str(input_path),
+        "--output", str(output_path),
+        "--modality", "atac",
+        "--n_fragments_for_nucleosome_signal", "100"
+    ]
+
+    run_component(args)
+    assert output_path.is_file()
+    data_with_qc = md.read(output_path)
+
+    for qc_metric in ("n_features_per_cell", "total_fragment_counts", "log_total_fragment_counts"):
+        assert qc_metric in data_with_qc.mod["atac"].obs
+    for qc_metric in ("n_cells_by_counts", "mean_counts", "pct_dropout_by_counts", "total_counts"):
+        assert qc_metric in data_with_qc.mod["atac"].var
+
+    # Check that ATAC-specific metrics are calculated if fragments information is present (for tiny ATAC data)
+    if "files" in data_with_qc.mod["atac"].uns and "fragments" in data_with_qc.mod["atac"].uns["files"]:
+        assert "nucleosome_signal" in data_with_qc.mod["atac"].obs
+
+    if "peak_annotation" in data_with_qc.mod["atac"].uns.keys():
+        assert "tss_score" in data_with_qc.mod["atac"].obs
+
+
+@pytest.mark.parametrize("mudata", ["example_mudata", "example_mudata_with_layer"])
+def test_calculations_correctness(request, run_component, mudata, tmp_path):
+    input_path = request.getfixturevalue(mudata)
+    output_path = tmp_path / "foo.h5mu"
+
+    args = [
+        "--input", str(input_path),
+        "--output", str(output_path),
+        "--modality", "atac",
+        "--n_fragments_for_nucleosome_signal", "100"
+    ]
+
+    if mudata == "example_mudata_with_layer":
+        args.extend(["--layer", "atac_counts"])
+
+    run_component(args)
+    assert output_path.is_file()
+    data_with_qc = md.read(output_path)
+
+    assert np.allclose(data_with_qc.mod["atac"].obs["n_features_per_cell"], [0, 2, 1, 2, 1])
+    assert np.allclose(data_with_qc.mod["atac"].obs["total_fragment_counts"], [0, 2, 10, 101, 1000])
+    assert np.allclose(data_with_qc.mod["atac"].obs["log_total_fragment_counts"], [-np.inf, np.log10(2), np.log10(10), np.log10(101), np.log10(1000)])
+
+    assert np.allclose(data_with_qc.mod["atac"].var["n_cells_by_counts"], [4, 0, 2])
+    assert np.allclose(data_with_qc.mod["atac"].var["mean_counts"], [222.2, 0, 0.4])
+    assert np.allclose(data_with_qc.mod["atac"].var["pct_dropout_by_counts"], [20, 100, 60])
+    assert np.allclose(data_with_qc.mod["atac"].var["total_counts"], [1111, 0, 2])
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__]))