Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scanvi annotation workflow #898

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 185 additions & 0 deletions src/workflows/annotation/scanvi/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
name: "scanvi"
namespace: "workflows/annotation"
description: "Cell type annotation workflow using ScanVI."
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author, maintainer ]
- __merge__: /src/authors/weiwei_schultz.yaml
roles: [ contributor ]

argument_groups:
- name: Query Input
arguments:
- name: "--id"
required: true
type: string
description: ID of the sample.
example: foo
- name: "--input"
required: true
type: file
description: Input dataset consisting of the (unlabeled) query observations. The dataset is expected to be pre-processed in the same way as --reference.
example: input.h5mu
- name: "--modality"
description: Which modality to process. Should match the modality of the --reference dataset.
type: string
default: "rna"
required: false
- name: "--layer"
type: string
example: log_normalized
description: Which layer to use for integration if .X is not to be used. Should match the layer of the --reference dataset.
- name: "--var_hvg"
type: string
required: false
description: ".var column containing highly variable genes. If not provided, genes will not be subset. Should match the .var column name of the --reference dataset."
- name: "--input_obs_batch_label"
type: string
description: "The .obs field in the input (query) dataset containing the batch labels."
example: "sample"
required: true

- name: Reference input
arguments:
- name: "--reference"
required: true
type: file
description: Reference dataset consisting of the labeled observations to train the KNN classifier on. The dataset is expected to be pre-processed in the same way as the --input query dataset.
example: reference.h5mu
- name: "--reference_obs_targets"
type: string
example: [ ann_level_1, ann_level_2, ann_level_3, ann_level_4, ann_level_5, ann_finest_level ]
required: true
multiple: true
description: The `.obs` key(s) of the target labels to transfer.
- name: "--reference_obs_batch_label"
type: string
description: "The .obs field in the reference dataset containing the batch labels."
example: "sample"
required: true

- name: scVI integration options
arguments:
- name: "--early_stopping"
required: false
type: boolean
description: "Whether to perform early stopping with respect to the validation set."
- name: "--early_stopping_monitor"
choices: ["elbo_validation", "reconstruction_loss_validation", "kl_local_validation"]
default: "elbo_validation"
type: string
description: "Metric logged during validation set epoch."
- name: "--early_stopping_patience"
type: integer
min: 1
default: 45
description: "Number of validation epochs with no improvement after which training will be stopped."
- name: "--early_stopping_min_delta"
min: 0
type: double
default: 0.0
description: "Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement."
- name: "--max_epochs"
type: integer
description: "Number of passes through the dataset, defaults to (20000 / number of cells) * 400 or 400; whichever is smallest."
required: false
- name: "--reduce_lr_on_plateau"
description: "Whether to monitor validation loss and reduce learning rate when validation set `lr_scheduler_metric` plateaus."
type: boolean
default: True
- name: "--lr_factor"
description: "Factor to reduce learning rate."
type: double
default: 0.6
min: 0
- name: "--lr_patience"
description: "Number of epochs with no improvement after which learning rate will be reduced."
type: double
default: 30
min: 0

- name: Leiden clustering options
arguments:
- name: "--leiden_resolution"
type: double
description: Control the coarseness of the clustering. Higher values lead to more clusters.
min: 0
default: [1]
multiple: true

- name: Neighbor classifier arguments
arguments:
- name: "--weights"
type: string
default: "uniform"
choices: ["uniform", "distance"]
description: |
Weight function used in prediction. Possible values are:
`uniform` (all points in each neighborhood are weighted equally) or
`distance` (weight points by the inverse of their distance)
- name: "--n_neighbors"
type: integer
default: 15
required: false
description: |
The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent.
Larger values will result in more accurate search results at the cost of computation time.
- name: "Outputs"
arguments:
- name: "--output"
type: file
required: true
direction: output
description: The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference.
example: output.h5mu
- name: "--output_obs_predictions"
type: string
required: false
multiple: true
description: |
In which `.obs` slots to store the predicted cell labels.
If provided, must have the same length as `--reference_obs_targets`.
If empty, will default to the `reference_obs_targets` combined with the `"_pred"` suffix.
- name: "--output_obs_probability"
type: string
required: false
multiple: true
description: |
In which `.obs` slots to store the probability of the predictions.
If provided, must have the same length as `--reference_obs_targets`.
If empty, will default to the `reference_obs_targets` combined with the `"_probability"` suffix.
- name: "--output_obsm_integrated"
type: string
default: "X_integrated_scvi"
required: false
description: "In which .obsm slot to store the integrated embedding."
- name: "--output_compression"
type: string
description: |
The compression format to be used on the output h5mu object.
choices: ["gzip", "lzf"]
required: false
example: "gzip"

dependencies:
- name: workflows/integration/scvi_leiden
alias: scvi_leiden_workflow
- name: labels_transfer/knn
- name: dataflow/split_h5mu
- name: dataflow/concatenate_h5mu
- name: metadata/add_id
- name: metadata/copy_obs

resources:
- type: nextflow_script
path: main.nf
entrypoint: run_wf

test_resources:
- type: nextflow_script
path: test.nf
entrypoint: test_wf
- path: /resources_test/scgpt

runners:
- type: nextflow
18 changes: 18 additions & 0 deletions src/workflows/annotation/scanvi/integration_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)

# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

export NXF_VER=21.10.6

nextflow \
run . \
-main-script src/workflows/annotation/scanvi/test.nf \
-entry test_wf \
-resume \
-profile no_publish,docker \
-c src/workflows/utils/labels_ci.config \
-c src/workflows/utils/integration_tests.config
169 changes: 169 additions & 0 deletions src/workflows/annotation/scanvi/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
workflow run_wf {
take:
input_ch

main:


output_ch = input_ch
// Set aside the output for this workflow to avoid conflicts
| map {id, state ->
def new_state = state + ["workflow_output": state.output]
[id, new_state]
}
// add id as _meta join id to be able to merge with source channel and end of workflow
| map{ id, state ->
def new_state = state + ["_meta": ["join_id": id]]
[id, new_state]
}
| view {"After adding join_id: $it"}
// Add 'query' id to .obs columns of query dataset
| add_id.run(
fromState: [
"input": "input",
],
args:[
"input_id": "query",
"obs_output": "dataset",
],
toState: ["input": "output"]
)
// Add 'reference'id to .obs columns of reference dataset
| add_id.run(
fromState:[
"input": "reference",
],
args:[
"input_id": "reference",
"obs_output": "dataset"
],
toState: ["reference": "output"]
)
// Make sure that query and reference dataset have batch information in the same .obs column
// By copying the respective .obs columns to the obs column "batch_label"
| copy_obs.run(
fromState: [
"input": "input",
"modality": "modality",
"input_obs_key": "input_obs_batch_label",
],
args: [
"output_obs_key": "batch_label"
],
toState: [
"input": "output"
]
)
| copy_obs.run(
fromState: [
"input": "reference",
"modality": "modality",
"input_obs_key": "reference_obs_batch_label",
],
args: [
"output_obs_key": "batch_label"
],
toState: [
"reference": "output"
]
)
// Concatenate query and reference datasets prior to integration
| concatenate_h5mu.run(
fromState: { id, state -> [
"input": [state.input, state.reference]
]
},
args: [
"input_id": ["query", "reference"],
"other_axis_mode": "move"
],
toState: ["input": "output"]
)
| view {"After concatenation: $it"}
// Run scvi integration with leiden clustering
| scvi_leiden_workflow.run(
fromState: { id, state ->
[
"id": id,
"input": state.input,
"layer": state.layer,
"modality": state.modality,
"obsm_output": state.output_obsm_integrated,
"leiden_resolution": state.leiden_resolution,
"var_input": state.var_hvg,
"early_stopping": state.early_stopping,
"early_stopping_monitor": state.early_stopping_monitor,
"early_stoping_patience": state.early_stoping_patience,
"early_stopping_min_delta": state.early_stopping_min_delta,
"max_epochs": state.max_epochs,
"reduce_lr_on_plateau": state.reduce_lr_on_plateau,
"lr_factor": state.lr_factor,
"lr_patience": state.lr_patience
]},
args: [
"uns_neighbors": "scvi_integration_neighbors",
"obsp_neighbor_distances": "scvi_integration_distances",
"obsp_neighbor_connectivities": "scvi_integration_connectivities",
"obs_cluster": "scvi_integration_leiden",
"obsm_umap": "X_leiden_scvi_umap",
"obs_batch": "batch_label"
],
toState: ["input": "output"]
)
| view {"After integration: $it"}
// Split integrated dataset back into a separate reference and query dataset
| split_h5mu.run(
fromState: [
"input": "input",
"modality": "modality"
],
args: [
"obs_feature": "dataset",
"output_files": "sample_files.csv",
"drop_obs_nan": "true",
"output": "ref_query"
],
toState: [
"output": "output",
"output_files": "output_files"
],
auto: [ publish: true ]
)
| view {"After sample splitting: $it"}
// map the integrated query and reference datasets back to the state
| map {id, state ->
def outputDir = state.output
def files = readCsv(state.output_files.toUriString())
def query_file = files.findAll{ dat -> dat.name == 'query' }
assert query_file.size() == 1, 'there should only be one query file'
def reference_file = files.findAll{ dat -> dat.name == 'reference' }
assert reference_file.size() == 1, 'there should only be one reference file'
def integrated_query = outputDir.resolve(query_file.filename)
def integrated_reference = outputDir.resolve(reference_file.filename)
def newKeys = ["integrated_query": integrated_query, "integrated_reference": integrated_reference]
[id, state + newKeys]
}
| view {"After splitting query: $it"}
// Perform KNN label transfer from integrated reference to integrated query
| knn.run(
fromState: [
"input": "integrated_query",
"modality": "modality",
"input_obsm_features": "output_obsm_integrated",
"reference": "integrated_reference",
"reference_obsm_features": "output_obsm_integrated",
"reference_obs_targets": "reference_obs_targets",
"output_obs_predictions": "output_obs_predictions",
"output_obs_probability": "output_obs_probability",
"output_compression": "output_compression",
"weights": "weights",
"n_neighbors": "n_neighbors",
"output": "workflow_output"
],
toState: {id, output, state -> ["output": output.output]},
auto: [ publish: true ]
)

emit:
output_ch
}
10 changes: 10 additions & 0 deletions src/workflows/annotation/scanvi/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
manifest {
nextflowVersion = '!>=20.12.1-edge'
}

params {
rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
}

// include common settings
includeConfig("${params.rootDir}/src/workflows/utils/labels.config")
Loading