-
Notifications
You must be signed in to change notification settings - Fork 32
/
nextflow_schema.json
388 lines (388 loc) · 20.4 KB
/
nextflow_schema.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
"title": "epi2me-labs/wf-transcriptomes",
"workflow_title": "Workflow Transcriptomes",
"description": "Transcriptome analysis including assembly and annotation of cDNA and direct RNA sequencing data and differential expression analysis.",
"demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-transcriptomes/wf-transcriptomes-demo.tar.gz",
"aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-transcriptomes/wf-transcriptomes-demo/aws.nextflow.config",
"url": "https://github.com/epi2me-labs/wf-transcriptomes",
"type": "object",
"definitions": {
"input_options": {
"title": "Input Options",
"type": "object",
"fa_icon": "fas fa-terminal",
"description": "Parameters for finding and handling input data for analysis.",
"properties": {
"fastq": {
"type": "string",
"format": "path",
"title": "FASTQ",
"demo_data": "${projectDir}/test_data/fastq",
"description": "FASTQ files to use in the analysis.",
"help_text": "This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
},
"bam": {
"type": "string",
"format": "path",
"description": "BAM or unaligned BAM (uBAM) files to use in the analysis.",
"help_text": "This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
},
"transcriptome_source": {
"type": "string",
"enum": [
"precomputed",
"reference-guided"
],
"default": "reference-guided",
"description": "Select how the transcriptome used for analysis should be prepared.",
"help_text": "For differential expression analysis, use of an existing transcriptome may be preferred and so 'precomputed' should be selected. In this case the 'ref_transcriptome' parameter should be specified. To create a reference transcriptome using an existing reference genome, select 'reference guided' and specify the 'ref_genome' parameter."
},
"ref_genome": {
"type": "string",
"title": "Reference genome",
"format": "file-path",
"demo_data": "${projectDir}/test_data/SIRV_150601a.fasta",
"description": "Path to reference genome sequence [.fa/.fq/.fa.gz/fq.gz]. Required for reference-based workflow.",
"help_text": "A reference genome is required for reference-based assembly of a transcriptome."
},
"ref_transcriptome": {
"type": "string",
"title": "Reference transcriptome",
"format": "file-path",
"description": "Transcriptome reference file. Required for precomputed transcriptome calculation and for differential expression analysis.",
"help_text": "A reference transcriptome related to the sample under study. Must be supplied when the 'Transcriptome source' parameter has been set to 'precomputed' or to perform differential expression."
},
"ref_annotation": {
"type": "string",
"title": "Reference annotation",
"format": "file-path",
"demo_data": "${projectDir}/test_data/SIRV_isoforms.gtf",
"description": "A reference annotation in GFF2 or GFF3 format (extensions .gtf(.gz), .gff(.gz), .gff3(.gz)). Only annotation files from [Encode](https://www.encodeproject.org), [Ensembl](https://www.ensembl.org/index.html) and [NCBI](https://www.ncbi.nlm.nih.gov/) are supported.",
"help_text": "This will be used for guiding the transcriptome assembly and to label transcripts with their corresponding gene identifiers. Note: If in de_analysis mode transcript strands must be only + or -."
},
"direct_rna": {
"type": "boolean",
"default": false,
"title": "direct RNA",
"description": "Set to true for direct RNA sequencing.",
"help_text": " Omits the pychopper step."
},
"analyse_unclassified": {
"type": "boolean",
"default": false,
"title": "Analyse unclassified",
"description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.",
"help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory."
}
},
"allOf": [
{
"required": [
"transcriptome_source"
]
},
{
"oneOf": [
{
"required": [
"fastq"
]
},
{
"required": [
"bam"
]
}
]
}
]
},
"output_options": {
"title": "Output Options",
"type": "object",
"description": "Parameters for saving and naming workflow outputs.",
"default": "",
"properties": {
"out_dir": {
"type": "string",
"format": "directory-path",
"default": "output",
"description": "Directory for output of all user-facing files."
},
"igv": {
"title": "Show in IGV",
"type": "boolean",
"default": false,
"description": "Visualize outputs in the EPI2ME IGV visualizer.",
"help_text": "Enabling this option will visualize the output alignment files in the EPI2ME Desktop App IGV visualizer."
}
}
},
"sample_options": {
"title": "Sample Options",
"type": "object",
"description": "Parameters that relate to samples such as sample sheets and sample names.",
"default": "",
"properties": {
"sample_sheet": {
"type": "string",
"title": "Sample and condition sheet",
"format": "file-path",
"description": "A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. If you are running the differential expression workflow, there must be an additional column `condition` with two labels, one of which must be `control` (e.g. `control` and `treated`). Control will indicate which samples will be used as the reference. There should be at least 3 repeats for each condition.",
"help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed."
},
"sample": {
"type": "string",
"description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files."
}
}
},
"options_for_reference_based_workflow": {
"title": "Options for reference-based workflow",
"type": "object",
"description": "Parameters that are used solely for the reference-guided workflow",
"properties": {
"plot_gffcmp_stats": {
"type": "boolean",
"default": true,
"title": "Plot gffcompare statistics",
"description": "Create a PDF of plots from showing gffcompare results",
"help_text": "If set to true, a PDF file containing detailed gffcompare reults will be output"
},
"gffcompare_opts": {
"type": "string",
"title": "Plot gffcompare options",
"description": "Extra command-line options to give to gffcompare -r",
"default": "-R",
"help_text": "For a list of possible options see [gffcompare](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml)."
},
"minimap2_index_opts": {
"type": "string",
"title": "Minimap2 index options",
"description": "Extra command-line options for minimap2 indexing.",
"default": "-k14",
"help_text": "See [minimap2 index options](https://lh3.github.io/minimap2/minimap2.html#4) for more information. These will only be relevant in the reference based transcriptome assembly."
},
"minimap2_opts": {
"type": "string",
"title": "Minimap2 options",
"description": "Additional command-line options for minimap2 alignment.",
"default": "-uf",
"help_text": "See [minimap2 options](https://lh3.github.io/minimap2/minimap2.html#5) for further information. These will only be relevant in the reference based transcriptome assembly."
},
"minimum_mapping_quality": {
"type": "integer",
"description": "filter aligned reads by MAPQ quality.",
"default": 40,
"help_text": "Reads that do not meet this mapping quality after minimap2 alignment, will be filtered out."
},
"poly_context": {
"type": "integer",
"description": "Region size at end of reads to apply poly(A) filter.",
"help_text": "Mispriming of polyT primers can occur when, instead of priming transcription from a polyA tail, it is primed from internal polyA rich regions in the genome. In these cases the 3` end of the transcript will not be captured and should be discarded. This parameter defines the size of a genomic region centered around the 3` alignment position from which to search for polyA rich regions.",
"hidden": true,
"default": 24
},
"max_poly_run": {
"type": "integer",
"title": "Maximum poly run",
"description": "Max poly(A) region allowed with poly_context-sized end regions.",
"help_text": "See `poly_context` parameter. This parameter defines the maximum allowed polyA tract within a `poly_context` defined genomic region.",
"hidden": true,
"default": 8
},
"stringtie_opts": {
"type": "string",
"title": "Stringtie options",
"description": "Extra command-line options for stringtie transcript assembly.",
"default": "--conservative",
"help_text": "For additional String tie options see [here](https://github.com/gpertea/stringtie#stringtie-options)."
}
}
},
"differential_expression_options": {
"title": "Differential Expression Options",
"type": "object",
"description": "Options relevant to the differential expression section of the workflow, only need to set if running DE.",
"default": "",
"properties": {
"de_analysis": {
"type": "boolean",
"default": false,
"title": "Differential expression analysis",
"description": "Run DE anaylsis",
"help_text": "Running this requires you to provide at least two replicates for a control and treated sample as well as a sample sheet param."
},
"min_gene_expr": {
"type": "integer",
"title": "Minimum gene expression",
"default": 10,
"description": "The minimum number of total mapped sequence reads required for a gene to be considered in differential transcript usage analysis.",
"help_text": "Filtering at the gene level ensures that the observed transcript ratios are calculated with a minimum number of counts per gene."
},
"min_feature_expr": {
"type": "integer",
"title": "Minimum feature expression",
"default": 3,
"description": "The minimum number of reads assigned to a transcript for it to be considered in differential transcript usage analysis.",
"help_text": "Filter out transcripts that do not have this minimum number of transcript expression, reducing noise."
},
"min_samps_gene_expr": {
"type": "integer",
"title": "Minimum samples with gene expression",
"description": "Set the minimum number of samples in which a gene is expressed to be included in the differential transcript usage analysis.",
"default": 3,
"help_text": "A gene must be expressed in at least this number of samples for the gene be included in the differential transcript usage analysis. Filtering at the gene level improves the reliability of the observed transcript ratios."
},
"min_samps_feature_expr": {
"type": "integer",
"title": "Minimum samples with feature expression",
"default": 1,
"description": "Set the minimum number of samples in which a transcript is expressed to be included in the differential transcript usage analysis.",
"help_text": "A transcript must expressed in at least this minimum number of samples to be included in the analysis. Should be equal to the number of replicates per sample you have."
}
}
},
"advanced_options": {
"title": "Advanced Options",
"type": "object",
"description": "Advanced options for configuring processes inside the workflow.",
"properties": {
"threads": {
"type": "integer",
"default": 4,
"description": "Number of CPU threads.",
"help_text": "Only provided to processes including alignment and and assembly that benefit from multiple threads."
},
"cdna_kit": {
"type": "string",
"title": "Kit used for cDNA synthesis.",
"enum": [
"SQK-PCS109",
"SQK-PCS110",
"SQK-PCS111",
"SQK-PCS114",
"SQK-PCB111",
"SQK-PCB114",
"SQK-LSK114"
],
"description": "If cDNA reads are used, select the kit used.",
"default": "SQK-PCS109",
"help_text": "This will be used by pychopper to preprocess the reads for downstream analysis."
},
"pychopper_backend": {
"type": "string",
"enum": [
"edlib",
"phmm"
],
"title": "Pychopper backend",
"description": "Pychopper can use one of two available backends for identifying primers in the raw reads",
"default": "edlib",
"help_text": "'edlib' is set by default due to its high performance. However, it may be less sensitive than 'phmm'."
},
"pychopper_opts": {
"type": "string",
"title": "Pychopper options",
"description": "Extra pychopper opts",
"help_text": "See available options (here)[https://github.com/epi2me-labs/pychopper#usage]"
},
"bundle_min_reads": {
"type": "integer",
"default": 50000,
"title": "Bundle minimum reads",
"description": "Minimum size of bam bundle for parallel processing."
},
"isoform_table_nrows": {
"type": "integer",
"title": "Isoform table number of rows",
"description": "Maximum rows to dispay in the isoform report table",
"default": 5000
}
}
},
"miscellaneous_options": {
"title": "Miscellaneous Options",
"type": "object",
"description": "Everything else.",
"default": "",
"properties": {
"help": {
"type": "boolean",
"default": false,
"description": "Display help text.",
"fa_icon": "fas fa-question-circle",
"hidden": true
},
"disable_ping": {
"type": "boolean",
"default": false,
"description": "Enable to prevent sending a workflow ping."
},
"version": {
"type": "boolean",
"default": false,
"description": "Display version and exit.",
"hidden": true
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/input_options"
},
{
"$ref": "#/definitions/output_options"
},
{
"$ref": "#/definitions/sample_options"
},
{
"$ref": "#/definitions/options_for_reference_based_workflow"
},
{
"$ref": "#/definitions/differential_expression_options"
},
{
"$ref": "#/definitions/advanced_options"
},
{
"$ref": "#/definitions/miscellaneous_options"
}
],
"properties": {
"aws_image_prefix": {
"type": "string",
"hidden": true
},
"aws_queue": {
"type": "string",
"hidden": true
},
"monochrome_logs": {
"type": "boolean"
},
"validate_params": {
"type": "boolean",
"default": true
},
"show_hidden_params": {
"type": "boolean"
}
},
"resources": {
"recommended": {
"cpus": 16,
"memory": "32GB"
},
"minimum": {
"cpus": 8,
"memory": "32GB"
},
"run_time": "15 minutes per sample, with 1 million reads and recommended resources.",
"arm_support": false
}
}