Skip to content

Commit

Permalink
Merge pull request #295 from microbiomedata/293-skip-metagenome-seque…
Browse files Browse the repository at this point in the history
…ncing

293 skip metagenome sequencing
  • Loading branch information
mbthornton-lbl authored Nov 20, 2024
2 parents 22d9e48 + 4b32131 commit 7213e8b
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 0 deletions.
4 changes: 4 additions & 0 deletions nmdc_automation/workflow_automation/workflow_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def get_current_workflow_process_nodes(

data_generation_ids = set()
data_generation_workflows = [wf for wf in workflows if wf.collection == "data_generation_set"]

workflow_execution_workflows = [wf for wf in workflows if wf.collection == "workflow_execution_set"]

# default query for data_generation_set records filtered by analyte category
Expand Down Expand Up @@ -129,6 +130,9 @@ def get_current_workflow_process_nodes(

records = db[wf.collection].find(q)
for rec in records:
# legacy JGI sequencing records
if rec.get("type") == "nmdc:MetagenomeSequencing" or rec["name"].startswith("Metagenome Sequencing"):
continue
if wf.version and not _within_range(rec["version"], wf.version):
continue
if _is_missing_required_input_output(wf, rec, data_objects_by_id):
Expand Down
27 changes: 27 additions & 0 deletions tests/fixtures/nmdc_db/legacy_data_generation.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[
{
"id" : "nmdc:omprc-11-cegmwy02",
"name" : "Terrestrial soil microbial communities - BONA_004-O-20210707-COMP",
"has_input" : [
"nmdc:procsm-11-d8hkca85"
],
"gold_sequencing_project_identifiers" : [
"gold:Gp0704890"
],
"processing_institution" : "JGI",
"type" : "nmdc:NucleotideSequencing",
"insdc_bioproject_identifiers" : [
"bioproject:PRJNA1029072"
],
"analyte_category" : "metagenome",
"associated_studies" : [
"nmdc:sty-11-34xj1150"
],
"instrument_used" : [
"nmdc:inst-14-mr4r2w09"
],
"has_output" : [
"nmdc:dobj-11-hnw52332"
]
}
]
15 changes: 15 additions & 0 deletions tests/fixtures/nmdc_db/legacy_data_obj.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[
{
"id" : "nmdc:dobj-11-hnw52332",
"name" : "52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz",
"description" : "Metagenome Raw Reads for nmdc:omprc-11-cegmwy02",
"alternative_identifiers" : [

],
"file_size_bytes" : 31068664547,
"md5_checksum" : "12f380b91ff3364cd3d228505d3402b5",
"data_object_type" : "Metagenome Raw Reads",
"url" : "https://data.microbiomedata.org/data/nmdc:omprc-11-cegmwy02/nmdc:wfmsa-11-jc5cmf37.1/52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz",
"type" : "nmdc:DataObject"
}
]
19 changes: 19 additions & 0 deletions tests/fixtures/nmdc_db/metagenome_sequencing.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[
{
"id" : "nmdc:wfmsa-11-jc5cmf37.1",
"name" : "Metagenome Sequencing Activity for nmdc:wfmsa-11-jc5cmf37.1",
"started_at_time" : "2023-09-13T19:57:49.595727+00:00",
"ended_at_time" : "2023-09-13T19:57:49.595743+00:00",
"was_informed_by" : "nmdc:omprc-11-cegmwy02",
"execution_resource" : "JGI",
"git_url" : "https://github.com/microbiomedata/RawSequencingData",
"has_input" : [
"nmdc:procsm-11-d8hkca85"
],
"has_output" : [
"nmdc:dobj-11-hnw52332"
],
"type" : "nmdc:MetagenomeSequencing",
"version" : "v1.0.0"
}
]
22 changes: 22 additions & 0 deletions tests/test_workflow_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,28 @@ def test_load_workflow_process_nodes(test_db, workflow_file, workflows_config_di
assert data_gen.children[0].type == "nmdc:ReadQcAnalysis"


def test_load_workflow_process_nodes_does_not_load_metagenome_sequencing(test_db, workflows_config_dir):
"""
Test that legacy nmdc:MetagenomeSequencing instances are not loaded
"""
exp_omprc = "nmdc:omprc-11-cegmwy02"
reset_db(test_db)
load_fixture(test_db, "legacy_data_obj.json", "data_object_set")
load_fixture(test_db, "legacy_data_generation.json", "data_generation_set")
load_fixture(test_db, "metagenome_sequencing.json", "workflow_execution_set")

wfs = load_workflow_configs(workflows_config_dir / "workflows.yaml")
data_objs_by_id = get_required_data_objects_map(test_db, wfs)
wf_execs = get_current_workflow_process_nodes(test_db, wfs, data_objs_by_id, allowlist=[exp_omprc,])
# We only expect the data generation to be loaded
assert wf_execs
assert len(wf_execs) == 1
wf = wf_execs[0]
assert wf.type == "nmdc:NucleotideSequencing"
assert wf.id == exp_omprc
assert wf.was_informed_by == exp_omprc


@mark.parametrize(
"workflow_file", ["workflows.yaml", "workflows-mt.yaml"]
)
Expand Down

0 comments on commit 7213e8b

Please sign in to comment.