From 2ac33ef57d308e783a344c95b050bb62d9f40878 Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 10:30:41 +0100 Subject: [PATCH 01/13] Add Array Express as repository --- mars-cli/mars_lib/target_repo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mars-cli/mars_lib/target_repo.py b/mars-cli/mars_lib/target_repo.py index dd648ec..4902616 100644 --- a/mars-cli/mars_lib/target_repo.py +++ b/mars-cli/mars_lib/target_repo.py @@ -13,6 +13,7 @@ class TargetRepository(str, Enum): METABOLIGHTS = "metabolights" BIOSAMPLES = "biosamples" EVA = "eva" + ARRAYEXPRESS = "arrayexpress" @classmethod def available_repositories(cls): From 1ec1344f52e333437f9b913cac8f47e13a3a0ce0 Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 11:07:20 +0100 Subject: [PATCH 02/13] Add functionality to map files to repositories according to the ISA-JSON --- mars-cli/mars_lib/isa_json.py | 19 ++++++++++- mars-cli/tests/test_isa_json.py | 56 +++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/mars-cli/mars_lib/isa_json.py b/mars-cli/mars_lib/isa_json.py index e68fdad..238d596 100644 --- a/mars-cli/mars_lib/isa_json.py +++ b/mars-cli/mars_lib/isa_json.py @@ -1,5 +1,5 @@ import json -from typing import Union, List, Any, Tuple, Optional +from typing import Union, List, Any, Tuple, Optional, Dict from mars_lib.models.isa_json import ( Investigation, Assay, @@ -420,3 +420,20 @@ def update_isa_json(isa_json: IsaJson, repo_response: RepositoryResponse) -> Isa isa_json.investigation = investigation return isa_json + + +def map_data_files_to_repositories( + files: List[str], isa_json: IsaJson +) -> Dict[str, List[str]]: + # Note: This works well in + df_map: Dict[str, List[str]] = {} + assays: List[Assay] = [ + assay for study in isa_json.investigation.studies for assay in study.assays + ] + + for assay in assays: + target_repo_comment: Comment = detect_target_repo_comment(assay.comments) + assay_data_files = [df.name for df in assay.dataFiles] + df_map[target_repo_comment.value] = assay_data_files + + return df_map diff --git a/mars-cli/tests/test_isa_json.py b/mars-cli/tests/test_isa_json.py index c4fd0d8..02355dc 100644 --- a/mars-cli/tests/test_isa_json.py +++ b/mars-cli/tests/test_isa_json.py @@ -1,9 +1,13 @@ import re +from wsgiref.validate import assert_ + +from black import assert_equivalent from mars_lib.isa_json import ( reduce_isa_json_for_target_repo, load_isa_json, update_isa_json, + map_data_files_to_repositories, ) from mars_lib.target_repo import TargetRepository, TARGET_REPO_KEY import pytest @@ -254,3 +258,55 @@ def test_filename_validation(): assert Investigation.model_validate({"@id": "4", "filename": "i_Good_File_Name"}) assert Study.model_validate({"@id": "5", "filename": "s_Good_File_Name"}) assert Assay.model_validate({"@id": "6", "filename": "a_Good_File_Name"}) + + +def test_map_data_files_to_repositories(): + isa_json = load_isa_json( + file_path="../test-data/ISA-BH2024-ALL/isa-bh2024-all.json", + investigation_is_root=True, + ) + data_files = [ + "cnv-seq-data-0.fastq", + "cnv-seq-data-1.fastq", + "cnv-seq-data-2.fastq", + "cnv-seq-data-3.fastq", + "cnv-seq-derived-data-0.vcf", + "cnv-seq-derived-data-1.vcf", + "cnv-seq-derived-data-2.vcf", + "cnv-seq-derived-data-3.vcf", + "metpro-analysis.txt", + "ms-data-metpro--1.mzml", + "ms-data-metpro--2.mzml", + "ms-data-metpro--3.mzml", + "ms-data-metpro--4.mzml", + "rna-seq-data-0.fastq", + "rna-seq-data-1.fastq", + "rna-seq-data-2.fastq", + "rna-seq-data-3.fastq", + "rna-seq-DEA.txt", + ] + + check_map = dict( + { + "metabolights": [ + "metpro-analysis.txt", + "ms-data-metpro--1.mzml", + "ms-data-metpro--2.mzml", + "ms-data-metpro--3.mzml", + "ms-data-metpro--4.mzml", + ], + "arrayexpress": [ + "rna-seq-data-0.fastq", + "rna-seq-data-1.fastq", + "rna-seq-data-2.fastq", + "rna-seq-data-3.fastq", + ], + "eva": [ + "cnv-seq-data-0.fastq", + "cnv-seq-data-1.fastq", + "cnv-seq-data-2.fastq", + "cnv-seq-data-3.fastq", + ], + } + ) + assert check_map == map_data_files_to_repositories(data_files, isa_json) From 6f9278f5997eee8ba021d65bb8aae84887a0d7ff Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 12:09:58 +0100 Subject: [PATCH 03/13] Add error when there is a mismatch --- mars-cli/mars_lib/isa_json.py | 13 ++++++++++++- mars-cli/tests/test_isa_json.py | 26 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/mars-cli/mars_lib/isa_json.py b/mars-cli/mars_lib/isa_json.py index 238d596..d5347ce 100644 --- a/mars-cli/mars_lib/isa_json.py +++ b/mars-cli/mars_lib/isa_json.py @@ -434,6 +434,17 @@ def map_data_files_to_repositories( for assay in assays: target_repo_comment: Comment = detect_target_repo_comment(assay.comments) assay_data_files = [df.name for df in assay.dataFiles] - df_map[target_repo_comment.value] = assay_data_files + for adf in assay_data_files: + if adf not in files: + raise ValueError( + f"""Assay for repository '{target_repo_comment.value}' has encountered a mismatch while mapping the data files to the ISA-JSON. + Data File '{adf}' is missing in the data files passed in the command: + {files} + Please correct the mismatch!""" + ) + + df_map[target_repo_comment.value] = [ + file for file in files if file in assay_data_files + ] return df_map diff --git a/mars-cli/tests/test_isa_json.py b/mars-cli/tests/test_isa_json.py index 02355dc..eca899d 100644 --- a/mars-cli/tests/test_isa_json.py +++ b/mars-cli/tests/test_isa_json.py @@ -309,4 +309,30 @@ def test_map_data_files_to_repositories(): ], } ) + print( + f"\n\ndata files map: {map_data_files_to_repositories(data_files, isa_json)}\n\n" + ) assert check_map == map_data_files_to_repositories(data_files, isa_json) + + bad_match_files = data_files = [ + "cnv-seq-data-0.fastq", + "cnv-seq-data-1.fastq", + "cnv-seq-data-2.fastq", + "cnv-seq-data-3.fastq", + "cnv-seq-derived-data-0.vcf", + "cnv-seq-derived-data-1.vcf", + "cnv-seq-derived-data-2.vcf", + "cnv-seq-derived-data-3.vcf", + "ms-data-metpro--1.mzml", + "ms-data-metpro--2.mzml", + "ms-data-metpro--3.mzml", + "ms-data-metpro--4.mzml", + "rna-seq-data-0.fastq", + "rna-seq-data-1.fastq", + "rna-seq-data-2.fastq", + "rna-seq-data-3.fastq", + "rna-seq-DEA.txt", + ] + + with pytest.raises(ValueError, match=r"Assay for repository 'metabolights' has encountered"): + map_data_files_to_repositories(bad_match_files, isa_json) From aafc35aaaab762f0568038039c7a2a32001ae5ad Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 12:32:56 +0100 Subject: [PATCH 04/13] Pass full file names instead of short names --- mars-cli/mars_lib/isa_json.py | 7 ++- mars-cli/tests/test_isa_json.py | 100 ++++++++++++++++---------------- 2 files changed, 56 insertions(+), 51 deletions(-) diff --git a/mars-cli/mars_lib/isa_json.py b/mars-cli/mars_lib/isa_json.py index d5347ce..c317acb 100644 --- a/mars-cli/mars_lib/isa_json.py +++ b/mars-cli/mars_lib/isa_json.py @@ -431,11 +431,12 @@ def map_data_files_to_repositories( assay for study in isa_json.investigation.studies for assay in study.assays ] + files_dicts = [{"full_name": f, "short_name": f.split("/")[-1]} for f in files] for assay in assays: target_repo_comment: Comment = detect_target_repo_comment(assay.comments) assay_data_files = [df.name for df in assay.dataFiles] for adf in assay_data_files: - if adf not in files: + if adf not in [fd["short_name"] for fd in files_dicts]: raise ValueError( f"""Assay for repository '{target_repo_comment.value}' has encountered a mismatch while mapping the data files to the ISA-JSON. Data File '{adf}' is missing in the data files passed in the command: @@ -444,7 +445,9 @@ def map_data_files_to_repositories( ) df_map[target_repo_comment.value] = [ - file for file in files if file in assay_data_files + fd["full_name"] + for fd in files_dicts + if fd["short_name"] in assay_data_files ] return df_map diff --git a/mars-cli/tests/test_isa_json.py b/mars-cli/tests/test_isa_json.py index eca899d..4d71214 100644 --- a/mars-cli/tests/test_isa_json.py +++ b/mars-cli/tests/test_isa_json.py @@ -266,46 +266,46 @@ def test_map_data_files_to_repositories(): investigation_is_root=True, ) data_files = [ - "cnv-seq-data-0.fastq", - "cnv-seq-data-1.fastq", - "cnv-seq-data-2.fastq", - "cnv-seq-data-3.fastq", - "cnv-seq-derived-data-0.vcf", - "cnv-seq-derived-data-1.vcf", - "cnv-seq-derived-data-2.vcf", - "cnv-seq-derived-data-3.vcf", - "metpro-analysis.txt", - "ms-data-metpro--1.mzml", - "ms-data-metpro--2.mzml", - "ms-data-metpro--3.mzml", - "ms-data-metpro--4.mzml", - "rna-seq-data-0.fastq", - "rna-seq-data-1.fastq", - "rna-seq-data-2.fastq", - "rna-seq-data-3.fastq", - "rna-seq-DEA.txt", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-0.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-1.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-2.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-3.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-0.vcf", + "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-1.vcf", + "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-2.vcf", + "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-3.vcf", + "../test-data/ISA-BH2024-ALL/metpro-analysis.txt", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--1.mzml", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--2.mzml", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--3.mzml", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--4.mzml", + "../test-data/ISA-BH2024-ALL/rna-seq-data-0.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-1.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-2.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-3.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-DEA.txt", ] check_map = dict( { "metabolights": [ - "metpro-analysis.txt", - "ms-data-metpro--1.mzml", - "ms-data-metpro--2.mzml", - "ms-data-metpro--3.mzml", - "ms-data-metpro--4.mzml", + "../test-data/ISA-BH2024-ALL/metpro-analysis.txt", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--1.mzml", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--2.mzml", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--3.mzml", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--4.mzml", ], "arrayexpress": [ - "rna-seq-data-0.fastq", - "rna-seq-data-1.fastq", - "rna-seq-data-2.fastq", - "rna-seq-data-3.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-0.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-1.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-2.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-3.fastq", ], "eva": [ - "cnv-seq-data-0.fastq", - "cnv-seq-data-1.fastq", - "cnv-seq-data-2.fastq", - "cnv-seq-data-3.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-0.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-1.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-2.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-3.fastq", ], } ) @@ -315,24 +315,26 @@ def test_map_data_files_to_repositories(): assert check_map == map_data_files_to_repositories(data_files, isa_json) bad_match_files = data_files = [ - "cnv-seq-data-0.fastq", - "cnv-seq-data-1.fastq", - "cnv-seq-data-2.fastq", - "cnv-seq-data-3.fastq", - "cnv-seq-derived-data-0.vcf", - "cnv-seq-derived-data-1.vcf", - "cnv-seq-derived-data-2.vcf", - "cnv-seq-derived-data-3.vcf", - "ms-data-metpro--1.mzml", - "ms-data-metpro--2.mzml", - "ms-data-metpro--3.mzml", - "ms-data-metpro--4.mzml", - "rna-seq-data-0.fastq", - "rna-seq-data-1.fastq", - "rna-seq-data-2.fastq", - "rna-seq-data-3.fastq", - "rna-seq-DEA.txt", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-0.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-1.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-2.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-data-3.fastq", + "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-0.vcf", + "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-1.vcf", + "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-2.vcf", + "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-3.vcf", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--1.mzml", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--2.mzml", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--3.mzml", + "../test-data/ISA-BH2024-ALL/ms-data-metpro--4.mzml", + "../test-data/ISA-BH2024-ALL/rna-seq-data-0.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-1.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-2.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-data-3.fastq", + "../test-data/ISA-BH2024-ALL/rna-seq-DEA.txt", ] - with pytest.raises(ValueError, match=r"Assay for repository 'metabolights' has encountered"): + with pytest.raises( + ValueError, match=r"Assay for repository 'metabolights' has encountered" + ): map_data_files_to_repositories(bad_match_files, isa_json) From dd5dcab87355ecb8b3a54df43603a986522b9f0b Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 12:43:40 +0100 Subject: [PATCH 05/13] Add data file map to submit function --- mars-cli/mars_lib/submit.py | 14 ++++++++++---- mars-cli/tests/test_isa_json.py | 8 +++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/mars-cli/mars_lib/submit.py b/mars-cli/mars_lib/submit.py index d70658e..3dd079f 100644 --- a/mars-cli/mars_lib/submit.py +++ b/mars-cli/mars_lib/submit.py @@ -5,7 +5,7 @@ import time import requests import json -from typing import Any +from typing import Optional, Any from mars_lib.authentication import get_metabolights_auth_token, get_webin_auth_token from mars_lib.biosamples_external_references import ( get_header, @@ -19,6 +19,7 @@ load_isa_json, reduce_isa_json_for_target_repo, update_isa_json, + map_data_files_to_repositories, ) from mars_lib.models.isa_json import Comment, IsaJson from mars_lib.models.repository_response import RepositoryResponse @@ -52,7 +53,7 @@ def submission( urls: dict[str, Any], file_transfer: str, output: str, - data_file_paths=None, + data_file_paths: Optional[List[TextIOWrapper]] = None, ) -> None: # If credential manager info found: # Get password from the credential manager @@ -80,6 +81,11 @@ def submission( f"ISA JSON with investigation '{isa_json.investigation.title}' is valid." ) + # create data file map + data_file_map = map_data_files_to_repositories( + files=[str(dfp) for dfp in data_file_paths], isa_json=isa_json + ) + time_stamp = datetime.timestamp(datetime.now()) if DEBUG: @@ -115,7 +121,7 @@ def submission( # Step 1 : upload data if file paths are provided if data_file_paths and file_transfer: upload_to_ena( - file_paths=data_file_paths, + file_paths=[Path(df) for df in data_file_map[TargetRepository.ENA]], user_credentials=user_credentials, submission_url=urls["ENA"]["DATA-SUBMISSION"], file_transfer=file_transfer, @@ -146,7 +152,7 @@ def submission( if TargetRepository.METABOLIGHTS in target_repositories: # Submit to MetaboLights metabolights_result = upload_to_metabolights( - file_paths=data_file_paths, + file_paths=data_file_map[TargetRepository.ENA], file_transfer=file_transfer, isa_json=isa_json, metabolights_credentials=user_credentials, diff --git a/mars-cli/tests/test_isa_json.py b/mars-cli/tests/test_isa_json.py index 4d71214..5b8e6fc 100644 --- a/mars-cli/tests/test_isa_json.py +++ b/mars-cli/tests/test_isa_json.py @@ -1,7 +1,5 @@ import re -from wsgiref.validate import assert_ -from black import assert_equivalent from mars_lib.isa_json import ( reduce_isa_json_for_target_repo, @@ -244,13 +242,13 @@ def test_update_study_materials_with_accession_categories(): def test_filename_validation(): # ISA should have a filename that starts with 'x_' - with pytest.raises(ValidationError, match=f"'filename' should start with 'i_'"): + with pytest.raises(ValidationError, match="'filename' should start with 'i_'"): Investigation.model_validate({"@id": "1", "filename": "bad filename"}) - with pytest.raises(ValidationError, match=f"'filename' should start with 's_'"): + with pytest.raises(ValidationError, match="'filename' should start with 's_'"): Study.model_validate({"@id": "2", "filename": "bad filename"}) - with pytest.raises(ValidationError, match=f"'filename' should start with 'a_'"): + with pytest.raises(ValidationError, match="'filename' should start with 'a_'"): Assay.model_validate({"@id": "3", "filename": "bad filename"}) assert re.match(r"^i_", "i_Good_file_name") From 4c786e88e27e8dc388f779e26440ec1f7c21c9ba Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 13:41:47 +0100 Subject: [PATCH 06/13] Add guard clause to keep MyPy happy --- mars-cli/mars_lib/isa_json.py | 6 ++++++ mars-cli/mars_lib/submit.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/mars-cli/mars_lib/isa_json.py b/mars-cli/mars_lib/isa_json.py index c317acb..4a8b024 100644 --- a/mars-cli/mars_lib/isa_json.py +++ b/mars-cli/mars_lib/isa_json.py @@ -434,6 +434,12 @@ def map_data_files_to_repositories( files_dicts = [{"full_name": f, "short_name": f.split("/")[-1]} for f in files] for assay in assays: target_repo_comment: Comment = detect_target_repo_comment(assay.comments) + # This is an effect of everything being optional in the Comment model. + # Should we decide to make the value mandatory, this guard clause would not be necessary anymore. + if target_repo_comment.value is None: + raise ValueError( + f"At least one assay in the ISA-JSON has no '{TARGET_REPO_KEY}' comment. Mapping not possible. Make sure all assays in the ISA-JSON have this comment!" + ) assay_data_files = [df.name for df in assay.dataFiles] for adf in assay_data_files: if adf not in [fd["short_name"] for fd in files_dicts]: diff --git a/mars-cli/mars_lib/submit.py b/mars-cli/mars_lib/submit.py index 3dd079f..e26560a 100644 --- a/mars-cli/mars_lib/submit.py +++ b/mars-cli/mars_lib/submit.py @@ -5,7 +5,7 @@ import time import requests import json -from typing import Optional, Any +from typing import Any from mars_lib.authentication import get_metabolights_auth_token, get_webin_auth_token from mars_lib.biosamples_external_references import ( get_header, @@ -53,7 +53,7 @@ def submission( urls: dict[str, Any], file_transfer: str, output: str, - data_file_paths: Optional[List[TextIOWrapper]] = None, + data_file_paths: List[TextIOWrapper] = [], ) -> None: # If credential manager info found: # Get password from the credential manager From a44af34d8fa809820c59129294ac60d334dd5f54 Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 13:45:51 +0100 Subject: [PATCH 07/13] Linting --- mars-cli/mars_cli.py | 1 - mars-cli/tests/test_biosample_external_references.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/mars-cli/mars_cli.py b/mars-cli/mars_cli.py index 71ae6e1..09c635f 100644 --- a/mars-cli/mars_cli.py +++ b/mars-cli/mars_cli.py @@ -10,7 +10,6 @@ from mars_lib.logging import print_and_log from mars_lib.validation import validate, CustomValidationException from logging.handlers import RotatingFileHandler -from pydantic import ValidationError import requests import sys import os diff --git a/mars-cli/tests/test_biosample_external_references.py b/mars-cli/tests/test_biosample_external_references.py index 494f986..7d120bb 100644 --- a/mars-cli/tests/test_biosample_external_references.py +++ b/mars-cli/tests/test_biosample_external_references.py @@ -67,7 +67,7 @@ def test_validate_bs_accession(): validate_bs_accession(invalid_accession) valid_accession = "SAMEA112654119" - assert validate_bs_accession(valid_accession) != ValueError + assert validate_bs_accession(valid_accession) is not ValueError def test_validate_json_against_schema(): From 28168f8cae3a9308763da0b03b3fcc4abfbcfb6b Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 15:05:39 +0100 Subject: [PATCH 08/13] Add warning for unmapped data file --- mars-cli/mars_lib/isa_json.py | 18 +++++++++++++ mars-cli/tests/test_isa_json.py | 47 +++++++++++---------------------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/mars-cli/mars_lib/isa_json.py b/mars-cli/mars_lib/isa_json.py index 4a8b024..e28ff4a 100644 --- a/mars-cli/mars_lib/isa_json.py +++ b/mars-cli/mars_lib/isa_json.py @@ -1,5 +1,7 @@ import json from typing import Union, List, Any, Tuple, Optional, Dict + +from mars_lib.logging import print_and_log from mars_lib.models.isa_json import ( Investigation, Assay, @@ -432,6 +434,7 @@ def map_data_files_to_repositories( ] files_dicts = [{"full_name": f, "short_name": f.split("/")[-1]} for f in files] + remaining_files = files_dicts.copy() for assay in assays: target_repo_comment: Comment = detect_target_repo_comment(assay.comments) # This is an effect of everything being optional in the Comment model. @@ -441,6 +444,9 @@ def map_data_files_to_repositories( f"At least one assay in the ISA-JSON has no '{TARGET_REPO_KEY}' comment. Mapping not possible. Make sure all assays in the ISA-JSON have this comment!" ) assay_data_files = [df.name for df in assay.dataFiles] + + # Check if the files in the ISA-JSON are present in the command + # If not, raise an error for adf in assay_data_files: if adf not in [fd["short_name"] for fd in files_dicts]: raise ValueError( @@ -449,6 +455,10 @@ def map_data_files_to_repositories( {files} Please correct the mismatch!""" ) + else: + remaining_files.remove( + next(fd for fd in files_dicts if fd["short_name"] == adf) + ) df_map[target_repo_comment.value] = [ fd["full_name"] @@ -456,4 +466,12 @@ def map_data_files_to_repositories( if fd["short_name"] in assay_data_files ] + [ + print_and_log( + msg=f"File '{rf['short_name']}' could not be mapped to any data file in the ISA-JSON. For this reason, it will be skipped during submission!", + level="warning", + ) + for rf in remaining_files + ] + return df_map diff --git a/mars-cli/tests/test_isa_json.py b/mars-cli/tests/test_isa_json.py index 5b8e6fc..a8f152b 100644 --- a/mars-cli/tests/test_isa_json.py +++ b/mars-cli/tests/test_isa_json.py @@ -263,15 +263,11 @@ def test_map_data_files_to_repositories(): file_path="../test-data/ISA-BH2024-ALL/isa-bh2024-all.json", investigation_is_root=True, ) - data_files = [ + exact_match_files = [ "../test-data/ISA-BH2024-ALL/cnv-seq-data-0.fastq", "../test-data/ISA-BH2024-ALL/cnv-seq-data-1.fastq", "../test-data/ISA-BH2024-ALL/cnv-seq-data-2.fastq", "../test-data/ISA-BH2024-ALL/cnv-seq-data-3.fastq", - "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-0.vcf", - "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-1.vcf", - "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-2.vcf", - "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-3.vcf", "../test-data/ISA-BH2024-ALL/metpro-analysis.txt", "../test-data/ISA-BH2024-ALL/ms-data-metpro--1.mzml", "../test-data/ISA-BH2024-ALL/ms-data-metpro--2.mzml", @@ -281,7 +277,6 @@ def test_map_data_files_to_repositories(): "../test-data/ISA-BH2024-ALL/rna-seq-data-1.fastq", "../test-data/ISA-BH2024-ALL/rna-seq-data-2.fastq", "../test-data/ISA-BH2024-ALL/rna-seq-data-3.fastq", - "../test-data/ISA-BH2024-ALL/rna-seq-DEA.txt", ] check_map = dict( @@ -307,32 +302,22 @@ def test_map_data_files_to_repositories(): ], } ) - print( - f"\n\ndata files map: {map_data_files_to_repositories(data_files, isa_json)}\n\n" - ) - assert check_map == map_data_files_to_repositories(data_files, isa_json) + assert check_map == map_data_files_to_repositories(exact_match_files, isa_json) - bad_match_files = data_files = [ - "../test-data/ISA-BH2024-ALL/cnv-seq-data-0.fastq", - "../test-data/ISA-BH2024-ALL/cnv-seq-data-1.fastq", - "../test-data/ISA-BH2024-ALL/cnv-seq-data-2.fastq", - "../test-data/ISA-BH2024-ALL/cnv-seq-data-3.fastq", - "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-0.vcf", - "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-1.vcf", - "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-2.vcf", - "../test-data/ISA-BH2024-ALL/cnv-seq-derived-data-3.vcf", - "../test-data/ISA-BH2024-ALL/ms-data-metpro--1.mzml", - "../test-data/ISA-BH2024-ALL/ms-data-metpro--2.mzml", - "../test-data/ISA-BH2024-ALL/ms-data-metpro--3.mzml", - "../test-data/ISA-BH2024-ALL/ms-data-metpro--4.mzml", - "../test-data/ISA-BH2024-ALL/rna-seq-data-0.fastq", - "../test-data/ISA-BH2024-ALL/rna-seq-data-1.fastq", - "../test-data/ISA-BH2024-ALL/rna-seq-data-2.fastq", - "../test-data/ISA-BH2024-ALL/rna-seq-data-3.fastq", - "../test-data/ISA-BH2024-ALL/rna-seq-DEA.txt", - ] + not_enough_files = exact_match_files[:-1] with pytest.raises( - ValueError, match=r"Assay for repository 'metabolights' has encountered" + ValueError, + match=rf"Assay for repository '{TargetRepository.ARRAYEXPRESS}' has encountered", ): - map_data_files_to_repositories(bad_match_files, isa_json) + map_data_files_to_repositories(not_enough_files, isa_json) + + too_many_files = exact_match_files + one_too_many = "../test-data/ISA-BH2024-ALL/one-too-many.fastq" + too_many_files.append(one_too_many) + + result_maps = map_data_files_to_repositories(too_many_files, isa_json) + + assert one_too_many not in [ + value for key, value_list in result_maps.items() for value in value_list + ] From 5cbfdc855129f43a14e774dfb6271b71c2b62116 Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 16:12:25 +0100 Subject: [PATCH 09/13] Fix enum problem Since python 3.11 'only-string' enums have to be called StrEnums or you need to call `.value` on each member. See https://docs.python.org/3/library/enum.html#enum.StrEnum for documentation --- mars-cli/mars_lib/target_repo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mars-cli/mars_lib/target_repo.py b/mars-cli/mars_lib/target_repo.py index 4902616..6d90cd4 100644 --- a/mars-cli/mars_lib/target_repo.py +++ b/mars-cli/mars_lib/target_repo.py @@ -1,10 +1,10 @@ -from enum import Enum +from enum import StrEnum TARGET_REPO_KEY = "target_repository" -class TargetRepository(str, Enum): +class TargetRepository(StrEnum): """ Holds constants, tied to the target repositories. """ @@ -17,4 +17,4 @@ class TargetRepository(str, Enum): @classmethod def available_repositories(cls): - return {item.value for item in cls} + return {item for item in cls} From 5ea2bd849d9af5219504d884cd3978883d04e41c Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 16:23:27 +0100 Subject: [PATCH 10/13] Revert "Fix enum problem" This reverts commit 5cbfdc855129f43a14e774dfb6271b71c2b62116. --- mars-cli/mars_lib/target_repo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mars-cli/mars_lib/target_repo.py b/mars-cli/mars_lib/target_repo.py index 6d90cd4..4902616 100644 --- a/mars-cli/mars_lib/target_repo.py +++ b/mars-cli/mars_lib/target_repo.py @@ -1,10 +1,10 @@ -from enum import StrEnum +from enum import Enum TARGET_REPO_KEY = "target_repository" -class TargetRepository(StrEnum): +class TargetRepository(str, Enum): """ Holds constants, tied to the target repositories. """ @@ -17,4 +17,4 @@ class TargetRepository(StrEnum): @classmethod def available_repositories(cls): - return {item for item in cls} + return {item.value for item in cls} From 9ee367d850b354e48390e653337fc36d32b51e97 Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Fri, 8 Nov 2024 16:39:36 +0100 Subject: [PATCH 11/13] Access enum's value field --- mars-cli/mars_cli.py | 6 +++--- mars-cli/mars_lib/isa_json.py | 2 +- mars-cli/mars_lib/submit.py | 36 ++++++++++++++++++--------------- mars-cli/tests/test_isa_json.py | 12 +++++------ 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/mars-cli/mars_cli.py b/mars-cli/mars_cli.py index 09c635f..e6596af 100644 --- a/mars-cli/mars_cli.py +++ b/mars-cli/mars_cli.py @@ -263,13 +263,13 @@ def submit( target_repositories = [] if submit_to_biosamples: - target_repositories.append(TargetRepository.BIOSAMPLES) + target_repositories.append(TargetRepository.BIOSAMPLES.value) if submit_to_ena: - target_repositories.append(TargetRepository.ENA) + target_repositories.append(TargetRepository.ENA.value) if submit_to_metabolights: - target_repositories.append(TargetRepository.METABOLIGHTS) + target_repositories.append(TargetRepository.METABOLIGHTS.value) print_and_log( f"Starting submission of the ISA JSON to the target repositories: {', '.join(target_repositories)}." diff --git a/mars-cli/mars_lib/isa_json.py b/mars-cli/mars_lib/isa_json.py index e28ff4a..47239db 100644 --- a/mars-cli/mars_lib/isa_json.py +++ b/mars-cli/mars_lib/isa_json.py @@ -40,7 +40,7 @@ def reduce_isa_json_for_target_repo( new_studies = [] studies = filtered_isa_json.investigation.studies for study in studies: - if target_repo == TargetRepository.BIOSAMPLES: + if target_repo == TargetRepository.BIOSAMPLES.value: filtered_assays = [] else: assays = study.assays diff --git a/mars-cli/mars_lib/submit.py b/mars-cli/mars_lib/submit.py index e26560a..cf86b66 100644 --- a/mars-cli/mars_lib/submit.py +++ b/mars-cli/mars_lib/submit.py @@ -97,7 +97,7 @@ def submission( ): raise ValueError("No target repository selected.") - if TargetRepository.BIOSAMPLES in target_repositories: + if TargetRepository.BIOSAMPLES.value in target_repositories: # Submit to Biosamples biosamples_result = submit_to_biosamples( isa_json=isa_json, @@ -106,7 +106,7 @@ def submission( webin_token_url=urls["WEBIN"]["TOKEN"], ) print_and_log( - f"Submission to {TargetRepository.BIOSAMPLES} was successful. Result:\n{biosamples_result.json()}", + f"Submission to {TargetRepository.BIOSAMPLES.value} was successful. Result:\n{biosamples_result.json()}", level="info", ) # Update `isa_json`, based on the receipt returned @@ -117,16 +117,20 @@ def submission( if DEBUG: save_step_to_file(time_stamp, "1_after_biosamples", isa_json) - if TargetRepository.ENA in target_repositories: + if TargetRepository.ENA.value in target_repositories: # Step 1 : upload data if file paths are provided if data_file_paths and file_transfer: upload_to_ena( - file_paths=[Path(df) for df in data_file_map[TargetRepository.ENA]], + file_paths=[ + Path(df) for df in data_file_map[TargetRepository.ENA.value] + ], user_credentials=user_credentials, submission_url=urls["ENA"]["DATA-SUBMISSION"], file_transfer=file_transfer, ) - print_and_log(f"Start submitting to {TargetRepository.ENA}.", level="debug") + print_and_log( + f"Start submitting to {TargetRepository.ENA.value}.", level="debug" + ) # Step 2 : submit isa-json to ena ena_result = submit_to_ena( @@ -135,11 +139,11 @@ def submission( submission_url=urls["ENA"]["SUBMISSION"], ) print_and_log( - f"Submission to {TargetRepository.ENA} was successful. Result:\n{ena_result.json()}" + f"Submission to {TargetRepository.ENA.value} was successful. Result:\n{ena_result.json()}" ) print_and_log( - f"Update ISA-JSON based on receipt from {TargetRepository.ENA}.", + f"Update ISA-JSON based on receipt from {TargetRepository.ENA.value}.", level="debug", ) ena_mars_receipt = RepositoryResponse.model_validate( @@ -149,10 +153,10 @@ def submission( if DEBUG: save_step_to_file(time_stamp, "2_after_ena", isa_json) - if TargetRepository.METABOLIGHTS in target_repositories: + if TargetRepository.METABOLIGHTS.value in target_repositories: # Submit to MetaboLights metabolights_result = upload_to_metabolights( - file_paths=data_file_map[TargetRepository.ENA], + file_paths=data_file_map[TargetRepository.METABOLIGHTS.value], file_transfer=file_transfer, isa_json=isa_json, metabolights_credentials=user_credentials, @@ -161,7 +165,7 @@ def submission( ) metabolights_receipt_obj = metabolights_result.json() print_and_log( - f"Submission to {TargetRepository.METABOLIGHTS} was successful. Result:\n{metabolights_receipt_obj}", + f"Submission to {TargetRepository.METABOLIGHTS.value} was successful. Result:\n{metabolights_receipt_obj}", level="info", ) metabolights_receipt = RepositoryResponse.model_validate( @@ -177,11 +181,11 @@ def submission( if DEBUG: save_step_to_file(time_stamp, "3_after_metabolights", isa_json) - if TargetRepository.EVA in target_repositories: + if TargetRepository.EVA.value in target_repositories: # Submit to EVA # TODO: Filter out other assays print_and_log( - f"Submission to {TargetRepository.EVA} was successful.", level="info" + f"Submission to {TargetRepository.EVA.value} was successful.", level="info" ) # TODO: Update `isa_json`, based on the receipt returned @@ -207,7 +211,7 @@ def submit_to_biosamples( headers=headers, params=params, json=reduce_isa_json_for_target_repo( - isa_json, TargetRepository.BIOSAMPLES + isa_json, TargetRepository.BIOSAMPLES.value ).model_dump(by_alias=True, exclude_none=True), ) @@ -344,9 +348,9 @@ def submit_to_ena( submission_url, headers=headers, params=params, - json=reduce_isa_json_for_target_repo(isa_json, TargetRepository.ENA).model_dump( - by_alias=True, exclude_none=True - ), + json=reduce_isa_json_for_target_repo( + isa_json, TargetRepository.ENA.value + ).model_dump(by_alias=True, exclude_none=True), ) if result.status_code != 200: diff --git a/mars-cli/tests/test_isa_json.py b/mars-cli/tests/test_isa_json.py index a8f152b..aeab113 100644 --- a/mars-cli/tests/test_isa_json.py +++ b/mars-cli/tests/test_isa_json.py @@ -46,7 +46,7 @@ def test_reduce_isa_json_for_target_repo(): ) filtered_isa_json = reduce_isa_json_for_target_repo( - good_isa_json, TargetRepository.ENA + good_isa_json, TargetRepository.ENA.value ) good_isa_json_study = good_isa_json.investigation.studies[0] @@ -63,7 +63,7 @@ def test_reduce_isa_json_for_biosamples(): ) filtered_isa_json = reduce_isa_json_for_target_repo( - good_isa_json, TargetRepository.BIOSAMPLES + good_isa_json, TargetRepository.BIOSAMPLES.value ) assert len(filtered_isa_json.investigation.studies[0].assays) == 0 @@ -110,7 +110,7 @@ def test_target_repo_comment_validator(): { "@id": "comment_001", "name": f"{TARGET_REPO_KEY}", - "value": TargetRepository.ENA, + "value": TargetRepository.ENA.value, } ], } @@ -134,12 +134,12 @@ def test_target_repo_comment_validator(): { "@id": "comment_003", "name": f"{TARGET_REPO_KEY}", - "value": TargetRepository.ENA, + "value": TargetRepository.ENA.value, }, { "@id": "comment_004", "name": f"{TARGET_REPO_KEY}", - "value": TargetRepository.METABOLIGHTS, + "value": TargetRepository.METABOLIGHTS.value, }, ], } @@ -308,7 +308,7 @@ def test_map_data_files_to_repositories(): with pytest.raises( ValueError, - match=rf"Assay for repository '{TargetRepository.ARRAYEXPRESS}' has encountered", + match=rf"Assay for repository '{TargetRepository.ARRAYEXPRESS.value}' has encountered", ): map_data_files_to_repositories(not_enough_files, isa_json) From d46e2532bad5fc06387b9758ddfd514ea25b2ef0 Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer <82407142+kdp-cloud@users.noreply.github.com> Date: Tue, 12 Nov 2024 11:14:24 +0100 Subject: [PATCH 12/13] Update mars-cli/tests/test_biosample_external_references.py Co-authored-by: April Shen --- mars-cli/tests/test_biosample_external_references.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mars-cli/tests/test_biosample_external_references.py b/mars-cli/tests/test_biosample_external_references.py index 7d120bb..ab325d8 100644 --- a/mars-cli/tests/test_biosample_external_references.py +++ b/mars-cli/tests/test_biosample_external_references.py @@ -67,7 +67,7 @@ def test_validate_bs_accession(): validate_bs_accession(invalid_accession) valid_accession = "SAMEA112654119" - assert validate_bs_accession(valid_accession) is not ValueError + validate_bs_accession(valid_accession) def test_validate_json_against_schema(): From 5bd6cb4e34f63d8872a9535c21b3b07ee1d864b2 Mon Sep 17 00:00:00 2001 From: Kevin De Pelseneer Date: Tue, 12 Nov 2024 13:06:42 +0100 Subject: [PATCH 13/13] Check duplicated files --- mars-cli/mars_lib/isa_json.py | 6 +++--- mars-cli/tests/test_isa_json.py | 7 ++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/mars-cli/mars_lib/isa_json.py b/mars-cli/mars_lib/isa_json.py index 47239db..3ab0ad8 100644 --- a/mars-cli/mars_lib/isa_json.py +++ b/mars-cli/mars_lib/isa_json.py @@ -456,9 +456,9 @@ def map_data_files_to_repositories( Please correct the mismatch!""" ) else: - remaining_files.remove( - next(fd for fd in files_dicts if fd["short_name"] == adf) - ) + remaining_files = [ + fd for fd in remaining_files if fd["short_name"] != adf + ] df_map[target_repo_comment.value] = [ fd["full_name"] diff --git a/mars-cli/tests/test_isa_json.py b/mars-cli/tests/test_isa_json.py index aeab113..53ec88e 100644 --- a/mars-cli/tests/test_isa_json.py +++ b/mars-cli/tests/test_isa_json.py @@ -312,7 +312,7 @@ def test_map_data_files_to_repositories(): ): map_data_files_to_repositories(not_enough_files, isa_json) - too_many_files = exact_match_files + too_many_files = exact_match_files.copy() one_too_many = "../test-data/ISA-BH2024-ALL/one-too-many.fastq" too_many_files.append(one_too_many) @@ -321,3 +321,8 @@ def test_map_data_files_to_repositories(): assert one_too_many not in [ value for key, value_list in result_maps.items() for value in value_list ] + + duplicated_files = exact_match_files.copy() + duplicated_files.append(exact_match_files[-1]) + + map_data_files_to_repositories(duplicated_files, isa_json)