elixir-europe · kdp-cloud · Nov 12, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/mars-cli/mars_cli.py b/mars-cli/mars_cli.py
@@ -10,7 +10,6 @@
 from mars_lib.logging import print_and_log
 from mars_lib.validation import validate, CustomValidationException
 from logging.handlers import RotatingFileHandler
-from pydantic import ValidationError
 import requests
 import sys
 import os
@@ -264,13 +263,13 @@ def submit(
     target_repositories = []
 
     if submit_to_biosamples:
-        target_repositories.append(TargetRepository.BIOSAMPLES)
+        target_repositories.append(TargetRepository.BIOSAMPLES.value)
 
     if submit_to_ena:
-        target_repositories.append(TargetRepository.ENA)
+        target_repositories.append(TargetRepository.ENA.value)
 
     if submit_to_metabolights:
-        target_repositories.append(TargetRepository.METABOLIGHTS)
+        target_repositories.append(TargetRepository.METABOLIGHTS.value)
 
     print_and_log(
         f"Starting submission of the ISA JSON to the target repositories: {', '.join(target_repositories)}."

diff --git a/mars-cli/mars_lib/isa_json.py b/mars-cli/mars_lib/isa_json.py
@@ -1,5 +1,7 @@
 import json
-from typing import Union, List, Any, Tuple, Optional
+from typing import Union, List, Any, Tuple, Optional, Dict
+
+from mars_lib.logging import print_and_log
 from mars_lib.models.isa_json import (
     Investigation,
     Assay,
@@ -38,7 +40,7 @@ def reduce_isa_json_for_target_repo(
     new_studies = []
     studies = filtered_isa_json.investigation.studies
     for study in studies:
-        if target_repo == TargetRepository.BIOSAMPLES:
+        if target_repo == TargetRepository.BIOSAMPLES.value:
             filtered_assays = []
         else:
             assays = study.assays
@@ -420,3 +422,56 @@ def update_isa_json(isa_json: IsaJson, repo_response: RepositoryResponse) -> Isa
 
     isa_json.investigation = investigation
     return isa_json
+
+
+def map_data_files_to_repositories(
+    files: List[str], isa_json: IsaJson
+) -> Dict[str, List[str]]:
+    # Note: This works well in
+    df_map: Dict[str, List[str]] = {}
+    assays: List[Assay] = [
+        assay for study in isa_json.investigation.studies for assay in study.assays
+    ]
+
+    files_dicts = [{"full_name": f, "short_name": f.split("/")[-1]} for f in files]
+    remaining_files = files_dicts.copy()
+    for assay in assays:
+        target_repo_comment: Comment = detect_target_repo_comment(assay.comments)
+        # This is an effect of everything being optional in the Comment model.
+        # Should we decide to make the value mandatory, this guard clause would not be necessary anymore.
+        if target_repo_comment.value is None:
+            raise ValueError(
+                f"At least one assay in the ISA-JSON has no '{TARGET_REPO_KEY}' comment. Mapping not possible. Make sure all assays in the ISA-JSON have this comment!"
+            )
+        assay_data_files = [df.name for df in assay.dataFiles]
+
+        # Check if the files in the ISA-JSON are present in the command
+        # If not, raise an error
+        for adf in assay_data_files:
+            if adf not in [fd["short_name"] for fd in files_dicts]:
+                raise ValueError(
+                    f"""Assay for repository '{target_repo_comment.value}' has encountered a mismatch while mapping the data files to the ISA-JSON.
+                Data File '{adf}' is missing in the data files passed in the command:
+                {files}
+                Please correct the mismatch!"""
+                )
+            else:
+                remaining_files.remove(
+                    next(fd for fd in files_dicts if fd["short_name"] == adf)
+                )
+
+        df_map[target_repo_comment.value] = [
+            fd["full_name"]
+            for fd in files_dicts
+            if fd["short_name"] in assay_data_files
+        ]
+
+    [
+        print_and_log(
+            msg=f"File '{rf['short_name']}' could not be mapped to any data file in the ISA-JSON. For this reason, it will be skipped during submission!",
+            level="warning",
+        )
+        for rf in remaining_files
+    ]
+
+    return df_map
diff --git a/mars-cli/mars_lib/submit.py b/mars-cli/mars_lib/submit.py
@@ -19,6 +19,7 @@
     load_isa_json,
     reduce_isa_json_for_target_repo,
     update_isa_json,
+    map_data_files_to_repositories,
 )
 from mars_lib.models.isa_json import Comment, IsaJson
 from mars_lib.models.repository_response import RepositoryResponse
@@ -52,7 +53,7 @@ def submission(
     urls: dict[str, Any],
     file_transfer: str,
     output: str,
-    data_file_paths=None,
+    data_file_paths: List[TextIOWrapper] = [],
 ) -> None:
     # If credential manager info found:
     # Get password from the credential manager
@@ -80,6 +81,11 @@ def submission(
         f"ISA JSON with investigation '{isa_json.investigation.title}' is valid."
     )
 
+    # create data file map
+    data_file_map = map_data_files_to_repositories(
+        files=[str(dfp) for dfp in data_file_paths], isa_json=isa_json
+    )
+
     time_stamp = datetime.timestamp(datetime.now())
 
     if DEBUG:
@@ -91,7 +97,7 @@ def submission(
     ):
         raise ValueError("No target repository selected.")
 
-    if TargetRepository.BIOSAMPLES in target_repositories:
+    if TargetRepository.BIOSAMPLES.value in target_repositories:
         # Submit to Biosamples
         biosamples_result = submit_to_biosamples(
             isa_json=isa_json,
@@ -100,7 +106,7 @@ def submission(
             webin_token_url=urls["WEBIN"]["TOKEN"],
         )
         print_and_log(
-            f"Submission to {TargetRepository.BIOSAMPLES} was successful. Result:\n{biosamples_result.json()}",
+            f"Submission to {TargetRepository.BIOSAMPLES.value} was successful. Result:\n{biosamples_result.json()}",
             level="info",
         )
         # Update `isa_json`, based on the receipt returned
@@ -111,16 +117,20 @@ def submission(
         if DEBUG:
             save_step_to_file(time_stamp, "1_after_biosamples", isa_json)
 
-    if TargetRepository.ENA in target_repositories:
+    if TargetRepository.ENA.value in target_repositories:
         # Step 1 : upload data if file paths are provided
         if data_file_paths and file_transfer:
             upload_to_ena(
-                file_paths=data_file_paths,
+                file_paths=[
+                    Path(df) for df in data_file_map[TargetRepository.ENA.value]
+                ],
                 user_credentials=user_credentials,
                 submission_url=urls["ENA"]["DATA-SUBMISSION"],
                 file_transfer=file_transfer,
             )
-        print_and_log(f"Start submitting to {TargetRepository.ENA}.", level="debug")
+        print_and_log(
+            f"Start submitting to {TargetRepository.ENA.value}.", level="debug"
+        )
 
         # Step 2 : submit isa-json to ena
         ena_result = submit_to_ena(
@@ -129,11 +139,11 @@ def submission(
             submission_url=urls["ENA"]["SUBMISSION"],
         )
         print_and_log(
-            f"Submission to {TargetRepository.ENA} was successful. Result:\n{ena_result.json()}"
+            f"Submission to {TargetRepository.ENA.value} was successful. Result:\n{ena_result.json()}"
         )
 
         print_and_log(
-            f"Update ISA-JSON based on receipt from {TargetRepository.ENA}.",
+            f"Update ISA-JSON based on receipt from {TargetRepository.ENA.value}.",
             level="debug",
         )
         ena_mars_receipt = RepositoryResponse.model_validate(
@@ -143,10 +153,10 @@ def submission(
         if DEBUG:
             save_step_to_file(time_stamp, "2_after_ena", isa_json)
 
-    if TargetRepository.METABOLIGHTS in target_repositories:
+    if TargetRepository.METABOLIGHTS.value in target_repositories:
         # Submit to MetaboLights
         metabolights_result = upload_to_metabolights(
-            file_paths=data_file_paths,
+            file_paths=data_file_map[TargetRepository.METABOLIGHTS.value],
             file_transfer=file_transfer,
             isa_json=isa_json,
             metabolights_credentials=user_credentials,
@@ -155,7 +165,7 @@ def submission(
         )
         metabolights_receipt_obj = metabolights_result.json()
         print_and_log(
-            f"Submission to {TargetRepository.METABOLIGHTS} was successful. Result:\n{metabolights_receipt_obj}",
+            f"Submission to {TargetRepository.METABOLIGHTS.value} was successful. Result:\n{metabolights_receipt_obj}",
             level="info",
         )
         metabolights_receipt = RepositoryResponse.model_validate(
@@ -171,11 +181,11 @@ def submission(
         if DEBUG:
             save_step_to_file(time_stamp, "3_after_metabolights", isa_json)
 
-    if TargetRepository.EVA in target_repositories:
+    if TargetRepository.EVA.value in target_repositories:
         # Submit to EVA
         # TODO: Filter out other assays
         print_and_log(
-            f"Submission to {TargetRepository.EVA} was successful.", level="info"
+            f"Submission to {TargetRepository.EVA.value} was successful.", level="info"
         )
         # TODO: Update `isa_json`, based on the receipt returned
 
@@ -201,7 +211,7 @@ def submit_to_biosamples(
         headers=headers,
         params=params,
         json=reduce_isa_json_for_target_repo(
-            isa_json, TargetRepository.BIOSAMPLES
+            isa_json, TargetRepository.BIOSAMPLES.value
         ).model_dump(by_alias=True, exclude_none=True),
     )
 
@@ -338,9 +348,9 @@ def submit_to_ena(
         submission_url,
         headers=headers,
         params=params,
-        json=reduce_isa_json_for_target_repo(isa_json, TargetRepository.ENA).model_dump(
-            by_alias=True, exclude_none=True
-        ),
+        json=reduce_isa_json_for_target_repo(
+            isa_json, TargetRepository.ENA.value
+        ).model_dump(by_alias=True, exclude_none=True),
     )
 
     if result.status_code != 200:

diff --git a/mars-cli/mars_lib/target_repo.py b/mars-cli/mars_lib/target_repo.py
@@ -13,6 +13,7 @@ class TargetRepository(str, Enum):
     METABOLIGHTS = "metabolights"
     BIOSAMPLES = "biosamples"
     EVA = "eva"
+    ARRAYEXPRESS = "arrayexpress"
 
     @classmethod
     def available_repositories(cls):

diff --git a/mars-cli/tests/test_biosample_external_references.py b/mars-cli/tests/test_biosample_external_references.py
@@ -67,7 +67,7 @@ def test_validate_bs_accession():
         validate_bs_accession(invalid_accession)
 
     valid_accession = "SAMEA112654119"
-    assert validate_bs_accession(valid_accession) != ValueError
+    assert validate_bs_accession(valid_accession) is not ValueError
 
 
 def test_validate_json_against_schema():

diff --git a/mars-cli/tests/test_isa_json.py b/mars-cli/tests/test_isa_json.py
@@ -1,9 +1,11 @@
 import re
 
+
 from mars_lib.isa_json import (
     reduce_isa_json_for_target_repo,
     load_isa_json,
     update_isa_json,
+    map_data_files_to_repositories,
 )
 from mars_lib.target_repo import TargetRepository, TARGET_REPO_KEY
 import pytest
@@ -44,7 +46,7 @@ def test_reduce_isa_json_for_target_repo():
     )
 
     filtered_isa_json = reduce_isa_json_for_target_repo(
-        good_isa_json, TargetRepository.ENA
+        good_isa_json, TargetRepository.ENA.value
     )
 
     good_isa_json_study = good_isa_json.investigation.studies[0]
@@ -61,7 +63,7 @@ def test_reduce_isa_json_for_biosamples():
     )
 
     filtered_isa_json = reduce_isa_json_for_target_repo(
-        good_isa_json, TargetRepository.BIOSAMPLES
+        good_isa_json, TargetRepository.BIOSAMPLES.value
     )
 
     assert len(filtered_isa_json.investigation.studies[0].assays) == 0
@@ -108,7 +110,7 @@ def test_target_repo_comment_validator():
             {
                 "@id": "comment_001",
                 "name": f"{TARGET_REPO_KEY}",
-                "value": TargetRepository.ENA,
+                "value": TargetRepository.ENA.value,
             }
         ],
     }
@@ -132,12 +134,12 @@ def test_target_repo_comment_validator():
             {
                 "@id": "comment_003",
                 "name": f"{TARGET_REPO_KEY}",
-                "value": TargetRepository.ENA,
+                "value": TargetRepository.ENA.value,
             },
             {
                 "@id": "comment_004",
                 "name": f"{TARGET_REPO_KEY}",
-                "value": TargetRepository.METABOLIGHTS,
+                "value": TargetRepository.METABOLIGHTS.value,
             },
         ],
     }
@@ -240,17 +242,82 @@ def test_update_study_materials_with_accession_categories():
 
 def test_filename_validation():
     # ISA should have a filename that starts with 'x_'
-    with pytest.raises(ValidationError, match=f"'filename' should start with 'i_'"):
+    with pytest.raises(ValidationError, match="'filename' should start with 'i_'"):
         Investigation.model_validate({"@id": "1", "filename": "bad filename"})
 
-    with pytest.raises(ValidationError, match=f"'filename' should start with 's_'"):
+    with pytest.raises(ValidationError, match="'filename' should start with 's_'"):
         Study.model_validate({"@id": "2", "filename": "bad filename"})
 
-    with pytest.raises(ValidationError, match=f"'filename' should start with 'a_'"):
+    with pytest.raises(ValidationError, match="'filename' should start with 'a_'"):
         Assay.model_validate({"@id": "3", "filename": "bad filename"})
 
     assert re.match(r"^i_", "i_Good_file_name")
 
     assert Investigation.model_validate({"@id": "4", "filename": "i_Good_File_Name"})
     assert Study.model_validate({"@id": "5", "filename": "s_Good_File_Name"})
     assert Assay.model_validate({"@id": "6", "filename": "a_Good_File_Name"})
+
+
+def test_map_data_files_to_repositories():
+    isa_json = load_isa_json(
+        file_path="../test-data/ISA-BH2024-ALL/isa-bh2024-all.json",
+        investigation_is_root=True,
+    )
+    exact_match_files = [
+        "../test-data/ISA-BH2024-ALL/cnv-seq-data-0.fastq",
+        "../test-data/ISA-BH2024-ALL/cnv-seq-data-1.fastq",
+        "../test-data/ISA-BH2024-ALL/cnv-seq-data-2.fastq",
+        "../test-data/ISA-BH2024-ALL/cnv-seq-data-3.fastq",
+        "../test-data/ISA-BH2024-ALL/metpro-analysis.txt",
+        "../test-data/ISA-BH2024-ALL/ms-data-metpro--1.mzml",
+        "../test-data/ISA-BH2024-ALL/ms-data-metpro--2.mzml",
+        "../test-data/ISA-BH2024-ALL/ms-data-metpro--3.mzml",
+        "../test-data/ISA-BH2024-ALL/ms-data-metpro--4.mzml",
+        "../test-data/ISA-BH2024-ALL/rna-seq-data-0.fastq",
+        "../test-data/ISA-BH2024-ALL/rna-seq-data-1.fastq",
+        "../test-data/ISA-BH2024-ALL/rna-seq-data-2.fastq",
+        "../test-data/ISA-BH2024-ALL/rna-seq-data-3.fastq",
+    ]
+
+    check_map = dict(
+        {
+            "metabolights": [
+                "../test-data/ISA-BH2024-ALL/metpro-analysis.txt",
+                "../test-data/ISA-BH2024-ALL/ms-data-metpro--1.mzml",
+                "../test-data/ISA-BH2024-ALL/ms-data-metpro--2.mzml",
+                "../test-data/ISA-BH2024-ALL/ms-data-metpro--3.mzml",
+                "../test-data/ISA-BH2024-ALL/ms-data-metpro--4.mzml",
+            ],
+            "arrayexpress": [
+                "../test-data/ISA-BH2024-ALL/rna-seq-data-0.fastq",
+                "../test-data/ISA-BH2024-ALL/rna-seq-data-1.fastq",
+                "../test-data/ISA-BH2024-ALL/rna-seq-data-2.fastq",
+                "../test-data/ISA-BH2024-ALL/rna-seq-data-3.fastq",
+            ],
+            "eva": [
+                "../test-data/ISA-BH2024-ALL/cnv-seq-data-0.fastq",
+                "../test-data/ISA-BH2024-ALL/cnv-seq-data-1.fastq",
+                "../test-data/ISA-BH2024-ALL/cnv-seq-data-2.fastq",
+                "../test-data/ISA-BH2024-ALL/cnv-seq-data-3.fastq",
+            ],
+        }
+    )
+    assert check_map == map_data_files_to_repositories(exact_match_files, isa_json)
+
+    not_enough_files = exact_match_files[:-1]
+
+    with pytest.raises(
+        ValueError,
+        match=rf"Assay for repository '{TargetRepository.ARRAYEXPRESS.value}' has encountered",
+    ):
+        map_data_files_to_repositories(not_enough_files, isa_json)
+
+    too_many_files = exact_match_files
+    one_too_many = "../test-data/ISA-BH2024-ALL/one-too-many.fastq"
+    too_many_files.append(one_too_many)
+
+    result_maps = map_data_files_to_repositories(too_many_files, isa_json)
+
+    assert one_too_many not in [
+        value for key, value_list in result_maps.items() for value in value_list
+    ]