Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Map data files to repository #87

Merged
merged 13 commits into from
Nov 12, 2024
7 changes: 3 additions & 4 deletions mars-cli/mars_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from mars_lib.logging import print_and_log
from mars_lib.validation import validate, CustomValidationException
from logging.handlers import RotatingFileHandler
from pydantic import ValidationError
import requests
import sys
import os
Expand Down Expand Up @@ -264,13 +263,13 @@ def submit(
target_repositories = []

if submit_to_biosamples:
target_repositories.append(TargetRepository.BIOSAMPLES)
target_repositories.append(TargetRepository.BIOSAMPLES.value)

if submit_to_ena:
target_repositories.append(TargetRepository.ENA)
target_repositories.append(TargetRepository.ENA.value)

if submit_to_metabolights:
target_repositories.append(TargetRepository.METABOLIGHTS)
target_repositories.append(TargetRepository.METABOLIGHTS.value)

print_and_log(
f"Starting submission of the ISA JSON to the target repositories: {', '.join(target_repositories)}."
Expand Down
59 changes: 57 additions & 2 deletions mars-cli/mars_lib/isa_json.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
from typing import Union, List, Any, Tuple, Optional
from typing import Union, List, Any, Tuple, Optional, Dict

from mars_lib.logging import print_and_log
from mars_lib.models.isa_json import (
Investigation,
Assay,
Expand Down Expand Up @@ -38,7 +40,7 @@ def reduce_isa_json_for_target_repo(
new_studies = []
studies = filtered_isa_json.investigation.studies
for study in studies:
if target_repo == TargetRepository.BIOSAMPLES:
if target_repo == TargetRepository.BIOSAMPLES.value:
filtered_assays = []
else:
assays = study.assays
Expand Down Expand Up @@ -420,3 +422,56 @@ def update_isa_json(isa_json: IsaJson, repo_response: RepositoryResponse) -> Isa

isa_json.investigation = investigation
return isa_json


def map_data_files_to_repositories(
files: List[str], isa_json: IsaJson
) -> Dict[str, List[str]]:
# Note: This works well in
df_map: Dict[str, List[str]] = {}
assays: List[Assay] = [
assay for study in isa_json.investigation.studies for assay in study.assays
]

files_dicts = [{"full_name": f, "short_name": f.split("/")[-1]} for f in files]
remaining_files = files_dicts.copy()
for assay in assays:
target_repo_comment: Comment = detect_target_repo_comment(assay.comments)
# This is an effect of everything being optional in the Comment model.
# Should we decide to make the value mandatory, this guard clause would not be necessary anymore.
if target_repo_comment.value is None:
raise ValueError(
f"At least one assay in the ISA-JSON has no '{TARGET_REPO_KEY}' comment. Mapping not possible. Make sure all assays in the ISA-JSON have this comment!"
)
assay_data_files = [df.name for df in assay.dataFiles]

# Check if the files in the ISA-JSON are present in the command
# If not, raise an error
for adf in assay_data_files:
if adf not in [fd["short_name"] for fd in files_dicts]:
raise ValueError(
f"""Assay for repository '{target_repo_comment.value}' has encountered a mismatch while mapping the data files to the ISA-JSON.
Data File '{adf}' is missing in the data files passed in the command:
{files}
Please correct the mismatch!"""
)
else:
remaining_files.remove(
next(fd for fd in files_dicts if fd["short_name"] == adf)
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This removes the first occurrence, correct? It could be confusing if the user accidentally repeats a filename in the command and then still sees the warning that it will be skipped. Could do something like

remaining_files = [fd for fd in remaining_files if fd["short_name"] == adf]

to get them all.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! But should we allow the user to pass a file multiple times? Or are there situations where a data file is submitted to multiple repositories? (Honest question...)

Also, shouldn't it be this then:

remaining_files = [fd for fd in remaining_files if fd["short_name"] != adf]

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes of course, your remaining_files filter is the right one 👍

For submitting the same data files to multiple repositories, this is indeed a good question... I feel like this shouldn't happen but maybe it's something we'll have to revisit in the future.


df_map[target_repo_comment.value] = [
fd["full_name"]
for fd in files_dicts
if fd["short_name"] in assay_data_files
]

[
print_and_log(
msg=f"File '{rf['short_name']}' could not be mapped to any data file in the ISA-JSON. For this reason, it will be skipped during submission!",
level="warning",
)
for rf in remaining_files
]

return df_map
44 changes: 27 additions & 17 deletions mars-cli/mars_lib/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
load_isa_json,
reduce_isa_json_for_target_repo,
update_isa_json,
map_data_files_to_repositories,
)
from mars_lib.models.isa_json import Comment, IsaJson
from mars_lib.models.repository_response import RepositoryResponse
Expand Down Expand Up @@ -52,7 +53,7 @@ def submission(
urls: dict[str, Any],
file_transfer: str,
output: str,
data_file_paths=None,
data_file_paths: List[TextIOWrapper] = [],
) -> None:
# If credential manager info found:
# Get password from the credential manager
Expand Down Expand Up @@ -80,6 +81,11 @@ def submission(
f"ISA JSON with investigation '{isa_json.investigation.title}' is valid."
)

# create data file map
data_file_map = map_data_files_to_repositories(
files=[str(dfp) for dfp in data_file_paths], isa_json=isa_json
)

time_stamp = datetime.timestamp(datetime.now())

if DEBUG:
Expand All @@ -91,7 +97,7 @@ def submission(
):
raise ValueError("No target repository selected.")

if TargetRepository.BIOSAMPLES in target_repositories:
if TargetRepository.BIOSAMPLES.value in target_repositories:
# Submit to Biosamples
biosamples_result = submit_to_biosamples(
isa_json=isa_json,
Expand All @@ -100,7 +106,7 @@ def submission(
webin_token_url=urls["WEBIN"]["TOKEN"],
)
print_and_log(
f"Submission to {TargetRepository.BIOSAMPLES} was successful. Result:\n{biosamples_result.json()}",
f"Submission to {TargetRepository.BIOSAMPLES.value} was successful. Result:\n{biosamples_result.json()}",
level="info",
)
# Update `isa_json`, based on the receipt returned
Expand All @@ -111,16 +117,20 @@ def submission(
if DEBUG:
save_step_to_file(time_stamp, "1_after_biosamples", isa_json)

if TargetRepository.ENA in target_repositories:
if TargetRepository.ENA.value in target_repositories:
# Step 1 : upload data if file paths are provided
if data_file_paths and file_transfer:
upload_to_ena(
file_paths=data_file_paths,
file_paths=[
Path(df) for df in data_file_map[TargetRepository.ENA.value]
],
user_credentials=user_credentials,
submission_url=urls["ENA"]["DATA-SUBMISSION"],
file_transfer=file_transfer,
)
print_and_log(f"Start submitting to {TargetRepository.ENA}.", level="debug")
print_and_log(
f"Start submitting to {TargetRepository.ENA.value}.", level="debug"
)

# Step 2 : submit isa-json to ena
ena_result = submit_to_ena(
Expand All @@ -129,11 +139,11 @@ def submission(
submission_url=urls["ENA"]["SUBMISSION"],
)
print_and_log(
f"Submission to {TargetRepository.ENA} was successful. Result:\n{ena_result.json()}"
f"Submission to {TargetRepository.ENA.value} was successful. Result:\n{ena_result.json()}"
)

print_and_log(
f"Update ISA-JSON based on receipt from {TargetRepository.ENA}.",
f"Update ISA-JSON based on receipt from {TargetRepository.ENA.value}.",
level="debug",
)
ena_mars_receipt = RepositoryResponse.model_validate(
Expand All @@ -143,10 +153,10 @@ def submission(
if DEBUG:
save_step_to_file(time_stamp, "2_after_ena", isa_json)

if TargetRepository.METABOLIGHTS in target_repositories:
if TargetRepository.METABOLIGHTS.value in target_repositories:
# Submit to MetaboLights
metabolights_result = upload_to_metabolights(
file_paths=data_file_paths,
file_paths=data_file_map[TargetRepository.METABOLIGHTS.value],
file_transfer=file_transfer,
isa_json=isa_json,
metabolights_credentials=user_credentials,
Expand All @@ -155,7 +165,7 @@ def submission(
)
metabolights_receipt_obj = metabolights_result.json()
print_and_log(
f"Submission to {TargetRepository.METABOLIGHTS} was successful. Result:\n{metabolights_receipt_obj}",
f"Submission to {TargetRepository.METABOLIGHTS.value} was successful. Result:\n{metabolights_receipt_obj}",
level="info",
)
metabolights_receipt = RepositoryResponse.model_validate(
Expand All @@ -171,11 +181,11 @@ def submission(
if DEBUG:
save_step_to_file(time_stamp, "3_after_metabolights", isa_json)

if TargetRepository.EVA in target_repositories:
if TargetRepository.EVA.value in target_repositories:
# Submit to EVA
# TODO: Filter out other assays
print_and_log(
f"Submission to {TargetRepository.EVA} was successful.", level="info"
f"Submission to {TargetRepository.EVA.value} was successful.", level="info"
)
# TODO: Update `isa_json`, based on the receipt returned

Expand All @@ -201,7 +211,7 @@ def submit_to_biosamples(
headers=headers,
params=params,
json=reduce_isa_json_for_target_repo(
isa_json, TargetRepository.BIOSAMPLES
isa_json, TargetRepository.BIOSAMPLES.value
).model_dump(by_alias=True, exclude_none=True),
)

Expand Down Expand Up @@ -338,9 +348,9 @@ def submit_to_ena(
submission_url,
headers=headers,
params=params,
json=reduce_isa_json_for_target_repo(isa_json, TargetRepository.ENA).model_dump(
by_alias=True, exclude_none=True
),
json=reduce_isa_json_for_target_repo(
isa_json, TargetRepository.ENA.value
).model_dump(by_alias=True, exclude_none=True),
)

if result.status_code != 200:
Expand Down
1 change: 1 addition & 0 deletions mars-cli/mars_lib/target_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class TargetRepository(str, Enum):
METABOLIGHTS = "metabolights"
BIOSAMPLES = "biosamples"
EVA = "eva"
ARRAYEXPRESS = "arrayexpress"

@classmethod
def available_repositories(cls):
Expand Down
2 changes: 1 addition & 1 deletion mars-cli/tests/test_biosample_external_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_validate_bs_accession():
validate_bs_accession(invalid_accession)

valid_accession = "SAMEA112654119"
assert validate_bs_accession(valid_accession) != ValueError
assert validate_bs_accession(valid_accession) is not ValueError
kdp-cloud marked this conversation as resolved.
Show resolved Hide resolved


def test_validate_json_against_schema():
Expand Down
83 changes: 75 additions & 8 deletions mars-cli/tests/test_isa_json.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import re


from mars_lib.isa_json import (
reduce_isa_json_for_target_repo,
load_isa_json,
update_isa_json,
map_data_files_to_repositories,
)
from mars_lib.target_repo import TargetRepository, TARGET_REPO_KEY
import pytest
Expand Down Expand Up @@ -44,7 +46,7 @@ def test_reduce_isa_json_for_target_repo():
)

filtered_isa_json = reduce_isa_json_for_target_repo(
good_isa_json, TargetRepository.ENA
good_isa_json, TargetRepository.ENA.value
)

good_isa_json_study = good_isa_json.investigation.studies[0]
Expand All @@ -61,7 +63,7 @@ def test_reduce_isa_json_for_biosamples():
)

filtered_isa_json = reduce_isa_json_for_target_repo(
good_isa_json, TargetRepository.BIOSAMPLES
good_isa_json, TargetRepository.BIOSAMPLES.value
)

assert len(filtered_isa_json.investigation.studies[0].assays) == 0
Expand Down Expand Up @@ -108,7 +110,7 @@ def test_target_repo_comment_validator():
{
"@id": "comment_001",
"name": f"{TARGET_REPO_KEY}",
"value": TargetRepository.ENA,
"value": TargetRepository.ENA.value,
}
],
}
Expand All @@ -132,12 +134,12 @@ def test_target_repo_comment_validator():
{
"@id": "comment_003",
"name": f"{TARGET_REPO_KEY}",
"value": TargetRepository.ENA,
"value": TargetRepository.ENA.value,
},
{
"@id": "comment_004",
"name": f"{TARGET_REPO_KEY}",
"value": TargetRepository.METABOLIGHTS,
"value": TargetRepository.METABOLIGHTS.value,
},
],
}
Expand Down Expand Up @@ -240,17 +242,82 @@ def test_update_study_materials_with_accession_categories():

def test_filename_validation():
# ISA should have a filename that starts with 'x_'
with pytest.raises(ValidationError, match=f"'filename' should start with 'i_'"):
with pytest.raises(ValidationError, match="'filename' should start with 'i_'"):
Investigation.model_validate({"@id": "1", "filename": "bad filename"})

with pytest.raises(ValidationError, match=f"'filename' should start with 's_'"):
with pytest.raises(ValidationError, match="'filename' should start with 's_'"):
Study.model_validate({"@id": "2", "filename": "bad filename"})

with pytest.raises(ValidationError, match=f"'filename' should start with 'a_'"):
with pytest.raises(ValidationError, match="'filename' should start with 'a_'"):
Assay.model_validate({"@id": "3", "filename": "bad filename"})

assert re.match(r"^i_", "i_Good_file_name")

assert Investigation.model_validate({"@id": "4", "filename": "i_Good_File_Name"})
assert Study.model_validate({"@id": "5", "filename": "s_Good_File_Name"})
assert Assay.model_validate({"@id": "6", "filename": "a_Good_File_Name"})


def test_map_data_files_to_repositories():
isa_json = load_isa_json(
file_path="../test-data/ISA-BH2024-ALL/isa-bh2024-all.json",
investigation_is_root=True,
)
exact_match_files = [
"../test-data/ISA-BH2024-ALL/cnv-seq-data-0.fastq",
"../test-data/ISA-BH2024-ALL/cnv-seq-data-1.fastq",
"../test-data/ISA-BH2024-ALL/cnv-seq-data-2.fastq",
"../test-data/ISA-BH2024-ALL/cnv-seq-data-3.fastq",
"../test-data/ISA-BH2024-ALL/metpro-analysis.txt",
"../test-data/ISA-BH2024-ALL/ms-data-metpro--1.mzml",
"../test-data/ISA-BH2024-ALL/ms-data-metpro--2.mzml",
"../test-data/ISA-BH2024-ALL/ms-data-metpro--3.mzml",
"../test-data/ISA-BH2024-ALL/ms-data-metpro--4.mzml",
"../test-data/ISA-BH2024-ALL/rna-seq-data-0.fastq",
"../test-data/ISA-BH2024-ALL/rna-seq-data-1.fastq",
"../test-data/ISA-BH2024-ALL/rna-seq-data-2.fastq",
"../test-data/ISA-BH2024-ALL/rna-seq-data-3.fastq",
]

check_map = dict(
{
"metabolights": [
"../test-data/ISA-BH2024-ALL/metpro-analysis.txt",
"../test-data/ISA-BH2024-ALL/ms-data-metpro--1.mzml",
"../test-data/ISA-BH2024-ALL/ms-data-metpro--2.mzml",
"../test-data/ISA-BH2024-ALL/ms-data-metpro--3.mzml",
"../test-data/ISA-BH2024-ALL/ms-data-metpro--4.mzml",
],
"arrayexpress": [
"../test-data/ISA-BH2024-ALL/rna-seq-data-0.fastq",
"../test-data/ISA-BH2024-ALL/rna-seq-data-1.fastq",
"../test-data/ISA-BH2024-ALL/rna-seq-data-2.fastq",
"../test-data/ISA-BH2024-ALL/rna-seq-data-3.fastq",
],
"eva": [
"../test-data/ISA-BH2024-ALL/cnv-seq-data-0.fastq",
"../test-data/ISA-BH2024-ALL/cnv-seq-data-1.fastq",
"../test-data/ISA-BH2024-ALL/cnv-seq-data-2.fastq",
"../test-data/ISA-BH2024-ALL/cnv-seq-data-3.fastq",
],
}
)
assert check_map == map_data_files_to_repositories(exact_match_files, isa_json)

not_enough_files = exact_match_files[:-1]

with pytest.raises(
ValueError,
match=rf"Assay for repository '{TargetRepository.ARRAYEXPRESS.value}' has encountered",
):
map_data_files_to_repositories(not_enough_files, isa_json)

too_many_files = exact_match_files
one_too_many = "../test-data/ISA-BH2024-ALL/one-too-many.fastq"
too_many_files.append(one_too_many)

result_maps = map_data_files_to_repositories(too_many_files, isa_json)

assert one_too_many not in [
value for key, value_list in result_maps.items() for value in value_list
]