Skip to content

Commit

Permalink
Remove get_tanimoto_score_between_spectra since not needed anymore
Browse files Browse the repository at this point in the history
  • Loading branch information
niekdejonge committed Oct 23, 2024
1 parent f041d38 commit ab1e5f9
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 81 deletions.
22 changes: 0 additions & 22 deletions ms2deepscore/benchmarking/calculate_scores_for_validation.py
Original file line number Diff line number Diff line change
@@ -1,22 +0,0 @@
from typing import List
from matchms.Spectrum import Spectrum
from ms2deepscore.benchmarking.CalculateScoresBetweenAllIonmodes import calculate_tanimoto_scores_unique_inchikey


def get_tanimoto_score_between_spectra(spectra_1: List[Spectrum],
spectra_2: List[Spectrum],
fingerprint_type="daylight",
nbits=2048):
"""Gets the tanimoto scores between two list of spectra
It is optimized by calculating the tanimoto scores only between unique fingerprints/smiles.
The tanimoto scores are derived after.
"""
tanimoto_df = calculate_tanimoto_scores_unique_inchikey(spectra_1, spectra_2,
fingerprint_type,
nbits)
inchikeys_1 = [spectrum.get("inchikey")[:14] for spectrum in spectra_1]
inchikeys_2 = [spectrum.get("inchikey")[:14] for spectrum in spectra_2]
tanimoto_scores = tanimoto_df.loc[inchikeys_1, inchikeys_2].values
return tanimoto_scores
33 changes: 1 addition & 32 deletions tests/test_calculate_tanimoto_scores_for_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from pathlib import Path
import numpy as np
from matchms import Spectrum
from ms2deepscore.benchmarking.calculate_scores_for_validation import (
get_tanimoto_score_between_spectra)
from ms2deepscore.benchmarking.CalculateScoresBetweenAllIonmodes import calculate_tanimoto_scores_unique_inchikey

TEST_RESOURCES_PATH = Path(__file__).parent / 'resources'
Expand All @@ -24,37 +22,8 @@ def create_dummy_data(nr_of_spectra):
return spectrums


def test_get_tanimoto_score_between_spectra_duplicated_inchikeys():
nr_of_test_spectra = 3
spectrums = create_dummy_data(nr_of_test_spectra)
# We duplicate the spectra, since we want to test if it works with duplicated inchikeys
tanimoto_scores = get_tanimoto_score_between_spectra(spectrums+spectrums,
spectrums+spectrums)
assert tanimoto_scores.shape == (nr_of_test_spectra*2, nr_of_test_spectra*2)
expected_values = np.array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 1.0, 0.5, 0.0, 1.0, 0.5],
[0.0, 0.5, 1.0, 0.0, 0.5, 1.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 1.0, 0.5, 0.0, 1.0, 0.5],
[0.0, 0.5, 1.0, 0.0, 0.5, 1.0],
])
assert np.array_equal(tanimoto_scores, expected_values)


def test_get_tanimoto_score_between_spectra_not_symmetric():
dummy_spectra = create_dummy_data(5)
tanimoto_scores = get_tanimoto_score_between_spectra(dummy_spectra[:3] + dummy_spectra[2:3],
dummy_spectra[2:])
assert tanimoto_scores.shape == (4, 3)
expected_values = np.array([[0.0, 0.0, 0.0],
[0.5, 0.333333, 0.25],
[1.0, 0.666667, 0.5],
[1.0, 0.666667, 0.5],
])
assert np.allclose(tanimoto_scores, expected_values, atol=1e-04)


def test_calculate_tanimoto_scores_unique_inchikey():
"""Tests that only scores are calculated between unique inchikeys"""
nr_of_test_spectra = 4
spectrums = create_dummy_data(nr_of_test_spectra)
tanimoto_scores = calculate_tanimoto_scores_unique_inchikey(
Expand Down
29 changes: 2 additions & 27 deletions tests/test_validation_loss_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from ms2deepscore.models.loss_functions import LOSS_FUNCTIONS
from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore
from ms2deepscore.train_new_model.ValidationLossCalculator import (
ValidationLossCalculator, select_spectra_per_inchikey)
ValidationLossCalculator)
from tests.create_test_spectra import (pesticides_test_spectra,
siamese_spectral_model, create_test_spectra)
siamese_spectral_model)


@pytest.fixture()
Expand Down Expand Up @@ -37,31 +37,6 @@ def simple_test_spectra():
return spectra


@pytest.mark.parametrize("nr_of_inchikeys,nr_of_spectra_per_inchikey,nr_of_sampled_spectra_per_inchikey",
[[2, 2, 1],
[2, 2, 5],
[1, 2, 1],
[2, 30, 100],])
def test_select_one_spectrum_per_inchikey(nr_of_inchikeys, nr_of_spectra_per_inchikey,
nr_of_sampled_spectra_per_inchikey):
test_spectra = create_test_spectra(nr_of_inchikeys, nr_of_spectra_per_inchikey)
selected_spectra = select_spectra_per_inchikey(test_spectra, 42, nr_of_sampled_spectra_per_inchikey)
assert len(selected_spectra) == nr_of_inchikeys*nr_of_sampled_spectra_per_inchikey

# Check if the spectra only are unique inchikeys
inchikeys_list = [s.get("inchikey") for s in selected_spectra]
assert set(inchikeys_list) == set([s.get("inchikey") for s in test_spectra]), "not all inchikeys are selected"

for inchikey_count in Counter(inchikeys_list).values():
assert inchikey_count == nr_of_sampled_spectra_per_inchikey

hashed_spectra = [spectrum.set("fingerprint", None).__hash__() for spectrum in selected_spectra]
for spectrum_count in Counter(hashed_spectra).values():
minimum_spectrum_count = nr_of_sampled_spectra_per_inchikey // nr_of_spectra_per_inchikey
assert minimum_spectrum_count <= spectrum_count <= minimum_spectrum_count + 1, \
"The spectra are not sampled equally"


def test_validation_loss_calculator():
model = siamese_spectral_model()
test_spectra = pesticides_test_spectra()
Expand Down

0 comments on commit ab1e5f9

Please sign in to comment.