From 25e8cb468a24542cf08aae2a03a83ac0bd2788b9 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 3 Sep 2021 19:05:51 +0200 Subject: [PATCH 1/7] Implement min-max normalization (coarse/fine) --- kissim/api/subset.py | 27 -- kissim/data/min_max_distances_coarse.csv | 9 + kissim/data/min_max_distances_fine.csv | 9 + kissim/data/min_max_moments_coarse.csv | 7 + kissim/data/min_max_moments_fine.csv | 7 + kissim/definitions.py | 15 +- kissim/encoding/__init__.py | 4 +- .../encoding/{base.py => fingerprint_base.py} | 4 +- kissim/encoding/fingerprint_generator.py | 248 +----------------- kissim/encoding/fingerprint_generator_base.py | 201 ++++++++++++++ .../fingerprint_generator_normalized.py | 121 +++++++++ kissim/encoding/fingerprint_normalized.py | 198 +++++--------- .../encoding/test_fingerprint_generator.py | 66 ++--- .../encoding/test_fingerprint_normalized.py | 114 ++------ kissim/tests/test_utils.py | 20 +- kissim/utils.py | 137 ++++++++++ kissim/viewer/base.py | 4 +- 17 files changed, 637 insertions(+), 554 deletions(-) create mode 100644 kissim/data/min_max_distances_coarse.csv create mode 100644 kissim/data/min_max_distances_fine.csv create mode 100644 kissim/data/min_max_moments_coarse.csv create mode 100644 kissim/data/min_max_moments_fine.csv rename kissim/encoding/{base.py => fingerprint_base.py} (99%) create mode 100644 kissim/encoding/fingerprint_generator_base.py create mode 100644 kissim/encoding/fingerprint_generator_normalized.py diff --git a/kissim/api/subset.py b/kissim/api/subset.py index 1e8710c7..a1e4b93d 100644 --- a/kissim/api/subset.py +++ b/kissim/api/subset.py @@ -114,9 +114,6 @@ def _subset_fingerprint_generator(fingerprint_generator, klifs_residue_ids): fingerprint_generator_subset.data = _subset_fingerprint_generator_data( fingerprint_generator, klifs_residue_ids ) - fingerprint_generator_subset.data_normalized = _subset_fingerprint_generator_data_normalized( - fingerprint_generator, fingerprint_generator_subset - ) return fingerprint_generator_subset @@ -210,27 +207,3 @@ def _subset_fingerprint_generator_data(fingerprint_generator, klifs_residue_ids) fingerprint_generator_data_subset[id_] = fp_subset return fingerprint_generator_data_subset - - -def _subset_fingerprint_generator_data_normalized( - fingerprint_generator, fingerprint_generator_subset -): - """ - Normalize the input fingerprint subsets. - - Attributes - ---------- - fingerprint_generator : kissim.encoding.FingerprintGenerator - Fingerprints. - fingerprint_generator_subset : kissim.encoding.FingerprintGenerator - Fingerprints subset. - - Returns - ------- - dict - Fingerprints with subset of residues only. - """ - - # If fingerprint generator contains normalized fingerprints - if fingerprint_generator.data_normalized is not None: - return fingerprint_generator_subset._normalize_fingerprints() diff --git a/kissim/data/min_max_distances_coarse.csv b/kissim/data/min_max_distances_coarse.csv new file mode 100644 index 00000000..26152e74 --- /dev/null +++ b/kissim/data/min_max_distances_coarse.csv @@ -0,0 +1,9 @@ +subpocket,min_max,any +hinge_region,min,2.37 +hinge_region,max,30.79 +dfg_region,min,0.87 +dfg_region,max,33.62 +front_pocket,min,1.36 +front_pocket,max,33.36 +center,min,1.11 +center,max,27.03 diff --git a/kissim/data/min_max_distances_fine.csv b/kissim/data/min_max_distances_fine.csv new file mode 100644 index 00000000..d7aa024e --- /dev/null +++ b/kissim/data/min_max_distances_fine.csv @@ -0,0 +1,9 @@ +subpocket,min_max,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85 +hinge_region,min,11.52,9.3,7.83,6.55,5.66,6.86,7.71,6.37,8.84,7.14,6.25,7.1,6.89,5.15,2.37,5.07,6.08,9.83,11.16,12.18,9.73,12.85,12.86,9.08,9.69,11.36,11.29,9.52,11.18,12.21,11.11,10.48,9.68,10.0,6.5,5.4,4.58,6.25,7.5,7.07,9.78,8.37,7.16,4.95,3.25,3.53,2.69,4.82,5.28,9.24,7.7,9.3,10.71,9.63,11.32,11.75,16.63,18.63,14.66,16.17,15.69,19.15,19.78,18.17,17.34,16.98,15.05,13.61,14.33,12.49,13.12,13.15,11.43,9.35,8.0,8.08,7.64,9.19,6.56,5.17,5.98,5.24,7.21,4.34,5.96 +hinge_region,max,23.62,20.68,18.57,18.8,20.51,19.77,19.17,18.26,17.01,16.03,12.37,12.79,14.27,8.53,6.23,8.29,11.01,13.56,16.08,30.79,27.93,24.99,23.72,23.31,21.24,21.71,23.29,25.5,24.68,23.65,20.26,19.07,22.95,19.57,16.31,13.7,11.93,10.56,12.02,12.9,14.56,13.52,10.59,8.03,6.0,6.16,6.94,9.82,12.34,12.85,14.02,14.75,17.47,19.59,21.55,24.16,23.94,27.83,27.15,30.31,28.44,27.9,27.14,26.34,24.67,22.55,22.6,20.71,21.55,21.26,18.24,17.88,17.13,15.15,12.46,13.23,12.19,14.76,11.45,9.07,12.25,14.45,24.17,30.0,24.42 +dfg_region,min,16.23,14.04,12.53,6.09,8.57,5.47,4.55,3.92,5.08,7.66,8.13,10.06,10.56,12.08,9.26,7.88,5.54,6.8,4.58,2.01,2.86,3.3,4.85,2.64,3.62,4.66,7.12,6.03,6.21,9.48,10.75,11.61,11.97,10.64,7.56,6.51,7.74,6.7,9.24,8.1,6.1,5.33,2.95,6.35,7.74,10.96,11.01,11.57,14.32,16.37,15.1,13.65,16.1,16.96,17.74,16.88,21.7,22.6,17.13,12.89,10.94,14.4,15.4,12.19,10.16,10.29,9.35,8.98,9.92,9.42,11.43,10.78,13.67,10.73,10.05,11.71,12.53,11.48,8.19,5.57,4.18,2.3,1.12,0.87,1.68 +dfg_region,max,30.33,27.63,24.45,22.83,21.77,22.91,25.38,23.47,19.85,18.67,17.56,20.22,21.3,18.73,15.42,12.98,10.98,12.88,13.56,21.63,18.26,15.91,15.21,17.43,16.35,16.71,19.99,21.29,20.95,21.2,19.46,22.08,25.42,23.36,19.71,17.63,16.97,14.47,16.63,15.21,15.61,11.54,10.84,13.71,14.09,17.22,18.3,19.88,22.9,24.04,24.27,22.34,23.75,26.23,27.74,29.72,30.02,33.58,33.62,28.7,26.96,25.28,25.46,22.78,21.72,18.7,19.09,17.5,18.97,18.47,20.73,22.21,23.28,20.99,18.01,19.22,19.56,22.45,17.2,13.75,12.26,10.42,18.96,26.47,22.51 +front_pocket,min,10.92,7.52,5.74,4.05,2.38,2.24,2.28,3.82,4.04,7.36,5.68,7.7,7.9,9.6,7.31,8.93,7.61,10.78,11.79,13.0,11.99,14.37,13.48,10.44,11.79,13.75,12.94,12.42,13.6,15.7,14.26,13.98,13.99,11.99,9.92,7.91,10.41,9.92,12.06,11.5,14.31,12.76,10.69,10.66,8.66,8.9,6.06,6.63,6.81,4.89,4.18,3.3,6.26,8.24,8.45,7.61,12.08,13.36,10.53,15.61,14.63,18.02,19.4,18.72,16.58,16.04,14.75,11.14,11.83,8.62,8.67,7.75,5.7,4.07,2.7,5.41,4.64,7.4,6.79,4.27,3.05,1.76,2.37,1.64,1.36 +front_pocket,max,23.34,19.68,18.01,19.84,20.9,19.48,17.04,17.51,15.22,13.11,13.32,15.09,17.34,14.29,11.16,13.4,12.49,15.14,17.26,32.93,30.28,27.96,27.35,25.56,24.07,24.92,26.58,28.1,26.59,26.02,23.58,22.91,22.81,20.01,16.48,14.74,14.51,16.72,18.22,18.42,20.2,17.36,15.06,14.19,11.79,12.62,12.84,12.59,15.5,13.27,15.98,12.98,13.68,16.23,17.69,19.57,20.37,25.21,24.23,33.36,30.61,29.49,27.84,26.72,25.15,21.76,21.44,18.21,17.92,16.9,14.04,13.56,12.38,10.03,8.25,9.14,9.23,12.23,12.41,10.56,13.21,14.06,20.05,23.84,21.09 +center,min,15.17,12.05,10.22,8.05,6.51,6.02,5.15,5.19,8.39,9.58,7.72,9.47,10.25,9.36,7.16,7.87,6.76,10.38,10.67,10.43,9.44,10.84,9.59,6.44,7.53,8.72,8.48,7.42,8.28,10.36,9.39,9.03,9.46,7.84,5.71,4.08,5.49,6.29,9.32,8.6,10.79,9.77,7.55,7.4,5.52,6.94,6.25,7.4,9.94,10.17,9.44,8.74,10.15,12.42,13.19,12.16,16.43,18.15,14.43,12.06,11.65,14.74,15.76,14.3,12.37,12.19,10.42,8.63,9.64,8.36,8.84,8.07,9.14,5.68,5.46,6.5,6.84,5.61,1.88,1.11,1.24,1.75,2.5,2.97,3.29 +center,max,26.03,22.76,20.23,20.8,21.68,21.14,19.48,20.33,18.81,17.58,16.44,18.23,19.64,13.57,10.86,12.69,12.27,15.2,16.28,25.95,23.32,20.81,20.31,21.64,17.82,18.63,20.24,21.73,20.56,20.22,17.67,17.3,20.18,17.08,13.65,12.19,10.88,12.01,13.64,15.27,17.86,15.02,12.87,11.51,9.31,10.95,12.01,13.39,16.92,15.66,18.47,16.43,18.35,20.46,22.63,24.25,23.81,26.89,26.8,27.03,24.93,23.83,22.55,21.54,19.84,17.4,17.05,15.36,15.64,15.21,13.15,13.95,14.11,12.01,8.89,10.33,11.69,13.93,11.41,8.22,7.39,9.32,20.52,26.75,20.19 diff --git a/kissim/data/min_max_moments_coarse.csv b/kissim/data/min_max_moments_coarse.csv new file mode 100644 index 00000000..622e72e7 --- /dev/null +++ b/kissim/data/min_max_moments_coarse.csv @@ -0,0 +1,7 @@ +moment,min_max,any +1,min,11.47 +1,max,16.85 +2,min,2.88 +2,max,6.19 +3,min,-2.29 +3,max,6.31 diff --git a/kissim/data/min_max_moments_fine.csv b/kissim/data/min_max_moments_fine.csv new file mode 100644 index 00000000..8e3f29a8 --- /dev/null +++ b/kissim/data/min_max_moments_fine.csv @@ -0,0 +1,7 @@ +moment,min_max,hinge_region,dfg_region,front_pocket,center +1,min,12.2,12.72,12.33,11.47 +1,max,13.85,16.85,14.34,12.97 +2,min,4.06,3.91,3.34,2.88 +2,max,5.76,6.17,6.19,4.48 +3,min,-2.17,-1.12,-2.1,-2.29 +3,max,5.16,6.31,5.97,4.15 diff --git a/kissim/definitions.py b/kissim/definitions.py index 7c0ce48d..eb7ab2c4 100644 --- a/kissim/definitions.py +++ b/kissim/definitions.py @@ -154,13 +154,14 @@ # Distance and moment cutoffs used for fingerprint normalization # Cutoffs defined in this notebook: # https://github.com/volkamerlab/kissim_app/blob/master/notebooks/004_fingerprints/002_spatial_feature_cutoffs.ipynb -DISTANCE_CUTOFFS = { - "hinge_region": (2.0, 31.0), - "dfg_region": (0.0, 34.0), - "front_pocket": (1.0, 33.0), - "center": (1.0, 29.0), -} -MOMENT_CUTOFFS = {1: (11.0, 17.0), 2: (2.0, 7.0), 3: (-3.0, 7.0)} +DISTANCE_CUTOFFS = {} +MOMENT_CUTOFFS = {} +for how in ["fine", "coarse"]: + + DISTANCE_CUTOFFS[how] = pd.read_csv( + PATH_DATA / f"min_max_distances_{how}.csv", index_col=[0, 1] + ) + MOMENT_CUTOFFS[how] = pd.read_csv(PATH_DATA / f"min_max_moments_{how}.csv", index_col=[0, 1]) # KLIFS pocket residue subsets by DFG conformation with open(PATH_DATA / "klifs_pocket_residue_subset.json") as f: diff --git a/kissim/encoding/__init__.py b/kissim/encoding/__init__.py index ffbcbdd7..36fd9674 100644 --- a/kissim/encoding/__init__.py +++ b/kissim/encoding/__init__.py @@ -2,7 +2,9 @@ Encode kinase pockets as subpocket-based structural fingerprint. """ -from .base import FingerprintBase +from .fingerprint_base import FingerprintBase from .fingerprint import Fingerprint from .fingerprint_normalized import FingerprintNormalized +from .fingerprint_generator_base import FingerprintGeneratorBase from .fingerprint_generator import FingerprintGenerator +from .fingerprint_generator_normalized import FingerprintGeneratorNormalized diff --git a/kissim/encoding/base.py b/kissim/encoding/fingerprint_base.py similarity index 99% rename from kissim/encoding/base.py rename to kissim/encoding/fingerprint_base.py index 097ed236..4f4c9ac8 100644 --- a/kissim/encoding/base.py +++ b/kissim/encoding/fingerprint_base.py @@ -1,7 +1,7 @@ """ -kissim.encoding.fingerprint +kissim.encoding.fingerprint_base -Defines the kissim fingerprint. +Defines the base kissim fingerprint. """ import json diff --git a/kissim/encoding/fingerprint_generator.py b/kissim/encoding/fingerprint_generator.py index 779f6628..08059402 100644 --- a/kissim/encoding/fingerprint_generator.py +++ b/kissim/encoding/fingerprint_generator.py @@ -1,5 +1,5 @@ """ -kissim.encoding.fingerprint +kissim.encoding.fingerprint_generator Defines sequencial and parallel processing of fingerprints from local or remote structures. """ @@ -14,35 +14,13 @@ import pandas as pd from opencadd.databases.klifs import setup_remote -from kissim.encoding import Fingerprint, FingerprintNormalized +from kissim.encoding import Fingerprint, FingerprintNormalized, FingerprintGeneratorBase from kissim.utils import set_n_cores logger = logging.getLogger(__name__) -class FingerprintGenerator: - """ - Generate fingerprints for multiple structures. - - Attributes - ---------- - structure_klifs_id : int - Structure KLIFS ID. - klifs_session : opencadd.databases.klifs.session.Session - Local or remote KLIFS session. - data : dict of int: kissim.encoding.Fingerprint - Fingerprints for input structures (by KLIFS ID). - data_normalized : dict of int: kissim.encoding.Fingerprint - Normalized fingerprints for input structures (by KLIFS ID). - """ - - def __init__(self): - - self.structure_klifs_ids = None - self.klifs_session = None - self.data = None - self.data_normalized = None - +class FingerprintGenerator(FingerprintGeneratorBase): @classmethod def from_structure_klifs_ids(cls, structure_klifs_ids, klifs_session=None, n_cores=1): """ @@ -87,7 +65,6 @@ def from_structure_klifs_ids(cls, structure_klifs_ids, klifs_session=None, n_cor for i in fingerprints_list if i is not None # Removes emtpy fingerprints } - fingerprint_generator.data_normalized = fingerprint_generator._normalize_fingerprints() logger.info(f"Number of output fingerprints: {len(fingerprint_generator.data)}") @@ -97,7 +74,7 @@ def from_structure_klifs_ids(cls, structure_klifs_ids, klifs_session=None, n_cor return fingerprint_generator @classmethod - def from_json(cls, filepath, normalize=False): + def from_json(cls, filepath): """ Initialize a FingerprintGenerator object from a json file. @@ -105,9 +82,6 @@ def from_json(cls, filepath, normalize=False): ---------- filepath : str or pathlib.Path Path to json file. - normalized : bool - Add normalization (default: False). This will store the unnormalized features alongside - the normalized features. """ filepath = Path(filepath) @@ -122,8 +96,6 @@ def from_json(cls, filepath, normalize=False): fingerprint_generator = cls() fingerprint_generator.data = data - if normalize: - fingerprint_generator.data_normalized = fingerprint_generator._normalize_fingerprints() fingerprint_generator.structure_klifs_ids = list(fingerprint_generator.data.keys()) return fingerprint_generator @@ -146,205 +118,6 @@ def to_json(self, filepath): with open(filepath, "w") as f: f.write(json_string) - @property - def subpocket_centers(self): - """ - Subpocket center coordinates for all structures. - - Returns - ------- - pandas.DataFrame - All subpockets (columns, level 0) coordinates x, y, z (columns, level 1) for all - structures (rows). - """ - - coordinates = [] - for structure_klifs_id, fingerprint in self.data.items(): - coordinates_series = fingerprint.subpocket_centers.transpose().stack() - coordinates_series.name = structure_klifs_id - coordinates.append(coordinates_series) - coordinates = pd.DataFrame(coordinates) - coordinates.columns.names = ["subpocket", "dimension"] - - return coordinates - - def physicochemical(self, normalized=False): - """ - Get physicochemical feature vectors per feature type and pocket. - - Parameters - ---------- - normalized : bool - Unnormalized (default) or normalized features. - - Returns - ------- - pandas.DataFrame - Physicochemical feature vectors per feature type (columns) and pocket (rows). - """ - - return self._feature_group("physicochemical", normalized) - - def distances(self, normalized=False): - """ - Get distances feature vectors per feature type and pocket. - - Parameters - ---------- - normalized : bool - Unnormalized (default) or normalized features. - - Returns - ------- - pandas.DataFrame - Distances feature vectors per feature type (columns) and pocket (rows). - """ - - return self._feature_group("distances", normalized) - - def moments(self, normalized=False): - """ - Get moments feature vectors per feature type and pocket. - - Parameters - ---------- - normalized : bool - Unnormalized (default) or normalized features. - - Returns - ------- - pandas.DataFrame - Moments feature vectors per feature type (columns) and pocket (rows). - """ - - return self._feature_group("moments", normalized) - - def physicochemical_exploded(self, normalized=False): - """ - Get physicochemical feature values per feature type and bit position. - - Parameters - ---------- - normalized : bool - Unnormalized (default) or normalized features. - - Returns - ------- - pandas.DataFrame - Physicochemical feature values per feature type (columns) and pocket / bit position - (rows). - """ - - return self._feature_group_exploded("physicochemical", normalized) - - def distances_exploded(self, normalized=False): - """ - Get distances feature values per feature type and bit position. - - Parameters - ---------- - normalized : bool - Unnormalized (default) or normalized features. - - Returns - ------- - pandas.DataFrame - Distances feature values per feature type (columns) and pocket / bit position (rows). - """ - - return self._feature_group_exploded("distances", normalized) - - def moments_exploded(self, normalized=False): - """ - Get moments feature values per feature type and bit position. - - Parameters - ---------- - normalized : bool - Unnormalized (default) or normalized features. - - Returns - ------- - pandas.DataFrame - Moments feature values per feature type (columns) and pocket / bit position (rows). - """ - - return self._feature_group_exploded("moments", normalized) - - def _feature_group(self, feature_group, normalized=False): - """ - For a given feature group, get feature vectors per feature type and pocket. - - Parameter - --------- - feature_group : str - Feature group, i.e. "physicochemical", "distances", or "moments". - normalized : bool - Unnormalized (default) or normalized features. - - Returns - ------- - pandas.DataFrame - Feature vectors per feature type (columns) and pocket (rows). - """ - - if normalized: - fingerprints = self.data_normalized - else: - fingerprints = self.data - - if fingerprints is not None: - features = { - structure_klifs_id: ( - fingerprint.values_dict[feature_group] - if feature_group == "physicochemical" - else fingerprint.values_dict["spatial"][feature_group] - ) - for structure_klifs_id, fingerprint in fingerprints.items() - } - features = pd.DataFrame(features).transpose() - else: - features = None - - return features - - def _feature_group_exploded(self, feature_group, normalized=False): - """ - For a given feature group, get moments feature values per feature type and bit position. - - Parameters - ---------- - feature_group : str - Feature group, i.e. "physicochemical", "distances", or "moments". - normalized : bool - Unnormalized (default) or normalized features. - - Returns - ------- - pandas.DataFrame - Feature values per feature type (columns) and pocket / bit position (rows). - """ - - index_level1 = "structure_klifs_id" - if feature_group == "moments": - index_level2 = "moment" - else: - index_level2 = "residue_ix" - features = self._feature_group(feature_group, normalized) - features_exploded = features.apply(lambda x: x.explode()).astype(float) - features_exploded.index.name = index_level1 - multi_index = ( - features_exploded.groupby(index_level1, sort=False, dropna=False) - .size() - .apply(lambda x: range(1, x + 1)) - .explode() - ) - multi_index = pd.MultiIndex.from_tuples( - list(multi_index.items()), names=[index_level1, index_level2] - ) - features_exploded.index = multi_index - return features_exploded - def _get_fingerprint_list(self, n_cores): """ Generate fingerprints. @@ -389,16 +162,3 @@ def _get_fingerprint(self, structure_klifs_id, klifs_session): logger.info(f"{structure_klifs_id}: Generate fingerprint...") fingerprint = Fingerprint.from_structure_klifs_id(structure_klifs_id, klifs_session) return fingerprint - - def _normalize_fingerprints(self): - """ - Normalize fingerprints in fingerprint generator. - - Returns - ------- - dict - Normalized fingerprints (values) by fingerprint ID (keys). - """ - return { - key: FingerprintNormalized.from_fingerprint(value) for key, value in self.data.items() - } diff --git a/kissim/encoding/fingerprint_generator_base.py b/kissim/encoding/fingerprint_generator_base.py new file mode 100644 index 00000000..84bc0a88 --- /dev/null +++ b/kissim/encoding/fingerprint_generator_base.py @@ -0,0 +1,201 @@ +""" +kissim.encoding.fingerprint_generator_base + +Defines the base kissim fingerprint generator. +""" + +import logging + +import pandas as pd + + +logger = logging.getLogger(__name__) + + +class FingerprintGeneratorBase: + """ + Generate fingerprints for multiple structures. + + Attributes + ---------- + structure_klifs_id : int + Structure KLIFS ID. + klifs_session : opencadd.databases.klifs.session.Session + Local or remote KLIFS session. + data : dict of int: kissim.encoding.Fingerprint + Fingerprints for input structures (by KLIFS ID). + """ + + def __init__(self): + + self.structure_klifs_ids = None + self.klifs_session = None + self.data = None + + @property + def subpocket_centers(self): + """ + Subpocket center coordinates for all structures. + + Returns + ------- + pandas.DataFrame + All subpockets (columns, level 0) coordinates x, y, z (columns, level 1) for all + structures (rows). + """ + + coordinates = [] + for structure_klifs_id, fingerprint in self.data.items(): + coordinates_series = fingerprint.subpocket_centers.transpose().stack() + coordinates_series.name = structure_klifs_id + coordinates.append(coordinates_series) + coordinates = pd.DataFrame(coordinates) + coordinates.columns.names = ["subpocket", "dimension"] + + return coordinates + + @property + def physicochemical(self): + """ + Get physicochemical feature vectors per feature type and pocket. + + Returns + ------- + pandas.DataFrame + Physicochemical feature vectors per feature type (columns) and pocket (rows). + """ + + return self._feature_group("physicochemical") + + @property + def distances(self): + """ + Get distances feature vectors per feature type and pocket. + + Returns + ------- + pandas.DataFrame + Distances feature vectors per feature type (columns) and pocket (rows). + """ + + return self._feature_group("distances") + + @property + def moments(self): + """ + Get moments feature vectors per feature type and pocket. + + Returns + ------- + pandas.DataFrame + Moments feature vectors per feature type (columns) and pocket (rows). + """ + + return self._feature_group("moments") + + @property + def physicochemical_exploded(self): + """ + Get physicochemical feature values per feature type and bit position. + + Returns + ------- + pandas.DataFrame + Physicochemical feature values per feature type (columns) and pocket / bit position + (rows). + """ + + return self._feature_group_exploded("physicochemical") + + @property + def distances_exploded(self): + """ + Get distances feature values per feature type and bit position. + + Returns + ------- + pandas.DataFrame + Distances feature values per feature type (columns) and pocket / bit position (rows). + """ + + return self._feature_group_exploded("distances") + + @property + def moments_exploded(self): + """ + Get moments feature values per feature type and bit position. + + Returns + ------- + pandas.DataFrame + Moments feature values per feature type (columns) and pocket / bit position (rows). + """ + + return self._feature_group_exploded("moments") + + def _feature_group(self, feature_group): + """ + For a given feature group, get feature vectors per feature type and pocket. + + Parameter + --------- + feature_group : str + Feature group, i.e. "physicochemical", "distances", or "moments". + + Returns + ------- + pandas.DataFrame + Feature vectors per feature type (columns) and pocket (rows). + """ + + fingerprints = self.data + + if fingerprints is not None: + features = { + structure_klifs_id: ( + fingerprint.values_dict[feature_group] + if feature_group == "physicochemical" + else fingerprint.values_dict["spatial"][feature_group] + ) + for structure_klifs_id, fingerprint in fingerprints.items() + } + features = pd.DataFrame(features).transpose() + else: + features = None + + return features + + def _feature_group_exploded(self, feature_group): + """ + For a given feature group, get moments feature values per feature type and bit position. + + Parameters + ---------- + feature_group : str + Feature group, i.e. "physicochemical", "distances", or "moments". + + Returns + ------- + pandas.DataFrame + Feature values per feature type (columns) and pocket / bit position (rows). + """ + + index_level1 = "structure_klifs_id" + if feature_group == "moments": + index_level2 = "moment" + else: + index_level2 = "residue_ix" + features = self._feature_group(feature_group) + features_exploded = features.apply(lambda x: x.explode()).astype(float) + features_exploded.index.name = index_level1 + multi_index = ( + features_exploded.groupby(index_level1, sort=False, dropna=False) + .size() + .apply(lambda x: range(1, x + 1)) + .explode() + ) + multi_index = pd.MultiIndex.from_tuples( + list(multi_index.items()), names=[index_level1, index_level2] + ) + features_exploded.index = multi_index + return features_exploded diff --git a/kissim/encoding/fingerprint_generator_normalized.py b/kissim/encoding/fingerprint_generator_normalized.py new file mode 100644 index 00000000..76a7e7bf --- /dev/null +++ b/kissim/encoding/fingerprint_generator_normalized.py @@ -0,0 +1,121 @@ +""" +kissim.encoding.fingerprint_generator_normalized + +Defines the normalization of a fingerprint generator. +""" + +import logging + +from kissim.encoding import FingerprintNormalized, FingerprintGeneratorBase + +logger = logging.getLogger(__name__) + +NORMALIZATION_METHODS = ["min_max"] + + +class FingerprintGeneratorNormalized(FingerprintGeneratorBase): + """ + Generate fingerprints for multiple structures. + + Attributes + ---------- + structure_klifs_id : int + Structure KLIFS ID. + klifs_session : opencadd.databases.klifs.session.Session + Local or remote KLIFS session. + data : dict of int: kissim.encoding.Fingerprint + Fingerprints for input structures (by KLIFS ID). + """ + + def __init__(self): + + self.structure_klifs_ids = None + self.klifs_session = None + self.data = None + + @classmethod + def from_fingerprint_generator( + cls, fingerprint_generator, method="min_max", fine_grained=True + ): + """ + Normalize fingerprints. + + Parameters + ---------- + method : str + Normalization method. + fine_grained : bool + True (default): + Distances: Calculate min/max per subpocket for each residue position individually. + Moments: Calculate min/max per moment for each subpocket individually. + False: + Distances: Calculate min/max per subpocket over all residue positions. + Moments: Calculate min/max per moment over all subpockets. + + Returns + ------- + kissim.encoding.FingerprintGeneratorNormalized + Normalized fingerprints. + """ + + fingerprint_generator_normalized = cls() + fingerprint_generator_normalized.structure_klifs_ids = ( + fingerprint_generator.structure_klifs_ids + ) + fingerprint_generator_normalized.klifs_session = fingerprint_generator.klifs_session + fingerprint_generator_normalized.data = fingerprint_generator_normalized._normalize( + fingerprint_generator.data, method=method, fine_grained=fine_grained + ) + + return fingerprint_generator_normalized + + def _normalize(self, fingerprint_generator_data, method, fine_grained): + """ + Normalize fingerprints. + + Parameters + ---------- + fingerprint_generator_data : dict + Fingerprints (values) by fingerprint IDs (keys). + method : str + Normalization method. + fine_grained : bool + True (default): + Distances: Calculate min/max per subpocket for each residue position individually. + Moments: Calculate min/max per moment for each subpocket individually. + False: + Distances: Calculate min/max per subpocket over all residue positions. + Moments: Calculate min/max per moment over all subpockets. + + Returns + ------- + dict + Normalized fingerprints (values) by fingerprint IDs (keys). + """ + + if method == NORMALIZATION_METHODS[0]: + return self._normalize_min_max(fingerprint_generator_data, fine_grained) + else: + raise KeyError( + f"Normalization type unknown. Please choose from {', '.join(NORMALIZATION_METHODS)}" + ) + + def _normalize_min_max(self, fingerprint_generator_data, fine_grained=True): + """ + Normalize fingerprints in fingerprint generator with min-max normalization. + Minimum and maximum values are the minimum and maximum values seen per feature in all available fingerprints. + + Parameters + ---------- + dict + Fingerprints (values) by fingerprint IDs (keys). + + Returns + ------- + dict + Normalized fingerprints (values) by fingerprint ID (keys). + """ + return { + key: FingerprintNormalized.from_fingerprint(value, fine_grained) + for key, value in fingerprint_generator_data.items() + } diff --git a/kissim/encoding/fingerprint_normalized.py b/kissim/encoding/fingerprint_normalized.py index 6eb3b969..0a4675c1 100644 --- a/kissim/encoding/fingerprint_normalized.py +++ b/kissim/encoding/fingerprint_normalized.py @@ -6,9 +6,8 @@ import logging -import numpy as np - from kissim.definitions import DISTANCE_CUTOFFS, MOMENT_CUTOFFS, DISCRETE_FEATURE_VALUES +from kissim.utils import min_max_normalization_vector from kissim.encoding import FingerprintBase logger = logging.getLogger(__name__) @@ -16,7 +15,7 @@ class FingerprintNormalized(FingerprintBase): @classmethod - def from_fingerprint(cls, fingerprint): + def from_fingerprint(cls, fingerprint, fine_grained=True): """ Normalize fingerprint. @@ -36,12 +35,13 @@ def from_fingerprint(cls, fingerprint): fingerprint_normalized.kinase_name = fingerprint.kinase_name fingerprint_normalized.residue_ids = fingerprint.residue_ids fingerprint_normalized.residue_ixs = fingerprint.residue_ixs - - fingerprint_normalized._normalize(fingerprint) + fingerprint_normalized.values_dict = fingerprint_normalized._normalize( + fingerprint, fine_grained + ) return fingerprint_normalized - def _normalize(self, fingerprint): + def _normalize(self, fingerprint, fine_grained): """ Normalize the fingerprint (set as values_dict attribute in FingerprintNormalized class). @@ -58,21 +58,21 @@ def _normalize(self, fingerprint): ) values_dict_normalized["spatial"] = {} values_dict_normalized["spatial"]["distances"] = self._normalize_distances_bits( - fingerprint.values_dict["spatial"]["distances"] + fingerprint.values_dict["spatial"]["distances"], fine_grained ) values_dict_normalized["spatial"]["moments"] = self._normalize_moments_bits( - fingerprint.values_dict["spatial"]["moments"] + fingerprint.values_dict["spatial"]["moments"], fine_grained ) - self.values_dict = values_dict_normalized + return values_dict_normalized - def _normalize_physicochemical_bits(self, values): + def _normalize_physicochemical_bits(self, values_dict): """ Normalize physicochemical bits. Parameters ---------- - values : dict of list of float + values_dict : dict of list of float Physicochemical bits. Returns @@ -81,85 +81,28 @@ def _normalize_physicochemical_bits(self, values): Normalized physicochemical bits. """ - values_normalized = {} + values_normalized_dict = {} - if values is not None: - values_normalized["size"] = [ - self._min_max_normalization( - value, - min(DISCRETE_FEATURE_VALUES["size"]), - max(DISCRETE_FEATURE_VALUES["size"]), - ) - for value in values["size"] - ] - values_normalized["hbd"] = [ - self._min_max_normalization( - value, - min(DISCRETE_FEATURE_VALUES["hbd"]), - max(DISCRETE_FEATURE_VALUES["hbd"]), - ) - for value in values["hbd"] - ] - values_normalized["hba"] = [ - self._min_max_normalization( - value, - min(DISCRETE_FEATURE_VALUES["hba"]), - max(DISCRETE_FEATURE_VALUES["hba"]), - ) - for value in values["hba"] - ] - values_normalized["charge"] = [ - self._min_max_normalization( - value, - min(DISCRETE_FEATURE_VALUES["charge"]), - max(DISCRETE_FEATURE_VALUES["charge"]), - ) - for value in values["charge"] - ] - values_normalized["aromatic"] = [ - self._min_max_normalization( - value, - min(DISCRETE_FEATURE_VALUES["aromatic"]), - max(DISCRETE_FEATURE_VALUES["aromatic"]), - ) - for value in values["aromatic"] - ] - values_normalized["aliphatic"] = [ - self._min_max_normalization( - value, - min(DISCRETE_FEATURE_VALUES["aliphatic"]), - max(DISCRETE_FEATURE_VALUES["aliphatic"]), - ) - for value in values["aliphatic"] - ] - values_normalized["sco"] = [ - self._min_max_normalization( - value, - min(DISCRETE_FEATURE_VALUES["sco"]), - max(DISCRETE_FEATURE_VALUES["sco"]), - ) - for value in values["sco"] - ] - values_normalized["exposure"] = [ - self._min_max_normalization( - value, - min(DISCRETE_FEATURE_VALUES["exposure"]), - max(DISCRETE_FEATURE_VALUES["exposure"]), - ) - for value in values["exposure"] - ] - return values_normalized + if values_dict is not None: + for feature_name, values in values_dict.items(): + if feature_name in DISCRETE_FEATURE_VALUES.keys(): + values_normalized_dict[feature_name] = min_max_normalization_vector( + values, + min(DISCRETE_FEATURE_VALUES[feature_name]), + max(DISCRETE_FEATURE_VALUES[feature_name]), + ) + return values_normalized_dict else: return None - def _normalize_distances_bits(self, values): + def _normalize_distances_bits(self, values_dict, fine_grained): """ Normalize distances bits (using cutoffs defined for each subpocket). Parameters ---------- - values : dict of list of float + values_dict : dict of list of float Distances bits. Returns @@ -168,24 +111,26 @@ def _normalize_distances_bits(self, values): Normalized distances bits. """ - values_normalized = {} + if fine_grained: + cutoffs = DISTANCE_CUTOFFS["fine"] + else: + cutoffs = DISTANCE_CUTOFFS["coarse"] - if values is not None: - for subpocket_name, distances in values.items(): - values_normalized[subpocket_name] = [ - self._min_max_normalization( - distance, - DISTANCE_CUTOFFS[subpocket_name][0], - DISTANCE_CUTOFFS[subpocket_name][1], - ) - for distance in distances - ] - return values_normalized + values_normalized_dict = {} + + if values_dict is not None: + for subpocket_name, values in values_dict.items(): + values_normalized_dict[subpocket_name] = min_max_normalization_vector( + values, + cutoffs.loc[(subpocket_name, "min"), :].to_list(), + cutoffs.loc[(subpocket_name, "max"), :].to_list(), + ) + return values_normalized_dict else: return None - def _normalize_moments_bits(self, values): + def _normalize_moments_bits(self, values_dict, fine_grained): """ Normalize moments bits (using cutoffs defined for each moment). @@ -200,47 +145,32 @@ def _normalize_moments_bits(self, values): Normalized moments bits. """ - values_normalized = {} - - if values is not None: - for subpocket_name, moments in values.items(): - values_normalized[subpocket_name] = [ - self._min_max_normalization( - moment, MOMENT_CUTOFFS[i + 1][0], MOMENT_CUTOFFS[i + 1][1] - ) - for i, moment in enumerate(values[subpocket_name]) - ] - return values_normalized - + if fine_grained: + cutoffs = MOMENT_CUTOFFS["fine"] else: - return None - - @staticmethod - def _min_max_normalization(value, minimum, maximum): - """ - Normalize a value using minimum-maximum normalization. - Values equal or lower / greater than the minimum / maximum value are set to 0.0 / 1.0. - - Parameters - ---------- - value : float or int - Value to be normalized. - minimum : float or int - Minimum value for normalization, values equal/greater than this minimum are set to 0.0. - maximum : float or int - Maximum value for normalization, values equal/greater than this maximum are set to 1.0. - - Returns - ------- - float - Normalized value. - """ + cutoffs = MOMENT_CUTOFFS["coarse"] + + values_normalized_dict = {} + + if values_dict is not None: + for subpocket_name, values in values_dict.items(): + + # This is truly ugly! + if fine_grained: + minimum = cutoffs[cutoffs.index.get_level_values("min_max") == "min"][ + subpocket_name + ] + maximum = cutoffs[cutoffs.index.get_level_values("min_max") == "max"][ + subpocket_name + ] + else: + minimum = cutoffs[cutoffs.index.get_level_values("min_max") == "min"] + maximum = cutoffs[cutoffs.index.get_level_values("min_max") == "max"] + + values_normalized_dict[subpocket_name] = min_max_normalization_vector( + values, minimum.squeeze().to_list(), maximum.squeeze().to_list() + ) + return values_normalized_dict - if np.isnan(value): - return np.nan - elif minimum < value < maximum: - return (value - minimum) / float(maximum - minimum) - elif value <= minimum: - return 0.0 else: - return 1.0 + return None diff --git a/kissim/tests/encoding/test_fingerprint_generator.py b/kissim/tests/encoding/test_fingerprint_generator.py index 36c07828..04a2fbce 100644 --- a/kissim/tests/encoding/test_fingerprint_generator.py +++ b/kissim/tests/encoding/test_fingerprint_generator.py @@ -10,12 +10,8 @@ from opencadd.databases.klifs import setup_local, setup_remote from kissim.utils import enter_temp_directory -from kissim.encoding import Fingerprint, FingerprintNormalized, FingerprintGenerator -from kissim.schema import ( - FEATURE_NAMES_PHYSICOCHEMICAL, - FEATURE_NAMES_PHYSICOCHEMICAL_DICT, - FEATURE_NAMES_DISTANCES_AND_MOMENTS, -) +from kissim.encoding import Fingerprint, FingerprintGenerator +from kissim.schema import FEATURE_NAMES_PHYSICOCHEMICAL_DICT, FEATURE_NAMES_DISTANCES_AND_MOMENTS PATH_TEST_DATA = Path(__name__).parent / "kissim" / "tests" / "data" REMOTE = setup_remote() @@ -64,18 +60,13 @@ def test_from_structure_klifs_id( fingerprints_values_array_sum_calculated = sum( [ np.nansum(fingerprint.values_array(True, True, True)) - for structure_klifs_id, fingerprint in fingerprints.data.items() + for _, fingerprint in fingerprints.data.items() ] ) assert ( pytest.approx(fingerprints_values_array_sum_calculated, abs=1e-4) == fingerprints_values_array_sum ) - # Attribute: data_normalized - assert isinstance(fingerprints.data_normalized, dict) - for key, value in fingerprints.data_normalized.items(): - assert isinstance(key, int) - assert isinstance(value, FingerprintNormalized) # Property: subpocket_centers assert isinstance(fingerprints.subpocket_centers, pd.DataFrame) @@ -95,10 +86,10 @@ def test_from_structure_klifs_id( ) @pytest.mark.parametrize( - "structure_klifs_ids, normalize, values_array_sum", - [([110, 118], False, 10148.4256), ([110, 118], True, 10148.4256)], + "structure_klifs_ids, values_array_sum", + [([110, 118], 10148.4256), ([110, 118], 10148.4256)], ) - def test_to_from_json(self, structure_klifs_ids, normalize, values_array_sum): + def test_to_from_json(self, structure_klifs_ids, values_array_sum): """ Test if saving/loading a fingerprint to/from a json file. """ @@ -113,30 +104,25 @@ def test_to_from_json(self, structure_klifs_ids, normalize, values_array_sum): assert json_filepath.exists() # Load json file - fingerprints_reloaded = FingerprintGenerator.from_json(json_filepath, normalize) + fingerprints_reloaded = FingerprintGenerator.from_json(json_filepath) assert isinstance(fingerprints_reloaded, FingerprintGenerator) # Attribute data assert list(fingerprints.data.keys()) == list(fingerprints_reloaded.data.keys()) - if normalize: - assert list(fingerprints.data_normalized.keys()) == list( - fingerprints_reloaded.data_normalized.keys() - ) - else: - assert fingerprints_reloaded.data_normalized is None + values_array_sum_calculated = sum( [ np.nansum(fingerprint.values_array(True, True, True)) - for structure_klifs_id, fingerprint in fingerprints_reloaded.data.items() + for _, fingerprint in fingerprints_reloaded.data.items() ] ) assert pytest.approx(values_array_sum_calculated, abs=1e-4) == values_array_sum @pytest.mark.parametrize( - "structure_klifs_ids, normalized", - [([110, 118], True), ([110, 118], False)], + "structure_klifs_ids", + [([110, 118]), ([110, 118])], ) - def test_physicochemical_distances_moments(self, structure_klifs_ids, normalized): + def test_physicochemical_distances_moments(self, structure_klifs_ids): """ Test feature group extraction methods. @@ -148,29 +134,26 @@ def test_physicochemical_distances_moments(self, structure_klifs_ids, normalized fingerprints = FingerprintGenerator.from_structure_klifs_ids(structure_klifs_ids) - physicochemical = fingerprints.physicochemical(normalized) + physicochemical = fingerprints.physicochemical assert physicochemical.index.to_list() == structure_klifs_ids - if normalized: - assert physicochemical.columns.to_list() == FEATURE_NAMES_PHYSICOCHEMICAL - else: - assert physicochemical.columns.to_list() == FEATURE_NAMES_PHYSICOCHEMICAL_DICT + assert physicochemical.columns.to_list() == FEATURE_NAMES_PHYSICOCHEMICAL_DICT assert isinstance(physicochemical.iloc[0, 0], list) - distances = fingerprints.distances(normalized) + distances = fingerprints.distances assert distances.index.to_list() == structure_klifs_ids assert distances.columns.to_list() == FEATURE_NAMES_DISTANCES_AND_MOMENTS assert isinstance(distances.iloc[0, 0], list) - moments = fingerprints.moments(normalized) + moments = fingerprints.moments assert moments.index.to_list() == structure_klifs_ids assert moments.columns.to_list() == FEATURE_NAMES_DISTANCES_AND_MOMENTS assert isinstance(moments.iloc[0, 0], list) @pytest.mark.parametrize( - "structure_klifs_ids, normalized", - [([110, 118], True), ([110, 118], False)], + "structure_klifs_ids", + [([110, 118]), ([110, 118])], ) - def test_physicochemical_distances_moments_exploded(self, structure_klifs_ids, normalized): + def test_physicochemical_distances_moments_exploded(self, structure_klifs_ids): """ Test feature group extraction methods. @@ -191,18 +174,15 @@ def _index_structure_klifs_id(multiplier): index_residue_ix = list(range(1, 86)) * len(structure_klifs_ids) index_moment = list(range(1, 4)) * len(structure_klifs_ids) - physicochemical = fingerprints.physicochemical_exploded(normalized) + physicochemical = fingerprints.physicochemical_exploded assert physicochemical.index.get_level_values( "structure_klifs_id" ).to_list() == _index_structure_klifs_id(85) assert physicochemical.index.get_level_values("residue_ix").to_list() == index_residue_ix - if normalized: - assert physicochemical.columns.to_list() == FEATURE_NAMES_PHYSICOCHEMICAL - else: - assert physicochemical.columns.to_list() == FEATURE_NAMES_PHYSICOCHEMICAL_DICT + assert physicochemical.columns.to_list() == FEATURE_NAMES_PHYSICOCHEMICAL_DICT assert physicochemical.dtypes.unique() == "float64" - distances = fingerprints.distances_exploded(normalized) + distances = fingerprints.distances_exploded assert distances.index.get_level_values( "structure_klifs_id" ).to_list() == _index_structure_klifs_id(85) @@ -210,7 +190,7 @@ def _index_structure_klifs_id(multiplier): assert distances.columns.to_list() == FEATURE_NAMES_DISTANCES_AND_MOMENTS assert distances.dtypes.unique() == "float64" - moments = fingerprints.moments_exploded(normalized) + moments = fingerprints.moments_exploded assert moments.index.get_level_values( "structure_klifs_id" ).to_list() == _index_structure_klifs_id(3) diff --git a/kissim/tests/encoding/test_fingerprint_normalized.py b/kissim/tests/encoding/test_fingerprint_normalized.py index 92c161d1..d9d74655 100644 --- a/kissim/tests/encoding/test_fingerprint_normalized.py +++ b/kissim/tests/encoding/test_fingerprint_normalized.py @@ -8,7 +8,6 @@ import numpy as np from opencadd.databases.klifs import setup_local -from kissim.definitions import DISTANCE_CUTOFFS, MOMENT_CUTOFFS from kissim.encoding import Fingerprint, FingerprintNormalized PATH_TEST_DATA = Path(__name__).parent / "kissim" / "tests" / "data" @@ -79,36 +78,15 @@ def test_normalize_physicochemical_bits(self, values, values_normalized): assert values_normalized_calculated == values_normalized @pytest.mark.parametrize( - "values, values_normalized", + "values, fine_grained, values_normalized", [ - (None, None), + (None, False, None), ( { - "hinge_region": [ - DISTANCE_CUTOFFS["hinge_region"][0] - 1, - DISTANCE_CUTOFFS["hinge_region"][0], - DISTANCE_CUTOFFS["hinge_region"][1], - DISTANCE_CUTOFFS["hinge_region"][1] + 1, - ], - "dfg_region": [ - DISTANCE_CUTOFFS["dfg_region"][0] - 1, - DISTANCE_CUTOFFS["dfg_region"][0], - DISTANCE_CUTOFFS["dfg_region"][1], - DISTANCE_CUTOFFS["dfg_region"][1] + 1, - ], - "front_pocket": [ - DISTANCE_CUTOFFS["front_pocket"][0] - 1, - DISTANCE_CUTOFFS["front_pocket"][0], - DISTANCE_CUTOFFS["front_pocket"][1], - DISTANCE_CUTOFFS["front_pocket"][1] + 1, - ], - "center": [ - DISTANCE_CUTOFFS["center"][0] - 1, - DISTANCE_CUTOFFS["center"][0], - DISTANCE_CUTOFFS["center"][1], - DISTANCE_CUTOFFS["center"][1] + 1, - ], + subpocket_name: [-100, -100, 100, 100] + for subpocket_name in ["hinge_region", "dfg_region", "front_pocket", "center"] }, + False, { "hinge_region": [0.0, 0.0, 1.0, 1.0], "dfg_region": [0.0, 0.0, 1.0, 1.0], @@ -118,90 +96,40 @@ def test_normalize_physicochemical_bits(self, values, values_normalized): ), ], ) - def test_normalize_distances_bits(self, values, values_normalized): + def test_normalize_distances_bits(self, values, fine_grained, values_normalized): """ Test normalization of distance bits. """ fingerprint_normalized = FingerprintNormalized() - values_normalized_calculated = fingerprint_normalized._normalize_distances_bits(values) + values_normalized_calculated = fingerprint_normalized._normalize_distances_bits( + values, fine_grained + ) assert values_normalized_calculated == values_normalized @pytest.mark.parametrize( - "values, values_normalized", + "values, fine_grained, values_normalized", [ - (None, None), - ( - { - "test": [ - MOMENT_CUTOFFS[1][0] - 1, - MOMENT_CUTOFFS[2][0] - 1, - MOMENT_CUTOFFS[3][0] - 1, - ], - }, - { - "test": [0.0, 0.0, 0.0], - }, - ), - ( - { - "test": [ - MOMENT_CUTOFFS[1][0], - MOMENT_CUTOFFS[2][0], - MOMENT_CUTOFFS[3][0], - ], - }, - { - "test": [0.0, 0.0, 0.0], - }, - ), + (None, False, None), ( - { - "test": [ - MOMENT_CUTOFFS[1][1], - MOMENT_CUTOFFS[2][1], - MOMENT_CUTOFFS[3][1], - ], - }, - { - "test": [1.0, 1.0, 1.0], - }, + {"test": [-100, -100, -100]}, + False, + {"test": [0.0, 0.0, 0.0]}, ), ( - { - "test": [ - MOMENT_CUTOFFS[1][1] + 1, - MOMENT_CUTOFFS[2][1] + 1, - MOMENT_CUTOFFS[3][1] + 1, - ], - }, - { - "test": [1.0, 1.0, 1.0], - }, + {"test": [100, 100, 100]}, + False, + {"test": [1.0, 1.0, 1.0]}, ), ], ) - def test_normalize_moments_bits(self, values, values_normalized): + def test_normalize_moments_bits(self, values, fine_grained, values_normalized): """ Test normalization of moments bits. """ fingerprint_normalized = FingerprintNormalized() - values_normalized_calculated = fingerprint_normalized._normalize_moments_bits(values) - assert values_normalized_calculated == values_normalized - - @pytest.mark.parametrize( - "value, minimum, maximum, value_normalized", - [(15, 10, 20, 0.5), (10, 10, 20, 0.0), (0, 10, 20, 0.0), (np.nan, 10, 20, np.nan)], - ) - def test_min_max_normalization(self, value, minimum, maximum, value_normalized): - """ - Test min-max normalization - """ - - fingerprint_normalized = FingerprintNormalized() - value_normalized_calculated = fingerprint_normalized._min_max_normalization( - value, minimum, maximum + values_normalized_calculated = fingerprint_normalized._normalize_moments_bits( + values, fine_grained ) - if not np.isnan(value): - assert pytest.approx(value_normalized_calculated, abs=1e-4) == value_normalized + assert values_normalized_calculated == values_normalized diff --git a/kissim/tests/test_utils.py b/kissim/tests/test_utils.py index ceb0b297..a3d5d6fa 100644 --- a/kissim/tests/test_utils.py +++ b/kissim/tests/test_utils.py @@ -5,7 +5,11 @@ import pytest import numpy as np -from kissim.utils import set_n_cores, calculate_first_second_third_moments +from kissim.utils import ( + set_n_cores, + calculate_first_second_third_moments, + min_max_normalization_scalar, +) @pytest.mark.parametrize( @@ -42,3 +46,17 @@ def test_calculate_first_second_third_moment(values, moments): assert pytest.approx(moments_calculated, abs=1e-6) == moments else: assert all(moments_calculated) == all(moments) + + +@pytest.mark.parametrize( + "value, minimum, maximum, value_normalized", + [(15, 10, 20, 0.5), (10, 10, 20, 0.0), (0, 10, 20, 0.0), (np.nan, 10, 20, np.nan)], +) +def test_min_max_normalization_scalar(value, minimum, maximum, value_normalized): + """ + Test min-max normalization + """ + + value_normalized_calculated = min_max_normalization_scalar(value, minimum, maximum) + if not np.isnan(value): + assert pytest.approx(value_normalized_calculated, abs=1e-4) == value_normalized diff --git a/kissim/utils.py b/kissim/utils.py index 9a8c18f4..8f60231d 100644 --- a/kissim/utils.py +++ b/kissim/utils.py @@ -12,6 +12,9 @@ from multiprocessing import cpu_count import numpy as np +from numpy.lib import isin +import pandas as pd +from sklearn.preprocessing import MinMaxScaler from scipy.special import cbrt from scipy.stats.stats import moment @@ -92,3 +95,137 @@ def calculate_first_second_third_moments( return moment1, moment2, moment3 else: return np.nan, np.nan, np.nan + + +def spatial_min_max_from_fingerprint_generator( + fingerprint_generator, feature="distances", fine_grained=True +): + """ + Calculate the minimum and maximum values from fingerprints data. + + Parameters + ---------- + feature : str + Choose `"distances"` or `"moments"` features. + fine_grained : bool + True (default): + Distances: Calculate min/max per subpocket for each residue position individually. + Moments: Calculate min/max per moment for each subpocket individually. + False: + Distances: Calculate min/max per subpocket over all residue positions. + Moments: Calculate min/max per moment over all subpockets. + + Returns + ------- + pandas.DataFrame + Distances: + For each subpocket save min/max (index) calculated over all residue positions or + per residue position (column(s)). + Moments: + For each moment save min/max (index) calculated over all subpockets or + per subpocket (column(s)). + """ + + if feature == "distances": + fps_data = fingerprint_generator.distances_exploded() + index1_name = "subpocket" + if fine_grained: + column_names = fps_data.index.get_level_values(1).unique().to_list() + else: + column_names = ["any"] + elif feature == "moments": + fps_data = fingerprint_generator.moments_exploded().stack().unstack(1) + index1_name = "moment" + if fine_grained: + column_names = fps_data.index.get_level_values(1).unique().to_list() + else: + column_names = ["any"] + else: + raise KeyError(f"Unknown feature. Choose between distances and moments.") + + min_max = [] + for subpocket, data in fps_data.items(): + + data = data.to_frame() + if fine_grained: + data = data.unstack() + + data_min, data_max = data.min(), data.max() + + data_min = round(data_min, 2) + data_max = round(data_max, 2) + + min_max.append([subpocket, "min"] + data_min.tolist()) + min_max.append([subpocket, "max"] + data_max.tolist()) + + min_max = pd.DataFrame(min_max, columns=[index1_name, "min_max"] + column_names).set_index( + [index1_name, "min_max"] + ) + + return min_max + + +def min_max_normalization_vector(vector, minimum, maximum): + """ + Normalize vector, either based on a single minimum/maximum or based on a element-wise minimum + /maximum. + + Parameters + ---------- + vector : list of float + Vector to be normalized. + minimum : int/float or list of int/float + Minimum value or vector (same length as `vector` and `maximum`) + maximum : int/float or list of int/float + Maximum value or vector (same length as `vector` and `minimum`) + + Returns + ------- + list of int/float + Normalized vector + """ + + if isinstance(minimum, (int, float)): + minimum = [minimum] + if isinstance(maximum, (int, float)): + maximum = [maximum] + + if len(minimum) == len(maximum) == 1: + return [min_max_normalization_scalar(v_i, minimum[0], maximum[0]) for v_i in vector] + elif len(minimum) == len(maximum) == len(vector) > 1: + return [ + min_max_normalization_scalar(v_i, min_i, max_i) + for v_i, min_i, max_i in zip(vector, minimum, maximum) + ] + else: + raise ValueError("Inputs do not match; please refer to docstring.") + + +def min_max_normalization_scalar(scalar, minimum, maximum): + """ + Normalize a value using minimum-maximum normalization. + Values equal or lower / greater than the minimum / maximum value are set to 0.0 / 1.0. + + Parameters + ---------- + scalar : float or int + Value to be normalized. + minimum : float or int + Minimum value for normalization, values equal/greater than this minimum are set to 0.0. + maximum : float or int + Maximum value for normalization, values equal/greater than this maximum are set to 1.0. + + Returns + ------- + float + Normalized value. + """ + + if np.isnan(scalar): + return np.nan + elif minimum < scalar < maximum: + return (scalar - minimum) / float(maximum - minimum) + elif scalar <= minimum: + return 0.0 + else: + return 1.0 diff --git a/kissim/viewer/base.py b/kissim/viewer/base.py index cd9a64c4..14a72764 100644 --- a/kissim/viewer/base.py +++ b/kissim/viewer/base.py @@ -126,8 +126,8 @@ def _fingerprints_features(self): return pd.concat( [ - self._fingerprints.physicochemical_exploded(), - self._fingerprints.distances_exploded(), + self._fingerprints.physicochemical_exploded, + self._fingerprints.distances_exploded, ] )[self._feature_names] From 0b11150e950315fc1eadb7c8449ad8738b918c80 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 3 Sep 2021 19:15:54 +0200 Subject: [PATCH 2/7] Satisfy pylint --- kissim/encoding/fingerprint_generator.py | 3 +-- kissim/utils.py | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/kissim/encoding/fingerprint_generator.py b/kissim/encoding/fingerprint_generator.py index 08059402..1d784f2f 100644 --- a/kissim/encoding/fingerprint_generator.py +++ b/kissim/encoding/fingerprint_generator.py @@ -11,10 +11,9 @@ from pathlib import Path from multiprocessing import Pool -import pandas as pd from opencadd.databases.klifs import setup_remote -from kissim.encoding import Fingerprint, FingerprintNormalized, FingerprintGeneratorBase +from kissim.encoding import Fingerprint, FingerprintGeneratorBase from kissim.utils import set_n_cores logger = logging.getLogger(__name__) diff --git a/kissim/utils.py b/kissim/utils.py index 8f60231d..71a9aa30 100644 --- a/kissim/utils.py +++ b/kissim/utils.py @@ -12,9 +12,7 @@ from multiprocessing import cpu_count import numpy as np -from numpy.lib import isin import pandas as pd -from sklearn.preprocessing import MinMaxScaler from scipy.special import cbrt from scipy.stats.stats import moment From a88bf7a03c2f296d4e7da1aae0f831bfbf485df1 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 3 Sep 2021 20:14:30 +0200 Subject: [PATCH 3/7] Add normalize API/CLI + tests --- kissim/api/__init__.py | 1 + kissim/api/normalize.py | 61 +++++++++++++++++++ kissim/cli/__init__.py | 1 + kissim/cli/main.py | 36 +++++++++++ kissim/cli/normalize.py | 22 +++++++ kissim/encoding/fingerprint_generator.py | 47 -------------- kissim/encoding/fingerprint_generator_base.py | 49 +++++++++++++++ kissim/tests/cli/test_main_normalize.py | 39 ++++++++++++ 8 files changed, 209 insertions(+), 47 deletions(-) create mode 100644 kissim/api/normalize.py create mode 100644 kissim/cli/normalize.py create mode 100644 kissim/tests/cli/test_main_normalize.py diff --git a/kissim/api/__init__.py b/kissim/api/__init__.py index ea42a9e6..dac3a390 100644 --- a/kissim/api/__init__.py +++ b/kissim/api/__init__.py @@ -3,6 +3,7 @@ """ from .encode import encode +from .normalize import normalize from .compare import compare from .weights import weights from .outliers import outliers diff --git a/kissim/api/normalize.py b/kissim/api/normalize.py new file mode 100644 index 00000000..fa671d91 --- /dev/null +++ b/kissim/api/normalize.py @@ -0,0 +1,61 @@ +""" +kissim.api.normalize + +Main API for normalizing fingerprints. +""" + +import logging +from pathlib import Path + +from kissim.encoding import FingerprintGenerator, FingerprintGeneratorNormalized + +logger = logging.getLogger(__name__) + + +def normalize( + fingerprints_path, method="min_max", fine_grained=True, fingerprints_normalized_path=None +): + """ + Remove outlier fingerprints (defined by spatial distances maximum). + + Parameters + ---------- + fingerprints_path : str or pathlib.Path + Path to fingerprints JSON file. + method : str + Normalization method. + fine_grained : bool + True (default): + Distances: Calculate min/max per subpocket for each residue position individually. + Moments: Calculate min/max per moment for each subpocket individually. + False: + Distances: Calculate min/max per subpocket over all residue positions. + Moments: Calculate min/max per moment over all subpockets. + fingerprints_normalized_path : str or pathlib.Path + Path to normalized fingerprints JSON file. + + Returns + ------- + kissim.encoding.FingerprintGenerator + Normalized Fingerprints. + """ + + # Load fingerprints + logger.info("Read fingerprints...") + fingerprints_path = Path(fingerprints_path) + fingerprint_generator = FingerprintGenerator.from_json(fingerprints_path) + logger.info(f"Number of fingerprints: {len(fingerprint_generator.data)}") + + # Normalize fingerprints + logger.info("Normalize fingerprints...") + logger.info(f"Normalization method: {method}") + logger.info(f"Use fine-grained normalization: {fine_grained}") + fingerprint_generator_normalized = FingerprintGeneratorNormalized.from_fingerprint_generator( + fingerprint_generator, method, fine_grained + ) + if fingerprints_normalized_path is not None: + fingerprints_normalized_path = Path(fingerprints_normalized_path) + fingerprint_generator_normalized.to_json(fingerprints_normalized_path) + logger.info(f"Number of fingerprints: {len(fingerprint_generator_normalized.data)}") + + return fingerprint_generator_normalized diff --git a/kissim/cli/__init__.py b/kissim/cli/__init__.py index 98bcd662..e0d9133e 100644 --- a/kissim/cli/__init__.py +++ b/kissim/cli/__init__.py @@ -3,6 +3,7 @@ """ from .encode import encode_from_cli +from .normalize import normalize_from_cli from .compare import compare_from_cli from .weights import weights_from_cli from .outliers import outliers_from_cli diff --git a/kissim/cli/main.py b/kissim/cli/main.py index 6e2afe88..eb5007f5 100644 --- a/kissim/cli/main.py +++ b/kissim/cli/main.py @@ -15,6 +15,7 @@ from kissim.cli import ( encode_from_cli, + normalize_from_cli, compare_from_cli, weights_from_cli, outliers_from_cli, @@ -30,6 +31,7 @@ def main(): Sub-commands are: - encode + - normalize - compare - weights - outliers @@ -41,6 +43,7 @@ def main(): subparsers = parser.add_subparsers() encode_subparser = subparsers.add_parser("encode") + normalize_subparser = subparsers.add_parser("normalize") compare_subparser = subparsers.add_parser("compare") weights_subparser = subparsers.add_parser("weights") outliers_subparser = subparsers.add_parser("outliers") @@ -80,6 +83,39 @@ def main(): ) encode_subparser.set_defaults(func=encode_from_cli) + # Arguments and function to be called for sub-command normalize + normalize_subparser.add_argument( + "-i", + "--input", + type=str, + help="Path to JSON file containing fingerprint data.", + required=True, + ) + normalize_subparser.add_argument( + "-o", + "--output", + type=str, + help="Path to JSON file containing normalized fingerprint data.", + required=True, + ) + normalize_subparser.add_argument( + "-m", + "--method", + type=str, + help="Normalization method.", + required=False, + default="min_max", + ) + normalize_subparser.add_argument( + "-f", + "--fine_grained", + action="store_true", + help="Use fine-grained normalization (min-max per residue/subpocket for distances/moments).", + required=False, + default=False, + ) + normalize_subparser.set_defaults(func=normalize_from_cli) + # Arguments and function to be called for sub-command compare compare_subparser.add_argument( "-i", diff --git a/kissim/cli/normalize.py b/kissim/cli/normalize.py new file mode 100644 index 00000000..48ef8555 --- /dev/null +++ b/kissim/cli/normalize.py @@ -0,0 +1,22 @@ +""" +kissim.cli.normalize + +Normalize fingerprints from CLI arguments. +""" + +from kissim.api import normalize +from kissim.cli.utils import configure_logger + + +def normalize_from_cli(args): + """ + Normalize fingerprints. + + Parameters + ---------- + args : argsparse.Namespace + CLI arguments. + """ + + configure_logger(args.output) + normalize(args.input, args.method, bool(args.fine_grained), args.output) diff --git a/kissim/encoding/fingerprint_generator.py b/kissim/encoding/fingerprint_generator.py index 1d784f2f..bed220bc 100644 --- a/kissim/encoding/fingerprint_generator.py +++ b/kissim/encoding/fingerprint_generator.py @@ -6,9 +6,7 @@ import datetime from itertools import repeat -import json import logging -from pathlib import Path from multiprocessing import Pool from opencadd.databases.klifs import setup_remote @@ -72,51 +70,6 @@ def from_structure_klifs_ids(cls, structure_klifs_ids, klifs_session=None, n_cor return fingerprint_generator - @classmethod - def from_json(cls, filepath): - """ - Initialize a FingerprintGenerator object from a json file. - - Parameters - ---------- - filepath : str or pathlib.Path - Path to json file. - """ - - filepath = Path(filepath) - with open(filepath, "r") as f: - json_string = f.read() - fingerprints_list = json.loads(json_string) - - data = {} - for fingerprint_dict in fingerprints_list: - fingerprint = Fingerprint._from_dict(fingerprint_dict) - data[fingerprint.structure_klifs_id] = fingerprint - - fingerprint_generator = cls() - fingerprint_generator.data = data - fingerprint_generator.structure_klifs_ids = list(fingerprint_generator.data.keys()) - - return fingerprint_generator - - def to_json(self, filepath): - """ - Write FingerprintGenerator class attributes to a json file. - - Parameters - ---------- - filepath : str or pathlib.Path - Path to json file. - """ - - fingerprint_list = [ - fingerprint.__dict__ for structure_klifs_id, fingerprint in self.data.items() - ] - json_string = json.dumps(fingerprint_list) - filepath = Path(filepath) - with open(filepath, "w") as f: - f.write(json_string) - def _get_fingerprint_list(self, n_cores): """ Generate fingerprints. diff --git a/kissim/encoding/fingerprint_generator_base.py b/kissim/encoding/fingerprint_generator_base.py index 84bc0a88..bb2ca3d3 100644 --- a/kissim/encoding/fingerprint_generator_base.py +++ b/kissim/encoding/fingerprint_generator_base.py @@ -4,10 +4,14 @@ Defines the base kissim fingerprint generator. """ +import json import logging +from pathlib import Path import pandas as pd +from kissim.encoding import FingerprintBase + logger = logging.getLogger(__name__) @@ -199,3 +203,48 @@ def _feature_group_exploded(self, feature_group): ) features_exploded.index = multi_index return features_exploded + + @classmethod + def from_json(cls, filepath): + """ + Initialize a FingerprintGenerator object from a json file. + + Parameters + ---------- + filepath : str or pathlib.Path + Path to json file. + """ + + filepath = Path(filepath) + with open(filepath, "r") as f: + json_string = f.read() + fingerprints_list = json.loads(json_string) + + data = {} + for fingerprint_dict in fingerprints_list: + fingerprint = FingerprintBase._from_dict(fingerprint_dict) + data[fingerprint.structure_klifs_id] = fingerprint + + fingerprint_generator = cls() + fingerprint_generator.data = data + fingerprint_generator.structure_klifs_ids = list(fingerprint_generator.data.keys()) + + return fingerprint_generator + + def to_json(self, filepath): + """ + Write FingerprintGenerator class attributes to a json file. + + Parameters + ---------- + filepath : str or pathlib.Path + Path to json file. + """ + + fingerprint_list = [ + fingerprint.__dict__ for structure_klifs_id, fingerprint in self.data.items() + ] + json_string = json.dumps(fingerprint_list) + filepath = Path(filepath) + with open(filepath, "w") as f: + f.write(json_string) diff --git a/kissim/tests/cli/test_main_normalize.py b/kissim/tests/cli/test_main_normalize.py new file mode 100644 index 00000000..82bd7caa --- /dev/null +++ b/kissim/tests/cli/test_main_normalize.py @@ -0,0 +1,39 @@ +""" +Unit and regression test for kissim's normalized CLI. +""" + +from pathlib import Path +import platform +import pytest +import subprocess + +from kissim.utils import enter_temp_directory + +PATH_TEST_DATA = Path(__name__).parent / "kissim" / "tests" / "data" +PATH_TEST_DATA = Path(__file__).parent / "../data" + + +@pytest.mark.parametrize( + "args", + [ + f"kissim normalize -i {(PATH_TEST_DATA / 'fingerprints_test.json').absolute()} -o fingerprints_normalized_test.json", + f"kissim normalize -i {(PATH_TEST_DATA / 'fingerprints_test.json').absolute()} -o fingerprints_normalized_test.json -f", + f"kissim normalize -i {(PATH_TEST_DATA / 'fingerprints_test.json').absolute()} -o fingerprints_normalized_test.json -m min_max", + ], +) +def test_main_normalize(args): + """ + Test CLI for normalize using subprocesses. + """ + + output = Path("fingerprints_normalized_test.json") + args = args.split() + + with enter_temp_directory(): + subprocess.run(args, check=True) + + # Json file there? + assert output.exists() + # Log file there? + if platform.system() != "Windows": + assert Path(f"{output.stem}.log").exists() From 02069bd3b72ff147f53cf192b53aa7f1eb7718d1 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 3 Sep 2021 21:19:11 +0200 Subject: [PATCH 4/7] Update tests --- kissim/tests/cli/test_main_encode.py | 4 ++-- kissim/tests/encoding/test_fingerprint_generator.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kissim/tests/cli/test_main_encode.py b/kissim/tests/cli/test_main_encode.py index a2a5d020..f090ee57 100644 --- a/kissim/tests/cli/test_main_encode.py +++ b/kissim/tests/cli/test_main_encode.py @@ -8,7 +8,7 @@ import subprocess from kissim.utils import enter_temp_directory -from kissim.encoding import Fingerprint, FingerprintGenerator +from kissim.encoding import FingerprintBase, FingerprintGenerator PATH_TEST_DATA = Path(__name__).parent / "kissim" / "tests" / "data" @@ -43,7 +43,7 @@ def test_main_encode(args): # Json file can be loaded as FingerprintGenerator object? fingerprint_generator = FingerprintGenerator.from_json(output) assert isinstance(fingerprint_generator, FingerprintGenerator) - assert isinstance(list(fingerprint_generator.data.values())[0], Fingerprint) + assert isinstance(list(fingerprint_generator.data.values())[0], FingerprintBase) @pytest.mark.parametrize( diff --git a/kissim/tests/encoding/test_fingerprint_generator.py b/kissim/tests/encoding/test_fingerprint_generator.py index 04a2fbce..1f083cdf 100644 --- a/kissim/tests/encoding/test_fingerprint_generator.py +++ b/kissim/tests/encoding/test_fingerprint_generator.py @@ -20,7 +20,7 @@ class TestFingerprintGenerator: """ - Test common functionalities in the PocketBioPython and PocketDataFrame classes. + Test fingerprints class. """ @pytest.mark.parametrize( From f00560ad0b0fb012c063da3a1caeeb443f37b5c1 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 3 Sep 2021 21:44:05 +0200 Subject: [PATCH 5/7] Add FingerprintGeneratorNormalized tests --- .../test_fingerprint_generator_normalized.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 kissim/tests/encoding/test_fingerprint_generator_normalized.py diff --git a/kissim/tests/encoding/test_fingerprint_generator_normalized.py b/kissim/tests/encoding/test_fingerprint_generator_normalized.py new file mode 100644 index 00000000..613bc655 --- /dev/null +++ b/kissim/tests/encoding/test_fingerprint_generator_normalized.py @@ -0,0 +1,58 @@ +""" +Unit and regression test for kissim.encoding.FingerprintGeneratorNormalized. +""" + +import pytest +import numpy as np + +from kissim.encoding import FingerprintGeneratorNormalized + + +class TestFingerprintGeneratorNormalized: + """ + Test normalized fingerprints class. + """ + + @pytest.mark.parametrize( + "method, fine_grained, structure_klifs_id, fingerprint_values_array_sum, fingerprint_normalized_values_array_sum", + [ + ("min_max", True, 109, 5108.226235, 409.057707), + ("min_max", False, 109, 5108.226235, 398.950979), + ], + ) + def test_from_fingerprint_generator( + self, + fingerprint_generator, + method, + fine_grained, + structure_klifs_id, + fingerprint_values_array_sum, + fingerprint_normalized_values_array_sum, + ): + """ + Test for the first fingerprint in the template fingerprints if the sum of unnormalized and + normalized fingerprint values is correct. + """ + + fingerprint_generator_normalized = ( + FingerprintGeneratorNormalized.from_fingerprint_generator( + fingerprint_generator, method, fine_grained + ) + ) + + fingerprint = fingerprint_generator.data[structure_klifs_id] + fingerprint_normalized = fingerprint_generator_normalized.data[structure_klifs_id] + + fingerprint_values_array_sum_calculated = np.nansum(fingerprint.values_array()) + assert ( + pytest.approx(fingerprint_values_array_sum_calculated, abs=1e-6) + == fingerprint_values_array_sum + ) + + fingerprint_normalized_values_array_sum_calculated = np.nansum( + fingerprint_normalized.values_array() + ) + assert ( + pytest.approx(fingerprint_normalized_values_array_sum_calculated, abs=1e-6) + == fingerprint_normalized_values_array_sum + ) From 550fa387a63270a0c1a865e380d87f09954e6912 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 3 Sep 2021 22:11:19 +0200 Subject: [PATCH 6/7] Attempt to fix CI --- kissim/tests/cli/test_main_normalize.py | 2 +- kissim/tests/cli/test_main_outliers.py | 2 +- kissim/utils.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kissim/tests/cli/test_main_normalize.py b/kissim/tests/cli/test_main_normalize.py index 82bd7caa..98e7d435 100644 --- a/kissim/tests/cli/test_main_normalize.py +++ b/kissim/tests/cli/test_main_normalize.py @@ -10,7 +10,7 @@ from kissim.utils import enter_temp_directory PATH_TEST_DATA = Path(__name__).parent / "kissim" / "tests" / "data" -PATH_TEST_DATA = Path(__file__).parent / "../data" +# PATH_TEST_DATA = Path(__file__).parent / "../data" @pytest.mark.parametrize( diff --git a/kissim/tests/cli/test_main_outliers.py b/kissim/tests/cli/test_main_outliers.py index 3fea6ec6..22eff059 100644 --- a/kissim/tests/cli/test_main_outliers.py +++ b/kissim/tests/cli/test_main_outliers.py @@ -10,7 +10,7 @@ from kissim.utils import enter_temp_directory PATH_TEST_DATA = Path(__name__).parent / "kissim" / "tests" / "data" -PATH_TEST_DATA = Path(__file__).parent / "../data" +# PATH_TEST_DATA = Path(__file__).parent / "../data" @pytest.mark.parametrize( diff --git a/kissim/utils.py b/kissim/utils.py index 71a9aa30..de126e23 100644 --- a/kissim/utils.py +++ b/kissim/utils.py @@ -125,14 +125,14 @@ def spatial_min_max_from_fingerprint_generator( """ if feature == "distances": - fps_data = fingerprint_generator.distances_exploded() + fps_data = fingerprint_generator.distances_exploded index1_name = "subpocket" if fine_grained: column_names = fps_data.index.get_level_values(1).unique().to_list() else: column_names = ["any"] elif feature == "moments": - fps_data = fingerprint_generator.moments_exploded().stack().unstack(1) + fps_data = fingerprint_generator.moments_exploded.stack().unstack(1) index1_name = "moment" if fine_grained: column_names = fps_data.index.get_level_values(1).unique().to_list() From 0472841f39a5015f61c50328667699e0766fff45 Mon Sep 17 00:00:00 2001 From: Unknown Date: Sat, 4 Sep 2021 08:23:30 +0200 Subject: [PATCH 7/7] Update test --- .../encoding/test_fingerprint_generator_normalized.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kissim/tests/encoding/test_fingerprint_generator_normalized.py b/kissim/tests/encoding/test_fingerprint_generator_normalized.py index 613bc655..ae170284 100644 --- a/kissim/tests/encoding/test_fingerprint_generator_normalized.py +++ b/kissim/tests/encoding/test_fingerprint_generator_normalized.py @@ -16,8 +16,8 @@ class TestFingerprintGeneratorNormalized: @pytest.mark.parametrize( "method, fine_grained, structure_klifs_id, fingerprint_values_array_sum, fingerprint_normalized_values_array_sum", [ - ("min_max", True, 109, 5108.226235, 409.057707), - ("min_max", False, 109, 5108.226235, 398.950979), + ("min_max", True, 109, 5108.2262, 409.0577), + ("min_max", False, 109, 5108.2262, 398.9509), ], ) def test_from_fingerprint_generator( @@ -45,7 +45,7 @@ def test_from_fingerprint_generator( fingerprint_values_array_sum_calculated = np.nansum(fingerprint.values_array()) assert ( - pytest.approx(fingerprint_values_array_sum_calculated, abs=1e-6) + pytest.approx(fingerprint_values_array_sum_calculated, abs=1e-4) == fingerprint_values_array_sum ) @@ -53,6 +53,6 @@ def test_from_fingerprint_generator( fingerprint_normalized.values_array() ) assert ( - pytest.approx(fingerprint_normalized_values_array_sum_calculated, abs=1e-6) + pytest.approx(fingerprint_normalized_values_array_sum_calculated, abs=1e-4) == fingerprint_normalized_values_array_sum )