Skip to content

Commit

Permalink
Merge pull request #89 from volkamerlab/normalize-fps
Browse files Browse the repository at this point in the history
Implement min-max normalization (coarse/fine)
  • Loading branch information
dominiquesydow committed Sep 4, 2021
2 parents d2c9991 + 0472841 commit 29481ce
Show file tree
Hide file tree
Showing 26 changed files with 905 additions and 605 deletions.
1 change: 1 addition & 0 deletions kissim/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

from .encode import encode
from .normalize import normalize
from .compare import compare
from .weights import weights
from .outliers import outliers
Expand Down
61 changes: 61 additions & 0 deletions kissim/api/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""
kissim.api.normalize
Main API for normalizing fingerprints.
"""

import logging
from pathlib import Path

from kissim.encoding import FingerprintGenerator, FingerprintGeneratorNormalized

logger = logging.getLogger(__name__)


def normalize(
fingerprints_path, method="min_max", fine_grained=True, fingerprints_normalized_path=None
):
"""
Remove outlier fingerprints (defined by spatial distances maximum).
Parameters
----------
fingerprints_path : str or pathlib.Path
Path to fingerprints JSON file.
method : str
Normalization method.
fine_grained : bool
True (default):
Distances: Calculate min/max per subpocket for each residue position individually.
Moments: Calculate min/max per moment for each subpocket individually.
False:
Distances: Calculate min/max per subpocket over all residue positions.
Moments: Calculate min/max per moment over all subpockets.
fingerprints_normalized_path : str or pathlib.Path
Path to normalized fingerprints JSON file.
Returns
-------
kissim.encoding.FingerprintGenerator
Normalized Fingerprints.
"""

# Load fingerprints
logger.info("Read fingerprints...")
fingerprints_path = Path(fingerprints_path)
fingerprint_generator = FingerprintGenerator.from_json(fingerprints_path)
logger.info(f"Number of fingerprints: {len(fingerprint_generator.data)}")

# Normalize fingerprints
logger.info("Normalize fingerprints...")
logger.info(f"Normalization method: {method}")
logger.info(f"Use fine-grained normalization: {fine_grained}")
fingerprint_generator_normalized = FingerprintGeneratorNormalized.from_fingerprint_generator(
fingerprint_generator, method, fine_grained
)
if fingerprints_normalized_path is not None:
fingerprints_normalized_path = Path(fingerprints_normalized_path)
fingerprint_generator_normalized.to_json(fingerprints_normalized_path)
logger.info(f"Number of fingerprints: {len(fingerprint_generator_normalized.data)}")

return fingerprint_generator_normalized
27 changes: 0 additions & 27 deletions kissim/api/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,6 @@ def _subset_fingerprint_generator(fingerprint_generator, klifs_residue_ids):
fingerprint_generator_subset.data = _subset_fingerprint_generator_data(
fingerprint_generator, klifs_residue_ids
)
fingerprint_generator_subset.data_normalized = _subset_fingerprint_generator_data_normalized(
fingerprint_generator, fingerprint_generator_subset
)

return fingerprint_generator_subset

Expand Down Expand Up @@ -210,27 +207,3 @@ def _subset_fingerprint_generator_data(fingerprint_generator, klifs_residue_ids)
fingerprint_generator_data_subset[id_] = fp_subset

return fingerprint_generator_data_subset


def _subset_fingerprint_generator_data_normalized(
fingerprint_generator, fingerprint_generator_subset
):
"""
Normalize the input fingerprint subsets.
Attributes
----------
fingerprint_generator : kissim.encoding.FingerprintGenerator
Fingerprints.
fingerprint_generator_subset : kissim.encoding.FingerprintGenerator
Fingerprints subset.
Returns
-------
dict
Fingerprints with subset of residues only.
"""

# If fingerprint generator contains normalized fingerprints
if fingerprint_generator.data_normalized is not None:
return fingerprint_generator_subset._normalize_fingerprints()
1 change: 1 addition & 0 deletions kissim/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

from .encode import encode_from_cli
from .normalize import normalize_from_cli
from .compare import compare_from_cli
from .weights import weights_from_cli
from .outliers import outliers_from_cli
Expand Down
36 changes: 36 additions & 0 deletions kissim/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from kissim.cli import (
encode_from_cli,
normalize_from_cli,
compare_from_cli,
weights_from_cli,
outliers_from_cli,
Expand All @@ -30,6 +31,7 @@ def main():
Sub-commands are:
- encode
- normalize
- compare
- weights
- outliers
Expand All @@ -41,6 +43,7 @@ def main():
subparsers = parser.add_subparsers()

encode_subparser = subparsers.add_parser("encode")
normalize_subparser = subparsers.add_parser("normalize")
compare_subparser = subparsers.add_parser("compare")
weights_subparser = subparsers.add_parser("weights")
outliers_subparser = subparsers.add_parser("outliers")
Expand Down Expand Up @@ -80,6 +83,39 @@ def main():
)
encode_subparser.set_defaults(func=encode_from_cli)

# Arguments and function to be called for sub-command normalize
normalize_subparser.add_argument(
"-i",
"--input",
type=str,
help="Path to JSON file containing fingerprint data.",
required=True,
)
normalize_subparser.add_argument(
"-o",
"--output",
type=str,
help="Path to JSON file containing normalized fingerprint data.",
required=True,
)
normalize_subparser.add_argument(
"-m",
"--method",
type=str,
help="Normalization method.",
required=False,
default="min_max",
)
normalize_subparser.add_argument(
"-f",
"--fine_grained",
action="store_true",
help="Use fine-grained normalization (min-max per residue/subpocket for distances/moments).",
required=False,
default=False,
)
normalize_subparser.set_defaults(func=normalize_from_cli)

# Arguments and function to be called for sub-command compare
compare_subparser.add_argument(
"-i",
Expand Down
22 changes: 22 additions & 0 deletions kissim/cli/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""
kissim.cli.normalize
Normalize fingerprints from CLI arguments.
"""

from kissim.api import normalize
from kissim.cli.utils import configure_logger


def normalize_from_cli(args):
"""
Normalize fingerprints.
Parameters
----------
args : argsparse.Namespace
CLI arguments.
"""

configure_logger(args.output)
normalize(args.input, args.method, bool(args.fine_grained), args.output)
9 changes: 9 additions & 0 deletions kissim/data/min_max_distances_coarse.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
subpocket,min_max,any
hinge_region,min,2.37
hinge_region,max,30.79
dfg_region,min,0.87
dfg_region,max,33.62
front_pocket,min,1.36
front_pocket,max,33.36
center,min,1.11
center,max,27.03
9 changes: 9 additions & 0 deletions kissim/data/min_max_distances_fine.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
subpocket,min_max,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85
hinge_region,min,11.52,9.3,7.83,6.55,5.66,6.86,7.71,6.37,8.84,7.14,6.25,7.1,6.89,5.15,2.37,5.07,6.08,9.83,11.16,12.18,9.73,12.85,12.86,9.08,9.69,11.36,11.29,9.52,11.18,12.21,11.11,10.48,9.68,10.0,6.5,5.4,4.58,6.25,7.5,7.07,9.78,8.37,7.16,4.95,3.25,3.53,2.69,4.82,5.28,9.24,7.7,9.3,10.71,9.63,11.32,11.75,16.63,18.63,14.66,16.17,15.69,19.15,19.78,18.17,17.34,16.98,15.05,13.61,14.33,12.49,13.12,13.15,11.43,9.35,8.0,8.08,7.64,9.19,6.56,5.17,5.98,5.24,7.21,4.34,5.96
hinge_region,max,23.62,20.68,18.57,18.8,20.51,19.77,19.17,18.26,17.01,16.03,12.37,12.79,14.27,8.53,6.23,8.29,11.01,13.56,16.08,30.79,27.93,24.99,23.72,23.31,21.24,21.71,23.29,25.5,24.68,23.65,20.26,19.07,22.95,19.57,16.31,13.7,11.93,10.56,12.02,12.9,14.56,13.52,10.59,8.03,6.0,6.16,6.94,9.82,12.34,12.85,14.02,14.75,17.47,19.59,21.55,24.16,23.94,27.83,27.15,30.31,28.44,27.9,27.14,26.34,24.67,22.55,22.6,20.71,21.55,21.26,18.24,17.88,17.13,15.15,12.46,13.23,12.19,14.76,11.45,9.07,12.25,14.45,24.17,30.0,24.42
dfg_region,min,16.23,14.04,12.53,6.09,8.57,5.47,4.55,3.92,5.08,7.66,8.13,10.06,10.56,12.08,9.26,7.88,5.54,6.8,4.58,2.01,2.86,3.3,4.85,2.64,3.62,4.66,7.12,6.03,6.21,9.48,10.75,11.61,11.97,10.64,7.56,6.51,7.74,6.7,9.24,8.1,6.1,5.33,2.95,6.35,7.74,10.96,11.01,11.57,14.32,16.37,15.1,13.65,16.1,16.96,17.74,16.88,21.7,22.6,17.13,12.89,10.94,14.4,15.4,12.19,10.16,10.29,9.35,8.98,9.92,9.42,11.43,10.78,13.67,10.73,10.05,11.71,12.53,11.48,8.19,5.57,4.18,2.3,1.12,0.87,1.68
dfg_region,max,30.33,27.63,24.45,22.83,21.77,22.91,25.38,23.47,19.85,18.67,17.56,20.22,21.3,18.73,15.42,12.98,10.98,12.88,13.56,21.63,18.26,15.91,15.21,17.43,16.35,16.71,19.99,21.29,20.95,21.2,19.46,22.08,25.42,23.36,19.71,17.63,16.97,14.47,16.63,15.21,15.61,11.54,10.84,13.71,14.09,17.22,18.3,19.88,22.9,24.04,24.27,22.34,23.75,26.23,27.74,29.72,30.02,33.58,33.62,28.7,26.96,25.28,25.46,22.78,21.72,18.7,19.09,17.5,18.97,18.47,20.73,22.21,23.28,20.99,18.01,19.22,19.56,22.45,17.2,13.75,12.26,10.42,18.96,26.47,22.51
front_pocket,min,10.92,7.52,5.74,4.05,2.38,2.24,2.28,3.82,4.04,7.36,5.68,7.7,7.9,9.6,7.31,8.93,7.61,10.78,11.79,13.0,11.99,14.37,13.48,10.44,11.79,13.75,12.94,12.42,13.6,15.7,14.26,13.98,13.99,11.99,9.92,7.91,10.41,9.92,12.06,11.5,14.31,12.76,10.69,10.66,8.66,8.9,6.06,6.63,6.81,4.89,4.18,3.3,6.26,8.24,8.45,7.61,12.08,13.36,10.53,15.61,14.63,18.02,19.4,18.72,16.58,16.04,14.75,11.14,11.83,8.62,8.67,7.75,5.7,4.07,2.7,5.41,4.64,7.4,6.79,4.27,3.05,1.76,2.37,1.64,1.36
front_pocket,max,23.34,19.68,18.01,19.84,20.9,19.48,17.04,17.51,15.22,13.11,13.32,15.09,17.34,14.29,11.16,13.4,12.49,15.14,17.26,32.93,30.28,27.96,27.35,25.56,24.07,24.92,26.58,28.1,26.59,26.02,23.58,22.91,22.81,20.01,16.48,14.74,14.51,16.72,18.22,18.42,20.2,17.36,15.06,14.19,11.79,12.62,12.84,12.59,15.5,13.27,15.98,12.98,13.68,16.23,17.69,19.57,20.37,25.21,24.23,33.36,30.61,29.49,27.84,26.72,25.15,21.76,21.44,18.21,17.92,16.9,14.04,13.56,12.38,10.03,8.25,9.14,9.23,12.23,12.41,10.56,13.21,14.06,20.05,23.84,21.09
center,min,15.17,12.05,10.22,8.05,6.51,6.02,5.15,5.19,8.39,9.58,7.72,9.47,10.25,9.36,7.16,7.87,6.76,10.38,10.67,10.43,9.44,10.84,9.59,6.44,7.53,8.72,8.48,7.42,8.28,10.36,9.39,9.03,9.46,7.84,5.71,4.08,5.49,6.29,9.32,8.6,10.79,9.77,7.55,7.4,5.52,6.94,6.25,7.4,9.94,10.17,9.44,8.74,10.15,12.42,13.19,12.16,16.43,18.15,14.43,12.06,11.65,14.74,15.76,14.3,12.37,12.19,10.42,8.63,9.64,8.36,8.84,8.07,9.14,5.68,5.46,6.5,6.84,5.61,1.88,1.11,1.24,1.75,2.5,2.97,3.29
center,max,26.03,22.76,20.23,20.8,21.68,21.14,19.48,20.33,18.81,17.58,16.44,18.23,19.64,13.57,10.86,12.69,12.27,15.2,16.28,25.95,23.32,20.81,20.31,21.64,17.82,18.63,20.24,21.73,20.56,20.22,17.67,17.3,20.18,17.08,13.65,12.19,10.88,12.01,13.64,15.27,17.86,15.02,12.87,11.51,9.31,10.95,12.01,13.39,16.92,15.66,18.47,16.43,18.35,20.46,22.63,24.25,23.81,26.89,26.8,27.03,24.93,23.83,22.55,21.54,19.84,17.4,17.05,15.36,15.64,15.21,13.15,13.95,14.11,12.01,8.89,10.33,11.69,13.93,11.41,8.22,7.39,9.32,20.52,26.75,20.19
7 changes: 7 additions & 0 deletions kissim/data/min_max_moments_coarse.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
moment,min_max,any
1,min,11.47
1,max,16.85
2,min,2.88
2,max,6.19
3,min,-2.29
3,max,6.31
7 changes: 7 additions & 0 deletions kissim/data/min_max_moments_fine.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
moment,min_max,hinge_region,dfg_region,front_pocket,center
1,min,12.2,12.72,12.33,11.47
1,max,13.85,16.85,14.34,12.97
2,min,4.06,3.91,3.34,2.88
2,max,5.76,6.17,6.19,4.48
3,min,-2.17,-1.12,-2.1,-2.29
3,max,5.16,6.31,5.97,4.15
15 changes: 8 additions & 7 deletions kissim/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,14 @@
# Distance and moment cutoffs used for fingerprint normalization
# Cutoffs defined in this notebook:
# https://github.com/volkamerlab/kissim_app/blob/master/notebooks/004_fingerprints/002_spatial_feature_cutoffs.ipynb
DISTANCE_CUTOFFS = {
"hinge_region": (2.0, 31.0),
"dfg_region": (0.0, 34.0),
"front_pocket": (1.0, 33.0),
"center": (1.0, 29.0),
}
MOMENT_CUTOFFS = {1: (11.0, 17.0), 2: (2.0, 7.0), 3: (-3.0, 7.0)}
DISTANCE_CUTOFFS = {}
MOMENT_CUTOFFS = {}
for how in ["fine", "coarse"]:

DISTANCE_CUTOFFS[how] = pd.read_csv(
PATH_DATA / f"min_max_distances_{how}.csv", index_col=[0, 1]
)
MOMENT_CUTOFFS[how] = pd.read_csv(PATH_DATA / f"min_max_moments_{how}.csv", index_col=[0, 1])

# KLIFS pocket residue subsets by DFG conformation
with open(PATH_DATA / "klifs_pocket_residue_subset.json") as f:
Expand Down
4 changes: 3 additions & 1 deletion kissim/encoding/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
Encode kinase pockets as subpocket-based structural fingerprint.
"""

from .base import FingerprintBase
from .fingerprint_base import FingerprintBase
from .fingerprint import Fingerprint
from .fingerprint_normalized import FingerprintNormalized
from .fingerprint_generator_base import FingerprintGeneratorBase
from .fingerprint_generator import FingerprintGenerator
from .fingerprint_generator_normalized import FingerprintGeneratorNormalized
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
kissim.encoding.fingerprint
kissim.encoding.fingerprint_base
Defines the kissim fingerprint.
Defines the base kissim fingerprint.
"""

import json
Expand Down
Loading

0 comments on commit 29481ce

Please sign in to comment.