Skip to content

Commit

Permalink
expand docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
florian-huber committed Aug 11, 2023
1 parent da5c383 commit b2871cf
Showing 1 changed file with 33 additions and 5 deletions.
38 changes: 33 additions & 5 deletions ms2deepscore/spectrum_pair_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def jaccard_similarity_matrix_cherrypicking(
fingerprints: np.ndarray,
selections_bins: np.ndarray = np.array([(x/10, x/10 + 0.1) for x in range(0, 10)]),
selection_bins: np.ndarray = np.array([(x/10, x/10 + 0.1) for x in range(0, 10)]),
max_pairs_per_bin: int = 20,
include_diagonal: bool = True,
fix_global_bias: bool = True,
Expand All @@ -19,6 +19,21 @@ def jaccard_similarity_matrix_cherrypicking(
----------
fingerprints
Fingerprint vectors as 2D numpy array.
selection_bins
List of tuples with upper and lower bound for score bins.
The goal is to pick equal numbers of pairs for each score bin.
Sidenote: bins do not have to be of equal size, nor do they have to cover the entire
range of the used scores.
max_pairs_per_bin
Specifies the desired maximum number of pairs to be added for each score bin.
include_diagonal
Set to False if pairs with two equal compounds/fingerprints should be excluded.
fix_global_bias
Default is True in which case the function aims to get the same amount of pairs for
each bin globally. This means it add more than max_pairs_par_bin for some bins and/or
some compounds to compensate for lack of such scores in other compounds.
random_seed
Set to integer if the randomness of the pair selection should be reproducible.
Returns
-------
Expand All @@ -31,7 +46,7 @@ def jaccard_similarity_matrix_cherrypicking(
np.random.seed(random_seed)
data, i, j = compute_jaccard_similarity_matrix_cherrypicking(
fingerprints,
selections_bins,
selection_bins,
max_pairs_per_bin,
include_diagonal,
fix_global_bias,
Expand All @@ -43,7 +58,7 @@ def jaccard_similarity_matrix_cherrypicking(
@numba.njit
def compute_jaccard_similarity_matrix_cherrypicking(
fingerprints: np.ndarray,
selections_bins: np.ndarray = np.array([(x/10, x/10 + 0.1) for x in range(0, 10)]),
selection_bins: np.ndarray = np.array([(x/10, x/10 + 0.1) for x in range(0, 10)]),
max_pairs_per_bin: int = 20,
include_diagonal: bool = True,
fix_global_bias: bool = True,
Expand All @@ -55,6 +70,19 @@ def compute_jaccard_similarity_matrix_cherrypicking(
----------
fingerprints
Fingerprint vectors as 2D numpy array.
selection_bins
List of tuples with upper and lower bound for score bins.
The goal is to pick equal numbers of pairs for each score bin.
Sidenote: bins do not have to be of equal size, nor do they have to cover the entire
range of the used scores.
max_pairs_per_bin
Specifies the desired maximum number of pairs to be added for each score bin.
include_diagonal
Set to False if pairs with two equal compounds/fingerprints should be excluded.
fix_global_bias
Default is True in which case the function aims to get the same amount of pairs for
each bin globally. This means it add more than max_pairs_par_bin for some bins and/or
some compounds to compensate for lack of such scores in other compounds.
Returns
-------
Expand All @@ -67,7 +95,7 @@ def compute_jaccard_similarity_matrix_cherrypicking(
scores_i = []
scores_j = []
# keep track of total bias across bins
max_pairs_global = len(selections_bins) * [max_pairs_per_bin]
max_pairs_global = len(selection_bins) * [max_pairs_per_bin]
for i in range(size):
scores_row = np.zeros(size)
for j in range(size):
Expand All @@ -76,7 +104,7 @@ def compute_jaccard_similarity_matrix_cherrypicking(
scores_row[j] = jaccard_index(fingerprints[i, :], fingerprints[j, :])

# Cherrypicking
for bin_number, selection_bin in enumerate(selections_bins):
for bin_number, selection_bin in enumerate(selection_bins):
# Indices of scores within the current bin
idx = np.where((scores_row > selection_bin[0]) & (scores_row <= selection_bin[1]))[0]

Expand Down

0 comments on commit b2871cf

Please sign in to comment.