diff --git a/CHANGELOG.md b/CHANGELOG.md index eeae0fb..5fd0096 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,82 +1,88 @@ # Change log +All notable changes to this project will be documented in this file. +This project adheres to [Semantic Versioning](http://semver.org/). +Formatted as described on http://keepachangelog.com/. ## Unreleased +## [2.0.0] - 2016-07-14 + ### Changed -* Flag to ignore upper triangle when calculating distances, instead of always ignore (#20) +- Renamed distance to similarity (#21) +- Flag to ignore upper triangle when calculating distances, instead of always ignore (#20) -## 1.4.2 - 3 June 2016 +## [1.4.2] - 2016-06-03 ### Changed -* Lower webservice cutoff to 0.45 (#18) +- Lower webservice cutoff to 0.45 (#18) -## 1.4.1 - 31 May 2016 +## [1.4.1] - 2016-05-31 ### Added -* Webservice online at http://3d-e-chem.vu-compmedchem.nl/kripodb/ui/ -* Ignore_upper triangle option in distance import sub command +- Webservice online at http://3d-e-chem.vu-compmedchem.nl/kripodb/ui/ +- Ignore_upper triangle option in distance import sub command -## 1.4.0 - 3 May 2016 +## [1.4.0] - 2016-05-03 ### Changed -* Using nested sub-commands instead of long sub-command. For example `kripodb distmatrix_import` now is `kripodb distances import` +- Using nested sub-commands instead of long sub-command. For example `kripodb distmatrix_import` now is `kripodb distances import` ### Added -* Faster distance matrix storage format -* Python3 support (#12) -* Automated build to docker hub. +- Faster distance matrix storage format +- Python3 support (#12) +- Automated build to docker hub. ### Removed -* CLI argument `--precision` +- CLI argument `--precision` -## 1.3.0 - 23 Apr 2016 +## [1.3.0] - 2016-04-23 ### Added -* webservice server/client for distance matrix (#16). The CLI and canned commands can now take a local file or a url. +- webservice server/client for distance matrix (#16). The CLI and canned commands can now take a local file or a url. ### Fixed -* het_seq_nr contains non-numbers (#15) +- het_seq_nr contains non-numbers (#15) -## 1.2.5 - 24 Mar 2016 +## [1.2.5] - 2016-03-24 ### Fixed -* fpneigh2tsv not available as sub command +- fpneigh2tsv not available as sub command -## 1.2.4 - 24 Mar 2016 +## [1.2.4] - 2016-03-24 ### Added -* Sub command to convert fpneight distance file to tsv. +- Sub command to convert fpneight distance file to tsv. -## 1.2.3 - 1 Mar 2016 +## [1.2.3] - 2016-03-01 ### Changed -* Converting distances matrix will load id2label lookup into memory to speed up conversion +- Converting distances matrix will load id2label lookup into memory to speed up conversion -## 1.2.2 - 22 Feb 2016 +## [1.2.2] - 2016-02-22 ### Added - Added sub command to read fpneigh formatted distance matrix file (#14) -## 1.2.1 - 12 Feb 2016 +## [1.2.1] - 2016-02-12 ### Added - Added sub commands to read/write distance matrix in tab delimited format (#13) - Created repo for Knime example and plugin at https://github.com/3D-e-Chem/knime-kripodb (#8) -## 1.2.0 - 11 Feb 2016 +## [1.2.0] - 2016-02-11 ### Added @@ -89,7 +95,7 @@ - Merging of distance matrix files more robust (#10) - Tanimoto coefficient is rounded up (#7) -## 1.0.0 - 5 Feb 2016 +## [1.0.0] - 2016-02-05 ### Added diff --git a/README.md b/README.md index 121bcbb..b621cf1 100644 --- a/README.md +++ b/README.md @@ -17,14 +17,14 @@ KRIPO stands for Key Representation of Interaction in POckets, see [reference](h * Subpocket, part of the protein pocket which binds with the fragment * Fingerprint, fingerprint of structure-based pharmacophore of subpocket * Similarity matrix, similarities between all fingerprint pairs calculated using the modified tanimoto similarity index -* Kripo identifier, used as identifier for fragment, subpocket and fingerprint +* Kripo fragment identifier, used as identifier for fragment, subpocket and fingerprint # Install Requirements: * rdkit, http://rdkit.org, to read SDF files and generate smile strings from molecules -* libhdf5 headers, to read/write distance matrix in hdf5 format +* libhdf5 headers, to read/write similarity matrix in hdf5 format ``` pip install -U setuptools @@ -48,42 +48,42 @@ kripodb fragments sdf fragment??.sdf fragments.sqlite kripodb fragments pdb fragments.sqlite kripodb fingerprints import 01.fp 01.fp.db kripodb fingerprints import 02.fp 02.fp.db -kripodb fingerprints distances --fragmentsdbfn fragments.sqlite --ignore_upper_triangle 01.fp.db 01.fp.db dist_01_01.h5 -kripodb fingerprints distances --fragmentsdbfn fragments.sqlite --ignore_upper_triangle 02.fp.db 02.fp.db dist_02_02.h5 -kripodb fingerprints distances --fragmentsdbfn fragments.sqlite 01.fp.db 02.fp.db dist_01_02.h5 -kripodb distances merge dist_*_*.h5 dist_all.h5 -kripodb distances freeze dist_all.h5 dist_all.frozen.h5 -# Make froze distance matrix smaller, by using slower compression -ptrepack --complevel 6 --complib blosc:zlib dist_all.frozen.h5 dist_all.packedfrozen.h5 -rm dist_all.frozen.h5 -kripodb distances serve dist_all.packedfrozen.h5 +kripodb fingerprints similarities --fragmentsdbfn fragments.sqlite --ignore_upper_triangle 01.fp.db 01.fp.db sim_01_01.h5 +kripodb fingerprints similarities --fragmentsdbfn fragments.sqlite --ignore_upper_triangle 02.fp.db 02.fp.db sim_02_02.h5 +kripodb fingerprints similarities --fragmentsdbfn fragments.sqlite 01.fp.db 02.fp.db sim_01_02.h5 +kripodb similarities merge sim_*_*.h5 sim_all.h5 +kripodb similarities freeze sim_all.h5 sim_all.frozen.h5 +# Make froze similarity matrix smaller, by using slower compression +ptrepack --complevel 6 --complib blosc:zlib sim_all.frozen.h5 sim_all.packedfrozen.h5 +rm sim_all.frozen.h5 +kripodb similarities serve sim_all.packedfrozen.h5 ``` ## Search for most similar fragments Command to find fragments most similar to `3kxm_K74_frag1` fragment. ``` -kripodb similar dist_all.h5 3kxm_K74_frag1 --cutoff 0.45 +kripodb similar sim_all.h5 3kxm_K74_frag1 --cutoff 0.45 ``` -## Create distance matrix from text files +## Create similarity matrix from text files -Input files `dist_??_??.txt.gz` looks like: +Input files `sim_??_??.txt.gz` looks like: ``` Compounds similar to 2xry_FAD_frag4: 2xry_FAD_frag4 1.0000 3cvv_FAD_frag3 0.5600 ``` -To create a single distance matrix from multiple text files: +To create a single similarity matrix from multiple text files: ``` -gunzip -c dist_01_01.txt.gz | kripodb distances import --ignore_upper_triangle - fragments.sqlite dist_01_01.h5 -gunzip -c dist_01_02.txt.gz | kripodb distances import - fragments.sqlite dist_01_02.h5 -gunzip -c dist_02_02.txt.gz | kripodb distances import --ignore_upper_triangle - fragments.sqlite dist_02_02.h5 -kripodb distances merge dist_??_??.h5 dist_all.h5 +gunzip -c sim_01_01.txt.gz | kripodb similarities import --ignore_upper_triangle - fragments.sqlite sim_01_01.h5 +gunzip -c sim_01_02.txt.gz | kripodb similarities import - fragments.sqlite sim_01_02.h5 +gunzip -c sim_02_02.txt.gz | kripodb similarities import --ignore_upper_triangle - fragments.sqlite sim_02_02.h5 +kripodb similarities merge sim_??_??.h5 sim_all.h5 ``` -The `--ignore_upper_triangle` flag is used to prevent scores corruption when freezing distance matrix. +The `--ignore_upper_triangle` flag is used to prevent scores corruption when freezing similarity matrix. # Data sets @@ -96,7 +96,7 @@ An example data set included in the [data/](data/) directory of this repo. See [ All fragments based on GPCR proteins compared with all proteins in PDB. * kripo.gpcrandhits.sqlite - Fragments sqlite database -* kripo.gpcr.h5 - HDF5 file with distance matrix +* kripo.gpcr.h5 - HDF5 file with similarity matrix The data set has been published at [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.50835.svg)](http://dx.doi.org/10.5281/zenodo.50835) @@ -106,8 +106,8 @@ All fragments form all proteins-ligand complexes in PDB compared with all. Data set contains PDB entries that where available at 23 December 2015. * kripo.sqlite - Fragments sqlite database -* Distance matrix is too big to ship with VM so use http://3d-e-chem.vu-compmedchem.nl/kripodb webservice url to query. -* kripo_fingerprint_2015_*.fp.gz - Fragment fingerprints, see [here](#create-distance-matrix-from-text-files) for instructions how to convert to a distance matrix. +* Similarity matrix is too big to ship with VM so use http://3d-e-chem.vu-compmedchem.nl/kripodb webservice url to query. +* kripo_fingerprint_2015_*.fp.gz - Fragment fingerprints, see [here](#create-similarity-matrix-from-text-files) for instructions how to convert to a similarity matrix. The data set has been published at [![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.55254.svg)](http://dx.doi.org/10.5281/zenodo.55254) @@ -152,7 +152,7 @@ The Kripo data files can be queried using a web service. Start webservice with: ``` -kripodb serve --port 8084 data/distances.h5 +kripodb serve --port 8084 data/similarities.h5 ``` It will print the urls for the swagger spec and UI. diff --git a/data/README.md b/data/README.md index 35f57d5..527a514 100644 --- a/data/README.md +++ b/data/README.md @@ -2,7 +2,7 @@ * fragments.sqlite - Fragments sqlite database containing a small number of fragments with their smiles string and molblock. * fingerprints.sqlite - Fingerprints sqlite database with fingerprint stored as [fastdumped intbitset](http://intbitset.readthedocs.org/en/latest/index.html#intbitset.intbitset.fastdump) -* distances.h5 - HDF5 file with distance matrix of fingerprints using modified tanimoto coefficient +* similarities.h5 - HDF5 file with similarities matrix of fingerprints using modified tanimoto similarity index ## Creating tiny data set @@ -23,8 +23,9 @@ EOF ``` -3. Create distance matrix +3. Create similarity matrix ``` -kripodb fingerprints distances --fragmentsdbfn fragments.sqlite fingerprints.sqlite fingerprints.sqlite distances.h5 -``` \ No newline at end of file +kripodb fingerprints similarities --fragmentsdbfn fragments.sqlite fingerprints.sqlite fingerprints.sqlite similarities.h5 +``` + diff --git a/data/distances.frozen.h5 b/data/similarities.frozen.h5 similarity index 100% rename from data/distances.frozen.h5 rename to data/similarities.frozen.h5 diff --git a/data/distances.h5 b/data/similarities.h5 similarity index 100% rename from data/distances.h5 rename to data/similarities.h5 diff --git a/kripodb/canned.py b/kripodb/canned.py index d2a36f9..1db6243 100644 --- a/kripodb/canned.py +++ b/kripodb/canned.py @@ -13,7 +13,7 @@ # limitations under the License. """Module with functions which use pandas DataFrame as input and output. -For using Kripo data files inside Knime (http://www.knime.org) +For using Kripo data files inside KNIME (http://www.knime.org) """ from __future__ import absolute_import @@ -21,20 +21,20 @@ import tables import pandas as pd -from kripodb.frozen import FrozenDistanceMatrix +from kripodb.frozen import FrozenSimilarityMatrix from .db import FragmentsDb -from .hdf5 import DistanceMatrix -from .pairs import similar +from .hdf5 import SimilarityMatrix +from .pairs import similar, open_similarity_matrix from .webservice.client import WebserviceClient -def similarities(queries, distance_matrix_filename_or_url, cutoff, limit=1000): - """Find similar fragments to queries based on distance matrix. +def similarities(queries, similarity_matrix_filename_or_url, cutoff, limit=1000): + """Find similar fragments to queries based on similarity matrix. Args: queries (List[str]): Query fragment identifiers - distance_matrix_filename_or_url (str): Filename of distance matrix file or base url of kripodb webservice - cutoff (float): Cutoff, distance scores below cutoff are discarded. + similarity_matrix_filename_or_url (str): Filename of similarity matrix file or base url of kripodb webservice + cutoff (float): Cutoff, similarity scores below cutoff are discarded. limit (int): Maximum number of hits for each query. Default is 1000. Use is None for no limit. @@ -44,12 +44,12 @@ def similarities(queries, distance_matrix_filename_or_url, cutoff, limit=1000): >>> import pandas as pd >>> from kripodb.canned import similarities >>> queries = pd.Series(['3j7u_NDP_frag24']) - >>> hits = similarities(queries, 'data/distances.h5', 0.55) + >>> hits = similarities(queries, 'data/similaritys.h5', 0.55) >>> len(hits) 11 - Retrieved from web service instead of local distance matrix file. - Make sure the web service is running, for example by `kripodb serve data/distances.h5`. + Retrieved from web service instead of local similarity matrix file. + Make sure the web service is running, for example by `kripodb serve data/similaritys.h5`. >>> hits = similarities(queries, 'http://localhost:8084/kripo', 0.55) >>> len(hits) @@ -59,28 +59,22 @@ def similarities(queries, distance_matrix_filename_or_url, cutoff, limit=1000): pandas.DataFrame: Data frame with query_fragment_id, hit_frag_id and score columns """ hits = [] - if distance_matrix_filename_or_url.startswith('http'): - client = WebserviceClient(distance_matrix_filename_or_url) + if similarity_matrix_filename_or_url.startswith('http'): + client = WebserviceClient(similarity_matrix_filename_or_url) for query in queries: qhits = client.similar_fragments(query, cutoff, limit) hits.extend(qhits) else: - f = tables.open_file(distance_matrix_filename_or_url, 'r') - is_frozen = 'scores' in f.root - f.close() - if is_frozen: - distance_matrix = FrozenDistanceMatrix(distance_matrix_filename_or_url) - else: - distance_matrix = DistanceMatrix(distance_matrix_filename_or_url) + similarity_matrix = open_similarity_matrix(similarity_matrix_filename_or_url) for query in queries: - for query_id, hit_id, score in similar(query, distance_matrix, cutoff, limit): + for query_id, hit_id, score in similar(query, similarity_matrix, cutoff, limit): hit = {'query_frag_id': query_id, 'hit_frag_id': hit_id, 'score': score, } hits.append(hit) - distance_matrix.close() + similarity_matrix.close() return pd.DataFrame(hits) diff --git a/kripodb/frozen.py b/kripodb/frozen.py index d0506eb..67ab6cc 100644 --- a/kripodb/frozen.py +++ b/kripodb/frozen.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Distance matrix using pytables carray""" +"""Similarity matrix using pytables carray""" from __future__ import absolute_import, print_function from math import log10, ceil, floor try: @@ -27,8 +27,8 @@ import tables -class FrozenDistanceMatrix(object): - """Frozen distances matrix +class FrozenSimilarityMatrix(object): + """Frozen similarities matrix Can retrieve whole column of a specific row fairly quickly. Store as compressed dense matrix. @@ -36,14 +36,14 @@ class FrozenDistanceMatrix(object): Warning! Can not be enlarged. - Compared find performance FrozenDistanceMatrix with DistanceMatrix: + Compared find performance FrozenSimilarityMatrix with SimilarityMatrix: >>> from kripodb.db import FragmentsDb >>> db = FragmentsDb('data/feb2016/Kripo20151223.sqlite') >>> ids = [v[0] for v in db.cursor.execute('SELECT frag_id FROM fragments ORDER BY RANDOM() LIMIT 20')] - >>> from kripodb.frozen import FrozenDistanceMatrix - >>> fdm = FrozenDistanceMatrix('01-01_to_13-13.out.frozen.blosczlib.h5') - >>> from kripodb.hdf5 import DistanceMatrix - >>> dm = DistanceMatrix('data/feb2016/01-01_to_13-13.out.h5', cache_labels=True) + >>> from kripodb.frozen import FrozenSimilarityMatrix + >>> fdm = FrozenSimilarityMatrix('01-01_to_13-13.out.frozen.blosczlib.h5') + >>> from kripodb.hdf5 import SimilarityMatrix + >>> dm = SimilarityMatrix('data/feb2016/01-01_to_13-13.out.h5', cache_labels=True) >>> %timeit list(dm.find(ids[0], 0.45, None)) ... 1 loop, best of 3: 1.96 s per loop >>> %timeit list(fdm.find(ids[0], 0.45, None)) @@ -56,7 +56,7 @@ class FrozenDistanceMatrix(object): ... 1 loop, best of 3: 29.7 s per loop Args: - filename (str): File name of hdf5 file to write or read distance matrix from + filename (str): File name of hdf5 file to write or read similarity matrix from mode (str): Can be 'r' for reading or 'w' for writing **kwargs: Passed though to tables.open_file() @@ -93,11 +93,11 @@ def find(self, query, cutoff, limit=None): Args: query (str): Query fragment identifier - cutoff (float): Cutoff, distance scores below cutoff are discarded. + cutoff (float): Cutoff, similarity scores below cutoff are discarded. limit (int): Maximum number of hits. Default is None for no limit. Returns: - Tuple[(str, float)]: Hit fragment idenfier and distance score + Tuple[(str, float)]: Hit fragment idenfier and similarity score """ precision = float(self.score_precision) precision10 = float(10**(floor(log10(precision)))) @@ -116,21 +116,21 @@ def build_label_cache(self): self.cache_i2l = {k: v.decode() for k, v in enumerate(self.labels)} self.cache_l2i = {v: k for k, v in self.cache_i2l.items()} - def from_pairs(self, distance_matrix, frame_size, limit=None, single_sided=False): + def from_pairs(self, similarity_matrix, frame_size, limit=None, single_sided=False): """Fills self with matrix which is stored in pairs. Also known as COOrdinate format, the 'ijv' or 'triplet' format. Args: - distance_matrix (kripodb.hdf5.DistanceMatrix): + similarity_matrix (kripodb.hdf5.SimilarityMatrix): frame_size (int): Number of pairs to append in a single go limit (int|None): Number of pairs to add, None for no limit, default is None. single_sided (bool): If false add stored direction and reverse direction. Default is False. - time kripodb distances freeze --limit 200000 -f 100000 data/feb2016/01-01_to_13-13.out.h5 percell.h5 + time kripodb similarities freeze --limit 200000 -f 100000 data/feb2016/01-01_to_13-13.out.h5 percell.h5 47.2s - time kripodb distances freeze --limit 200000 -f 100000 data/feb2016/01-01_to_13-13.out.h5 coo.h5 + time kripodb similarities freeze --limit 200000 -f 100000 data/feb2016/01-01_to_13-13.out.h5 coo.h5 0.2m - 2m6s .4m - 2m19s .8m - 2m33s @@ -140,11 +140,11 @@ def from_pairs(self, distance_matrix, frame_size, limit=None, single_sided=False 12.8m - 4m59s 25.6m - 7m27s """ - nr_frags = len(distance_matrix.labels) + nr_frags = len(similarity_matrix.labels) six.print_('Filling labels ... ', end='') - id2labels = {v: k for k, v in distance_matrix.labels.label2ids().items()} + id2labels = {v: k for k, v in similarity_matrix.labels.label2ids().items()} id2nid = {v: k for k, v in enumerate(id2labels)} labels2nid = [None] * nr_frags for myid in id2nid: @@ -159,9 +159,9 @@ def from_pairs(self, distance_matrix, frame_size, limit=None, single_sided=False shape=(nr_frags, nr_frags), chunkshape=(1, nr_frags), filters=self.filters) if limit is None: - limit = len(distance_matrix.pairs) + limit = len(similarity_matrix.pairs) - self._ingest_pairs(distance_matrix.pairs.table, id2nid, frame_size, limit, single_sided) + self._ingest_pairs(similarity_matrix.pairs.table, id2nid, frame_size, limit, single_sided) self.h5file.flush() def _ingest_pairs(self, pairs, oid2nid, frame_size, limit, single_sided): diff --git a/kripodb/hdf5.py b/kripodb/hdf5.py index 547c68d..7961e77 100644 --- a/kripodb/hdf5.py +++ b/kripodb/hdf5.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Distance matrix using hdf5 as storage backend.""" +"""Similarity matrix using hdf5 as storage backend.""" from __future__ import absolute_import from math import log10, ceil, floor @@ -19,16 +19,16 @@ import six -class DistanceMatrix(object): - """Distance matrix +class SimilarityMatrix(object): + """Similarity matrix Args: - filename (str): File name of hdf5 file to write or read distance matrix from + filename (str): File name of hdf5 file to write or read similarity matrix from mode (str): Can be 'r' for reading or 'w' for writing expectedpairrows (int): Expected number of pairs to be added. - Required when distance matrix is opened in write mode, helps optimize storage + Required when similarity matrix is opened in write mode, helps optimize storage expectedlabelrows (int): Expected number of labels to be added. - Required when distance matrix is opened in write mode, helps optimize storage + Required when similarity matrix is opened in write mode, helps optimize storage cache_labels (bool): Cache labels, speed up label lookups Attributes: @@ -57,10 +57,10 @@ def close(self): self.h5file.close() def append(self, other): - """Append data from other distance matrix to me + """Append data from other similarity matrix to me Args: - other (DistanceMatrix): Other distance matrix + other (SimilarityMatrix): Other similarity matrix """ if len(self.labels) == 0: # copy labels when self has no labels @@ -81,15 +81,15 @@ def __iter__(self): for pair in self.pairs: yield self.cache_i2l[pair['a']], self.cache_i2l[pair['b']], pair['score'] - def update(self, distances_iter, label2id): - """Store pairs of fragment identifier with their distance score and label 2 id lookup + def update(self, similarities_iter, label2id): + """Store pairs of fragment identifier with their similarity score and label 2 id lookup Args: - distances_iter (Iterator): Iterator which yields (label1, label2, distance_score) + similarities_iter (Iterator): Iterator which yields (label1, label2, similarity_score) label2id (Dict): Dictionary with fragment label as key and fragment identifier as value. """ - self.pairs.update(distances_iter, label2id) + self.pairs.update(similarities_iter, label2id) self.pairs.add_indexes() self.labels.update(label2id) @@ -98,11 +98,11 @@ def find(self, query, cutoff, limit=None): Args: query (str): Query fragment identifier - cutoff (float): Cutoff, distance scores below cutoff are discarded. + cutoff (float): Cutoff, similarity scores below cutoff are discarded. limit (int): Maximum number of hits. Default is None for no limit. Yields: - Tuple[(str, float)]: Hit fragment idenfier and distance score + Tuple[(str, float)]: Hit fragment idenfier and similarity score """ if self.cache_l2i: frag_id = self.cache_l2i[query] @@ -149,25 +149,25 @@ def __iter__(self): return self.table.__iter__() -class DistancePair(tables.IsDescription): - """Table description for distance pair""" +class SimilarityPair(tables.IsDescription): + """Table description for similarity pair""" a = tables.UInt32Col() b = tables.UInt32Col() score = tables.UInt16Col() class PairsTable(AbstractSimpleTable): - """Tabel to store distance score of a pair of fragment fingerprints + """Tabel to store similarity score of a pair of fragment fingerprints When table does not exist in h5file it is created. Args: h5file (tables.File): Object representing an open hdf5 file expectedrows (int): Expected number of pairs to be added. - Required when distance matrix is opened in write mode, helps optimize storage + Required when similarity matrix is opened in write mode, helps optimize storage Attributes: - score_precision (int): Distance score is a fraction, + score_precision (int): Similarity score is a fraction, the score is converted to an int by multiplying it with the precision full_matrix (bool): Matrix is filled above and below diagonal. """ @@ -180,8 +180,8 @@ def __init__(self, h5file, expectedrows=0): else: table = h5file.create_table('/', self.table_name, - DistancePair, - 'Distance pairs', + SimilarityPair, + 'Similarity pairs', expectedrows=expectedrows) self.table = table @@ -207,32 +207,32 @@ def add_indexes(self): if not self.full_matrix: self.table.cols.b.create_index(filters=self.filters) - def update(self, distances_iter, label2id): - """Store pairs of fragment identifier with their distance score + def update(self, similarities_iter, label2id): + """Store pairs of fragment identifier with their similarity score Args: - distances_iter (Iterator): Iterator which yields (label1, label2, distance_score) + similarities_iter (Iterator): Iterator which yields (label1, label2, similarity_score) label2id (Dict): Lookup with fragment label as key and fragment identifier as value """ hit = self.table.row - for label1, label2, distance in distances_iter: + for label1, label2, similarity in similarities_iter: hit['a'] = label2id[label1] hit['b'] = label2id[label2] - hit['score'] = int(distance * self.score_precision) + hit['score'] = int(similarity * self.score_precision) hit.append() self.table.flush() def find(self, frag_id, cutoff, limit): - """Find fragment hits which has a distance score with frag_id above cutoff. + """Find fragment hits which has a similarity score with frag_id above cutoff. Args: frag_id (int): query fragment identifier - cutoff (float): Cutoff, distance scores below cutoff are discarded. + cutoff (float): Cutoff, similarity scores below cutoff are discarded. limit (int): Maximum number of hits. Default is None for no limit. Returns: - List[Tuple]: Where first tuple value is hit fragment identifier and second value is distance score + List[Tuple]: Where first tuple value is hit fragment identifier and second value is similarity score """ precision = float(self.score_precision) @@ -295,7 +295,7 @@ class LabelsLookup(AbstractSimpleTable): Args: h5file (tables.File): Object representing an open hdf5 file expectedrows (int): Expected number of pairs to be added. - Required when distance matrix is opened in write mode, helps optimize storage + Required when similarity matrix is opened in write mode, helps optimize storage """ table_name = 'labels' filters = tables.Filters(complevel=6, complib='blosc') diff --git a/kripodb/makebits.py b/kripodb/makebits.py index e171bc2..948ee72 100644 --- a/kripodb/makebits.py +++ b/kripodb/makebits.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Module to read/write Makebits file format""" +"""Module to read/write fingerprints in Makebits file format""" from __future__ import absolute_import from intbitset import intbitset diff --git a/kripodb/modifiedtanimoto.py b/kripodb/modifiedtanimoto.py index 74dd32f..9c340f2 100644 --- a/kripodb/modifiedtanimoto.py +++ b/kripodb/modifiedtanimoto.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Module to calculate modified tanimoto distance""" +"""Module to calculate modified tanimoto similarity""" from __future__ import absolute_import from math import fsum @@ -39,7 +39,7 @@ def calc_mean_onbit_density(bitsets, number_of_bits): def corrections(mean_onbit_density): """Calculate corrections - See :func:`distance` for explanation of corrections. + See :func:`similarity` for explanation of corrections. Args: mean_onbit_density (float): Mean on bit density @@ -53,8 +53,8 @@ def corrections(mean_onbit_density): return corr_st, corr_sto -def distance(bitset1, bitset2, number_of_bits, corr_st, corr_sto): - """Calculate modified Tanimoto distance between two fingerprints +def similarity(bitset1, bitset2, number_of_bits, corr_st, corr_sto): + """Calculate modified Tanimoto similarity between two fingerprints Given two fingerprints of length n with a and b bits set in each fingerprint, respectively, and c bits set in both fingerprint, @@ -85,7 +85,7 @@ def distance(bitset1, bitset2, number_of_bits, corr_st, corr_sto): corr_sto (float): Sto correction Returns: - float: modified Tanimoto distance + float: modified Tanimoto similarity """ a = len(bitset1) b = len(bitset2) @@ -97,10 +97,10 @@ def distance(bitset1, bitset2, number_of_bits, corr_st, corr_sto): return smt -def distances(bitsets1, bitsets2, number_of_bits, corr_st, corr_sto, cutoff, ignore_upper_triangle=False): - """Calculate modified tanimoto distance between two collections of fingerprints +def similarities(bitsets1, bitsets2, number_of_bits, corr_st, corr_sto, cutoff, ignore_upper_triangle=False): + """Calculate modified tanimoto similarity between two collections of fingerprints - Excludes distance of the same fingerprint. + Excludes similarity of the same fingerprint. Args: bitsets1 (Dict{str, intbitset.intbitset}): First dict of fingerprints @@ -110,12 +110,12 @@ def distances(bitsets1, bitsets2, number_of_bits, corr_st, corr_sto, cutoff, ign number_of_bits (int): Number of bits for all fingerprints corr_st (float): St correction corr_sto (float): Sto correction - cutoff (float): Cutoff, distance scores below cutoff are discarded. - ignore_upper_triangle (Optional[bool]): When true returns distance where label1 > label2, - when false returns all distances + cutoff (float): Cutoff, similarity scores below cutoff are discarded. + ignore_upper_triangle (Optional[bool]): When true returns similarity where label1 > label2, + when false returns all similarities Yields: - (fingerprint label 1, fingerprint label2, distance) + (fingerprint label 1, fingerprint label2, similarity score) """ for (label1, bs1) in six.iteritems(bitsets1): @@ -126,7 +126,7 @@ def distances(bitsets1, bitsets2, number_of_bits, corr_st, corr_sto, cutoff, ign if ignore_upper_triangle and label1 > label2: continue - d = distance(bs1, bs2, number_of_bits, corr_st, corr_sto) + score = similarity(bs1, bs2, number_of_bits, corr_st, corr_sto) - if d >= cutoff: - yield label1, label2, d + if score >= cutoff: + yield label1, label2, score diff --git a/kripodb/pairs.py b/kripodb/pairs.py index 985f5c6..1fdb24a 100644 --- a/kripodb/pairs.py +++ b/kripodb/pairs.py @@ -11,17 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Module handling generation and retrieval of distance matrix""" +"""Module handling generation and retrieval of similarity of fingerprint pairs""" from __future__ import absolute_import import tables import logging -from kripodb.frozen import FrozenDistanceMatrix +from kripodb.frozen import FrozenSimilarityMatrix -from .hdf5 import DistanceMatrix -from .modifiedtanimoto import distances, corrections +from .hdf5 import SimilarityMatrix +from .modifiedtanimoto import similarities, corrections from .webservice.client import WebserviceClient @@ -38,7 +38,7 @@ def dump_pairs(bitsets1, ignore_upper_triangle=False): """Dump pairs of bitset collection. - A pairs are rows of the bitset identifier of both bitsets with a distance score. + A pairs are rows of the bitset identifier of both bitsets with a similarity score. Args: bitsets1 (Dict{str, intbitset.intbitset}): First dict of fingerprints @@ -50,11 +50,11 @@ def dump_pairs(bitsets1, out (File): File object where 'tsv' format is written to. number_of_bits (int): Number of bits for all bitsets mean_onbit_density (float): Mean on bit density - cutoff (float): Cutoff, distance scores below cutoff are discarded. + cutoff (float): Cutoff, similarity scores below cutoff are discarded. label2id: dict to translate label to id (string to int) nomemory: If true bitset2 is not loaded into memory - ignore_upper_triangle: When true returns distance where label1 > label2, - when false returns all distances + ignore_upper_triangle: When true returns similarity where label1 > label2, + when false returns all similarities """ if out_file == '-' and out_format.startswith('hdf5'): @@ -71,15 +71,15 @@ def dump_pairs(bitsets1, logging.warning('Generating pairs') - distances_iter = distances(bitsets1, bitsets2, + similarities_iter = similarities(bitsets1, bitsets2, number_of_bits, corr_st, corr_sto, cutoff, ignore_upper_triangle) if out_format == 'tsv': - dump_pairs_tsv(distances_iter, out) + dump_pairs_tsv(similarities_iter, out) elif out_format == 'hdf5': - dump_pairs_hdf5(distances_iter, + dump_pairs_hdf5(similarities_iter, label2id, expectedrows, out_file) @@ -87,7 +87,7 @@ def dump_pairs(bitsets1, raise LookupError('Invalid output format') -def dump_pairs_tsv(distances_iter, out): +def dump_pairs_tsv(similarities_iter, out): """Dump pairs in tab delimited file Pro: @@ -95,16 +95,15 @@ def dump_pairs_tsv(distances_iter, out): Con: * big, unless output is compressed - :param distances_iter: - :param out: - :return: - + Args: + similarities_iter (Iterator): Iterator with tuple with fingerprint 1 label, fingerprint 2 label, similarity as members + out (File): Writeable file """ - for label1, label2, distance in distances_iter: - out.write('{0}\t{1}\t{2:.5}\n'.format(label1, label2, distance)) + for label1, label2, similarity in similarities_iter: + out.write('{0}\t{1}\t{2:.5}\n'.format(label1, label2, similarity)) -def dump_pairs_hdf5(distances_iter, +def dump_pairs_hdf5(similarities_iter, label2id, expectedrows, out_file): @@ -115,30 +114,31 @@ def dump_pairs_hdf5(distances_iter, Con: * requires hdf5 library to access - :param distances_iter: - :param label2id: dict to translate label to id (string to int) - :param expectedrows: - :param out_file: - :return: + Args: + similarities_iter (Iterator): Iterator with tuple with fingerprint 1 label, fingerprint 2 label, similarity as members + label2id (dict): dict to translate label to id (string to int) + expectedrows: + out_file: + """ - matrix = DistanceMatrix(out_file, 'w', + matrix = SimilarityMatrix(out_file, 'w', expectedpairrows=expectedrows, expectedlabelrows=len(label2id)) - matrix.update(distances_iter, label2id) + matrix.update(similarities_iter, label2id) matrix.close() -def distance2query(bitsets2, query, out, mean_onbit_density, cutoff, memory): - """Calculate distance of query against all fingerprints in bitsets2 and write to tab delimited file. +def similarity2query(bitsets2, query, out, mean_onbit_density, cutoff, memory): + """Calculate similarity of query against all fingerprints in bitsets2 and write to tab delimited file. Args: bitsets2 (kripodb.db.IntbitsetDict): query (str): Query identifier or beginning of it out (File): File object to write output to mean_onbit_density (flaot): Mean on bit density - cutoff (float): Cutoff, distance scores below cutoff are discarded. + cutoff (float): Cutoff, similarity scores below cutoff are discarded. memory (Optional[bool]): When true will load bitset2 into memory, when false it doesn't """ @@ -160,20 +160,20 @@ def distance2query(bitsets2, query, out, mean_onbit_density, cutoff, memory): (corr_st, corr_sto) = corrections(mean_onbit_density) - distances_iter = distances(bitsets1, bitsets2, + similarities_iter = similarities(bitsets1, bitsets2, number_of_bits, corr_st, corr_sto, cutoff, True) - sorted_distances = sorted(distances_iter, key=lambda row: row[2], reverse=True) - dump_pairs_tsv(sorted_distances, out) + sorted_similarities = sorted(similarities_iter, key=lambda row: row[2], reverse=True) + dump_pairs_tsv(sorted_similarities, out) def similar_run(query, pairsdbfn, cutoff, out): - """Find similar fragments to query based on distance matrix and write to tab delimited file. + """Find similar fragments to query based on similarity matrix and write to tab delimited file. Args: query (str): Query fragment identifier - pairsdbfn (str): Filename of distance matrix file or url of kripodb webservice - cutoff (float): Cutoff, distance scores below cutoff are discarded. + pairsdbfn (str): Filename of similarity matrix file or url of kripodb webservice + cutoff (float): Cutoff, similarity scores below cutoff are discarded. out (File): File object to write output to """ @@ -183,20 +183,20 @@ def similar_run(query, pairsdbfn, cutoff, out): hits = [(h['query_frag_id'], h['hit_frag_id'], h['score']) for h in hits] dump_pairs_tsv(hits, out) else: - matrix = open_distance_matrix(pairsdbfn) + matrix = open_similarity_matrix(pairsdbfn) hits = similar(query, matrix, cutoff) dump_pairs_tsv(hits, out) matrix.close() -def open_distance_matrix(fn): - """Open read-only distance matrix file. +def open_similarity_matrix(fn): + """Open read-only similarity matrix file. Args: - fn (str): Filename of distance matrix + fn (str): Filename of similarity matrix Returns: - DistanceMatrix|FrozenDistanceMatrix: A read-only distance matrix object + SimilarityMatrix|FrozenSimilarityMatrix: A read-only similarity matrix object """ # peek in file to detect format @@ -204,43 +204,43 @@ def open_distance_matrix(fn): is_frozen = 'scores' in f.root f.close() if is_frozen: - matrix = FrozenDistanceMatrix(fn) + matrix = FrozenSimilarityMatrix(fn) else: - matrix = DistanceMatrix(fn, cache_labels=True) + matrix = SimilarityMatrix(fn, cache_labels=True) return matrix -def similar(query, distance_matrix, cutoff, limit=None): - """Find similar fragments to query based on distance matrix. +def similar(query, similarity_matrix, cutoff, limit=None): + """Find similar fragments to query based on similarity matrix. Args: query (str): Query fragment identifier - distance_matrix (kripodb.db.DistanceMatrix): Distance matrix - cutoff (float): Cutoff, distance scores below cutoff are discarded. + similarity_matrix (kripodb.db.SimilarityMatrix): Similarity matrix + cutoff (float): Cutoff, similarity scores below cutoff are discarded. limit (int): Maximum number of hits. Default is None for no limit. Yields: - Tuple[(str, str, float)]: List of (query fragment identifier, hit fragment identifier, distance score) sorted on distance score + Tuple[(str, str, float)]: List of (query fragment identifier, hit fragment identifier, similarity score) sorted on similarity score """ - raw_hits = distance_matrix.find(query, cutoff, limit) + raw_hits = similarity_matrix.find(query, cutoff, limit) # add query column for hit_id, score in raw_hits: yield query, hit_id, score def total_number_of_pairs(fingerprint_filenames): - """Count number of pairs in distance matrix files + """Count number of pairs in similarity matrix files Args: - fingerprint_filenames (list[str]): List of file names of distance matrices + fingerprint_filenames (list[str]): List of file names of similarity matrices Returns: int: Total number of pairs """ sizes = [] for filename in fingerprint_filenames: - matrix = DistanceMatrix(filename) + matrix = SimilarityMatrix(filename) pairs = matrix.pairs sizes.append(len(pairs)) matrix.close() @@ -248,22 +248,22 @@ def total_number_of_pairs(fingerprint_filenames): def merge(ins, out): - """Concatenate distance matrix files into a single one. + """Concatenate similarity matrix files into a single one. Args: - ins (list[str]): List of input distance matrix filenames - out (str): Output distance matrix filenames + ins (list[str]): List of input similarity matrix filenames + out (str): Output similarity matrix filenames Raises: AssertionError: When nr of labels of input files is not the same """ expectedrows = total_number_of_pairs(ins) - out_matrix = DistanceMatrix(out, 'w', expectedpairrows=expectedrows) + out_matrix = SimilarityMatrix(out, 'w', expectedpairrows=expectedrows) # copy pairs for in_filename in ins: - in_matrix = DistanceMatrix(in_filename) + in_matrix = SimilarityMatrix(in_filename) out_matrix.append(in_matrix) in_matrix.close() diff --git a/kripodb/script.py b/kripodb/script.py index 2874197..2ddc2da 100644 --- a/kripodb/script.py +++ b/kripodb/script.py @@ -27,8 +27,8 @@ from . import makebits from . import pairs from .db import FragmentsDb, FingerprintsDb -from .frozen import FrozenDistanceMatrix -from .hdf5 import DistanceMatrix +from .frozen import FrozenSimilarityMatrix +from .hdf5 import SimilarityMatrix from .pdb import PdbReport from .modifiedtanimoto import calc_mean_onbit_density from .webservice.server import serve_app @@ -49,24 +49,24 @@ def make_parser(): make_fragments_parser(subparsers) - make_distances_parser(subparsers) + make_similarities_parser(subparsers) return parser -def make_distances_parser(subparsers): - """Creates a parser for distances sub commands +def make_similarities_parser(subparsers): + """Creates a parser for similarities sub commands Args: subparsers (argparse.ArgumentParser): Parser to which to add sub commands to """ - dm_sc = subparsers.add_parser('distances', help='Distance matrix').add_subparsers() + dm_sc = subparsers.add_parser('similarities', help='Similarity matrix').add_subparsers() similar_sc(dm_sc) merge_pairs_sc(dm_sc) - distmatrix_export_sc(dm_sc) - distmatrix_import_sc(dm_sc) - distmatrix_filter_sc(dm_sc) - dismatrix_freeze_sc(dm_sc) + simmatrix_export_sc(dm_sc) + simmatrix_import_sc(dm_sc) + simmatrix_filter_sc(dm_sc) + similarity_freeze_sc(dm_sc) fpneigh2tsv_sc(dm_sc) serve_sc(dm_sc) @@ -94,25 +94,25 @@ def make_fingerprints_parser(subparsers): makebits2fingerprintsdb_sc(fp_sc) fingerprintsdb2makebits_sc(fp_sc) meanbitdensity_sc(fp_sc) - distance2query_sc(fp_sc) + similarity2query_sc(fp_sc) pairs_sc(fp_sc) def pairs_sc(subparsers): - sc_help = '''Calculate modified tanimoto distance between fingerprints''' + sc_help = '''Calculate modified tanimoto similarity between fingerprints''' sc_description = ''' Output formats: - * tsv, tab separated id1,id2, distance + * tsv, tab separated id1,id2, similarity * hdf5, hdf5 file constructed with pytables with a, b and score, but but a and b have been replaced - by numbers and distance has been converted to scaled int + by numbers and similarity has been converted to scaled int When input has been split into chunks, - use `--ignore_upper_triangle` flag for computing distances between same chunk. + use `--ignore_upper_triangle` flag for computing similarities between same chunk. This prevents storing pair a->b also as b->a. ''' out_formats = ['tsv', 'hdf5'] - sc = subparsers.add_parser('distances', + sc = subparsers.add_parser('similarities', help=sc_help, description=sc_description) sc.add_argument('fingerprintsfn1', @@ -233,7 +233,7 @@ def fingerprintsdb2makebits(infile, outfile): makebits.write_file(bitsets.number_of_bits, bitsets, outfile) -def distance2query_sc(subparsers): +def similarity2query_sc(subparsers): sc_help = 'Find the fragments closests to query based on fingerprints' sc = subparsers.add_parser('similar', help=sc_help) sc.add_argument('fingerprintsdb', @@ -252,25 +252,25 @@ def distance2query_sc(subparsers): sc.add_argument('--memory', action='store_true', help='Store bitsets in memory (default: %(default)s)') - sc.set_defaults(func=pairs.distance2query) + sc.set_defaults(func=pairs.similarity2query) -def distance2query_run(fingerprintsdb, query, out, mean_onbit_density, cutoff, memory): +def similarity2query_run(fingerprintsdb, query, out, mean_onbit_density, cutoff, memory): bitsets = FingerprintsDb(fingerprintsdb).as_dict() - pairs.distance2query(bitsets, query, out, mean_onbit_density, cutoff, memory) + pairs.similarity2query(bitsets, query, out, mean_onbit_density, cutoff, memory) def similar_sc(subparsers): - sc_help = 'Find the fragments closets to query based on distance matrix' + sc_help = 'Find the fragments closets to query based on similarity matrix' sc = subparsers.add_parser('similar', help=sc_help) - sc.add_argument('pairsdbfn', type=str, help='hdf5 distance matrix file or base url of kripodb webservice') + sc.add_argument('pairsdbfn', type=str, help='hdf5 similarity matrix file or base url of kripodb webservice') sc.add_argument('query', type=str, help='Query fragment identifier') sc.add_argument('--out', type=argparse.FileType('w'), default='-', - help='Output file tab delimited (query, hit, distance score)') + help='Output file tab delimited (query, hit, similarity score)') sc.add_argument('--cutoff', type=float, default=0.55, - help='Distance cutoff (default: %(default)s)') + help='Similarity cutoff (default: %(default)s)') sc.set_defaults(func=pairs.similar_run) @@ -355,7 +355,7 @@ def fragmentsdb_filter_sc(subparsers): help='Name of fragments db output file, will overwrite file if it exists') sc.add_argument('--pdbs', type=argparse.FileType('r'), help='Keep fragments from any of the supplied pdb codes, one pdb code per line, use - for stdin') - sc.add_argument('--matrix', type=str, help='Keep fragments which are in distance matrix file') + sc.add_argument('--matrix', type=str, help='Keep fragments which are in similarity matrix file') sc.set_defaults(func=fragmentsdb_filter) @@ -377,10 +377,10 @@ def fragmentsdb_filter_matrix(input, output, matrix): output_db.cursor.execute('CREATE TEMPORARY TABLE filter (frag_id TEXT PRIMARY KEY)') sql = 'INSERT OR REPLACE INTO filter (frag_id) VALUES (?)' print('Matrix labels') - distmatrix = DistanceMatrix(matrix) - for frag_id in distmatrix.labels.label2ids().keys(): + simmatrix = SimilarityMatrix(matrix) + for frag_id in simmatrix.labels.label2ids().keys(): output_db.cursor.execute(sql, (frag_id,)) - distmatrix.close() + simmatrix.close() # insert select output_db.cursor.execute('INSERT INTO fragments SELECT * FROM orig.fragments JOIN filter USING (frag_id)') @@ -425,41 +425,41 @@ def fragmentsdb_filter_pdbs(input, output, pdbs): print('Wrote: ' + output) -def distmatrix_export_sc(subparsers): - sc = subparsers.add_parser('export', help='Export distance matrix to tab delimited file') - sc.add_argument('distmatrixfn', type=str, help='Compact hdf5 distance matrix filename') +def simmatrix_export_sc(subparsers): + sc = subparsers.add_parser('export', help='Export similarity matrix to tab delimited file') + sc.add_argument('simmatrixfn', type=str, help='Compact hdf5 similarity matrix filename') sc.add_argument('outputfile', type=argparse.FileType('w'), help='Tab delimited output file, use - for stdout') - sc.set_defaults(func=distmatrix_export_run) + sc.set_defaults(func=simmatrix_export_run) -def distmatrix_export_run(distmatrixfn, outputfile): - """Export distance matrix to tab delimited file +def simmatrix_export_run(simmatrixfn, outputfile): + """Export similarity matrix to tab delimited file Args: - distmatrixfn (str): Compact hdf5 distance matrix filename + simmatrixfn (str): Compact hdf5 similarity matrix filename outputfile (file): Tab delimited output file """ - distmatrix = DistanceMatrix(distmatrixfn) + simmatrix = SimilarityMatrix(simmatrixfn) writer = csv.writer(outputfile, delimiter="\t", lineterminator='\n') writer.writerow(['frag_id1', 'frag_id2', 'score']) - writer.writerows(distmatrix) - distmatrix.close() + writer.writerows(simmatrix) + simmatrix.close() -def distmatrix_import_sc(subparsers): +def simmatrix_import_sc(subparsers): sc = subparsers.add_parser('import', - help='Import distance matrix from tab delimited file', + help='Import similarity matrix from tab delimited file', description='''When input has been split into chunks, - use `--ignore_upper_triangle` flag for distances between same chunk. + use `--ignore_upper_triangle` flag for similarities between same chunk. This prevents storing pair a->b also as b->a.''') sc.add_argument('inputfile', type=argparse.FileType('r'), help='Input file, use - for stdin') sc.add_argument('fragmentsdb', default='fragments.db', help='Name of fragments db file (default: %(default)s)') - sc.add_argument('distmatrixfn', type=str, help='Compact hdf5 distance matrix file, will overwrite file if it exists') + sc.add_argument('simmatrixfn', type=str, help='Compact hdf5 similarity matrix file, will overwrite file if it exists') sc.add_argument('--format', choices=['tsv', 'fpneigh'], default='fpneigh', @@ -472,20 +472,20 @@ def distmatrix_import_sc(subparsers): sc.add_argument('--ignore_upper_triangle', action='store_true', help='Ignore upper triangle (default: %(default)s)') - sc.set_defaults(func=distmatrix_import_run) + sc.set_defaults(func=simmatrix_import_run) -def distmatrix_import_run(inputfile, fragmentsdb, distmatrixfn, format, nrrows, ignore_upper_triangle=False): +def simmatrix_import_run(inputfile, fragmentsdb, simmatrixfn, format, nrrows, ignore_upper_triangle=False): if format == 'tsv': - distmatrix_import_tsv(inputfile, fragmentsdb, distmatrixfn, nrrows, ignore_upper_triangle) + simmatrix_import_tsv(inputfile, fragmentsdb, simmatrixfn, nrrows, ignore_upper_triangle) elif format == 'fpneigh': - distmatrix_importfpneigh_run(inputfile, fragmentsdb, distmatrixfn, nrrows, ignore_upper_triangle) + simmatrix_importfpneigh_run(inputfile, fragmentsdb, simmatrixfn, nrrows, ignore_upper_triangle) -def distmatrix_import_tsv(inputfile, fragmentsdb, distmatrixfn, nrrows, ignore_upper_triangle=False): +def simmatrix_import_tsv(inputfile, fragmentsdb, simmatrixfn, nrrows, ignore_upper_triangle=False): frags = FragmentsDb(fragmentsdb) label2id = frags.label2id().materialize() - distmatrix = DistanceMatrix(distmatrixfn, 'w', + simmatrix = SimilarityMatrix(simmatrixfn, 'w', expectedlabelrows=len(label2id), expectedpairrows=nrrows) @@ -493,7 +493,7 @@ def distmatrix_import_tsv(inputfile, fragmentsdb, distmatrixfn, nrrows, ignore_u # ignore header next(reader) - # distmatrix wants score as float instead of str + # simmatrix wants score as float instead of str def csv_iter(rows): for row in rows: if row[0] == row[1]: @@ -503,42 +503,42 @@ def csv_iter(rows): row[2] = float(row[2]) yield row - distmatrix.update(csv_iter(reader), label2id) - distmatrix.close() + simmatrix.update(csv_iter(reader), label2id) + simmatrix.close() -def distmatrix_importfpneigh_run(inputfile, fragmentsdb, distmatrixfn, nrrows, ignore_upper_triangle=False): +def simmatrix_importfpneigh_run(inputfile, fragmentsdb, simmatrixfn, nrrows, ignore_upper_triangle=False): frags = FragmentsDb(fragmentsdb) label2id = frags.label2id().materialize() - distmatrix = DistanceMatrix(distmatrixfn, 'w', + simmatrix = SimilarityMatrix(simmatrixfn, 'w', expectedlabelrows=len(label2id), expectedpairrows=nrrows) - distmatrix.update(read_fpneighpairs_file(inputfile, ignore_upper_triangle), label2id) - distmatrix.close() + simmatrix.update(read_fpneighpairs_file(inputfile, ignore_upper_triangle), label2id) + simmatrix.close() -def distmatrix_filter_sc(subparsers): - sc = subparsers.add_parser('filter', help='Filter distance matrix') +def simmatrix_filter_sc(subparsers): + sc = subparsers.add_parser('filter', help='Filter similarity matrix') sc.add_argument('input', type=str, - help='Input hdf5 distance matrix file') + help='Input hdf5 similarity matrix file') sc.add_argument('output', type=str, - help='Output hdf5 distance matrix file, will overwrite file if it exists') + help='Output hdf5 similarity matrix file, will overwrite file if it exists') sc.add_argument('--fragmentsdb', default='fragments.db', help='Name of fragments db file (default: %(default)s)') - sc.set_defaults(func=distmatrix_filter) + sc.set_defaults(func=simmatrix_filter) -def distmatrix_filter(input, output, fragmentsdb): - distmatrix_in = DistanceMatrix(input) +def simmatrix_filter(input, output, fragmentsdb): + simmatrix_in = SimilarityMatrix(input) frags = FragmentsDb(fragmentsdb) print('Counting') expectedlabelrows = len(frags) - labelsin = len(distmatrix_in.labels) - expectedpairrows = int(len(distmatrix_in.pairs) * (float(expectedlabelrows) / labelsin)) + labelsin = len(simmatrix_in.labels) + expectedpairrows = int(len(simmatrix_in.pairs) * (float(expectedlabelrows) / labelsin)) - distmatrix_out = DistanceMatrix(output, + simmatrix_out = SimilarityMatrix(output, 'w', expectedlabelrows=expectedlabelrows, expectedpairrows=expectedpairrows, @@ -547,14 +547,14 @@ def distmatrix_filter(input, output, fragmentsdb): print('Building frag_id keep list') frag_labels2keep = set(frags.id2label().values()) frag_ids2keep = set() - for frag_label, frag_id in six.iteritems(distmatrix_in.labels.label2ids()): + for frag_label, frag_id in six.iteritems(simmatrix_in.labels.label2ids()): if frag_label in frag_labels2keep: frag_ids2keep.add(frag_id) print('Copying subset of pairs table') all_frags2keep = set(frag_ids2keep) - hit = distmatrix_out.pairs.table.row - for row in distmatrix_in.pairs.table: + hit = simmatrix_out.pairs.table.row + for row in simmatrix_in.pairs.table: if row[0] in frag_ids2keep and row[1] in frag_ids2keep: hit['a'] = row[0] hit['b'] = row[1] @@ -578,50 +578,50 @@ def distmatrix_filter(input, output, fragmentsdb): all_frags2keep.add(row[0]) print('Adding indices') - distmatrix_out.pairs.add_indexes() + simmatrix_out.pairs.add_indexes() print('Copying subset of labels table') - hit = distmatrix_out.labels.table.row - for row in distmatrix_in.labels.table: + hit = simmatrix_out.labels.table.row + for row in simmatrix_in.labels.table: if row[0] in all_frags2keep: hit['frag_id'] = row[0] hit['label'] = row[1] hit.append() - distmatrix_in.close() - distmatrix_out.close() + simmatrix_in.close() + simmatrix_out.close() -def dismatrix_freeze_sc(subparsers): - sc = subparsers.add_parser('freeze', help='Optimize distance matrix for reading') +def similarity_freeze_sc(subparsers): + sc = subparsers.add_parser('freeze', help='Optimize similarity matrix for reading') sc.add_argument('in_fn', type=str, help='Input pairs file') sc.add_argument('out_fn', type=str, help='Output array file, file is overwritten') sc.add_argument('-f', '--frame_size', type=int, default=10**8, help='Size of frame (default: %(default)s)') sc.add_argument('-m', '--memory', type=int, default=1, help='Memory cache in Gigabytes (default: %(default)s)') sc.add_argument('-l', '--limit', type=int, help='Number of pairs to copy, None for no limit (default: %(default)s)') sc.add_argument('-s', '--single_sided', action='store_true', help='Store half matrix (default: %(default)s)') - sc.set_defaults(func=dismatrix_freeze) + sc.set_defaults(func=similarity_freeze_run) -def dismatrix_freeze(in_fn, out_fn, frame_size, memory, limit, single_sided): - dm = DistanceMatrix(in_fn, 'r') +def similarity_freeze_run(in_fn, out_fn, frame_size, memory, limit, single_sided): + dm = SimilarityMatrix(in_fn, 'r') parameters.CHUNK_CACHE_SIZE = memory * 1024 ** 3 parameters.CHUNK_CACHE_NELMTS = 2 ** 14 - dfm = FrozenDistanceMatrix(out_fn, 'w') + dfm = FrozenSimilarityMatrix(out_fn, 'w') dfm.from_pairs(dm, frame_size, limit, single_sided) dm.close() dfm.close() def read_fpneighpairs_file(inputfile, ignore_upper_triangle=False): - """Read fpneigh formatted distance matrix file. + """Read fpneigh formatted similarity matrix file. Args: inputfile (file): File object to read ignore_upper_triangle (bool): Ignore upper triangle of input Yields: - Tuple((Str,Str,Float)): List of (query fragment identifier, hit fragment identifier, distance score) + Tuple((Str,Str,Float)): List of (query fragment identifier, hit fragment identifier, similarity score) """ current_query = None @@ -652,8 +652,8 @@ def fpneigh2tsv_run(inputfile, outputfile): def serve_sc(subparsers): - sc = subparsers.add_parser('serve', help='Serve distance matrix as webservice') - sc.add_argument('matrix', type=str, help='Filename of distance matrix hdf5 file') + sc = subparsers.add_parser('serve', help='Serve similarity matrix as webservice') + sc.add_argument('matrix', type=str, help='Filename of similarity matrix hdf5 file') sc.add_argument('--internal_port', type=int, default=8084, diff --git a/kripodb/version.py b/kripodb/version.py index 2fb3f89..688a1b4 100644 --- a/kripodb/version.py +++ b/kripodb/version.py @@ -12,5 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version_info__ = ('1', '4', '2') +__version_info__ = ('2', '0', '0') __version__ = '.'.join(__version_info__) diff --git a/kripodb/webservice/client.py b/kripodb/webservice/client.py index 3a8da86..0693334 100644 --- a/kripodb/webservice/client.py +++ b/kripodb/webservice/client.py @@ -35,11 +35,11 @@ def similar_fragments(self, fragment_id, cutoff, limit=1000): Args: fragment_id (str): Query fragment identifier - cutoff (float): Cutoff, distance scores below cutoff are discarded. + cutoff (float): Cutoff, similarity scores below cutoff are discarded. limit (int): Maximum number of hits. Default is None for no limit. Returns: - List(Dict()): Query fragment identifier, hit fragment identifier and distance score + List(Dict()): Query fragment identifier, hit fragment identifier and similarity score """ url = self.base_url + '/fragments/{fragment_id}/similar'.format(fragment_id=fragment_id) params = {'cutoff': cutoff, 'limit': limit} diff --git a/kripodb/webservice/server.py b/kripodb/webservice/server.py index f0b5bde..d1640d6 100644 --- a/kripodb/webservice/server.py +++ b/kripodb/webservice/server.py @@ -7,7 +7,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, +# simributed under the License is simributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -22,7 +22,7 @@ from flask import current_app, abort from ..version import __version__ -from ..pairs import open_distance_matrix +from ..pairs import open_similarity_matrix LOGGER = logging.getLogger(__name__) @@ -32,17 +32,17 @@ def get_similar_fragments(fragment_id, cutoff, limit): Args: fragment_id (str): Query fragment identifier - cutoff (float): Cutoff, distance scores below cutoff are discarded. + cutoff (float): Cutoff, similarity scores below cutoff are discarded. limit (int): Maximum number of hits. Default is None for no limit. Returns: - List(Dict()): Query fragment identifier, hit fragment identifier and distance score + List(Dict()): Query fragment identifier, hit fragment identifier and similarity score """ - distance_matrix = current_app.config['matrix'] + similarity_matrix = current_app.config['matrix'] query_id = fragment_id hits = [] try: - raw_hits = distance_matrix.find(query_id, cutoff, limit) + raw_hits = similarity_matrix.find(query_id, cutoff, limit) # add query column for hit_id, score in raw_hits: hits.append({'query_frag_id': query_id, 'hit_frag_id': hit_id, 'score': score}) @@ -60,11 +60,11 @@ def get_version(): return {'version': __version__} -def wsgi_app(dist_matrix, external_url='http://localhost:8084/kripo'): +def wsgi_app(sim_matrix, external_url='http://localhost:8084/kripo'): """Create wsgi app Args: - dist_matrix (DistanceMatrix): Distance matrix to use in webservice + sim_matrix (SimilarityMatrix): Similarity matrix to use in webservice external_url (str): URL which should be used in Swagger spec Returns: @@ -74,7 +74,7 @@ def wsgi_app(dist_matrix, external_url='http://localhost:8084/kripo'): url = urlparse(external_url) swagger_file = resource_filename(__name__, 'swagger.json') app.add_api(swagger_file, base_path=url.path, arguments={'hostport': url.netloc, 'scheme': url.scheme}) - app.app.config['matrix'] = dist_matrix + app.app.config['matrix'] = sim_matrix return app @@ -82,12 +82,12 @@ def serve_app(matrix, internal_port=8084, external_url='http://localhost:8084/kr """Serve webservice forever Args: - matrix: Filename of distance matrix hdf5 file + matrix: Filename of similarity matrix hdf5 file internal_port: TCP port on which to listen external_url (str): URL which should be used in Swagger spec """ - dist_matrix = open_distance_matrix(matrix) - app = wsgi_app(dist_matrix, external_url) + sim_matrix = open_similarity_matrix(matrix) + app = wsgi_app(sim_matrix, external_url) LOGGER.setLevel(logging.INFO) LOGGER.addHandler(logging.StreamHandler()) LOGGER.info(' * Swagger spec at {}/swagger.json'.format(external_url)) @@ -95,4 +95,4 @@ def serve_app(matrix, internal_port=8084, external_url='http://localhost:8084/kr try: app.run(port=internal_port) finally: - dist_matrix.close() + sim_matrix.close() diff --git a/tests/test_canned.py b/tests/test_canned.py index 5e5cae7..7888eb5 100644 --- a/tests/test_canned.py +++ b/tests/test_canned.py @@ -22,7 +22,7 @@ def test_similarities(): queries = pd.Series(['3j7u_NDP_frag24']) - result = similarities(queries, 'data/distances.h5', 0.85) + result = similarities(queries, 'data/similarities.h5', 0.85) expected = [ {'query_frag_id': '3j7u_NDP_frag24', 'hit_frag_id': '3j7u_NDP_frag23', 'score': 0.8991}, @@ -33,7 +33,7 @@ def test_similarities(): def test_similarities_limitof1(): queries = pd.Series(['3j7u_NDP_frag24']) - result = similarities(queries, 'data/distances.h5', 0.55, 1) + result = similarities(queries, 'data/similarities.h5', 0.55, 1) expected = [ {'query_frag_id': '3j7u_NDP_frag24', 'hit_frag_id': '3j7u_NDP_frag23', 'score': 0.8991}, diff --git a/tests/test_frozen.py b/tests/test_frozen.py index bf1d931..2bc1155 100644 --- a/tests/test_frozen.py +++ b/tests/test_frozen.py @@ -17,11 +17,11 @@ import pandas as pd import pandas.util.testing as pdt from tests.test_pairs import tmpname -from kripodb.frozen import FrozenDistanceMatrix -from kripodb.hdf5 import DistanceMatrix +from kripodb.frozen import FrozenSimilarityMatrix +from kripodb.hdf5 import SimilarityMatrix -class TestFrozenDistanceMatrix(object): +class TestFrozenSimilarityMatrix(object): pair_matrix_fn = None pair_matrix = None matrix_fn = None @@ -29,17 +29,17 @@ class TestFrozenDistanceMatrix(object): def setUp(self): self.pair_matrix_fn = tmpname() - self.pair_matrix = DistanceMatrix(self.pair_matrix_fn, 'a', driver='H5FD_CORE', driver_core_backing_store=0) + self.pair_matrix = SimilarityMatrix(self.pair_matrix_fn, 'a', driver='H5FD_CORE', driver_core_backing_store=0) labels = {'a': 0, 'b': 1, 'c': 2, 'd': 3} - distances = [ + similarities = [ ('a', 'b', 0.9), ('a', 'c', 0.5), ('b', 'c', 0.6), ('d', 'c', 0.7) ] - self.pair_matrix.update(distances, labels) + self.pair_matrix.update(similarities, labels) self.matrix_fn = tmpname() - self.matrix = FrozenDistanceMatrix(self.matrix_fn, 'a', driver='H5FD_CORE', driver_core_backing_store=0) + self.matrix = FrozenSimilarityMatrix(self.matrix_fn, 'a', driver='H5FD_CORE', driver_core_backing_store=0) def tearDown(self): self.pair_matrix.close() diff --git a/tests/test_hdf5.py b/tests/test_hdf5.py index 2698c5f..aa11b0d 100644 --- a/tests/test_hdf5.py +++ b/tests/test_hdf5.py @@ -16,14 +16,14 @@ from nose.tools import eq_ from numpy.testing import assert_array_almost_equal, assert_almost_equal -from kripodb.hdf5 import DistanceMatrix +from kripodb.hdf5 import SimilarityMatrix -class TestDistanceMatrix(object): +class TestSimilarityMatrix(object): matrix = None def setUp(self): - self.matrix = DistanceMatrix('data/distances.h5') + self.matrix = SimilarityMatrix('data/similarities.h5') def tearDown(self): self.matrix.close() diff --git a/tests/test_modifiedtanimoto.py b/tests/test_modifiedtanimoto.py index ae4aa8f..efdd979 100644 --- a/tests/test_modifiedtanimoto.py +++ b/tests/test_modifiedtanimoto.py @@ -18,7 +18,7 @@ from kripodb import modifiedtanimoto -def assert_distances(result, expected): +def assert_similarities(result, expected): result = sorted(result) expected = sorted(expected) eq_(len(result), len(expected)) @@ -56,25 +56,25 @@ def test_corrections(self): assert_almost_equal(corr_st, 0.663333333333) assert_almost_equal(corr_sto, 0.336666666667) - def test_distance(self): + def test_similarity(self): bitset1 = intbitset([1, 2, 3]) bitset2 = intbitset([1, 2, 4, 8]) - result = modifiedtanimoto.distance(bitset1, bitset2, + result = modifiedtanimoto.similarity(bitset1, bitset2, self.number_of_bits, self.corr_st, self.corr_sto) expected = 0.5779523809525572 assert_almost_equal(result, expected) - def test_distances_ignore_upper_triangle(self): + def test_similarities_ignore_upper_triangle(self): bitsets = { 'a': intbitset([1, 2, 3]), 'b': intbitset([1, 2, 4, 5, 8]), 'c': intbitset([1, 2, 4, 8]) } - iterator = modifiedtanimoto.distances(bitsets, bitsets, + iterator = modifiedtanimoto.similarities(bitsets, bitsets, self.number_of_bits, self.corr_st, self.corr_sto, 0.55, True) @@ -83,17 +83,17 @@ def test_distances_ignore_upper_triangle(self): expected = [ ('a', 'c', 0.5779523809525572), ('b', 'c', 0.8357708333333689)] - # pair a-c is below cutoff with distance of 0.53 - assert_distances(result, expected) + # pair a-c is below cutoff with similarity of 0.53 + assert_similarities(result, expected) - def test_distances(self): + def test_similarities(self): bitsets = { 'a': intbitset([1, 2, 3]), 'b': intbitset([1, 2, 4, 5, 8]), 'c': intbitset([1, 2, 4, 8]) } - iterator = modifiedtanimoto.distances(bitsets, bitsets, + iterator = modifiedtanimoto.similarities(bitsets, bitsets, self.number_of_bits, self.corr_st, self.corr_sto, 0.55, False) @@ -104,5 +104,5 @@ def test_distances(self): ('c', 'a', 0.5779523809525572), ('c', 'b', 0.8357708333333689), ('b', 'c', 0.8357708333333689)] - # pair a-c is below cutoff with distance of 0.53 - assert_distances(result, expected) + # pair a-c is below cutoff with similarity of 0.53 + assert_similarities(result, expected) diff --git a/tests/test_pairs.py b/tests/test_pairs.py index f8f3083..c84ee70 100644 --- a/tests/test_pairs.py +++ b/tests/test_pairs.py @@ -24,7 +24,7 @@ import kripodb.hdf5 import kripodb.pairs as pairs -from kripodb.hdf5 import DistanceMatrix +from kripodb.hdf5 import SimilarityMatrix def tmpname(): @@ -207,10 +207,10 @@ def test_dump_pairs_astsv_nomem(self): expected = "a\tc\t0.13556\n" eq_(result, expected) - def test_distance2query(self): + def test_similarity2query(self): out = StringIO() - pairs.distance2query(self.bitsets, + pairs.similarity2query(self.bitsets, 'a', out, 0.4, @@ -236,24 +236,24 @@ def test_merge(): outfile = tmpname() try: # fill infiles - inmatrix1 = DistanceMatrix(infiles[0], 'w', 1, 2**16-1, 2) + inmatrix1 = SimilarityMatrix(infiles[0], 'w', 1, 2**16-1, 2) inmatrix1.update([('a', 'b', 0.2)], {'a': 1, 'b': 2, 'c': 3}) inmatrix1.close() # matrix with same labels -> copy pairs table by dump/append, ignores labels tables - inmatrix2 = DistanceMatrix(infiles[1], 'w', 2, 2**16-1, 3) + inmatrix2 = SimilarityMatrix(infiles[1], 'w', 2, 2**16-1, 3) inmatrix2.update([('a', 'c', 0.6)], {'a': 1, 'b': 2, 'c': 3}) inmatrix2.close() # matrix generated with different labels -> copy pairs table by iterate/update, adds missing labels - inmatrix3 = DistanceMatrix(infiles[2], 'w', 2, 2**16-1, 3) + inmatrix3 = SimilarityMatrix(infiles[2], 'w', 2, 2**16-1, 3) inmatrix3.update([('b', 'e', 0.4), ('e', 'f', 0.8)], {'b': 1, 'e': 2, 'f': 3}) inmatrix3.close() pairs.merge(infiles, outfile) # compare it - outmatrix = DistanceMatrix(outfile) + outmatrix = SimilarityMatrix(outfile) result = list(outmatrix) outmatrix.close() expected = [('a', 'b', 0.2), ('a', 'c', 0.6), ('b', 'e', 0.4), ('e', 'f', 0.8)] diff --git a/tests/test_script.py b/tests/test_script.py index ee4ade3..e6f8426 100644 --- a/tests/test_script.py +++ b/tests/test_script.py @@ -18,7 +18,7 @@ from nose.tools import eq_ from numpy.testing import assert_array_almost_equal -from kripodb.hdf5 import DistanceMatrix +from kripodb.hdf5 import SimilarityMatrix import kripodb.script as script from tests.test_pairs import tmpname @@ -26,7 +26,7 @@ def test_pairs_subcommand_defaults(): parser = script.make_parser() - args = parser.parse_args(['fingerprints', 'distances', '--fragmentsdbfn', 'fragdb', 'fp1', 'fp2', 'outfn']) + args = parser.parse_args(['fingerprints', 'similarities', '--fragmentsdbfn', 'fragdb', 'fp1', 'fp2', 'outfn']) eq_(args.func, script.pairs_run) @@ -55,7 +55,7 @@ def test_meanbitdensity(): eq_(out.getvalue(), '0.0077683\n') -def test_distmatrix_import_run(): +def test_simmatrix_import_run(): output_fn = tmpname() tsv = '''frag_id1 frag_id2 score @@ -65,15 +65,15 @@ def test_distmatrix_import_run(): inputfile = StringIO(tsv) try: - script.distmatrix_import_run(inputfile=inputfile, - format='tsv', - distmatrixfn=output_fn, - fragmentsdb='data/fragments.sqlite', - nrrows=2) - - distmatrix = DistanceMatrix(output_fn) - result = [r for r in distmatrix] - distmatrix.close() + script.simmatrix_import_run(inputfile=inputfile, + format='tsv', + simmatrixfn=output_fn, + fragmentsdb='data/fragments.sqlite', + nrrows=2) + + simmatrix = SimilarityMatrix(output_fn) + result = [r for r in simmatrix] + simmatrix.close() expected = [('2mlm_2W7_frag1', '2mlm_2W7_frag2xx', 0.5877), ('2mlm_2W7_frag2', '3wvm_STE_frag1', 0.4633)] assert_array_almost_equal([r[2] for r in result], [r[2] for r in expected], 3) eq_([(r[0], r[1],) for r in result], [(r[0], r[1],) for r in result]) @@ -82,7 +82,7 @@ def test_distmatrix_import_run(): os.remove(output_fn) -def test_distmatrix_import_run_ignore_upper_triangle(): +def test_simmatrix_import_run_ignore_upper_triangle(): output_fn = tmpname() tsv = '''frag_id1 frag_id2 score @@ -96,16 +96,16 @@ def test_distmatrix_import_run_ignore_upper_triangle(): inputfile = StringIO(tsv) try: - script.distmatrix_import_run(inputfile=inputfile, - format='tsv', - distmatrixfn=output_fn, - fragmentsdb='data/fragments.sqlite', - nrrows=2, - ignore_upper_triangle=True) - - distmatrix = DistanceMatrix(output_fn) - result = [r for r in distmatrix] - distmatrix.close() + script.simmatrix_import_run(inputfile=inputfile, + format='tsv', + simmatrixfn=output_fn, + fragmentsdb='data/fragments.sqlite', + nrrows=2, + ignore_upper_triangle=True) + + simmatrix = SimilarityMatrix(output_fn) + result = [r for r in simmatrix] + simmatrix.close() print(result) expected = [('2mlm_2W7_frag1', '2mlm_2W7_frag2xx', 0.5877), ('2mlm_2W7_frag2', '3wvm_STE_frag1', 0.4633)] assert_array_almost_equal([r[2] for r in result], [r[2] for r in expected], 3) @@ -115,9 +115,9 @@ def test_distmatrix_import_run_ignore_upper_triangle(): os.remove(output_fn) -def test_distmatrix_export_run(): +def test_simmatrix_export_run(): outputfile = StringIO() - script.distmatrix_export_run('data/distances.h5', outputfile) + script.simmatrix_export_run('data/similarities.h5', outputfile) # go back to start of file outputfile.seek(0) @@ -143,7 +143,7 @@ def test_read_fpneighpairs_file(): eq_(result, expected) -def test_distmatrix_importfpneigh_run(): +def test_simmatrix_importfpneigh_run(): output_fn = tmpname() tsv = '''Compounds similar to 2mlm_2W7_frag1: @@ -156,21 +156,21 @@ def test_distmatrix_importfpneigh_run(): inputfile = StringIO(tsv) try: - script.distmatrix_importfpneigh_run(inputfile=inputfile, - distmatrixfn=output_fn, - fragmentsdb='data/fragments.sqlite', - nrrows=3) - - distmatrix = DistanceMatrix(output_fn) - rows = [r for r in distmatrix] - distmatrix.close() + script.simmatrix_importfpneigh_run(inputfile=inputfile, + simmatrixfn=output_fn, + fragmentsdb='data/fragments.sqlite', + nrrows=3) + + simmatrix = SimilarityMatrix(output_fn) + rows = [r for r in simmatrix] + simmatrix.close() expected = [(u'2mlm_2W7_frag1', u'2mlm_2W7_frag2', 0.5877), (u'2mlm_2W7_frag2', u'3wvm_STE_frag1', 0.4633)] eq_(rows, expected) finally: os.remove(output_fn) -def test_distmatrix_importfpneigh_run_ignore_upper_triangle(): +def test_simmatrix_importfpneigh_run_ignore_upper_triangle(): output_fn = tmpname() tsv = '''Compounds similar to 2mlm_2W7_frag1: @@ -184,15 +184,15 @@ def test_distmatrix_importfpneigh_run_ignore_upper_triangle(): inputfile = StringIO(tsv) try: - script.distmatrix_importfpneigh_run(inputfile=inputfile, - distmatrixfn=output_fn, - fragmentsdb='data/fragments.sqlite', - nrrows=3, - ignore_upper_triangle=True) - - distmatrix = DistanceMatrix(output_fn) - rows = [r for r in distmatrix] - distmatrix.close() + script.simmatrix_importfpneigh_run(inputfile=inputfile, + simmatrixfn=output_fn, + fragmentsdb='data/fragments.sqlite', + nrrows=3, + ignore_upper_triangle=True) + + simmatrix = SimilarityMatrix(output_fn) + rows = [r for r in simmatrix] + simmatrix.close() expected = [(u'2mlm_2W7_frag1', u'2mlm_2W7_frag2', 0.5877), (u'2mlm_2W7_frag2', u'3wvm_STE_frag1', 0.4633)] eq_(rows, expected) finally: diff --git a/tests/test_webservice.py b/tests/test_webservice.py index 88fbb55..6a4559e 100644 --- a/tests/test_webservice.py +++ b/tests/test_webservice.py @@ -15,14 +15,14 @@ from nose.tools import eq_ import requests_mock from kripodb.webservice import server -from kripodb.hdf5 import DistanceMatrix +from kripodb.hdf5 import SimilarityMatrix from kripodb.version import __version__ from kripodb.webservice.client import WebserviceClient class TestWebservice(object): def setUp(self): - self.matrix = DistanceMatrix('data/distances.h5') + self.matrix = SimilarityMatrix('data/similarities.h5') self.app = server.wsgi_app(self.matrix) def tearDown(self):