Add new Clustergram feature (Hierarchical clustered distance matrix w…

…ith dendrograms) - Data Analysis and Export is now possible for a list of specific physical positions - Reduces RAM usage on the server-side Python component - Bugfixes
IPK-BIT · May 15, 2023 · 4e16af0 · 4e16af0
1 parent c53b7fd
commit 4e16af0
Show file tree

Hide file tree

Showing 35 changed files with 1,480 additions and 393 deletions.
diff --git a/divbrowse/__init__.py b/divbrowse/__init__.py
@@ -11,4 +11,6 @@
 
 logging.getLogger('numexpr').setLevel(logging.WARNING)
 logging.getLogger('numba').setLevel(logging.WARNING)
-logging.getLogger('h5py').setLevel(logging.WARNING)
+logging.getLogger('h5py').setLevel(logging.WARNING)
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+logging.getLogger('matplotlib.font_manager').setLevel(logging.WARNING)
diff --git a/divbrowse/divbrowse.config.yml.example b/divbrowse/divbrowse.config.yml.example
@@ -37,6 +37,9 @@ gff3:
       url: https://some.external.resource.org/{FEATURE_ID}
       linktext: Open this gene in an external resource
 
+features:
+  pca: true
+  umap: true
 
 chromosome_labels:
   1: 1H

diff --git a/divbrowse/divbrowse.config.yml.skeleton b/divbrowse/divbrowse.config.yml.skeleton
@@ -22,6 +22,10 @@ gff3:
   external_link_ontology_term: https://www.ebi.ac.uk/QuickGO/term/{ID}
   external_links:
 
+features:
+  pca: true
+  umap: true
+
 chromosome_labels:
 
 gff3_chromosome_labels:

diff --git a/divbrowse/lib/analysis.py b/divbrowse/lib/analysis.py
@@ -69,7 +69,6 @@ def calc_distance_to_reference(self, samples):
 
         ref_vec = np.zeros(self.variant_calls_slice.numbers_of_alternate_alleles.shape[1]).reshape(1, self.variant_calls_slice.numbers_of_alternate_alleles.shape[1])
 
-
         start = timer()
         distances = pairwise_distances(calls_imputed, ref_vec, n_jobs=1, metric='hamming')
         distances = distances * self.variant_calls_slice.numbers_of_alternate_alleles.shape[1]
@@ -79,6 +78,20 @@ def calc_distance_to_reference(self, samples):
         log.debug("==== pairwise_distances() calculation time: %f", timer() - start)
         return distances_combined
 
+
+    def calc_distance_matrix(self, samples):
+        calls_imputed = self.get_imputed_calls()
+
+        start = timer()
+        distances = pairwise_distances(calls_imputed, n_jobs=4, metric='hamming')
+        distances = distances * self.variant_calls_slice.numbers_of_alternate_alleles.shape[1]
+        distances = distances.astype(np.int16);
+        #sample_ids = np.array(self.variant_calls_slice.samples_selected_mapped).reshape(samples[self.variant_calls_slice.samples_mask].shape[0], 1)
+        #distances_combined = np.concatenate((sample_ids, distances), axis=1)
+        #log.debug("==== pairwise_distances() calculation time: %f", timer() - start)
+        #return distances_combined
+        return distances
+
 
     def pca(self):
         """Calculate a PCA for a variant matrix array

diff --git a/divbrowse/lib/genotype_data.py b/divbrowse/lib/genotype_data.py
@@ -50,6 +50,7 @@ def __init__(self, config):
         self._setup_sample_id_mapping()
         self._create_chrom_indices()
         self._create_list_of_chromosomes()
+        self._free_mem()
 
 
     def _load_data(self):
@@ -68,6 +69,11 @@ def _load_data(self):
         self.pos = self.callset['variants/POS'][:]
         self.samples = self.callset['samples'][:]
 
+        #print('----------------------------------------------------------')
+        #print(self.chrom.nbytes)
+        #print(self.pos.nbytes)
+        #print(self.samples.nbytes)
+
         self.count_variants = len(self.pos)
         self.count_samples = len(self.samples)
 
@@ -124,20 +130,23 @@ def _create_chrom_indices(self):
             try:
                 chrom_indices[_chr] = pd.read_hdf(cached_index_path, key='s')
                 log.debug("++++ Loaded Pandas index for chromosome %s", str(_chr))
+                #print(chrom_indices[_chr].memory_usage(index=True))
+                #print(chrom_indices[_chr].info())
 
             except FileNotFoundError:
                 log.debug("++++ Creating Pandas index for chromosome %s", str(_chr))
                 chrom_range = self.idx_pos.locate_key(_chr)
                 chrom_indices[_chr] = pd.Series(data=np.arange(chrom_range.start, chrom_range.stop), index=self.pos[chrom_range])
                 chrom_indices[_chr].to_hdf(cached_index_path, key='s', mode='w', complevel=5, complib='blosc:zstd')
+                #print(chrom_indices[_chr].memory_usage(index=True))
 
         self.chrom_indices = chrom_indices
 
 
 
     def _create_list_of_chromosomes(self):
 
-        chromosome_labels = {str(key): str(value) for key, value in self.config['chromosome_labels'].items()}
+        self.chromosome_labels = {str(key): str(value) for key, value in self.config['chromosome_labels'].items()}
         centromeres_positions = {str(key): str(value) for key, value in self.config['centromeres_positions'].items()}
 
         path_chromosome_tmp_data = self.datadir+'____list_of_chromosomes____.json'
@@ -154,7 +163,7 @@ def _create_list_of_chromosomes(self):
                 _region = self.idx_pos.locate_range(_chr)
                 self.list_of_chromosomes.append({
                     'id': _chr,
-                    'label': chromosome_labels[str(_chr)],
+                    'label': self.chromosome_labels[str(_chr)],
                     'centromere_position': int(centromeres_positions[str(_chr)]),
                     'start': int(self.pos[ _region.start ]),
                     'end': int(self.pos[ _region.stop - 1 ]),
@@ -164,6 +173,14 @@ def _create_list_of_chromosomes(self):
                 json.dump(self.list_of_chromosomes, outfile)
 
 
+
+    def _free_mem(self):
+        del self.idx_pos
+        del self.chrom
+
+
+
+
     def sample_ids_to_mask(self, sample_ids: list) -> np.ndarray:
         """Creates a boolean mask based on the input sample IDs that could be found in the samples array of the Zarr storage
 
@@ -280,6 +297,27 @@ def get_posidx_by_genome_coordinate(self, chrom, pos, method='nearest') -> Tuple
             return lookup, 'nearest_lookup'
 
 
+    def get_posidx_by_genome_coordinates(self, chrom, positions, method='nearest') -> Tuple[int, str]:
+
+        pd_series = self.chrom_indices[chrom]
+
+        positions = [int(x) for x in positions]
+
+        start = timer()
+        diff = pd.Index(positions).difference(pd_series.index)
+        log.debug("============ diff = pd.Index(positions).difference(pd_series.index) => calculation time: %f", timer() - start)
+
+        if diff.size > 0:
+            return False, None, diff.values
+
+        else:
+            start = timer()
+            lookup = pd_series.loc[positions]
+            log.debug("============ lookup = pd_series.loc[positions] => calculation time: %f", timer() - start)
+            return True, lookup.values, None
+
+
+
 
     def count_variants_in_window(self, chrom, startpos, endpos) -> int:
         """Counts number of variants in a genomic region
@@ -316,35 +354,60 @@ def get_slice_of_variant_calls(
             chrom,
             startpos = None,
             endpos = None,
+            positions = None,
             count = None,
             samples = None,
             variant_filter_settings = None,
             with_call_metadata = False,
+            calc_summary_stats = False,
             flanking_region_include = False,
             flanking_region_length = 1500,
             flanking_region_direction = 'both'
         ) -> VariantCallsSlice:
 
+        start = timer()
+
         lookup_type_start = False
         lookup_type_end = False
+        type_of_slice = 'range'
+        positions_not_found = None
 
         if samples is None:
             samples = self.samples
 
         samples_mask, samples_selected_mapped = self.get_samples_mask(samples)
 
+        log.debug("============ self.get_samples_mask() => calculation time: %f", timer() - start)
+        start = timer()
+
         if count is None:
 
-            if flanking_region_include:
-                startpos = startpos - flanking_region_length
-                endpos = endpos + flanking_region_length
-                location_start, lookup_type_start = self.get_posidx_by_genome_coordinate(chrom, startpos, method='backfill')
-                location_end, lookup_type_end = self.get_posidx_by_genome_coordinate(chrom, endpos, method='pad')
+            #print("++++++++++++++++++++++++++++++++++++++++++++")
+            #print(positions)
+            #print(type(positions))
+
+            if positions and isinstance(positions, list) and len(positions) > 0:
+                type_of_slice = 'positions'
+                positional_lookup_success, positions_matched_indices, positions_not_found = self.get_posidx_by_genome_coordinates(chrom, positions)
+
+                location_start = 0
+                location_end = 0
+
+                #print('******************************************************************************')
+                #print(positional_lookup_success)
+                #print(positions_matched_indices)
 
             else:
-                location_start, lookup_type_start = self.get_posidx_by_genome_coordinate(chrom, startpos)
-                location_end, lookup_type_end = self.get_posidx_by_genome_coordinate(chrom, endpos)
-                location_end = location_end + 1
+                if flanking_region_include:
+                    startpos = startpos - flanking_region_length
+                    endpos = endpos + flanking_region_length
+                    location_start, lookup_type_start = self.get_posidx_by_genome_coordinate(chrom, startpos, method='backfill')
+                    location_end, lookup_type_end = self.get_posidx_by_genome_coordinate(chrom, endpos, method='pad')
+
+                else:
+                    location_start, lookup_type_start = self.get_posidx_by_genome_coordinate(chrom, startpos)
+                    location_end, lookup_type_end = self.get_posidx_by_genome_coordinate(chrom, endpos)
+                    location_end = location_end + 1
         else:
             if startpos is not None:
                 # Get start coordinate (allows automatic position fuzzy search if coordinate does not exist in the variant matrix!)
@@ -365,17 +428,35 @@ def get_slice_of_variant_calls(
                 if location_start < 0:
                     location_start = 0
                     location_end = location_start + count
+
+        log.debug("============ self.get_posidx_by_genome_coordinate() section => calculation time: %f", timer() - start)
+
+
+        if type_of_slice == 'positions':
+            if positional_lookup_success:
+                slice_variant_calls = positions_matched_indices
+                positions = self.pos[positions_matched_indices]
+                positions_indices = positions_matched_indices
+            else:
+                slice_variant_calls = False
+                positions = np.array(positions)
+                positions_indices = positions_not_found
+        else:
+            # create slice() object for later going into get_orthogonal_selection()
+            slice_variant_calls = slice(location_start, location_end, None)
+            positions = self.pos[slice_variant_calls]
+            positions_indices = np.array(list(range(location_start, location_end)))
 
 
-        # create slice() object for later going into get_orthogonal_selection()
-        slice_variant_calls = slice(location_start, location_end, None)
 
-        positions = self.pos[slice_variant_calls]
 
         # get the variant slice from Zarr dataset
+        start = timer()
         sliced_variant_calls = self.calldata.get_orthogonal_selection((slice_variant_calls, samples_mask))
+        log.debug("============ self.calldata.get_orthogonal_selection() section => calculation time: %f", timer() - start)
 
 
+        start = timer()
         metadata = {}
         if with_call_metadata:
             # get DP values
@@ -397,17 +478,25 @@ def get_slice_of_variant_calls(
                     metadata['DV'][sample] = sliced_DV[i].tolist()
                     i += 1
 
+        log.debug("============ if with_call_metadata: section => calculation time: %f", timer() - start)
 
+        start = timer()
         variant_calls_slice = VariantCallsSlice(
             gd = self,
+            type_of_slice = type_of_slice,
+            #positional_lookup_success = positional_lookup_success,
             sliced_variant_calls = sliced_variant_calls,
             positions = positions,
+            positions_indices = positions_indices,
+            positions_not_found = positions_not_found,
             location_start = location_start,
             location_end = location_end,
             samples_mask = samples_mask,
             samples_selected_mapped = samples_selected_mapped,
             variant_filter_settings = variant_filter_settings,
-            calls_metadata = metadata
+            calls_metadata = metadata,
+            calc_summary_stats = calc_summary_stats
         )
+        log.debug("============ variant_calls_slice = VariantCallsSlice() section => calculation time: %f", timer() - start)
 
         return variant_calls_slice
diff --git a/divbrowse/lib/utils.py b/divbrowse/lib/utils.py
@@ -1,5 +1,8 @@
 from simplejson import JSONEncoder
 
+#from flask.json.provider import JSONProvider
+import orjson
+
 RED = '\033[31m'
 RESET = '\033[0m'
 
@@ -27,4 +30,14 @@ class StrictEncoder(JSONEncoder):
     def __init__(self, *args, **kwargs):
         kwargs["allow_nan"] = False
         kwargs["ignore_nan"] = True
-        super().__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
+
+class ORJSONEncoder:
+
+    def __init__(self, **kwargs):
+        # eventually take into consideration when serializing
+        self.options = kwargs
+
+    def encode(self, obj):
+        # decode back to str, as orjson returns bytes
+        return orjson.dumps(obj, option=orjson.OPT_NON_STR_KEYS).decode('utf-8')