Skip to content

Commit

Permalink
Add new Clustergram feature (Hierarchical clustered distance matrix w…
Browse files Browse the repository at this point in the history
…ith dendrograms)

- Data Analysis and Export is now possible for a list of specific physical positions
- Reduces RAM usage on the server-side Python component
- Bugfixes
  • Loading branch information
patrick-koenig committed May 15, 2023
1 parent c53b7fd commit 4e16af0
Show file tree
Hide file tree
Showing 35 changed files with 1,480 additions and 393 deletions.
4 changes: 3 additions & 1 deletion divbrowse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@

logging.getLogger('numexpr').setLevel(logging.WARNING)
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('h5py').setLevel(logging.WARNING)
logging.getLogger('h5py').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('matplotlib.font_manager').setLevel(logging.WARNING)
3 changes: 3 additions & 0 deletions divbrowse/divbrowse.config.yml.example
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ gff3:
url: https://some.external.resource.org/{FEATURE_ID}
linktext: Open this gene in an external resource

features:
pca: true
umap: true

chromosome_labels:
1: 1H
Expand Down
4 changes: 4 additions & 0 deletions divbrowse/divbrowse.config.yml.skeleton
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ gff3:
external_link_ontology_term: https://www.ebi.ac.uk/QuickGO/term/{ID}
external_links:

features:
pca: true
umap: true

chromosome_labels:

gff3_chromosome_labels:
Expand Down
15 changes: 14 additions & 1 deletion divbrowse/lib/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def calc_distance_to_reference(self, samples):

ref_vec = np.zeros(self.variant_calls_slice.numbers_of_alternate_alleles.shape[1]).reshape(1, self.variant_calls_slice.numbers_of_alternate_alleles.shape[1])


start = timer()
distances = pairwise_distances(calls_imputed, ref_vec, n_jobs=1, metric='hamming')
distances = distances * self.variant_calls_slice.numbers_of_alternate_alleles.shape[1]
Expand All @@ -79,6 +78,20 @@ def calc_distance_to_reference(self, samples):
log.debug("==== pairwise_distances() calculation time: %f", timer() - start)
return distances_combined


def calc_distance_matrix(self, samples):
calls_imputed = self.get_imputed_calls()

start = timer()
distances = pairwise_distances(calls_imputed, n_jobs=4, metric='hamming')
distances = distances * self.variant_calls_slice.numbers_of_alternate_alleles.shape[1]
distances = distances.astype(np.int16);
#sample_ids = np.array(self.variant_calls_slice.samples_selected_mapped).reshape(samples[self.variant_calls_slice.samples_mask].shape[0], 1)
#distances_combined = np.concatenate((sample_ids, distances), axis=1)
#log.debug("==== pairwise_distances() calculation time: %f", timer() - start)
#return distances_combined
return distances


def pca(self):
"""Calculate a PCA for a variant matrix array
Expand Down
117 changes: 103 additions & 14 deletions divbrowse/lib/genotype_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(self, config):
self._setup_sample_id_mapping()
self._create_chrom_indices()
self._create_list_of_chromosomes()
self._free_mem()


def _load_data(self):
Expand All @@ -68,6 +69,11 @@ def _load_data(self):
self.pos = self.callset['variants/POS'][:]
self.samples = self.callset['samples'][:]

#print('----------------------------------------------------------')
#print(self.chrom.nbytes)
#print(self.pos.nbytes)
#print(self.samples.nbytes)

self.count_variants = len(self.pos)
self.count_samples = len(self.samples)

Expand Down Expand Up @@ -124,20 +130,23 @@ def _create_chrom_indices(self):
try:
chrom_indices[_chr] = pd.read_hdf(cached_index_path, key='s')
log.debug("++++ Loaded Pandas index for chromosome %s", str(_chr))
#print(chrom_indices[_chr].memory_usage(index=True))
#print(chrom_indices[_chr].info())

except FileNotFoundError:
log.debug("++++ Creating Pandas index for chromosome %s", str(_chr))
chrom_range = self.idx_pos.locate_key(_chr)
chrom_indices[_chr] = pd.Series(data=np.arange(chrom_range.start, chrom_range.stop), index=self.pos[chrom_range])
chrom_indices[_chr].to_hdf(cached_index_path, key='s', mode='w', complevel=5, complib='blosc:zstd')
#print(chrom_indices[_chr].memory_usage(index=True))

self.chrom_indices = chrom_indices



def _create_list_of_chromosomes(self):

chromosome_labels = {str(key): str(value) for key, value in self.config['chromosome_labels'].items()}
self.chromosome_labels = {str(key): str(value) for key, value in self.config['chromosome_labels'].items()}
centromeres_positions = {str(key): str(value) for key, value in self.config['centromeres_positions'].items()}

path_chromosome_tmp_data = self.datadir+'____list_of_chromosomes____.json'
Expand All @@ -154,7 +163,7 @@ def _create_list_of_chromosomes(self):
_region = self.idx_pos.locate_range(_chr)
self.list_of_chromosomes.append({
'id': _chr,
'label': chromosome_labels[str(_chr)],
'label': self.chromosome_labels[str(_chr)],
'centromere_position': int(centromeres_positions[str(_chr)]),
'start': int(self.pos[ _region.start ]),
'end': int(self.pos[ _region.stop - 1 ]),
Expand All @@ -164,6 +173,14 @@ def _create_list_of_chromosomes(self):
json.dump(self.list_of_chromosomes, outfile)



def _free_mem(self):
del self.idx_pos
del self.chrom




def sample_ids_to_mask(self, sample_ids: list) -> np.ndarray:
"""Creates a boolean mask based on the input sample IDs that could be found in the samples array of the Zarr storage
Expand Down Expand Up @@ -280,6 +297,27 @@ def get_posidx_by_genome_coordinate(self, chrom, pos, method='nearest') -> Tuple
return lookup, 'nearest_lookup'


def get_posidx_by_genome_coordinates(self, chrom, positions, method='nearest') -> Tuple[int, str]:

pd_series = self.chrom_indices[chrom]

positions = [int(x) for x in positions]

start = timer()
diff = pd.Index(positions).difference(pd_series.index)
log.debug("============ diff = pd.Index(positions).difference(pd_series.index) => calculation time: %f", timer() - start)

if diff.size > 0:
return False, None, diff.values

else:
start = timer()
lookup = pd_series.loc[positions]
log.debug("============ lookup = pd_series.loc[positions] => calculation time: %f", timer() - start)
return True, lookup.values, None




def count_variants_in_window(self, chrom, startpos, endpos) -> int:
"""Counts number of variants in a genomic region
Expand Down Expand Up @@ -316,35 +354,60 @@ def get_slice_of_variant_calls(
chrom,
startpos = None,
endpos = None,
positions = None,
count = None,
samples = None,
variant_filter_settings = None,
with_call_metadata = False,
calc_summary_stats = False,
flanking_region_include = False,
flanking_region_length = 1500,
flanking_region_direction = 'both'
) -> VariantCallsSlice:

start = timer()

lookup_type_start = False
lookup_type_end = False
type_of_slice = 'range'
positions_not_found = None

if samples is None:
samples = self.samples

samples_mask, samples_selected_mapped = self.get_samples_mask(samples)

log.debug("============ self.get_samples_mask() => calculation time: %f", timer() - start)
start = timer()

if count is None:

if flanking_region_include:
startpos = startpos - flanking_region_length
endpos = endpos + flanking_region_length
location_start, lookup_type_start = self.get_posidx_by_genome_coordinate(chrom, startpos, method='backfill')
location_end, lookup_type_end = self.get_posidx_by_genome_coordinate(chrom, endpos, method='pad')
#print("++++++++++++++++++++++++++++++++++++++++++++")
#print(positions)
#print(type(positions))

if positions and isinstance(positions, list) and len(positions) > 0:
type_of_slice = 'positions'
positional_lookup_success, positions_matched_indices, positions_not_found = self.get_posidx_by_genome_coordinates(chrom, positions)

location_start = 0
location_end = 0

#print('******************************************************************************')
#print(positional_lookup_success)
#print(positions_matched_indices)

else:
location_start, lookup_type_start = self.get_posidx_by_genome_coordinate(chrom, startpos)
location_end, lookup_type_end = self.get_posidx_by_genome_coordinate(chrom, endpos)
location_end = location_end + 1
if flanking_region_include:
startpos = startpos - flanking_region_length
endpos = endpos + flanking_region_length
location_start, lookup_type_start = self.get_posidx_by_genome_coordinate(chrom, startpos, method='backfill')
location_end, lookup_type_end = self.get_posidx_by_genome_coordinate(chrom, endpos, method='pad')

else:
location_start, lookup_type_start = self.get_posidx_by_genome_coordinate(chrom, startpos)
location_end, lookup_type_end = self.get_posidx_by_genome_coordinate(chrom, endpos)
location_end = location_end + 1
else:
if startpos is not None:
# Get start coordinate (allows automatic position fuzzy search if coordinate does not exist in the variant matrix!)
Expand All @@ -365,17 +428,35 @@ def get_slice_of_variant_calls(
if location_start < 0:
location_start = 0
location_end = location_start + count

log.debug("============ self.get_posidx_by_genome_coordinate() section => calculation time: %f", timer() - start)


if type_of_slice == 'positions':
if positional_lookup_success:
slice_variant_calls = positions_matched_indices
positions = self.pos[positions_matched_indices]
positions_indices = positions_matched_indices
else:
slice_variant_calls = False
positions = np.array(positions)
positions_indices = positions_not_found
else:
# create slice() object for later going into get_orthogonal_selection()
slice_variant_calls = slice(location_start, location_end, None)
positions = self.pos[slice_variant_calls]
positions_indices = np.array(list(range(location_start, location_end)))


# create slice() object for later going into get_orthogonal_selection()
slice_variant_calls = slice(location_start, location_end, None)

positions = self.pos[slice_variant_calls]

# get the variant slice from Zarr dataset
start = timer()
sliced_variant_calls = self.calldata.get_orthogonal_selection((slice_variant_calls, samples_mask))
log.debug("============ self.calldata.get_orthogonal_selection() section => calculation time: %f", timer() - start)


start = timer()
metadata = {}
if with_call_metadata:
# get DP values
Expand All @@ -397,17 +478,25 @@ def get_slice_of_variant_calls(
metadata['DV'][sample] = sliced_DV[i].tolist()
i += 1

log.debug("============ if with_call_metadata: section => calculation time: %f", timer() - start)

start = timer()
variant_calls_slice = VariantCallsSlice(
gd = self,
type_of_slice = type_of_slice,
#positional_lookup_success = positional_lookup_success,
sliced_variant_calls = sliced_variant_calls,
positions = positions,
positions_indices = positions_indices,
positions_not_found = positions_not_found,
location_start = location_start,
location_end = location_end,
samples_mask = samples_mask,
samples_selected_mapped = samples_selected_mapped,
variant_filter_settings = variant_filter_settings,
calls_metadata = metadata
calls_metadata = metadata,
calc_summary_stats = calc_summary_stats
)
log.debug("============ variant_calls_slice = VariantCallsSlice() section => calculation time: %f", timer() - start)

return variant_calls_slice
15 changes: 14 additions & 1 deletion divbrowse/lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from simplejson import JSONEncoder

#from flask.json.provider import JSONProvider
import orjson

RED = '\033[31m'
RESET = '\033[0m'

Expand Down Expand Up @@ -27,4 +30,14 @@ class StrictEncoder(JSONEncoder):
def __init__(self, *args, **kwargs):
kwargs["allow_nan"] = False
kwargs["ignore_nan"] = True
super().__init__(*args, **kwargs)
super().__init__(*args, **kwargs)

class ORJSONEncoder:

def __init__(self, **kwargs):
# eventually take into consideration when serializing
self.options = kwargs

def encode(self, obj):
# decode back to str, as orjson returns bytes
return orjson.dumps(obj, option=orjson.OPT_NON_STR_KEYS).decode('utf-8')
Loading

0 comments on commit 4e16af0

Please sign in to comment.