diff --git a/dynamo/configuration.py b/dynamo/configuration.py index 887b0f4eb..b1ef7cdab 100755 --- a/dynamo/configuration.py +++ b/dynamo/configuration.py @@ -109,6 +109,8 @@ def select_layer_data(adata: AnnData, layer: str, copy=False) -> pd.DataFrame: res_data = None if layer == DynamoAdataKeyManager.X_LAYER: res_data = adata.X + elif layer == DynamoAdataKeyManager.RAW: + res_data = adata.raw.X elif layer == DynamoAdataKeyManager.PROTEIN_LAYER: res_data = adata.obsm["protein"] if "protein" in adata.obsm_keys() else None else: diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index 305b3208d..d5fbbcc7d 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -580,30 +580,38 @@ def select_genes_by_seurat_recipe( main_info("n_top_genes is None, reserve all genes and add filter gene information") n_top_genes = adata.n_vars - chunk_size = chunk_size if chunk_size is not None else adata.n_vars - if algorithm == "seurat_dispersion": - chunked_layer_mats = DKM.select_layer_chunked_data( - adata[:, pass_filter_genes], - layer, - chunk_size=chunk_size, - chunk_mode="gene", - ) - mean = np.zeros(len(pass_filter_genes), dtype=initial_dtype) - variance = np.zeros(len(pass_filter_genes), dtype=initial_dtype) - - for mat_data in chunked_layer_mats: - layer_mat = mat_data[0] + if chunk_size is None: + layer_mat = DKM.select_layer_data(adata[:, pass_filter_genes], layer) if nan_replace_val: main_info("replacing nan values with: %s" % (nan_replace_val)) _mask = get_nan_or_inf_data_bool_mask(layer_mat) layer_mat[_mask] = nan_replace_val - chunked_mean, chunked_var = seurat_get_mean_var(layer_mat) + mean, variance = seurat_get_mean_var(layer_mat) + else: + chunked_layer_mats = DKM.select_layer_chunked_data( + adata[:, pass_filter_genes], + layer, + chunk_size=chunk_size, + chunk_mode="gene", + ) + mean = np.zeros(len(pass_filter_genes), dtype=initial_dtype) + variance = np.zeros(len(pass_filter_genes), dtype=initial_dtype) + + for mat_data in chunked_layer_mats: + layer_mat = mat_data[0] + + if nan_replace_val: + main_info("replacing nan values with: %s" % (nan_replace_val)) + _mask = get_nan_or_inf_data_bool_mask(layer_mat) + layer_mat[_mask] = nan_replace_val + + chunked_mean, chunked_var = seurat_get_mean_var(layer_mat) - mean[mat_data[1] : mat_data[2]] = chunked_mean - variance[mat_data[1] : mat_data[2]] = chunked_var + mean[mat_data[1]:mat_data[2]] = chunked_mean + variance[mat_data[1]:mat_data[2]] = chunked_var mean, variance, highly_variable_mask = select_genes_by_seurat_dispersion( mean=mean, diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py index 245cda265..38feea6e1 100644 --- a/dynamo/preprocessing/normalization.py +++ b/dynamo/preprocessing/normalization.py @@ -300,8 +300,6 @@ def normalize( adata.obs = adata.obs.loc[:, ~adata.obs.columns.str.contains("Size_Factor")] - chunk_size = chunk_size if chunk_size is not None else adata.n_obs - if np.count_nonzero(adata.obs.columns.str.contains("Size_Factor")) < len(layers): calc_sz_factor( adata, @@ -329,8 +327,7 @@ def normalize( """This normalization implements the centered log-ratio (CLR) normalization from Seurat which is computed for each gene (M Stoeckius, 2017). """ - CMs_data = DKM.select_layer_chunked_data(adata, layer, chunk_size=adata.n_obs) - CM = next(CMs_data)[0] + CM = DKM.select_layer_data(adata, layer) CM = CM.T n_feature = CM.shape[1] @@ -352,28 +349,39 @@ def normalize( main_info_insert_adata_layer("X_" + layer) adata.layers["X_" + layer] = CM else: - CMs_data = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) + if chunk_size is None: + CM = DKM.select_layer_data(adata, layer) + CM = size_factor_normalize(CM, szfactors) - if layer in ["raw", "X"]: - main_debug("set adata to normalized data.") + if layer in ["raw", "X"]: + main_debug("set adata to normalized data.") + adata.X = CM + else: + main_info_insert_adata_layer("X_" + layer) + adata.layers["X_" + layer] = CM + else: + CMs_data = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) - for CM_data in CMs_data: - CM = CM_data[0] - CM = size_factor_normalize(CM, szfactors[CM_data[1] : CM_data[2]]) - adata.X[CM_data[1] : CM_data[2]] = CM + if layer in ["raw", "X"]: + main_debug("set adata to normalized data.") - else: - main_info_insert_adata_layer("X_" + layer) + for CM_data in CMs_data: + CM = CM_data[0] + CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) + adata.X[CM_data[1]:CM_data[2]] = CM - if issparse(adata.layers[layer]): - adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape)) else: - adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape) + main_info_insert_adata_layer("X_" + layer) + + if issparse(adata.layers[layer]): + adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape)) + else: + adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape) - for CM_data in CMs_data: - CM = CM_data[0] - CM = size_factor_normalize(CM, szfactors[CM_data[1] : CM_data[2]]) - adata.layers["X_" + layer][CM_data[1] : CM_data[2]] = CM + for CM_data in CMs_data: + CM = CM_data[0] + CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) + adata.layers["X_" + layer][CM_data[1]:CM_data[2]] = CM return adata @@ -477,13 +485,8 @@ def sz_util( extend_layers=False, ) - chunk_size = chunk_size if chunk_size is not None else adata.n_obs - chunked_CMs = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) if CM is None else CM - - cell_total = np.zeros(adata.n_obs, dtype=initial_dtype) - - for CM_data in chunked_CMs: - CM = CM_data[0] + if chunk_size is None: + CM = DKM.select_layer_data(adata, layer) if CM is None else CM if CM is None: return None, None @@ -495,12 +498,32 @@ def sz_util( else: CM = CM.round().astype("int") - chunk_cell_total = CM.sum(axis=1).A1 if issparse(CM) else CM.sum(axis=1) - chunk_cell_total += chunk_cell_total == 0 # avoid infinity value after log (0) + cell_total = CM.sum(axis=1).A1 if issparse(CM) else CM.sum(axis=1) + cell_total += cell_total == 0 # avoid infinity value after log (0) + else: + chunked_CMs = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) if CM is None else CM + + cell_total = np.zeros(adata.n_obs, dtype=initial_dtype) + + for CM_data in chunked_CMs: + CM = CM_data[0] + + if CM is None: + return None, None + + if round_exprs: + main_debug("rounding expression data of layer: %s during size factor calculation" % (layer)) + if issparse(CM): + CM.data = np.round(CM.data, 0) + else: + CM = CM.round().astype("int") + + chunk_cell_total = CM.sum(axis=1).A1 if issparse(CM) else CM.sum(axis=1) + chunk_cell_total += chunk_cell_total == 0 # avoid infinity value after log (0) - cell_total[CM_data[1] : CM_data[2]] = chunk_cell_total + cell_total[CM_data[1]:CM_data[2]] = chunk_cell_total - cell_total = cell_total.astype(int) if np.all(cell_total % 1 == 0) else cell_total + cell_total = cell_total.astype(int) if np.all(cell_total % 1 == 0) else cell_total if method in ["mean-geometric-mean-total", "geometric"]: sfs = cell_total / (np.exp(locfunc(np.log(cell_total))) if scale_to is None else scale_to)