From 1d4d6baead55e3137513ece4733183dc5b51dd8a Mon Sep 17 00:00:00 2001 From: sichao Date: Mon, 8 Apr 2024 12:24:42 -0400 Subject: [PATCH 1/3] make chunk as an separate option --- dynamo/configuration.py | 2 + dynamo/preprocessing/gene_selection.py | 40 +++++++----- dynamo/preprocessing/normalization.py | 87 ++++++++++++++++---------- 3 files changed, 81 insertions(+), 48 deletions(-) diff --git a/dynamo/configuration.py b/dynamo/configuration.py index 907459e14..0c72c1b61 100755 --- a/dynamo/configuration.py +++ b/dynamo/configuration.py @@ -107,6 +107,8 @@ def select_layer_data(adata: AnnData, layer: str, copy=False) -> pd.DataFrame: res_data = None if layer == DynamoAdataKeyManager.X_LAYER: res_data = adata.X + elif layer == DynamoAdataKeyManager.RAW: + res_data = adata.raw.X elif layer == DynamoAdataKeyManager.PROTEIN_LAYER: res_data = adata.obsm["protein"] if "protein" in adata.obsm_keys() else None else: diff --git a/dynamo/preprocessing/gene_selection.py b/dynamo/preprocessing/gene_selection.py index cacf33945..80b0cf56b 100644 --- a/dynamo/preprocessing/gene_selection.py +++ b/dynamo/preprocessing/gene_selection.py @@ -580,30 +580,38 @@ def select_genes_by_seurat_recipe( main_info("n_top_genes is None, reserve all genes and add filter gene information") n_top_genes = adata.n_vars - chunk_size = chunk_size if chunk_size is not None else adata.n_vars - if algorithm == "seurat_dispersion": - chunked_layer_mats = DKM.select_layer_chunked_data( - adata[:, pass_filter_genes], - layer, - chunk_size=chunk_size, - chunk_mode="gene", - ) - mean = np.zeros(len(pass_filter_genes), dtype=initial_dtype) - variance = np.zeros(len(pass_filter_genes), dtype=initial_dtype) - - for mat_data in chunked_layer_mats: - layer_mat = mat_data[0] + if chunk_size is None: + layer_mat = DKM.select_layer_data(adata[:, pass_filter_genes], layer) if nan_replace_val: main_info("replacing nan values with: %s" % (nan_replace_val)) _mask = get_nan_or_inf_data_bool_mask(layer_mat) layer_mat[_mask] = nan_replace_val - chunked_mean, chunked_var = seurat_get_mean_var(layer_mat) + mean, variance = seurat_get_mean_var(layer_mat) + else: + chunked_layer_mats = DKM.select_layer_chunked_data( + adata[:, pass_filter_genes], + layer, + chunk_size=chunk_size, + chunk_mode="gene", + ) + mean = np.zeros(len(pass_filter_genes), dtype=initial_dtype) + variance = np.zeros(len(pass_filter_genes), dtype=initial_dtype) + + for mat_data in chunked_layer_mats: + layer_mat = mat_data[0] + + if nan_replace_val: + main_info("replacing nan values with: %s" % (nan_replace_val)) + _mask = get_nan_or_inf_data_bool_mask(layer_mat) + layer_mat[_mask] = nan_replace_val + + chunked_mean, chunked_var = seurat_get_mean_var(layer_mat) - mean[mat_data[1]:mat_data[2]] = chunked_mean - variance[mat_data[1]:mat_data[2]] = chunked_var + mean[mat_data[1]:mat_data[2]] = chunked_mean + variance[mat_data[1]:mat_data[2]] = chunked_var mean, variance, highly_variable_mask = select_genes_by_seurat_dispersion( mean=mean, diff --git a/dynamo/preprocessing/normalization.py b/dynamo/preprocessing/normalization.py index 3e097569e..f7ce59444 100644 --- a/dynamo/preprocessing/normalization.py +++ b/dynamo/preprocessing/normalization.py @@ -298,8 +298,6 @@ def normalize( adata.obs = adata.obs.loc[:, ~adata.obs.columns.str.contains("Size_Factor")] - chunk_size = chunk_size if chunk_size is not None else adata.n_obs - if np.count_nonzero(adata.obs.columns.str.contains("Size_Factor")) < len(layers): calc_sz_factor( adata, @@ -327,8 +325,7 @@ def normalize( """This normalization implements the centered log-ratio (CLR) normalization from Seurat which is computed for each gene (M Stoeckius, 2017). """ - CMs_data = DKM.select_layer_chunked_data(adata, layer, chunk_size=adata.n_obs) - CM = next(CMs_data)[0] + CM = DKM.select_layer_data(adata, layer) CM = CM.T n_feature = CM.shape[1] @@ -350,28 +347,39 @@ def normalize( main_info_insert_adata_layer("X_" + layer) adata.layers["X_" + layer] = CM else: - CMs_data = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) + if chunk_size is None: + CM = DKM.select_layer_data(adata, layer) + CM = size_factor_normalize(CM, szfactors) - if layer in ["raw", "X"]: - main_debug("set adata to normalized data.") + if layer in ["raw", "X"]: + main_debug("set adata to normalized data.") + adata.X = CM + else: + main_info_insert_adata_layer("X_" + layer) + adata.layers["X_" + layer] = CM + else: + CMs_data = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) - for CM_data in CMs_data: - CM = CM_data[0] - CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) - adata.X[CM_data[1]:CM_data[2]] = CM + if layer in ["raw", "X"]: + main_debug("set adata to normalized data.") - else: - main_info_insert_adata_layer("X_" + layer) + for CM_data in CMs_data: + CM = CM_data[0] + CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) + adata.X[CM_data[1]:CM_data[2]] = CM - if issparse(adata.layers[layer]): - adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape)) else: - adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape) + main_info_insert_adata_layer("X_" + layer) + + if issparse(adata.layers[layer]): + adata.layers["X_" + layer] = csr_matrix(np.zeros(adata.layers[layer].shape)) + else: + adata.layers["X_" + layer] = np.zeros(adata.layers[layer].shape) - for CM_data in CMs_data: - CM = CM_data[0] - CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) - adata.layers["X_" + layer][CM_data[1]:CM_data[2]] = CM + for CM_data in CMs_data: + CM = CM_data[0] + CM = size_factor_normalize(CM, szfactors[CM_data[1]:CM_data[2]]) + adata.layers["X_" + layer][CM_data[1]:CM_data[2]] = CM return adata @@ -439,7 +447,7 @@ def sz_util( total_layers: List[str] = None, CM: pd.DataFrame = None, scale_to: Union[float, None] = None, - initial_dtype: type=np.float32, + initial_dtype: type = np.float32, ) -> Tuple[pd.Series, pd.Series]: """Calculate the size factor for a given layer. @@ -475,13 +483,8 @@ def sz_util( extend_layers=False, ) - chunk_size = chunk_size if chunk_size is not None else adata.n_obs - chunked_CMs = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) if CM is None else CM - - cell_total = np.zeros(adata.n_obs, dtype=initial_dtype) - - for CM_data in chunked_CMs: - CM = CM_data[0] + if chunk_size is None: + CM = DKM.select_layer_data(adata, layer) if CM is None else CM if CM is None: return None, None @@ -493,12 +496,32 @@ def sz_util( else: CM = CM.round().astype("int") - chunk_cell_total = CM.sum(axis=1).A1 if issparse(CM) else CM.sum(axis=1) - chunk_cell_total += chunk_cell_total == 0 # avoid infinity value after log (0) + cell_total = CM.sum(axis=1).A1 if issparse(CM) else CM.sum(axis=1) + cell_total += cell_total == 0 # avoid infinity value after log (0) + else: + chunked_CMs = DKM.select_layer_chunked_data(adata, layer, chunk_size=chunk_size) if CM is None else CM + + cell_total = np.zeros(adata.n_obs, dtype=initial_dtype) + + for CM_data in chunked_CMs: + CM = CM_data[0] + + if CM is None: + return None, None + + if round_exprs: + main_debug("rounding expression data of layer: %s during size factor calculation" % (layer)) + if issparse(CM): + CM.data = np.round(CM.data, 0) + else: + CM = CM.round().astype("int") + + chunk_cell_total = CM.sum(axis=1).A1 if issparse(CM) else CM.sum(axis=1) + chunk_cell_total += chunk_cell_total == 0 # avoid infinity value after log (0) - cell_total[CM_data[1]:CM_data[2]] = chunk_cell_total + cell_total[CM_data[1]:CM_data[2]] = chunk_cell_total - cell_total = cell_total.astype(int) if np.all(cell_total % 1 == 0) else cell_total + cell_total = cell_total.astype(int) if np.all(cell_total % 1 == 0) else cell_total if method in ["mean-geometric-mean-total", "geometric"]: sfs = cell_total / (np.exp(locfunc(np.log(cell_total))) if scale_to is None else scale_to) From bf9f0aca0cc175826d6ff59bc30ad04721829416 Mon Sep 17 00:00:00 2001 From: sichao Date: Mon, 8 Apr 2024 14:36:40 -0400 Subject: [PATCH 2/3] print gradop test to debug --- tests/test_tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_tools.py b/tests/test_tools.py index 814639166..a427b8d3e 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -53,6 +53,7 @@ def test_gradop(): expected_data = np.array([-1, 1, 1, -1, -1, 1, 1, -1]) expected_indices = np.array([0, 1, 0, 1, 1, 2, 1, 2]) expected_indptr = np.array([0, 2, 4, 6, 8]) + print(grad.A) assert np.all(grad.data == expected_data) assert np.all(grad.indices == expected_indices) assert np.all(grad.indptr == expected_indptr) From 29c208880e0624cfc26b068d602af96161bb13b8 Mon Sep 17 00:00:00 2001 From: sichao Date: Mon, 8 Apr 2024 14:59:24 -0400 Subject: [PATCH 3/3] debug test_gradop --- tests/test_tools.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_tools.py b/tests/test_tools.py index a427b8d3e..e8ba00e48 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -50,13 +50,11 @@ def test_gradop(): # Check that the matrix has the expected shape values assert np.all(grad.A == grad_dense.A) assert grad.shape == (4, 3) - expected_data = np.array([-1, 1, 1, -1, -1, 1, 1, -1]) - expected_indices = np.array([0, 1, 0, 1, 1, 2, 1, 2]) - expected_indptr = np.array([0, 2, 4, 6, 8]) - print(grad.A) - assert np.all(grad.data == expected_data) - assert np.all(grad.indices == expected_indices) - assert np.all(grad.indptr == expected_indptr) + print(grad.A, grad.indices, grad.indptr) + assert np.all(grad.A == np.array([[-1, 1, 0], + [1, -1, 0], + [0, -1, 1], + [0, 1, -1]])) def test_norm_loglikelihood():