From b2415c94aa3cb5df9290df8f85cd61988cc5f79f Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Mon, 29 Jan 2024 15:31:00 +0800 Subject: [PATCH] fix #221 --- docs/changelog.md | 6 ++++++ snapatac2-python/Cargo.lock | 1 - snapatac2-python/Cargo.toml | 4 ++-- snapatac2-python/snapatac2/preprocessing/_basic.py | 11 ++++++++--- snapatac2-python/snapatac2/preprocessing/_knn.py | 1 + 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 502ec5a0a..e4b137cae 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,11 @@ # Release Notes +## Development version (unreleased) + +### Bugs fixed: + +- Fix: #221: 'pp.knn' with 'method=pynndescent' invalid csr matrix. + ## Release 2.5.3 (released Jan 16, 2024) ### Features: diff --git a/snapatac2-python/Cargo.lock b/snapatac2-python/Cargo.lock index 9161cf5f7..82ed38053 100644 --- a/snapatac2-python/Cargo.lock +++ b/snapatac2-python/Cargo.lock @@ -3126,7 +3126,6 @@ dependencies = [ [[package]] name = "snapatac2-core" version = "2.0.0" -source = "git+https://github.com/kaizhang/SnapATAC2.git?rev=ab502b5ed67492d3428fb01b3f4ab7cee6b32566#ab502b5ed67492d3428fb01b3f4ab7cee6b32566" dependencies = [ "anndata", "anyhow", diff --git a/snapatac2-python/Cargo.toml b/snapatac2-python/Cargo.toml index 627a0f598..09ac0ed19 100644 --- a/snapatac2-python/Cargo.toml +++ b/snapatac2-python/Cargo.toml @@ -11,8 +11,8 @@ homepage = "https://github.com/" keywords = ["single-cell", "biology"] [dependencies] -#snapatac2-core = { path = "../snapatac2-core" } -snapatac2-core = { git = "https://github.com/kaizhang/SnapATAC2.git", rev = "ab502b5ed67492d3428fb01b3f4ab7cee6b32566" } +snapatac2-core = { path = "../snapatac2-core" } +#snapatac2-core = { git = "https://github.com/kaizhang/SnapATAC2.git", rev = "ab502b5ed67492d3428fb01b3f4ab7cee6b32566" } anndata = "0.2" anndata-hdf5 = "0.1" pyanndata = "0.2" diff --git a/snapatac2-python/snapatac2/preprocessing/_basic.py b/snapatac2-python/snapatac2/preprocessing/_basic.py index 5a7d41ce1..577861c94 100644 --- a/snapatac2-python/snapatac2/preprocessing/_basic.py +++ b/snapatac2-python/snapatac2/preprocessing/_basic.py @@ -46,6 +46,9 @@ def make_fragment_file( Note ---- + When using `barcode_regex` or `umi_regex`, the regex must contain exactly one capturing group + (Parentheses group the regex between them) that matches the barcodes or UMIs. + Writting the correct regex is tricky. You can test your regex online at https://regex101.com/. BAM files produced by the 10X Genomics Cell Ranger pipeline are not supported, as they contain invalid BAM headers. Specifically, Cell Ranger ATAC <= 2.0 produces BAM files with no @VN tag in the header, and Cell Ranger ATAC >= 2.1 produces BAM files @@ -66,7 +69,7 @@ def make_fragment_file( Extract barcodes from read names of BAM records using regular expressions. Reguler expressions should contain exactly one capturing group (Parentheses group the regex between them) that matches - the barcodes. For example, `barcode_regex="(..:..:..:..):\w+$"` + the barcodes. For example, `barcode_regex="(..:..:..:..):\\w+$"` extracts `bd:69:Y6:10` from `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`. umi_tag @@ -732,7 +735,6 @@ def _find_most_accessible_features( idx = idx[n_lower:n-n_upper] return idx[::-1][:total_features] - def select_features( adata: internal.AnnData | internal.AnnDataSet | list[internal.AnnData], n_features: int = 500000, @@ -746,12 +748,15 @@ def select_features( verbose: bool = True, ) -> np.ndarray | list[np.ndarray] | None: """ - Perform feature selection. + Perform feature selection by selecting the most accessibile features across + all cells unless `max_iter` > 1. Note ---- This function does not perform the actual subsetting. The feature mask is used by various functions to generate submatrices on the fly. + Features that are zero in all cells will be always removed regardless of the + filtering criteria. For more discussion about feature selection, see: https://github.com/kaizhang/SnapATAC2/discussions/116. Parameters diff --git a/snapatac2-python/snapatac2/preprocessing/_knn.py b/snapatac2-python/snapatac2/preprocessing/_knn.py index 99f21d143..c63ba579f 100644 --- a/snapatac2-python/snapatac2/preprocessing/_knn.py +++ b/snapatac2-python/snapatac2/preprocessing/_knn.py @@ -76,6 +76,7 @@ def knn( distances = np.ravel(distances[:, :n_neighbors]) indptr = np.arange(0, distances.size + 1, n_neighbors) adj = csr_matrix((distances, indices, indptr), shape=(n, n)) + adj.sort_indices() elif method == 'kdtree': adj = internal.nearest_neighbour_graph(data, n_neighbors) else: