From b2415c94aa3cb5df9290df8f85cd61988cc5f79f Mon Sep 17 00:00:00 2001
From: Kai Zhang <zhangkai33@westlake.edu.cn>
Date: Mon, 29 Jan 2024 15:31:00 +0800
Subject: [PATCH] fix #221

---
 docs/changelog.md                                  |  6 ++++++
 snapatac2-python/Cargo.lock                        |  1 -
 snapatac2-python/Cargo.toml                        |  4 ++--
 snapatac2-python/snapatac2/preprocessing/_basic.py | 11 ++++++++---
 snapatac2-python/snapatac2/preprocessing/_knn.py   |  1 +
 5 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 502ec5a0a..e4b137cae 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -1,5 +1,11 @@
 # Release Notes
 
+## Development version (unreleased)
+
+### Bugs fixed:
+
+- Fix: #221: 'pp.knn' with 'method=pynndescent' invalid csr matrix.
+
 ## Release 2.5.3 (released Jan 16, 2024)
 
 ### Features:
diff --git a/snapatac2-python/Cargo.lock b/snapatac2-python/Cargo.lock
index 9161cf5f7..82ed38053 100644
--- a/snapatac2-python/Cargo.lock
+++ b/snapatac2-python/Cargo.lock
@@ -3126,7 +3126,6 @@ dependencies = [
 [[package]]
 name = "snapatac2-core"
 version = "2.0.0"
-source = "git+https://github.com/kaizhang/SnapATAC2.git?rev=ab502b5ed67492d3428fb01b3f4ab7cee6b32566#ab502b5ed67492d3428fb01b3f4ab7cee6b32566"
 dependencies = [
  "anndata",
  "anyhow",
diff --git a/snapatac2-python/Cargo.toml b/snapatac2-python/Cargo.toml
index 627a0f598..09ac0ed19 100644
--- a/snapatac2-python/Cargo.toml
+++ b/snapatac2-python/Cargo.toml
@@ -11,8 +11,8 @@ homepage = "https://github.com/"
 keywords = ["single-cell", "biology"]
 
 [dependencies]
-#snapatac2-core = { path = "../snapatac2-core" }
-snapatac2-core = { git = "https://github.com/kaizhang/SnapATAC2.git", rev = "ab502b5ed67492d3428fb01b3f4ab7cee6b32566" }
+snapatac2-core = { path = "../snapatac2-core" }
+#snapatac2-core = { git = "https://github.com/kaizhang/SnapATAC2.git", rev = "ab502b5ed67492d3428fb01b3f4ab7cee6b32566" }
 anndata = "0.2"
 anndata-hdf5 = "0.1"
 pyanndata = "0.2"
diff --git a/snapatac2-python/snapatac2/preprocessing/_basic.py b/snapatac2-python/snapatac2/preprocessing/_basic.py
index 5a7d41ce1..577861c94 100644
--- a/snapatac2-python/snapatac2/preprocessing/_basic.py
+++ b/snapatac2-python/snapatac2/preprocessing/_basic.py
@@ -46,6 +46,9 @@ def make_fragment_file(
 
     Note
     ----
+    When using `barcode_regex` or `umi_regex`, the regex must contain exactly one capturing group
+    (Parentheses group the regex between them) that matches the barcodes or UMIs.
+    Writting the correct regex is tricky. You can test your regex online at https://regex101.com/.
     BAM files produced by the 10X Genomics Cell Ranger pipeline are not supported,
     as they contain invalid BAM headers. Specifically, Cell Ranger ATAC <= 2.0 produces BAM
     files with no @VN tag in the header, and Cell Ranger ATAC >= 2.1 produces BAM files
@@ -66,7 +69,7 @@ def make_fragment_file(
         Extract barcodes from read names of BAM records using regular expressions.
         Reguler expressions should contain exactly one capturing group 
         (Parentheses group the regex between them) that matches
-        the barcodes. For example, `barcode_regex="(..:..:..:..):\w+$"`
+        the barcodes. For example, `barcode_regex="(..:..:..:..):\\w+$"`
         extracts `bd:69:Y6:10` from
         `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`.
     umi_tag
@@ -732,7 +735,6 @@ def _find_most_accessible_features(
     idx = idx[n_lower:n-n_upper]
     return idx[::-1][:total_features]
  
- 
 def select_features(
     adata: internal.AnnData | internal.AnnDataSet | list[internal.AnnData],
     n_features: int = 500000,
@@ -746,12 +748,15 @@ def select_features(
     verbose: bool = True,
 ) -> np.ndarray | list[np.ndarray] | None:
     """
-    Perform feature selection.
+    Perform feature selection by selecting the most accessibile features across
+    all cells unless `max_iter` > 1.
 
     Note
     ----
     This function does not perform the actual subsetting. The feature mask is used by
     various functions to generate submatrices on the fly.
+    Features that are zero in all cells will be always removed regardless of the
+    filtering criteria.
     For more discussion about feature selection, see: https://github.com/kaizhang/SnapATAC2/discussions/116.
 
     Parameters
diff --git a/snapatac2-python/snapatac2/preprocessing/_knn.py b/snapatac2-python/snapatac2/preprocessing/_knn.py
index 99f21d143..c63ba579f 100644
--- a/snapatac2-python/snapatac2/preprocessing/_knn.py
+++ b/snapatac2-python/snapatac2/preprocessing/_knn.py
@@ -76,6 +76,7 @@ def knn(
         distances = np.ravel(distances[:, :n_neighbors]) 
         indptr = np.arange(0, distances.size + 1, n_neighbors)
         adj = csr_matrix((distances, indices, indptr), shape=(n, n))
+        adj.sort_indices()
     elif method == 'kdtree':
         adj = internal.nearest_neighbour_graph(data, n_neighbors)
     else: