Skip to content

Commit

Permalink
[r] add feature selection methods
Browse files Browse the repository at this point in the history
  • Loading branch information
immanuelazn committed Dec 15, 2024
1 parent 6381f74 commit d8d9ed2
Show file tree
Hide file tree
Showing 8 changed files with 256 additions and 1 deletion.
2 changes: 2 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ export(rowVars.default)
export(sctransform_pearson)
export(select_cells)
export(select_chromosomes)
export(select_features_by_mean)
export(select_features_by_variance)
export(select_regions)
export(set_trackplot_height)
export(set_trackplot_label)
Expand Down
1 change: 1 addition & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Contributions welcome :)
If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
- Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)

## Improvements
- `trackplot_loop()` now accepts discrete color scales
Expand Down
113 changes: 113 additions & 0 deletions r/R/singlecell_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,118 @@
# option. This file may not be copied, modified, or distributed
# except according to those terms.


#################
# Feature selection
#################

#' Get the most variable features within a matrix.
#' @param mat (IterableMatrix) dimensions features x cells
#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix,
#' all features will be returned.
#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
#' @param threads (integer) Number of threads to use.
#' @returns
#' Return a dataframe with the following columns, sorted descending by variance:
#' - `names`: Feature name.
#' - `score`: Variance of the feature.
#' - `highly_variable`: Logical vector of whether the feature is highly variable.
#' @details
#' Calculate using the following process:
#' 1. Perform an optional term frequency + log normalization, for each feature.
#' 2. Find `num_feats` features with the highest variance across clusters.
#' @export
select_features_by_variance <- function(
mat, num_feats = 25000,
normalize = normalize_log,
threads = 1L
) {
assert_is(mat, "IterableMatrix")
assert_greater_than_zero(num_feats)
assert_is_wholenumber(num_feats)
assert_len(num_feats, 1)
assert_is(num_feats, "numeric")
num_feats <- min(max(num_feats, 0), nrow(mat))

if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
features_df <- tibble::tibble(
names = rownames(mat),
score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
) %>%
dplyr::arrange(desc(score)) %>%
dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
return(features_df)
}


#' Get the features with the highest dispersion within a matrix.
#' @returns
#' Return a dataframe with the following columns, sorted descending by dispersion:
#' - `names`: Feature name.
#' - `score`: Variance of the feature.
#' - `highly_variable`: Logical vector of whether the feature is highly variable.
#' @inheritParams select_features_by_variance
#' @details
#' Calculate using the following process:
#' 1. Perform an optional term frequency + log normalization, for each feature.
#' 2. Find the dispersion (variance/mean) of each feature.
#' 3. Find `num_feats` features with the highest dispersion.
select_features_by_dispersion <- function(
mat, num_feats = 25000,
normalize = normalize_log,
threads = 1L
) {
assert_is(mat, "IterableMatrix")
assert_greater_than_zero(num_feats)
assert_is_wholenumber(num_feats)
assert_len(num_feats, 1)
assert_is(num_feats, "numeric")
num_feats <- min(max(num_feats, 0), nrow(mat))

if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
features_df <- tibble::tibble(
names = rownames(mat),
score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ]
) %>%
dplyr::arrange(desc(score)) %>%
dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
return(features_df)
}


#' Get the top features from a matrix, based on the mean accessibility of each feature.
#' @param num_feats Number of features to deem as highly accessible. If the number is higher than the number of features in the matrix,
#' all features will be returned.
#' @inheritParams select_features_by_variance
#' @returns
#' Return a dataframe with the following columns, sorted descending by mean accessibility:
#' - `names`: Feature name.
#' - `score`: Binarize sum of each feature.
#' - `highly_variable`: Logical vector of whether the feature is highly accessible by mean accessibility.
#' @details
#' Calculate using the following process:
#' 1. Get the sum of each binarized feature.
#' 2. Find `num_feats` features with the highest accessibility.
#' @export
select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
assert_is(mat, "IterableMatrix")
assert_is_wholenumber(num_feats)
assert_greater_than_zero(num_feats)
assert_is(num_feats, "numeric")
num_feats <- min(max(num_feats, 0), nrow(mat))
# get the sum of each feature, binarized
# get the top features
features_df <- tibble::tibble(
names = rownames(mat),
score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
) %>%
dplyr::arrange(desc(score)) %>%
dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
return(features_df)
}


#' Test for marker features
#'
#' Given a features x cells matrix, perform one-vs-all differential
Expand Down Expand Up @@ -70,6 +182,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
)
}


#' Aggregate counts matrices by cell group or feature.
#'
#' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
Expand Down
42 changes: 42 additions & 0 deletions r/man/select_features_by_dispersion.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 34 additions & 0 deletions r/man/select_features_by_mean.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 41 additions & 0 deletions r/man/select_features_by_variance.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions r/pkgdown/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ reference:
- regress_out
- normalize_log
- normalize_tfidf
- select_features_by_variance
- select_features_by_dispersion
- select_features_by_mean
- IterableMatrix-methods
- pseudobulk_matrix

Expand Down
21 changes: 20 additions & 1 deletion r/tests/testthat/test-singlecell_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,24 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val =
as(m, "dgCMatrix")
}

test_that("select_features works general case", {
m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
res <- do.call(fn, list(m1, num_feats = 10))
expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
expect_setequal(res$names, rownames(m1))
res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100))
expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
if (fn != "select_features_by_mean") {
# Check that normalization actually does something
res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
}
}
})

test_that("Wilcoxon rank sum works (small)", {
x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
Expand Down Expand Up @@ -160,4 +178,5 @@ test_that("Pseudobulk aggregation works with multiple return types", {
}
}
}
})
})

0 comments on commit d8d9ed2

Please sign in to comment.