-
Notifications
You must be signed in to change notification settings - Fork 18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[r] add tf-idf and log normalization functions #168
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -923,3 +923,57 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { | |
vars_to_regress = vars_to_regress | ||
) | ||
} | ||
|
||
################# | ||
# Normalizations | ||
################# | ||
|
||
#' Normalize a matrix using log normalization | ||
#' @param mat (IterableMatrix) Matrix to normalize | ||
#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization | ||
#' @param add_one (logical) Add one to the matrix before log normalization | ||
#' @returns log normalized matrix. | ||
#' @export | ||
normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) { | ||
assert_is(mat, "IterableMatrix") | ||
assert_is_numeric(scale_factor) | ||
assert_true(is.logical(add_one)) | ||
assert_greater_than_zero(scale_factor) | ||
mat <- mat * scale_factor | ||
if (!add_one) mat <- mat - 1 | ||
return(log1p(mat)) | ||
} | ||
|
||
|
||
#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency | ||
#' @param mat (IterableMatrix) to normalize | ||
#' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then | ||
#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. | ||
#' Else, map each feature name to its mean value. | ||
#' @returns tf-idf normalized matrix. | ||
#' @export | ||
normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
assert_is(mat, "IterableMatrix") | ||
assert_is_wholenumber(threads) | ||
# If feature means are passed in, only need to calculate term frequency | ||
if (is.null(feature_means)) { | ||
mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) | ||
feature_means <- mat_stats$row_stats["mean", ] | ||
read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) | ||
} else { | ||
assert_is_numeric(feature_means) | ||
if (!is.null(names(feature_means)) && !is.null(rownames(mat))) { | ||
# Make sure every name in feature means exists in rownames(mat) | ||
# In the case there is a length mismatch but the feature names all exist in feature_means | ||
# will not error out | ||
assert_true(all(rownames(mat) %in% names(feature_means))) | ||
feature_means <- feature_means[rownames(mat)] | ||
} else { | ||
assert_len(feature_means, nrow(mat)) | ||
} | ||
read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat) | ||
} | ||
tf <- mat %>% multiply_cols(1 / read_depth) | ||
idf <- 1 / feature_means | ||
return(tf %>% multiply_rows(idf)) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -346,3 +346,49 @@ test_that("linear regression works", { | |
expect_equal(as(m1, "matrix"), ans) | ||
expect_equal(as(m1t, "matrix"), ans) | ||
}) | ||
|
||
test_that("tf-idf normalization works", { | ||
m <- generate_sparse_matrix(5, 5) | ||
rownames(m) <- paste0("row", seq_len(nrow(m))) | ||
rev_rownames <- rev(rownames(m)) | ||
# Create tf-idf normalization for dgCMatrix | ||
res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix") | ||
|
||
rownames(res_dgc) <- rownames(m) | ||
m2 <- as(m, "IterableMatrix") | ||
# Check that we can pass in row means as a (named) vector | ||
row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",] | ||
# Test that row means ordering does not matter as long as names exist | ||
row_means_shuffled <- row_means[sample(1:length(row_means))] | ||
# Test that row means can have an extra element as long as all rownames are in the vector | ||
row_means_plus_one <- c(row_means, row6 = 1) | ||
|
||
|
||
res <- normalize_tfidf(m2) | ||
expect_equal(res %>% as("dgCMatrix"), res_dgc) | ||
res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) | ||
expect_identical(res, res_with_row_means) | ||
|
||
res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled) | ||
expect_identical(res_with_row_means, res_with_shuffled_row_means, res) | ||
|
||
res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one) | ||
expect_identical(res, res_with_row_means_with_extra_element) | ||
}) | ||
|
||
test_that("normalize_log works", { | ||
m <- generate_sparse_matrix(5, 5) | ||
m2 <- as(m, "IterableMatrix") | ||
# Test that default params yield the same as log1p on dgCMatrix | ||
res_1 <- as(normalize_log(m2), "dgCMatrix") | ||
expect_equal(res_1, log1p(m*1e4), tolerance = 1e-6) | ||
|
||
# Test that changing scale factor works | ||
res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") | ||
expect_equal(res_2, log1p(m*1e5), tolerance = 1e-6) | ||
# Test that removing the add_one works | ||
# log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. | ||
res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") | ||
res_3@x[res_3@x < -60] <- -Inf | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any better way of doing this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As suggested above, I think we just get rid of the |
||
expect_equal(as(res_3, "dgeMatrix"), log(m*1e4), tolerance = 1e-6) | ||
}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add_one
parameter and always just dolog1p
. Every time I've seen this normalization it's done with alog1p
, as otherwise the zero values would become -Inf (dgCMatrix actually messes this up)matrix_stats()
so we can do multi-threadingThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree on removing the add_one, and providing formulas in docs