From 4f14ab364ecba92ea598c4374b0e42e63a025a3f Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Thu, 18 Jul 2024 06:45:52 +0200 Subject: [PATCH] refactor: enable chunk wise import/processing of LipidBlast json - Add parameter `n` to `compound_tbl_lipidblast()` to enable and support import and processing of the data in sets of lines to reduce memory demand (issue #114). --- DESCRIPTION | 4 +- NAMESPACE | 1 + NEWS.md | 6 ++ R/createCompDbPackage.R | 125 +++++++++++++++------- man/compound_tbl_lipidblast.Rd | 21 +++- tests/testthat/test_createCompDbPackage.R | 32 +++++- 6 files changed, 144 insertions(+), 45 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2d5c3b0..014d955 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: CompoundDb Type: Package Title: Creating and Using (Chemical) Compound Annotation Databases -Version: 1.9.1 +Version: 1.9.2 Authors@R: c(person(given = "Jan", family = "Stanstrup", email = "stanstrup@gmail.com", role = c("aut"), @@ -63,7 +63,7 @@ BugReports: https://github.com/RforMassSpectrometry/CompoundDb/issues biocViews: MassSpectrometry, Metabolomics, Annotation VignetteBuilder: knitr License: Artistic-2.0 -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Roxygen: list(markdown=TRUE) Collate: 'AllGenerics.R' diff --git a/NAMESPACE b/NAMESPACE index 651b8a2..47658b1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -120,6 +120,7 @@ importFrom(Spectra,intensity) importFrom(dbplyr,src_dbi) importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) +importFrom(jsonlite,fromJSON) importFrom(jsonlite,read_json) importFrom(methods,"slot<-") importFrom(methods,.hasSlot) diff --git a/NEWS.md b/NEWS.md index 216c790..87eacb5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # CompoundDb version 1.9 +## Changes in version 1.9.2 + +- `compound_tbl_lipidblast`: add parameter `n` to support reading and + processing MoNA json files in sets (chunks) of lines at a time and hence + reduce memory demand for very large files. + ## Changes in version 1.9.1 - Allow `CompDb` to store that database name as alternative to an active diff --git a/R/createCompDbPackage.R b/R/createCompDbPackage.R index 56908ac..8ad5742 100644 --- a/R/createCompDbPackage.R +++ b/R/createCompDbPackage.R @@ -111,22 +111,35 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE, #' #' @description #' -#' `compound_tbl_lipidblast()` extracts basic comopund annotations from a +#' `compound_tbl_lipidblast()` extracts basic compound annotations from a #' LipidBlast file in (json format) downloaded from -#' http://mona.fiehnlab.ucdavis.edu/downloads +#' http://mona.fiehnlab.ucdavis.edu/downloads . Note that no mass spectra data +#' is extracted from the json file. #' #' @param file `character(1)` with the name of the file name. #' #' @param collapse optional `character(1)` to be used to collapse multiple #' values in the columns `"synonyms"`. See examples for details. #' +#' @param n `integer(1)` defining the number of rows from the json file that +#' should be read and processed at a time. By default (`n = -1L`) the +#' complete file is imported and processed. For large json files it is +#' suggested to set e.g. `n = 100000` to enable chunk-wise processing and +#' hence reduce the memory demand. +#' +#' @param verbose `logical(1)` whether some progress information should be +#' provided. Defaults to `verbose = FALSE`, but for parsing very large +#' files (specifically with chunk-wise processing enabled with `n` > 0) +#' it might be helpful to set to `verbose = TRUE`. +#' #' @return A [tibble::tibble] with general compound information (one row per #' compound): #' #' - `compound_id`: the ID of the compound. #' - `name`: the compound's name. #' - `inchi`: the InChI of the compound. -#' - `inchikey`: the InChI key. +#' - `inchikey`: the InChI key. `NA` for all compounds as it is ot provided +#' in MoNA json files. #' - `formula`: the chemical formula. #' - `exactmass`: the compound's mass. #' - `synonyms`: the compound's synonyms (aliases). This type of this column is @@ -146,12 +159,13 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE, #' fl <- system.file("json/MoNa-LipidBlast_sub.json", package = "CompoundDb") #' cmps <- compound_tbl_lipidblast(fl) #' cmps -compound_tbl_lipidblast <- function(file, collapse) { +compound_tbl_lipidblast <- function(file, collapse = character(), n = -1L, + verbose = FALSE) { .check_parameter_file(file) - res <- .import_lipidblast(file) - if (!missing(collapse)) { + res <- .import_lipidblast(file, n = n, verbose = verbose) + if (length(collapse)) { ## collapse elements from lists. - res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse, + res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse[1L], FUN.VALUE = "character") } res @@ -331,44 +345,79 @@ compound_tbl_lipidblast <- function(file, collapse) { #' @author Jan Stanstrup and Johannes Rainer #' #' @importFrom jsonlite read_json +#' #' @importFrom dplyr bind_rows #' #' @md #' #' @noRd -.import_lipidblast <- function(file) { - lipidb <- read_json(file) +.import_lipidblast <- function(file, n = -1L, verbose = FALSE) { + if (n < 0) { + lipidb <- read_json(file) + if (verbose) + message("Processing ", length(lipidb), " elements ...", + appendLF = FALSE) + res <- lapply(lipidb, .parse_lipidblast_json_element) + if (verbose) message(" done.") + } else res <- .import_lipidblast_json_chunk(file, n = n, verbose = verbose) + bind_rows(res) +} - parse_element <- function(x) { - id <- x$id - cmp <- x$compound[[1]] - ## get the name(s) -> name + aliases - nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character") - mass <- unlist(lapply(cmp$metaData, function(z) { - if (z$name == "total exact mass") - z$value - })) - if (is.null(mass)) - mass <- NA_character_ - frml <- unlist(lapply(cmp$metaData, function(z) { - if (z$name == "molecular formula") - z$value - })) - if (is.null(frml)) - mass <- NA_character_ - list( - compound_id = x$id, - name = nms[1], - inchi = cmp$inchi, - inchikey = NA_character_, - formula = frml, - exactmass = mass, - synonyms = nms[-1] - ) - } +.parse_lipidblast_json_element <- function(x) { + id <- x$id[[1L]] + cmp <- x$compound[[1L]] + ## get the name(s) -> name + aliases + nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character") + mass <- unlist(lapply(cmp$metaData, function(z) { + if (z$name == "total exact mass") + z$value + })) + if (is.null(mass)) + mass <- NA_character_ + frml <- unlist(lapply(cmp$metaData, function(z) { + if (z$name == "molecular formula") + z$value + })) + if (is.null(frml)) + frml <- NA_character_ + snms <- NA_character_ + if (length(nms) > 1L) + snms <- nms[-1L] + list( + compound_id = id, + name = nms[1L], + inchi = cmp$inchi, + inchikey = NA_character_, + formula = unique(frml), + exactmass = mass, + synonyms = list(snms) + ) +} - res <- lapply(lipidb, parse_element) - bind_rows(res) + +#' @importFrom jsonlite fromJSON +#' +#' @importFrom dplyr bind_rows +.import_lipidblast_json_chunk <- function(x, n = 10000, verbose = FALSE) { + con <- file(x, open = "r") + on.exit(close(con)) + res <- list() + while (length(ls <- readLines(con, n = n, warn = FALSE))) { + if (length(grep("^\\[", ls[1L]))) + ls <- ls[-1L] + if (length(grep("^\\]", ls[length(ls)]))) + ls <- ls[-length(ls)] + ls <- sub(",$", "", ls) + if (length(ls)) { + res <- c(res, lapply(ls, function(z) { + .parse_lipidblast_json_element( + fromJSON(z, simplifyVector = FALSE)) + })) + } + if (verbose) + message("Processed ", length(ls), " elements") + } + res } #' @title Create a CompDb database diff --git a/man/compound_tbl_lipidblast.Rd b/man/compound_tbl_lipidblast.Rd index 80f67c4..16df7ef 100644 --- a/man/compound_tbl_lipidblast.Rd +++ b/man/compound_tbl_lipidblast.Rd @@ -4,13 +4,24 @@ \alias{compound_tbl_lipidblast} \title{Extract compound data from LipidBlast} \usage{ -compound_tbl_lipidblast(file, collapse) +compound_tbl_lipidblast(file, collapse = character(), n = -1L, verbose = FALSE) } \arguments{ \item{file}{\code{character(1)} with the name of the file name.} \item{collapse}{optional \code{character(1)} to be used to collapse multiple values in the columns \code{"synonyms"}. See examples for details.} + +\item{n}{\code{integer(1)} defining the number of rows from the json file that +should be read and processed at a time. By default (\code{n = -1L}) the +complete file is imported and processed. For large json files it is +suggested to set e.g. \code{n = 100000} to enable chunk-wise processing and +hence reduce the memory demand.} + +\item{verbose}{\code{logical(1)} whether some progress information should be +provided. Defaults to \code{verbose = FALSE}, but for parsing very large +files (specifically with chunk-wise processing enabled with \code{n} > 0) +it might be helpful to set to \code{verbose = TRUE}.} } \value{ A \link[tibble:tibble]{tibble::tibble} with general compound information (one row per @@ -19,7 +30,8 @@ compound): \item \code{compound_id}: the ID of the compound. \item \code{name}: the compound's name. \item \code{inchi}: the InChI of the compound. -\item \code{inchikey}: the InChI key. +\item \code{inchikey}: the InChI key. \code{NA} for all compounds as it is ot provided +in MoNA json files. \item \code{formula}: the chemical formula. \item \code{exactmass}: the compound's mass. \item \code{synonyms}: the compound's synonyms (aliases). This type of this column is @@ -29,9 +41,10 @@ into a single element separated by the value of \code{collapse}. } } \description{ -\code{compound_tbl_lipidblast()} extracts basic comopund annotations from a +\code{compound_tbl_lipidblast()} extracts basic compound annotations from a LipidBlast file in (json format) downloaded from -http://mona.fiehnlab.ucdavis.edu/downloads +http://mona.fiehnlab.ucdavis.edu/downloads . Note that no mass spectra data +is extracted from the json file. } \examples{ diff --git a/tests/testthat/test_createCompDbPackage.R b/tests/testthat/test_createCompDbPackage.R index 0bcc494..770c380 100644 --- a/tests/testthat/test_createCompDbPackage.R +++ b/tests/testthat/test_createCompDbPackage.R @@ -151,7 +151,7 @@ test_that("compound_tbl_lipidblast works", { "inchikey", "formula", "exactmass", "synonyms")) expect_true(nrow(cmps) == 8) - expect_true(is(cmps$synonyms, "character")) + expect_true(is.list(cmps$synonyms)) cmps <- compound_tbl_lipidblast(lb, collapse = ";") expect_true(is.character(cmps$synonyms)) }) @@ -447,3 +447,33 @@ test_that("emptyCompDb works", { expect_error(emptyCompDb(fl), "exist") }) + +test_that(".parse_lipidblast_json_element works", { + library(jsonlite) + f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb") + js <- read_json(f) + res <- .parse_lipidblast_json_element(js[[1L]]) + expect_true(is.list(res)) + expect_equal(names(res), c("compound_id", "name", "inchi", "inchikey", + "formula", "exactmass", "synonyms")) + expect_equal(res$name, "CerP 24:0") +}) + +test_that(".import_lipidblast_json_chunk works", { + f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb") + res <- .import_lipidblast_json_chunk(f, n = 3) + expect_true(is.list(res)) + expect_true(length(res) == 8L) + + ref <- .import_lipidblast(f, verbose = TRUE) + expect_equal(nrow(ref), length(res)) + res <- bind_rows(res) + expect_equal(ref, res) +}) + +test_that("compound_tbl_lipidblast works with n > 0", { + f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb") + ref <- compound_tbl_lipidblast(f) + res <- compound_tbl_lipidblast(f, n = 4) + expect_equal(ref, res) +})