Merge pull request #115 from rformassspectrometry/jomain

refactor: enable chunk wise import/processing of LipidBlast json
rformassspectrometry · Jul 18, 2024 · 4dadffa · 4dadffa
2 parents 9fc9385 + 4f14ab3
commit 4dadffa
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 45 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CompoundDb
 Type: Package
 Title: Creating and Using (Chemical) Compound Annotation Databases
-Version: 1.9.1
+Version: 1.9.2
 Authors@R: c(person(given = "Jan", family = "Stanstrup",
                     email = "[email protected]",
 		    role = c("aut"),
@@ -63,7 +63,7 @@ BugReports: https://github.com/RforMassSpectrometry/CompoundDb/issues
 biocViews: MassSpectrometry, Metabolomics, Annotation
 VignetteBuilder: knitr
 License: Artistic-2.0
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Roxygen: list(markdown=TRUE)
 Collate:
     'AllGenerics.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -120,6 +120,7 @@ importFrom(Spectra,intensity)
 importFrom(dbplyr,src_dbi)
 importFrom(dplyr,bind_cols)
 importFrom(dplyr,bind_rows)
+importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,read_json)
 importFrom(methods,"slot<-")
 importFrom(methods,.hasSlot)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,11 @@
 # CompoundDb version 1.9
 
+## Changes in version 1.9.2
+
+- `compound_tbl_lipidblast`: add parameter `n` to support reading and
+  processing MoNA json files in sets (chunks) of lines at a time and hence
+  reduce memory demand for very large files.
+
 ## Changes in version 1.9.1
 
 - Allow `CompDb` to store that database name as alternative to an active

diff --git a/R/createCompDbPackage.R b/R/createCompDbPackage.R
@@ -111,22 +111,35 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
 #'
 #' @description
 #'
-#' `compound_tbl_lipidblast()` extracts basic comopund annotations from a
+#' `compound_tbl_lipidblast()` extracts basic compound annotations from a
 #' LipidBlast file in (json format) downloaded from
-#' http://mona.fiehnlab.ucdavis.edu/downloads
+#' http://mona.fiehnlab.ucdavis.edu/downloads . Note that no mass spectra data
+#' is extracted from the json file.
 #'
 #' @param file `character(1)` with the name of the file name.
 #'
 #' @param collapse optional `character(1)` to be used to collapse multiple
 #'     values in the columns `"synonyms"`. See examples for details.
 #'
+#' @param n `integer(1)` defining the number of rows from the json file that
+#'     should be read and processed at a time. By default (`n = -1L`) the
+#'     complete file is imported and processed. For large json files it is
+#'     suggested to set e.g. `n = 100000` to enable chunk-wise processing and
+#'     hence reduce the memory demand.
+#'
+#' @param verbose `logical(1)` whether some progress information should be
+#'     provided. Defaults to `verbose = FALSE`, but for parsing very large
+#'     files (specifically with chunk-wise processing enabled with `n` > 0)
+#'     it might be helpful to set to `verbose = TRUE`.
+#'
 #' @return A [tibble::tibble] with general compound information (one row per
 #' compound):
 #'
 #' - `compound_id`: the ID of the compound.
 #' - `name`: the compound's name.
 #' - `inchi`: the InChI of the compound.
-#' - `inchikey`: the InChI key.
+#' - `inchikey`: the InChI key. `NA` for all compounds as it is ot provided
+#'   in MoNA json files.
 #' - `formula`: the chemical formula.
 #' - `exactmass`: the compound's mass.
 #' - `synonyms`: the compound's synonyms (aliases). This type of this column is
@@ -146,12 +159,13 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
 #' fl <- system.file("json/MoNa-LipidBlast_sub.json", package = "CompoundDb")
 #' cmps <- compound_tbl_lipidblast(fl)
 #' cmps
-compound_tbl_lipidblast <- function(file, collapse) {
+compound_tbl_lipidblast <- function(file, collapse = character(), n = -1L,
+                                    verbose = FALSE) {
     .check_parameter_file(file)
-    res <- .import_lipidblast(file)
-    if (!missing(collapse)) {
+    res <- .import_lipidblast(file, n = n, verbose = verbose)
+    if (length(collapse)) {
         ## collapse elements from lists.
-        res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse,
+        res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse[1L],
                                FUN.VALUE = "character")
     }
     res
@@ -331,44 +345,79 @@ compound_tbl_lipidblast <- function(file, collapse) {
 #' @author Jan Stanstrup and Johannes Rainer
 #'
 #' @importFrom jsonlite read_json
+#'
 #' @importFrom dplyr bind_rows
 #'
 #' @md
 #'
 #' @noRd
-.import_lipidblast <- function(file) {
-    lipidb <- read_json(file)
+.import_lipidblast <- function(file, n = -1L, verbose = FALSE) {
+    if (n < 0) {
+        lipidb <- read_json(file)
+        if (verbose)
+            message("Processing ", length(lipidb), " elements ...",
+                    appendLF = FALSE)
+        res <- lapply(lipidb, .parse_lipidblast_json_element)
+        if (verbose) message(" done.")
+    } else res <- .import_lipidblast_json_chunk(file, n = n, verbose = verbose)
+    bind_rows(res)
+}
 
-    parse_element <- function(x) {
-        id <- x$id
-        cmp <- x$compound[[1]]
-        ## get the name(s) -> name + aliases
-        nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
-        mass <- unlist(lapply(cmp$metaData, function(z) {
-            if (z$name == "total exact mass")
-                z$value
-        }))
-        if (is.null(mass))
-            mass <- NA_character_
-        frml <- unlist(lapply(cmp$metaData, function(z) {
-            if (z$name == "molecular formula")
-                z$value
-        }))
-        if (is.null(frml))
-            mass <- NA_character_
-        list(
-            compound_id = x$id,
-            name = nms[1],
-            inchi = cmp$inchi,
-            inchikey = NA_character_,
-            formula = frml,
-            exactmass = mass,
-            synonyms = nms[-1]
-        )
-    }
+.parse_lipidblast_json_element <- function(x) {
+    id <- x$id[[1L]]
+    cmp <- x$compound[[1L]]
+    ## get the name(s) -> name + aliases
+    nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
+    mass <- unlist(lapply(cmp$metaData, function(z) {
+        if (z$name == "total exact mass")
+            z$value
+    }))
+    if (is.null(mass))
+        mass <- NA_character_
+    frml <- unlist(lapply(cmp$metaData, function(z) {
+        if (z$name == "molecular formula")
+            z$value
+    }))
+    if (is.null(frml))
+        frml <- NA_character_
+    snms <- NA_character_
+    if (length(nms) > 1L)
+        snms <- nms[-1L]
+    list(
+        compound_id = id,
+        name = nms[1L],
+        inchi = cmp$inchi,
+        inchikey = NA_character_,
+        formula = unique(frml),
+        exactmass = mass,
+        synonyms = list(snms)
+    )
+}
 
-    res <- lapply(lipidb, parse_element)
-    bind_rows(res)
+
+#' @importFrom jsonlite fromJSON
+#'
+#' @importFrom dplyr bind_rows
+.import_lipidblast_json_chunk <- function(x, n = 10000, verbose = FALSE) {
+    con <- file(x, open = "r")
+    on.exit(close(con))
+    res <- list()
+    while (length(ls <- readLines(con, n = n, warn = FALSE))) {
+        if (length(grep("^\\[", ls[1L])))
+            ls <- ls[-1L]
+        if (length(grep("^\\]", ls[length(ls)])))
+            ls <- ls[-length(ls)]
+        ls <- sub(",$", "", ls)
+        if (length(ls)) {
+            res <- c(res, lapply(ls, function(z) {
+                .parse_lipidblast_json_element(
+                    fromJSON(z, simplifyVector = FALSE))
+            }))
+        }
+        if (verbose)
+            message("Processed ", length(ls), " elements")
+    }
+    res
 }
 
 #' @title Create a CompDb database

diff --git a/man/compound_tbl_lipidblast.Rd b/man/compound_tbl_lipidblast.Rd
diff --git a/tests/testthat/test_createCompDbPackage.R b/tests/testthat/test_createCompDbPackage.R
@@ -151,7 +151,7 @@ test_that("compound_tbl_lipidblast works", {
                                    "inchikey", "formula", "exactmass",
                                    "synonyms"))
     expect_true(nrow(cmps) == 8)
-    expect_true(is(cmps$synonyms, "character"))
+    expect_true(is.list(cmps$synonyms))
     cmps <- compound_tbl_lipidblast(lb, collapse = ";")
     expect_true(is.character(cmps$synonyms))
 })
@@ -447,3 +447,33 @@ test_that("emptyCompDb works", {
 
     expect_error(emptyCompDb(fl), "exist")
 })
+
+test_that(".parse_lipidblast_json_element works", {
+    library(jsonlite)
+    f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
+    js <- read_json(f)
+    res <- .parse_lipidblast_json_element(js[[1L]])
+    expect_true(is.list(res))
+    expect_equal(names(res), c("compound_id", "name", "inchi", "inchikey",
+                               "formula", "exactmass", "synonyms"))
+    expect_equal(res$name, "CerP 24:0")
+})
+
+test_that(".import_lipidblast_json_chunk works", {
+    f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
+    res <- .import_lipidblast_json_chunk(f, n = 3)
+    expect_true(is.list(res))
+    expect_true(length(res) == 8L)
+
+    ref <- .import_lipidblast(f, verbose = TRUE)
+    expect_equal(nrow(ref), length(res))
+    res <- bind_rows(res)
+    expect_equal(ref, res)
+})
+
+test_that("compound_tbl_lipidblast works with n > 0", {
+    f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
+    ref <- compound_tbl_lipidblast(f)
+    res <- compound_tbl_lipidblast(f, n = 4)
+    expect_equal(ref, res)
+})