From 4f14ab364ecba92ea598c4374b0e42e63a025a3f Mon Sep 17 00:00:00 2001
From: Johannes Rainer <johannes.rainer@gmail.com>
Date: Thu, 18 Jul 2024 06:45:52 +0200
Subject: [PATCH] refactor: enable chunk wise import/processing of LipidBlast
 json

- Add parameter `n` to `compound_tbl_lipidblast()` to enable and support import
  and processing of the data in sets of lines to reduce memory demand (issue
  #114).
---
 DESCRIPTION                               |   4 +-
 NAMESPACE                                 |   1 +
 NEWS.md                                   |   6 ++
 R/createCompDbPackage.R                   | 125 +++++++++++++++-------
 man/compound_tbl_lipidblast.Rd            |  21 +++-
 tests/testthat/test_createCompDbPackage.R |  32 +++++-
 6 files changed, 144 insertions(+), 45 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2d5c3b0..014d955 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CompoundDb
 Type: Package
 Title: Creating and Using (Chemical) Compound Annotation Databases
-Version: 1.9.1
+Version: 1.9.2
 Authors@R: c(person(given = "Jan", family = "Stanstrup",
                     email = "stanstrup@gmail.com",
 		    role = c("aut"),
@@ -63,7 +63,7 @@ BugReports: https://github.com/RforMassSpectrometry/CompoundDb/issues
 biocViews: MassSpectrometry, Metabolomics, Annotation
 VignetteBuilder: knitr
 License: Artistic-2.0
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Roxygen: list(markdown=TRUE)
 Collate:
     'AllGenerics.R'
diff --git a/NAMESPACE b/NAMESPACE
index 651b8a2..47658b1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -120,6 +120,7 @@ importFrom(Spectra,intensity)
 importFrom(dbplyr,src_dbi)
 importFrom(dplyr,bind_cols)
 importFrom(dplyr,bind_rows)
+importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,read_json)
 importFrom(methods,"slot<-")
 importFrom(methods,.hasSlot)
diff --git a/NEWS.md b/NEWS.md
index 216c790..87eacb5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,11 @@
 # CompoundDb version 1.9
 
+## Changes in version 1.9.2
+
+- `compound_tbl_lipidblast`: add parameter `n` to support reading and
+  processing MoNA json files in sets (chunks) of lines at a time and hence
+  reduce memory demand for very large files.
+
 ## Changes in version 1.9.1
 
 - Allow `CompDb` to store that database name as alternative to an active
diff --git a/R/createCompDbPackage.R b/R/createCompDbPackage.R
index 56908ac..8ad5742 100644
--- a/R/createCompDbPackage.R
+++ b/R/createCompDbPackage.R
@@ -111,22 +111,35 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
 #'
 #' @description
 #'
-#' `compound_tbl_lipidblast()` extracts basic comopund annotations from a
+#' `compound_tbl_lipidblast()` extracts basic compound annotations from a
 #' LipidBlast file in (json format) downloaded from
-#' http://mona.fiehnlab.ucdavis.edu/downloads
+#' http://mona.fiehnlab.ucdavis.edu/downloads . Note that no mass spectra data
+#' is extracted from the json file.
 #'
 #' @param file `character(1)` with the name of the file name.
 #'
 #' @param collapse optional `character(1)` to be used to collapse multiple
 #'     values in the columns `"synonyms"`. See examples for details.
 #'
+#' @param n `integer(1)` defining the number of rows from the json file that
+#'     should be read and processed at a time. By default (`n = -1L`) the
+#'     complete file is imported and processed. For large json files it is
+#'     suggested to set e.g. `n = 100000` to enable chunk-wise processing and
+#'     hence reduce the memory demand.
+#'
+#' @param verbose `logical(1)` whether some progress information should be
+#'     provided. Defaults to `verbose = FALSE`, but for parsing very large
+#'     files (specifically with chunk-wise processing enabled with `n` > 0)
+#'     it might be helpful to set to `verbose = TRUE`.
+#'
 #' @return A [tibble::tibble] with general compound information (one row per
 #' compound):
 #'
 #' - `compound_id`: the ID of the compound.
 #' - `name`: the compound's name.
 #' - `inchi`: the InChI of the compound.
-#' - `inchikey`: the InChI key.
+#' - `inchikey`: the InChI key. `NA` for all compounds as it is ot provided
+#'   in MoNA json files.
 #' - `formula`: the chemical formula.
 #' - `exactmass`: the compound's mass.
 #' - `synonyms`: the compound's synonyms (aliases). This type of this column is
@@ -146,12 +159,13 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
 #' fl <- system.file("json/MoNa-LipidBlast_sub.json", package = "CompoundDb")
 #' cmps <- compound_tbl_lipidblast(fl)
 #' cmps
-compound_tbl_lipidblast <- function(file, collapse) {
+compound_tbl_lipidblast <- function(file, collapse = character(), n = -1L,
+                                    verbose = FALSE) {
     .check_parameter_file(file)
-    res <- .import_lipidblast(file)
-    if (!missing(collapse)) {
+    res <- .import_lipidblast(file, n = n, verbose = verbose)
+    if (length(collapse)) {
         ## collapse elements from lists.
-        res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse,
+        res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse[1L],
                                FUN.VALUE = "character")
     }
     res
@@ -331,44 +345,79 @@ compound_tbl_lipidblast <- function(file, collapse) {
 #' @author Jan Stanstrup and Johannes Rainer
 #'
 #' @importFrom jsonlite read_json
+#'
 #' @importFrom dplyr bind_rows
 #'
 #' @md
 #'
 #' @noRd
-.import_lipidblast <- function(file) {
-    lipidb <- read_json(file)
+.import_lipidblast <- function(file, n = -1L, verbose = FALSE) {
+    if (n < 0) {
+        lipidb <- read_json(file)
+        if (verbose)
+            message("Processing ", length(lipidb), " elements ...",
+                    appendLF = FALSE)
+        res <- lapply(lipidb, .parse_lipidblast_json_element)
+        if (verbose) message(" done.")
+    } else res <- .import_lipidblast_json_chunk(file, n = n, verbose = verbose)
+    bind_rows(res)
+}
 
-    parse_element <- function(x) {
-        id <- x$id
-        cmp <- x$compound[[1]]
-        ## get the name(s) -> name + aliases
-        nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
-        mass <- unlist(lapply(cmp$metaData, function(z) {
-            if (z$name == "total exact mass")
-                z$value
-        }))
-        if (is.null(mass))
-            mass <- NA_character_
-        frml <- unlist(lapply(cmp$metaData, function(z) {
-            if (z$name == "molecular formula")
-                z$value
-        }))
-        if (is.null(frml))
-            mass <- NA_character_
-        list(
-            compound_id = x$id,
-            name = nms[1],
-            inchi = cmp$inchi,
-            inchikey = NA_character_,
-            formula = frml,
-            exactmass = mass,
-            synonyms = nms[-1]
-        )
-    }
+.parse_lipidblast_json_element <- function(x) {
+    id <- x$id[[1L]]
+    cmp <- x$compound[[1L]]
+    ## get the name(s) -> name + aliases
+    nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
+    mass <- unlist(lapply(cmp$metaData, function(z) {
+        if (z$name == "total exact mass")
+            z$value
+    }))
+    if (is.null(mass))
+        mass <- NA_character_
+    frml <- unlist(lapply(cmp$metaData, function(z) {
+        if (z$name == "molecular formula")
+            z$value
+    }))
+    if (is.null(frml))
+        frml <- NA_character_
+    snms <- NA_character_
+    if (length(nms) > 1L)
+        snms <- nms[-1L]
+    list(
+        compound_id = id,
+        name = nms[1L],
+        inchi = cmp$inchi,
+        inchikey = NA_character_,
+        formula = unique(frml),
+        exactmass = mass,
+        synonyms = list(snms)
+    )
+}
 
-    res <- lapply(lipidb, parse_element)
-    bind_rows(res)
+
+#' @importFrom jsonlite fromJSON
+#'
+#' @importFrom dplyr bind_rows
+.import_lipidblast_json_chunk <- function(x, n = 10000, verbose = FALSE) {
+    con <- file(x, open = "r")
+    on.exit(close(con))
+    res <- list()
+    while (length(ls <- readLines(con, n = n, warn = FALSE))) {
+        if (length(grep("^\\[", ls[1L])))
+            ls <- ls[-1L]
+        if (length(grep("^\\]", ls[length(ls)])))
+            ls <- ls[-length(ls)]
+        ls <- sub(",$", "", ls)
+        if (length(ls)) {
+            res <- c(res, lapply(ls, function(z) {
+                .parse_lipidblast_json_element(
+                    fromJSON(z, simplifyVector = FALSE))
+            }))
+        }
+        if (verbose)
+            message("Processed ", length(ls), " elements")
+    }
+    res
 }
 
 #' @title Create a CompDb database
diff --git a/man/compound_tbl_lipidblast.Rd b/man/compound_tbl_lipidblast.Rd
index 80f67c4..16df7ef 100644
--- a/man/compound_tbl_lipidblast.Rd
+++ b/man/compound_tbl_lipidblast.Rd
@@ -4,13 +4,24 @@
 \alias{compound_tbl_lipidblast}
 \title{Extract compound data from LipidBlast}
 \usage{
-compound_tbl_lipidblast(file, collapse)
+compound_tbl_lipidblast(file, collapse = character(), n = -1L, verbose = FALSE)
 }
 \arguments{
 \item{file}{\code{character(1)} with the name of the file name.}
 
 \item{collapse}{optional \code{character(1)} to be used to collapse multiple
 values in the columns \code{"synonyms"}. See examples for details.}
+
+\item{n}{\code{integer(1)} defining the number of rows from the json file that
+should be read and processed at a time. By default (\code{n = -1L}) the
+complete file is imported and processed. For large json files it is
+suggested to set e.g. \code{n = 100000} to enable chunk-wise processing and
+hence reduce the memory demand.}
+
+\item{verbose}{\code{logical(1)} whether some progress information should be
+provided. Defaults to \code{verbose = FALSE}, but for parsing very large
+files (specifically with chunk-wise processing enabled with \code{n} > 0)
+it might be helpful to set to \code{verbose = TRUE}.}
 }
 \value{
 A \link[tibble:tibble]{tibble::tibble} with general compound information (one row per
@@ -19,7 +30,8 @@ compound):
 \item \code{compound_id}: the ID of the compound.
 \item \code{name}: the compound's name.
 \item \code{inchi}: the InChI of the compound.
-\item \code{inchikey}: the InChI key.
+\item \code{inchikey}: the InChI key. \code{NA} for all compounds as it is ot provided
+in MoNA json files.
 \item \code{formula}: the chemical formula.
 \item \code{exactmass}: the compound's mass.
 \item \code{synonyms}: the compound's synonyms (aliases). This type of this column is
@@ -29,9 +41,10 @@ into a single element separated by the value of \code{collapse}.
 }
 }
 \description{
-\code{compound_tbl_lipidblast()} extracts basic comopund annotations from a
+\code{compound_tbl_lipidblast()} extracts basic compound annotations from a
 LipidBlast file in (json format) downloaded from
-http://mona.fiehnlab.ucdavis.edu/downloads
+http://mona.fiehnlab.ucdavis.edu/downloads . Note that no mass spectra data
+is extracted from the json file.
 }
 \examples{
 
diff --git a/tests/testthat/test_createCompDbPackage.R b/tests/testthat/test_createCompDbPackage.R
index 0bcc494..770c380 100644
--- a/tests/testthat/test_createCompDbPackage.R
+++ b/tests/testthat/test_createCompDbPackage.R
@@ -151,7 +151,7 @@ test_that("compound_tbl_lipidblast works", {
                                    "inchikey", "formula", "exactmass",
                                    "synonyms"))
     expect_true(nrow(cmps) == 8)
-    expect_true(is(cmps$synonyms, "character"))
+    expect_true(is.list(cmps$synonyms))
     cmps <- compound_tbl_lipidblast(lb, collapse = ";")
     expect_true(is.character(cmps$synonyms))
 })
@@ -447,3 +447,33 @@ test_that("emptyCompDb works", {
 
     expect_error(emptyCompDb(fl), "exist")
 })
+
+test_that(".parse_lipidblast_json_element works", {
+    library(jsonlite)
+    f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
+    js <- read_json(f)
+    res <- .parse_lipidblast_json_element(js[[1L]])
+    expect_true(is.list(res))
+    expect_equal(names(res), c("compound_id", "name", "inchi", "inchikey",
+                               "formula", "exactmass", "synonyms"))
+    expect_equal(res$name, "CerP 24:0")
+})
+
+test_that(".import_lipidblast_json_chunk works", {
+    f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
+    res <- .import_lipidblast_json_chunk(f, n = 3)
+    expect_true(is.list(res))
+    expect_true(length(res) == 8L)
+
+    ref <- .import_lipidblast(f, verbose = TRUE)
+    expect_equal(nrow(ref), length(res))
+    res <- bind_rows(res)
+    expect_equal(ref, res)
+})
+
+test_that("compound_tbl_lipidblast works with n > 0", {
+    f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
+    ref <- compound_tbl_lipidblast(f)
+    res <- compound_tbl_lipidblast(f, n = 4)
+    expect_equal(ref, res)
+})