Skip to content

Commit

Permalink
Merge pull request #115 from rformassspectrometry/jomain
Browse files Browse the repository at this point in the history
refactor: enable chunk wise import/processing of LipidBlast json
  • Loading branch information
jorainer authored Jul 18, 2024
2 parents 9fc9385 + 4f14ab3 commit 4dadffa
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 45 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: CompoundDb
Type: Package
Title: Creating and Using (Chemical) Compound Annotation Databases
Version: 1.9.1
Version: 1.9.2
Authors@R: c(person(given = "Jan", family = "Stanstrup",
email = "[email protected]",
role = c("aut"),
Expand Down Expand Up @@ -63,7 +63,7 @@ BugReports: https://github.com/RforMassSpectrometry/CompoundDb/issues
biocViews: MassSpectrometry, Metabolomics, Annotation
VignetteBuilder: knitr
License: Artistic-2.0
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Roxygen: list(markdown=TRUE)
Collate:
'AllGenerics.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ importFrom(Spectra,intensity)
importFrom(dbplyr,src_dbi)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,read_json)
importFrom(methods,"slot<-")
importFrom(methods,.hasSlot)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# CompoundDb version 1.9

## Changes in version 1.9.2

- `compound_tbl_lipidblast`: add parameter `n` to support reading and
processing MoNA json files in sets (chunks) of lines at a time and hence
reduce memory demand for very large files.

## Changes in version 1.9.1

- Allow `CompDb` to store that database name as alternative to an active
Expand Down
125 changes: 87 additions & 38 deletions R/createCompDbPackage.R
Original file line number Diff line number Diff line change
Expand Up @@ -111,22 +111,35 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
#'
#' @description
#'
#' `compound_tbl_lipidblast()` extracts basic comopund annotations from a
#' `compound_tbl_lipidblast()` extracts basic compound annotations from a
#' LipidBlast file in (json format) downloaded from
#' http://mona.fiehnlab.ucdavis.edu/downloads
#' http://mona.fiehnlab.ucdavis.edu/downloads . Note that no mass spectra data
#' is extracted from the json file.
#'
#' @param file `character(1)` with the name of the file name.
#'
#' @param collapse optional `character(1)` to be used to collapse multiple
#' values in the columns `"synonyms"`. See examples for details.
#'
#' @param n `integer(1)` defining the number of rows from the json file that
#' should be read and processed at a time. By default (`n = -1L`) the
#' complete file is imported and processed. For large json files it is
#' suggested to set e.g. `n = 100000` to enable chunk-wise processing and
#' hence reduce the memory demand.
#'
#' @param verbose `logical(1)` whether some progress information should be
#' provided. Defaults to `verbose = FALSE`, but for parsing very large
#' files (specifically with chunk-wise processing enabled with `n` > 0)
#' it might be helpful to set to `verbose = TRUE`.
#'
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#'
#' - `compound_id`: the ID of the compound.
#' - `name`: the compound's name.
#' - `inchi`: the InChI of the compound.
#' - `inchikey`: the InChI key.
#' - `inchikey`: the InChI key. `NA` for all compounds as it is ot provided
#' in MoNA json files.
#' - `formula`: the chemical formula.
#' - `exactmass`: the compound's mass.
#' - `synonyms`: the compound's synonyms (aliases). This type of this column is
Expand All @@ -146,12 +159,13 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
#' fl <- system.file("json/MoNa-LipidBlast_sub.json", package = "CompoundDb")
#' cmps <- compound_tbl_lipidblast(fl)
#' cmps
compound_tbl_lipidblast <- function(file, collapse) {
compound_tbl_lipidblast <- function(file, collapse = character(), n = -1L,
verbose = FALSE) {
.check_parameter_file(file)
res <- .import_lipidblast(file)
if (!missing(collapse)) {
res <- .import_lipidblast(file, n = n, verbose = verbose)
if (length(collapse)) {
## collapse elements from lists.
res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse,
res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse[1L],
FUN.VALUE = "character")
}
res
Expand Down Expand Up @@ -331,44 +345,79 @@ compound_tbl_lipidblast <- function(file, collapse) {
#' @author Jan Stanstrup and Johannes Rainer
#'
#' @importFrom jsonlite read_json
#'
#' @importFrom dplyr bind_rows
#'
#' @md
#'
#' @noRd
.import_lipidblast <- function(file) {
lipidb <- read_json(file)
.import_lipidblast <- function(file, n = -1L, verbose = FALSE) {
if (n < 0) {
lipidb <- read_json(file)
if (verbose)
message("Processing ", length(lipidb), " elements ...",
appendLF = FALSE)
res <- lapply(lipidb, .parse_lipidblast_json_element)
if (verbose) message(" done.")
} else res <- .import_lipidblast_json_chunk(file, n = n, verbose = verbose)
bind_rows(res)
}

parse_element <- function(x) {
id <- x$id
cmp <- x$compound[[1]]
## get the name(s) -> name + aliases
nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
mass <- unlist(lapply(cmp$metaData, function(z) {
if (z$name == "total exact mass")
z$value
}))
if (is.null(mass))
mass <- NA_character_
frml <- unlist(lapply(cmp$metaData, function(z) {
if (z$name == "molecular formula")
z$value
}))
if (is.null(frml))
mass <- NA_character_
list(
compound_id = x$id,
name = nms[1],
inchi = cmp$inchi,
inchikey = NA_character_,
formula = frml,
exactmass = mass,
synonyms = nms[-1]
)
}
.parse_lipidblast_json_element <- function(x) {
id <- x$id[[1L]]
cmp <- x$compound[[1L]]
## get the name(s) -> name + aliases
nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
mass <- unlist(lapply(cmp$metaData, function(z) {
if (z$name == "total exact mass")
z$value
}))
if (is.null(mass))
mass <- NA_character_
frml <- unlist(lapply(cmp$metaData, function(z) {
if (z$name == "molecular formula")
z$value
}))
if (is.null(frml))
frml <- NA_character_
snms <- NA_character_
if (length(nms) > 1L)
snms <- nms[-1L]
list(
compound_id = id,
name = nms[1L],
inchi = cmp$inchi,
inchikey = NA_character_,
formula = unique(frml),
exactmass = mass,
synonyms = list(snms)
)
}

res <- lapply(lipidb, parse_element)
bind_rows(res)

#' @importFrom jsonlite fromJSON
#'
#' @importFrom dplyr bind_rows
.import_lipidblast_json_chunk <- function(x, n = 10000, verbose = FALSE) {
con <- file(x, open = "r")
on.exit(close(con))
res <- list()
while (length(ls <- readLines(con, n = n, warn = FALSE))) {
if (length(grep("^\\[", ls[1L])))
ls <- ls[-1L]
if (length(grep("^\\]", ls[length(ls)])))
ls <- ls[-length(ls)]
ls <- sub(",$", "", ls)
if (length(ls)) {
res <- c(res, lapply(ls, function(z) {
.parse_lipidblast_json_element(
fromJSON(z, simplifyVector = FALSE))
}))
}
if (verbose)
message("Processed ", length(ls), " elements")
}
res
}

#' @title Create a CompDb database
Expand Down
21 changes: 17 additions & 4 deletions man/compound_tbl_lipidblast.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 31 additions & 1 deletion tests/testthat/test_createCompDbPackage.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ test_that("compound_tbl_lipidblast works", {
"inchikey", "formula", "exactmass",
"synonyms"))
expect_true(nrow(cmps) == 8)
expect_true(is(cmps$synonyms, "character"))
expect_true(is.list(cmps$synonyms))
cmps <- compound_tbl_lipidblast(lb, collapse = ";")
expect_true(is.character(cmps$synonyms))
})
Expand Down Expand Up @@ -447,3 +447,33 @@ test_that("emptyCompDb works", {

expect_error(emptyCompDb(fl), "exist")
})

test_that(".parse_lipidblast_json_element works", {
library(jsonlite)
f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
js <- read_json(f)
res <- .parse_lipidblast_json_element(js[[1L]])
expect_true(is.list(res))
expect_equal(names(res), c("compound_id", "name", "inchi", "inchikey",
"formula", "exactmass", "synonyms"))
expect_equal(res$name, "CerP 24:0")
})

test_that(".import_lipidblast_json_chunk works", {
f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
res <- .import_lipidblast_json_chunk(f, n = 3)
expect_true(is.list(res))
expect_true(length(res) == 8L)

ref <- .import_lipidblast(f, verbose = TRUE)
expect_equal(nrow(ref), length(res))
res <- bind_rows(res)
expect_equal(ref, res)
})

test_that("compound_tbl_lipidblast works with n > 0", {
f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
ref <- compound_tbl_lipidblast(f)
res <- compound_tbl_lipidblast(f, n = 4)
expect_equal(ref, res)
})

0 comments on commit 4dadffa

Please sign in to comment.