Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: enable chunk wise import/processing of LipidBlast json #115

Merged
merged 1 commit into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: CompoundDb
Type: Package
Title: Creating and Using (Chemical) Compound Annotation Databases
Version: 1.9.1
Version: 1.9.2
Authors@R: c(person(given = "Jan", family = "Stanstrup",
email = "[email protected]",
role = c("aut"),
Expand Down Expand Up @@ -63,7 +63,7 @@ BugReports: https://github.com/RforMassSpectrometry/CompoundDb/issues
biocViews: MassSpectrometry, Metabolomics, Annotation
VignetteBuilder: knitr
License: Artistic-2.0
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Roxygen: list(markdown=TRUE)
Collate:
'AllGenerics.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ importFrom(Spectra,intensity)
importFrom(dbplyr,src_dbi)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,read_json)
importFrom(methods,"slot<-")
importFrom(methods,.hasSlot)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# CompoundDb version 1.9

## Changes in version 1.9.2

- `compound_tbl_lipidblast`: add parameter `n` to support reading and
processing MoNA json files in sets (chunks) of lines at a time and hence
reduce memory demand for very large files.

## Changes in version 1.9.1

- Allow `CompDb` to store that database name as alternative to an active
Expand Down
125 changes: 87 additions & 38 deletions R/createCompDbPackage.R
Original file line number Diff line number Diff line change
Expand Up @@ -111,22 +111,35 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
#'
#' @description
#'
#' `compound_tbl_lipidblast()` extracts basic comopund annotations from a
#' `compound_tbl_lipidblast()` extracts basic compound annotations from a
#' LipidBlast file in (json format) downloaded from
#' http://mona.fiehnlab.ucdavis.edu/downloads
#' http://mona.fiehnlab.ucdavis.edu/downloads . Note that no mass spectra data
#' is extracted from the json file.
#'
#' @param file `character(1)` with the name of the file name.
#'
#' @param collapse optional `character(1)` to be used to collapse multiple
#' values in the columns `"synonyms"`. See examples for details.
#'
#' @param n `integer(1)` defining the number of rows from the json file that
#' should be read and processed at a time. By default (`n = -1L`) the
#' complete file is imported and processed. For large json files it is
#' suggested to set e.g. `n = 100000` to enable chunk-wise processing and
#' hence reduce the memory demand.
#'
#' @param verbose `logical(1)` whether some progress information should be
#' provided. Defaults to `verbose = FALSE`, but for parsing very large
#' files (specifically with chunk-wise processing enabled with `n` > 0)
#' it might be helpful to set to `verbose = TRUE`.
#'
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#'
#' - `compound_id`: the ID of the compound.
#' - `name`: the compound's name.
#' - `inchi`: the InChI of the compound.
#' - `inchikey`: the InChI key.
#' - `inchikey`: the InChI key. `NA` for all compounds as it is ot provided
#' in MoNA json files.
#' - `formula`: the chemical formula.
#' - `exactmass`: the compound's mass.
#' - `synonyms`: the compound's synonyms (aliases). This type of this column is
Expand All @@ -146,12 +159,13 @@ compound_tbl_sdf <- function(file, collapse, onlyValid = TRUE,
#' fl <- system.file("json/MoNa-LipidBlast_sub.json", package = "CompoundDb")
#' cmps <- compound_tbl_lipidblast(fl)
#' cmps
compound_tbl_lipidblast <- function(file, collapse) {
compound_tbl_lipidblast <- function(file, collapse = character(), n = -1L,
verbose = FALSE) {
.check_parameter_file(file)
res <- .import_lipidblast(file)
if (!missing(collapse)) {
res <- .import_lipidblast(file, n = n, verbose = verbose)
if (length(collapse)) {
## collapse elements from lists.
res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse,
res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse[1L],
FUN.VALUE = "character")
}
res
Expand Down Expand Up @@ -331,44 +345,79 @@ compound_tbl_lipidblast <- function(file, collapse) {
#' @author Jan Stanstrup and Johannes Rainer
#'
#' @importFrom jsonlite read_json
#'
#' @importFrom dplyr bind_rows
#'
#' @md
#'
#' @noRd
.import_lipidblast <- function(file) {
lipidb <- read_json(file)
.import_lipidblast <- function(file, n = -1L, verbose = FALSE) {
if (n < 0) {
lipidb <- read_json(file)
if (verbose)
message("Processing ", length(lipidb), " elements ...",
appendLF = FALSE)
res <- lapply(lipidb, .parse_lipidblast_json_element)
if (verbose) message(" done.")
} else res <- .import_lipidblast_json_chunk(file, n = n, verbose = verbose)
bind_rows(res)
}

parse_element <- function(x) {
id <- x$id
cmp <- x$compound[[1]]
## get the name(s) -> name + aliases
nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
mass <- unlist(lapply(cmp$metaData, function(z) {
if (z$name == "total exact mass")
z$value
}))
if (is.null(mass))
mass <- NA_character_
frml <- unlist(lapply(cmp$metaData, function(z) {
if (z$name == "molecular formula")
z$value
}))
if (is.null(frml))
mass <- NA_character_
list(
compound_id = x$id,
name = nms[1],
inchi = cmp$inchi,
inchikey = NA_character_,
formula = frml,
exactmass = mass,
synonyms = nms[-1]
)
}
.parse_lipidblast_json_element <- function(x) {
id <- x$id[[1L]]
cmp <- x$compound[[1L]]
## get the name(s) -> name + aliases
nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
mass <- unlist(lapply(cmp$metaData, function(z) {
if (z$name == "total exact mass")
z$value
}))
if (is.null(mass))
mass <- NA_character_
frml <- unlist(lapply(cmp$metaData, function(z) {
if (z$name == "molecular formula")
z$value
}))
if (is.null(frml))
frml <- NA_character_
snms <- NA_character_
if (length(nms) > 1L)
snms <- nms[-1L]
list(
compound_id = id,
name = nms[1L],
inchi = cmp$inchi,
inchikey = NA_character_,
formula = unique(frml),
exactmass = mass,
synonyms = list(snms)
)
}

res <- lapply(lipidb, parse_element)
bind_rows(res)

#' @importFrom jsonlite fromJSON
#'
#' @importFrom dplyr bind_rows
.import_lipidblast_json_chunk <- function(x, n = 10000, verbose = FALSE) {
con <- file(x, open = "r")
on.exit(close(con))
res <- list()
while (length(ls <- readLines(con, n = n, warn = FALSE))) {
if (length(grep("^\\[", ls[1L])))
ls <- ls[-1L]
if (length(grep("^\\]", ls[length(ls)])))
ls <- ls[-length(ls)]
ls <- sub(",$", "", ls)
if (length(ls)) {
res <- c(res, lapply(ls, function(z) {
.parse_lipidblast_json_element(
fromJSON(z, simplifyVector = FALSE))
}))
}
if (verbose)
message("Processed ", length(ls), " elements")
}
res
}

#' @title Create a CompDb database
Expand Down
21 changes: 17 additions & 4 deletions man/compound_tbl_lipidblast.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 31 additions & 1 deletion tests/testthat/test_createCompDbPackage.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ test_that("compound_tbl_lipidblast works", {
"inchikey", "formula", "exactmass",
"synonyms"))
expect_true(nrow(cmps) == 8)
expect_true(is(cmps$synonyms, "character"))
expect_true(is.list(cmps$synonyms))
cmps <- compound_tbl_lipidblast(lb, collapse = ";")
expect_true(is.character(cmps$synonyms))
})
Expand Down Expand Up @@ -447,3 +447,33 @@ test_that("emptyCompDb works", {

expect_error(emptyCompDb(fl), "exist")
})

test_that(".parse_lipidblast_json_element works", {
library(jsonlite)
f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
js <- read_json(f)
res <- .parse_lipidblast_json_element(js[[1L]])
expect_true(is.list(res))
expect_equal(names(res), c("compound_id", "name", "inchi", "inchikey",
"formula", "exactmass", "synonyms"))
expect_equal(res$name, "CerP 24:0")
})

test_that(".import_lipidblast_json_chunk works", {
f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
res <- .import_lipidblast_json_chunk(f, n = 3)
expect_true(is.list(res))
expect_true(length(res) == 8L)

ref <- .import_lipidblast(f, verbose = TRUE)
expect_equal(nrow(ref), length(res))
res <- bind_rows(res)
expect_equal(ref, res)
})

test_that("compound_tbl_lipidblast works with n > 0", {
f <- system.file("json", "MoNa-LipidBlast_sub.json", package = "CompoundDb")
ref <- compound_tbl_lipidblast(f)
res <- compound_tbl_lipidblast(f, n = 4)
expect_equal(ref, res)
})
Loading