From e58984fbb15bb5368a94e005192ffcc60a17d1bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 18 Jul 2023 10:10:31 +0200 Subject: [PATCH 1/8] lib: export pharmacogenomics data (and the related pubmed articles) from a list of genes, #TASK-4768, #TASK-4761 --- .../executors/ExportCommandExecutor.java | 118 ++++++++++++++++-- .../core/api/PharmaChemicalQuery.java | 2 +- .../core/PharmacogenomicsMongoDBAdaptor.java | 7 ++ .../clinical/PharmacogenomicsWSServer.java | 3 +- 4 files changed, 119 insertions(+), 11 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 19aff216c..54c741bf0 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -19,7 +19,9 @@ import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.Entry; +import org.opencb.biodata.formats.pubmed.v233jaxb.PubmedArticle; import org.opencb.biodata.models.core.*; +import org.opencb.biodata.models.pharma.*; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.Repeat; import org.opencb.cellbase.app.cli.CommandExecutor; @@ -42,8 +44,7 @@ import java.util.*; import java.util.stream.Collectors; -import static org.opencb.cellbase.lib.EtlCommons.CLINICAL_VARIANTS_DATA; -import static org.opencb.cellbase.lib.EtlCommons.OBO_DATA; +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by jtarraga on 29/05/23. @@ -85,7 +86,7 @@ public ExportCommandExecutor(AdminCliOptionsParser.ExportCommandOptions exportCo EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA}; + OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PHARMACOGENOMICS_DATA}; } else { this.dataToExport = exportCommandOptions.data.split(","); } @@ -148,6 +149,7 @@ public void execute() throws CellBaseException { for (String loadOption : dataToExport) { try { int counter = 0; + String counterMsg = ""; logger.info("Exporting '{}' data...", loadOption); long dbTimeStart = System.currentTimeMillis(); switch (loadOption) { @@ -163,11 +165,14 @@ public void execute() throws CellBaseException { results = genomeManager.getGenomeInfo(QueryOptions.empty(), dataRelease); writeExportedData(results.getResults(), "genome_info", serializer); serializer.close(); + + counterMsg = counter + " sequences and " + results.getNumResults() + " genome info items"; break; } case EtlCommons.GENE_DATA: { // Export data counter = writeExportedData(genes, "gene", output); + counterMsg = counter + " Ensembl genes"; break; } case EtlCommons.REFSEQ_DATA: { @@ -177,11 +182,13 @@ public void execute() throws CellBaseException { CellBaseDataResult results = geneManager.search(geneQuery); counter = writeExportedData(results.getResults(), "refseq", output); + counterMsg = counter + " RefSeq genes"; break; } case EtlCommons.VARIATION_DATA: { // Export data counter = writeExportedData(variants, "variation_chr_all", output); + counterMsg = counter + " variants"; break; } case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: { @@ -190,6 +197,7 @@ public void execute() throws CellBaseException { CellBaseDataResult results = variantManager.getFunctionalScoreRegion(regions, null, dataRelease); counter = writeExportedData(results.getResults(), "cadd", output); + counterMsg = counter + " CADD items"; break; } case EtlCommons.MISSENSE_VARIATION_SCORE_DATA: { @@ -220,6 +228,8 @@ public void execute() throws CellBaseException { counter += writeExportedData(results.getResults(), "missense_variation_functional_score", serializer); } serializer.close(); + + counterMsg = counter + " missense variation functional scores"; break; } case EtlCommons.CONSERVATION_DATA: { @@ -237,6 +247,8 @@ public void execute() throws CellBaseException { counter++; } serializer.close(); + + counterMsg = counter + " conservation scores"; break; } case EtlCommons.REGULATION_DATA: { @@ -246,6 +258,7 @@ public void execute() throws CellBaseException { query.setDataRelease(dataRelease); CellBaseDataResult results = regulatoryManager.search(query); counter = writeExportedData(results.getResults(), "regulatory_region", output); + counterMsg = counter + " regulatory regions"; break; } case EtlCommons.PROTEIN_DATA: { @@ -255,6 +268,7 @@ public void execute() throws CellBaseException { query.setDataRelease(dataRelease); CellBaseDataResult results = proteinManager.search(query); counter = writeExportedData(results.getResults(), "protein", output); + counterMsg = counter + " proteins"; break; } case EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA: { @@ -275,10 +289,13 @@ public void execute() throws CellBaseException { counter += writeExportedData(results.getResults(), "prot_func_pred_chr_" + entry.getKey(), output); } serializer.close(); + + counterMsg = counter + " protein functional predictions"; break; } case EtlCommons.CLINICAL_VARIANTS_DATA: { counter = exportClinicalVariantData(regions); + counterMsg = counter + " clinical variants"; break; } case EtlCommons.REPEATS_DATA: { @@ -289,27 +306,29 @@ public void execute() throws CellBaseException { repeatsQuery.setDataRelease(dataRelease); CellBaseDataResult results = repeatsManager.search(repeatsQuery); counter = writeExportedData(results.getResults(), "repeats", output); + counterMsg = counter + " repeats"; break; } case OBO_DATA: { counter = exportOntologyData(); + counterMsg = counter + " ontology items"; break; } case EtlCommons.SPLICE_SCORE_DATA: { counter = exportSpliceScoreData(variants); + counterMsg = counter + " splice scores"; + break; + } + case EtlCommons.PHARMACOGENOMICS_DATA: { + counterMsg = exportPharmacogenomicsData(genes); break; } -// case EtlCommons.PUBMED_DATA: { -// // Load data, create index and update release -// loadPubMed(); -// break; -// } default: logger.warn("Not valid 'data'. We should not reach this point"); break; } long dbTimeEnd = System.currentTimeMillis(); - logger.info("Exported {} '{}' items in {} ms!", counter, loadOption, dbTimeEnd - dbTimeStart); + logger.info("Exported {} in {} ms!", counterMsg, dbTimeEnd - dbTimeStart); } catch (IllegalAccessException | IOException | QueryException e) { e.printStackTrace(); } @@ -317,6 +336,87 @@ public void execute() throws CellBaseException { } } + private String exportPharmacogenomicsData(List genes) + throws QueryException, CellBaseException, IllegalAccessException, IOException { + String counterMsg; + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output.resolve(PHARMACOGENOMICS_DATA), PHARMACOGENOMICS_DATA); + + PharmaChemicalQuery query = new PharmaChemicalQuery(); + List geneNames = new ArrayList<>(new HashSet<>(genes.stream().map(g -> g.getName()).collect(Collectors.toList()))); + query.setGeneNames(geneNames); + query.setDataRelease(dataRelease); + PharmacogenomicsManager pharmacogenomicsManager = managerFactory.getPharmacogenomicsManager(species, assembly); + CellBaseIterator iterator = pharmacogenomicsManager.iterator(query); + int counter = 0; + Set pubmedIds = new HashSet<>(); + Set chemicalIds = new HashSet<>(); + while (iterator.hasNext()) { + PharmaChemical pharmaChemical = iterator.next(); + if (!chemicalIds.contains(pharmaChemical.getId())) { + // Add chemical ID to avoid duplicate + chemicalIds.add(pharmaChemical.getId()); + + // Retrieve PubMed IDs from pharma chemical (discarding empty pubmed IDs) + for (PharmaGeneAnnotation gene : pharmaChemical.getGenes()) { + List ids = gene.getPubmed().stream().filter(item -> StringUtils.isNotEmpty(item)).collect(Collectors.toList()); + if (CollectionUtils.isNotEmpty(ids)) { + pubmedIds.addAll(ids); + } + } + for (PharmaVariantAnnotation variant : pharmaChemical.getVariants()) { + for (PharmaClinicalEvidence evidence : variant.getEvidences()) { + if (StringUtils.isNotEmpty(evidence.getPubmed())) { + pubmedIds.add(evidence.getPubmed()); + } + for (PharmaVariantAssociation variantAssociation : evidence.getVariantAssociations()) { + if (StringUtils.isNotEmpty(variantAssociation.getPubmed())) { + pubmedIds.add(variantAssociation.getPubmed()); + } + } + } + } + + // Finally, write and count chemicals + serializer.serialize(pharmaChemical); + counter++; + if (counter % 200 == 0) { + logger.info("{} pharma chemicals written....", counter); + } + } + } + serializer.close(); + counterMsg = counter + " pharma chemicals"; + + // Create new JSON serializer for pubmed articles, then retrieve and write pubmed articles + serializer = new CellBaseJsonFileSerializer(output.resolve(PUBMED_DATA), PUBMED_DATA); + + PublicationManager publicationManager = managerFactory.getPublicationManager(); + List pubmedList = new ArrayList<>(pubmedIds); + PublicationQuery publicationQuery = new PublicationQuery(); + publicationQuery.setDataRelease(dataRelease); + counter = 0; + + int subListSize = 10; + for (int i = 0; i < pubmedList.size(); i += subListSize) { + int end = Math.min(i + subListSize, pubmedList.size()); + List idList = pubmedList.subList(i, end); + if (CollectionUtils.isNotEmpty(idList) && idList.size() > 0) { + System.out.println(StringUtils.join(idList, ",")); + publicationQuery.setIds(idList); + CellBaseDataResult results = publicationManager.search(publicationQuery); + for (PubmedArticle pubmedArticle : results.getResults()) { + // Finally, write and count chemicals + serializer.serialize(pubmedArticle); + counter++; + } + } + } + serializer.close(); + counterMsg += " and " + counter + " PubMed articles"; + + return counterMsg; + } + private int exportClinicalVariantData(List regions) throws CellBaseException, QueryException, IllegalAccessException, IOException { String baseFilename = CLINICAL_VARIANTS_DATA + ".full"; diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PharmaChemicalQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PharmaChemicalQuery.java index c2fec9ceb..a2b7e1e11 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PharmaChemicalQuery.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PharmaChemicalQuery.java @@ -50,7 +50,7 @@ public class PharmaChemicalQuery extends AbstractQuery { @QueryParameter(id = "variants.haplotypes", alias = {"haplotype"}) private List hapolotypes; - @QueryParameter(id = "variants.geneNames", alias = {"geneName"}) + @QueryParameter(id = "geneName") private List geneNames; @QueryParameter(id = "variants.phenotypes", alias = {"phenotype"}) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java index aabf539ea..2881a6df3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java @@ -21,6 +21,7 @@ import org.bson.conversions.Bson; import org.opencb.biodata.models.pharma.PharmaChemical; import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.cellbase.core.api.query.LogicalList; import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.result.CellBaseDataResult; @@ -118,6 +119,12 @@ public Bson parseQuery(PharmaChemicalQuery pharmaQuery) { case "token": // do nothing break; + case "geneName": + List orBsonList = new ArrayList<>(); + orBsonList.add(getLogicalListFilter(new LogicalList((List) value), "variants.geneNames")); + orBsonList.add(getLogicalListFilter(new LogicalList((List) value), "genes.xrefs.id")); + andBsonList.add(Filters.or(orBsonList)); + break; default: createAndOrQuery(value, dotNotationName, QueryParam.Type.STRING, andBsonList); break; diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java index 983a45f73..de47e69d3 100644 --- a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java @@ -92,7 +92,8 @@ public PharmacogenomicsWSServer(@PathParam("apiVersion") @ApiParam(name = "apiVe + "please, call the endpoint endpoint pharmacogenomics/distinct?field=variants.haplotypes", dataType = "java.util.List", paramType = "query"), @ApiImplicitParam(name = "geneName", value = "List of gene names, e.g.: NT5C2,VKORC1. In order to get the list of gene names," - + "please, call the endpoint endpoint pharmacogenomics/distinct?field=variants.geneNames", dataType = "java.util.List", + + "please, call the endpoint endpoints pharmacogenomics/distinct?field=variants.geneNames and " + + " pharmacogenomics/distinct?field=genes.xrefs.id", dataType = "java.util.List", paramType = "query"), @ApiImplicitParam(name = "location", value = "List of chromosomic coordinates in the format: chromosome:position, e.g.:" + " 10:103109774", dataType = "java.util.List", paramType = "query"), From a28c81f7b5b45c2ad043cfcae2a3446eb0809f1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 18 Jul 2023 10:27:02 +0200 Subject: [PATCH 2/8] lib: fix sonar issues, #TASK-4768, #TASK-4761 --- .../lib/impl/core/PharmacogenomicsMongoDBAdaptor.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java index 2881a6df3..b6a308750 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java @@ -17,6 +17,7 @@ package org.opencb.cellbase.lib.impl.core; import com.mongodb.client.model.Filters; +import org.apache.commons.collections4.CollectionUtils; import org.bson.Document; import org.bson.conversions.Bson; import org.opencb.biodata.models.pharma.PharmaChemical; @@ -109,7 +110,6 @@ public CellBaseDataResult groupBy(PharmaChemicalQuery query) thr public Bson parseQuery(PharmaChemicalQuery pharmaQuery) { List andBsonList = new ArrayList<>(); - boolean visited = false; try { for (Map.Entry entry : pharmaQuery.toObjectMap().entrySet()) { String dotNotationName = entry.getKey(); @@ -121,8 +121,8 @@ public Bson parseQuery(PharmaChemicalQuery pharmaQuery) { break; case "geneName": List orBsonList = new ArrayList<>(); - orBsonList.add(getLogicalListFilter(new LogicalList((List) value), "variants.geneNames")); - orBsonList.add(getLogicalListFilter(new LogicalList((List) value), "genes.xrefs.id")); + orBsonList.add(getLogicalListFilter(new LogicalList((List) value), "variants.geneNames")); + orBsonList.add(getLogicalListFilter(new LogicalList((List) value), "genes.xrefs.id")); andBsonList.add(Filters.or(orBsonList)); break; default: @@ -133,8 +133,8 @@ public Bson parseQuery(PharmaChemicalQuery pharmaQuery) { } catch (IllegalAccessException e) { e.printStackTrace(); } - logger.debug("pharmacogenomics parsed query: " + andBsonList); - if (andBsonList.size() > 0) { + logger.debug("Pharmacogenomics parsed query: {}", andBsonList); + if (CollectionUtils.isNotEmpty(andBsonList)) { return Filters.and(andBsonList); } else { return new Document(); From fa814a9670f6874a7b040cb01d9dbea3704e4f93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 18 Jul 2023 10:56:41 +0200 Subject: [PATCH 3/8] app: fix sonar issues, #TASK-4768, #TASK-4761 --- .../executors/ExportCommandExecutor.java | 43 ++++++------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 54c741bf0..730e470cf 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -98,11 +98,11 @@ public ExportCommandExecutor(AdminCliOptionsParser.ExportCommandOptions exportCo * @throws CellBaseException CellBase exception */ public void execute() throws CellBaseException { - checkDataRelease(); - logger.info("Exporting from data release {}", dataRelease); this.managerFactory = new CellBaseManagerFactory(configuration); + checkDataRelease(); + if (exportCommandOptions.data != null) { // Get genes List geneNames = Arrays.asList(exportCommandOptions.gene.split(",")); @@ -138,8 +138,8 @@ public void execute() throws CellBaseException { regions.addAll(Region.parseRegions(exportCommandOptions.region)); } - logger.info("{} regions: {}", regions.size(), StringUtils.join(regions.stream().map(r -> r.toString()) - .collect(Collectors.toList()), ",")); + String strRegions = StringUtils.join(regions.stream().map(Object::toString).collect(Collectors.toList()), ","); + logger.info("{} regions: {}", regions.size(), strRegions); List variants = new ArrayList<>(); if (areVariantsNeeded()) { @@ -157,7 +157,7 @@ public void execute() throws CellBaseException { GenomeManager genomeManager = managerFactory.getGenomeManager(species, assembly); // Genome sequence - CellBaseDataResult results = genomeManager.getGenomeSequenceRawData(regions, dataRelease); + CellBaseDataResult results = genomeManager.getGenomeSequenceRawData(regions, dataRelease); counter = writeExportedData(results.getResults(), "genome_sequence", output); // Genome info @@ -342,7 +342,7 @@ private String exportPharmacogenomicsData(List genes) CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output.resolve(PHARMACOGENOMICS_DATA), PHARMACOGENOMICS_DATA); PharmaChemicalQuery query = new PharmaChemicalQuery(); - List geneNames = new ArrayList<>(new HashSet<>(genes.stream().map(g -> g.getName()).collect(Collectors.toList()))); + List geneNames = new ArrayList<>(new HashSet<>(genes.stream().map(Gene::getName).collect(Collectors.toList()))); query.setGeneNames(geneNames); query.setDataRelease(dataRelease); PharmacogenomicsManager pharmacogenomicsManager = managerFactory.getPharmacogenomicsManager(species, assembly); @@ -358,7 +358,7 @@ private String exportPharmacogenomicsData(List genes) // Retrieve PubMed IDs from pharma chemical (discarding empty pubmed IDs) for (PharmaGeneAnnotation gene : pharmaChemical.getGenes()) { - List ids = gene.getPubmed().stream().filter(item -> StringUtils.isNotEmpty(item)).collect(Collectors.toList()); + List ids = gene.getPubmed().stream().filter(StringUtils::isNotEmpty).collect(Collectors.toList()); if (CollectionUtils.isNotEmpty(ids)) { pubmedIds.addAll(ids); } @@ -400,8 +400,7 @@ private String exportPharmacogenomicsData(List genes) for (int i = 0; i < pubmedList.size(); i += subListSize) { int end = Math.min(i + subListSize, pubmedList.size()); List idList = pubmedList.subList(i, end); - if (CollectionUtils.isNotEmpty(idList) && idList.size() > 0) { - System.out.println(StringUtils.join(idList, ",")); + if (CollectionUtils.isNotEmpty(idList)) { publicationQuery.setIds(idList); CellBaseDataResult results = publicationManager.search(publicationQuery); for (PubmedArticle pubmedArticle : results.getResults()) { @@ -502,7 +501,6 @@ private List getVariants(List regions) throws CellBaseException VariantManager variantManager = managerFactory.getVariantManager(species, assembly); VariantQuery query = new VariantQuery(); query.setDataRelease(dataRelease); - int batchSize = 10; for (Region region : regions) { query.setRegions(Collections.singletonList(region)); try { @@ -522,14 +520,13 @@ private boolean areVariantsNeeded() { if (data.equals(EtlCommons.VARIATION_DATA) || data.equals(EtlCommons.MISSENSE_VARIATION_SCORE_DATA) || data.equals(EtlCommons.SPLICE_SCORE_DATA)) { - // || data.equals(EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA)) { return true; } } return false; } - private int writeExportedData(List objects, String baseFilename, CellBaseFileSerializer serializer) throws IOException { + private int writeExportedData(List objects, String baseFilename, CellBaseFileSerializer serializer) { int counter = 0; for (Object object : objects) { serializer.serialize(object, baseFilename); @@ -550,31 +547,15 @@ private int writeExportedData(List objects, String baseFilename, Path outDir) return counter; } - private int writeExportedDataList(List> results, String baseFilename, Path outDir) throws IOException { - checkPath(outDir); - int counter = 0; - CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outDir); - for (CellBaseDataResult result : results) { - for (Object object : result.getResults()) { - serializer.serialize(object, baseFilename); - counter++; - } - } - serializer.close(); - return counter; - } - private void checkPath(Path outDir) throws IOException { - if (!outDir.toFile().exists()) { - if (!outDir.toFile().mkdirs()) { - throw new IOException("Impossible to create output directory: " + outDir); - } + if (!outDir.toFile().exists() && !outDir.toFile().mkdirs()) { + throw new IOException("Impossible to create output directory: " + outDir); } } private void checkDataRelease() throws CellBaseException { // Check data release - DataReleaseManager dataReleaseManager = new DataReleaseManager(database, configuration); + DataReleaseManager dataReleaseManager = managerFactory.getDataReleaseManager(species, assembly); CellBaseDataResult dataReleaseResults = dataReleaseManager.getReleases(); if (CollectionUtils.isEmpty(dataReleaseResults.getResults())) { throw new CellBaseException("No data releases are available"); From 4bca68d64d63a35337e87bb0b6cfdd0a71af3390 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 21 Aug 2023 14:34:43 +0200 Subject: [PATCH 4/8] app: create subfolder for pharmacogenomics and pubmed when exporting data, #TASK-4768, #TASK-4761 --- .../app/cli/admin/executors/ExportCommandExecutor.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java index 730e470cf..a99abee61 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/ExportCommandExecutor.java @@ -340,6 +340,9 @@ private String exportPharmacogenomicsData(List genes) throws QueryException, CellBaseException, IllegalAccessException, IOException { String counterMsg; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(output.resolve(PHARMACOGENOMICS_DATA), PHARMACOGENOMICS_DATA); + if (!serializer.getOutdir().toFile().exists()) { + serializer.getOutdir().toFile().mkdirs(); + } PharmaChemicalQuery query = new PharmaChemicalQuery(); List geneNames = new ArrayList<>(new HashSet<>(genes.stream().map(Gene::getName).collect(Collectors.toList()))); @@ -389,6 +392,9 @@ private String exportPharmacogenomicsData(List genes) // Create new JSON serializer for pubmed articles, then retrieve and write pubmed articles serializer = new CellBaseJsonFileSerializer(output.resolve(PUBMED_DATA), PUBMED_DATA); + if (!serializer.getOutdir().toFile().exists()) { + serializer.getOutdir().toFile().mkdirs(); + } PublicationManager publicationManager = managerFactory.getPublicationManager(); List pubmedList = new ArrayList<>(pubmedIds); From a8bada34cdd3cd6135c3ff3d64a7764c9eb6a49e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 21 Aug 2023 15:55:32 +0200 Subject: [PATCH 5/8] test: add Pharmacogenomics Junit tests, #TASK-4769, #TASK-4761 --- .../lib/GenericMongoDBAdaptorTest.java | 10 ++- .../PharmacogenomicsMongoDBAdaptorTest.java | 83 +++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptorTest.java diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java index 8666aadb9..e5e804268 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java @@ -60,7 +60,7 @@ public class GenericMongoDBAdaptorTest { protected String cellBaseName; - private static final String DATASET_BASENAME = "cellbase-v5.6-dr4"; + private static final String DATASET_BASENAME = "cellbase-v5.7-dr6"; private static final String DATASET_EXTENSION = ".tar.gz"; private static final String DATASET_URL = "http://reports.test.zettagenomics.com/cellbase/test-data/"; private static final String DATASET_TMP_DIR = "/tmp/cb"; @@ -135,7 +135,9 @@ private void downloadAndPopulate() throws IOException, ExecutionException, Class Path tmpPath = Paths.get(DATASET_TMP_DIR); tmpPath.toFile().mkdirs(); + logger.info("Downloading " + url + " into " + tmpPath); URLUtils.download(url, tmpPath); + Path tmpFile = tmpPath.resolve(DATASET_BASENAME + DATASET_EXTENSION); String commandline = "tar -xvzf " + tmpFile.toAbsolutePath() + " -C " + tmpPath; logger.info("Running: " + commandline); @@ -206,6 +208,12 @@ private void downloadAndPopulate() throws IOException, ExecutionException, Class // clinical_variants.full.json.gz loadData("clinical_variants", "clinical_variants", baseDir.resolve("clinical_variants.full.json.gz")); + // pharmacogenomics.json.gz + loadData("pharmacogenomics", "pharmacogenomics", baseDir.resolve("pharmacogenomics/pharmacogenomics.json.gz")); + + // pubmed.json.gz + loadData("pubmed", "pubmed", baseDir.resolve("pubmed/pubmed.json.gz")); + // Clean temporary dir } diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptorTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptorTest.java new file mode 100644 index 000000000..96eaa7a9b --- /dev/null +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptorTest.java @@ -0,0 +1,83 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.impl.core; + +import org.hamcrest.CoreMatchers; +import org.junit.jupiter.api.Test; +import org.opencb.biodata.models.core.Gene; +import org.opencb.biodata.models.pharma.PharmaChemical; +import org.opencb.biodata.models.variant.avro.Constraint; +import org.opencb.biodata.models.variant.avro.Expression; +import org.opencb.biodata.models.variant.avro.ExpressionCall; +import org.opencb.cellbase.core.api.GeneQuery; +import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.cellbase.core.api.query.AbstractQuery; +import org.opencb.cellbase.core.api.query.LogicalList; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.GenericMongoDBAdaptorTest; +import org.opencb.cellbase.lib.managers.GeneManager; +import org.opencb.cellbase.lib.managers.PharmacogenomicsManager; + +import java.io.IOException; +import java.util.*; +import java.util.stream.Collectors; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Created by jtarraga on 08/21/23. + */ +public class PharmacogenomicsMongoDBAdaptorTest extends GenericMongoDBAdaptorTest { + + public PharmacogenomicsMongoDBAdaptorTest() throws IOException { + super(); + } + + @Test + public void testQueryName() throws Exception { + PharmacogenomicsManager pharmacogenomicsManager = cellBaseManagerFactory.getPharmacogenomicsManager(SPECIES, ASSEMBLY); + + Map paramMap = new HashMap<>(); + paramMap.put("name", "galantamine"); + paramMap.put("include", "id,name"); + paramMap.put(AbstractQuery.DATA_RELEASE, String.valueOf(dataRelease)); + + PharmaChemicalQuery chemicalQuery = new PharmaChemicalQuery(paramMap); + chemicalQuery.setCount(Boolean.TRUE); + + CellBaseDataResult cellBaseDataResult = pharmacogenomicsManager.search(chemicalQuery); + + assertEquals(1, cellBaseDataResult.getNumMatches()); + assertEquals("PA449726", cellBaseDataResult.first().getId()); + } + + @Test + public void testQuery() throws Exception { + PharmacogenomicsManager pharmacogenomicsManager = cellBaseManagerFactory.getPharmacogenomicsManager(SPECIES, ASSEMBLY); + + PharmaChemicalQuery chemicalQuery = new PharmaChemicalQuery(); + chemicalQuery.setGeneNames(Collections.singletonList("PRKCE")); + chemicalQuery.setDataRelease(dataRelease); + chemicalQuery.setCount(Boolean.TRUE); + + CellBaseDataResult cellBaseDataResult = pharmacogenomicsManager.search(chemicalQuery); + + assertEquals(6, cellBaseDataResult.getNumMatches()); + } +} From 1a7b19d8f89e24de1ba03bc52173e293e7e34b59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 31 Aug 2023 13:11:03 +0200 Subject: [PATCH 6/8] lib: use small chuncks to load pharma and pubmed, #TASK-4768, #TASK-4761 --- .../java/org/opencb/cellbase/lib/loader/LoadRunner.java | 5 ++++- .../org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/LoadRunner.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/LoadRunner.java index 3a6605c4f..f921403ff 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/LoadRunner.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/loader/LoadRunner.java @@ -21,6 +21,7 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; +import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.lib.managers.DataReleaseManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -100,7 +101,9 @@ public void load(Path filePath, String data, int dataRelease, String field, Stri // protein_functional_prediction documents are extremely big. Increasing the batch size will probably // lead to an OutOfMemory error for this collection. Batch size can be much higher for the rest of // collections though - if (data.equals(PROTEIN_FUNCTIONAL_PREDICTION)) { + if (data.equals(PROTEIN_FUNCTIONAL_PREDICTION) + || data.equals(EtlCommons.PHARMACOGENOMICS_DATA) + || data.equals(EtlCommons.PUBMED_DATA)) { batchSize = 50; } else { batchSize = 200; diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java index e5e804268..f316fdf07 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/GenericMongoDBAdaptorTest.java @@ -47,6 +47,8 @@ import java.util.List; import java.util.concurrent.ExecutionException; +import static org.opencb.cellbase.lib.EtlCommons.PHARMACOGENOMICS_DATA; +import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA; import static org.opencb.cellbase.lib.db.MongoDBManager.DBNAME_SEPARATOR; /** @@ -209,10 +211,10 @@ private void downloadAndPopulate() throws IOException, ExecutionException, Class loadData("clinical_variants", "clinical_variants", baseDir.resolve("clinical_variants.full.json.gz")); // pharmacogenomics.json.gz - loadData("pharmacogenomics", "pharmacogenomics", baseDir.resolve("pharmacogenomics/pharmacogenomics.json.gz")); + loadData(PHARMACOGENOMICS_DATA, PHARMACOGENOMICS_DATA, baseDir.resolve("pharmacogenomics/pharmacogenomics.json.gz")); // pubmed.json.gz - loadData("pubmed", "pubmed", baseDir.resolve("pubmed/pubmed.json.gz")); + loadData(PUBMED_DATA, PUBMED_DATA, baseDir.resolve("pubmed/pubmed.json.gz")); // Clean temporary dir } From 577f541867f30c7d9307f927dd9e889d5e34a86f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Mon, 4 Sep 2023 16:32:24 +0200 Subject: [PATCH 7/8] server: set default data release when it is not specified in the url, #TASK-4768, #TASK-4761 --- .../cellbase/server/rest/clinical/PharmacogenomicsWSServer.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java index de47e69d3..8cccd1856 100644 --- a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java @@ -126,6 +126,7 @@ public PharmacogenomicsWSServer(@PathParam("apiVersion") @ApiParam(name = "apiVe public Response getAll() { try { PharmaChemicalQuery query = new PharmaChemicalQuery(uriParams); + query.setDataRelease(getDataRelease()); CellBaseDataResult queryResults = pharmacogenomicsManager.search(query); return createOkResponse(queryResults); @@ -171,6 +172,7 @@ public Response getUniqueValues(@QueryParam("field") @ApiParam(name = "field", r try { copyToFacet("field", field); PharmaChemicalQuery query = new PharmaChemicalQuery(uriParams); + query.setDataRelease(getDataRelease()); CellBaseDataResult queryResults = pharmacogenomicsManager.distinct(query); return createOkResponse(queryResults); } catch (Exception e) { From dd037a721fc40237e3f705952c607af2d59a690f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 6 Sep 2023 10:01:01 +0200 Subject: [PATCH 8/8] core: fix unknown query parameter, #TASK-4768, #TASK-4761 --- .../org/opencb/cellbase/core/api/query/AbstractQuery.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/query/AbstractQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/query/AbstractQuery.java index c3d2b4e6a..76896b9e1 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/query/AbstractQuery.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/query/AbstractQuery.java @@ -124,7 +124,9 @@ public void updateParams(Map uriParams) { annotations = getAnnotations(); try { - validateParams(uriParams, classAttributesToType, annotations); + // Skip this validation because some CellBase endpoint URL parameters are not included + // in the query (such as GeneQuery, VariantQuery,...) + //validateParams(uriParams, classAttributesToType, annotations); Map objectHashMap = new HashMap<>(); for (Map.Entry> entry : classAttributesToType.entrySet()) { @@ -175,7 +177,7 @@ public void updateParams(Map uriParams) { } } objectMapper.updateValue(this, objectHashMap); - } catch (JsonProcessingException | QueryException e) { + } catch (JsonProcessingException e) { // | QueryException e) { throw new IllegalArgumentException(e); } }