Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TASK-1111 Add Pharmagenomics data to CellBase #653

Merged
merged 45 commits into from
Jul 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
0c5957c
lib: implement PharmGKB downloader, #TASK-4325. #TASK-1111
jtarraga May 4, 2023
726b4a0
core: download PharmGKB variant and clinical annotation files, #TASK-…
jtarraga May 4, 2023
2ed9ed7
lib: add the PharmGKBBuilder class, #TASK-4325. #TASK-1111
jtarraga May 4, 2023
42c9911
lib: parse PharmGKB clinical annotation files, #TASK-4325. #TASK-1111
jtarraga May 5, 2023
4080ad7
lib: improvements in pharmacogenomics
imedina May 8, 2023
426d6de
lib: add phrama variant annotation to evidence
imedina May 8, 2023
b1f617c
lib: parse PharmGKB study parameters file, #TASK-4325. #TASK-1111
jtarraga May 8, 2023
5e0b5c1
lib: parse PharmGKB label annotation file, #TASK-4325. #TASK-1111
jtarraga May 8, 2023
475df23
lib: pharmacogenomics minor improvements
imedina May 8, 2023
7de235b
lib: parse PharmGKB guideline, phenotype and functional annotations, …
jtarraga May 8, 2023
a12b2db
lib: fixes and improvements in PharmGKB builder
imedina May 9, 2023
8f519bb
lib: parse PharmGKB variants file to get the variant chromosome and p…
jtarraga May 9, 2023
d33c183
lib: create the pharmacogenomics JSON file after parsing the PharmGKB…
jtarraga May 9, 2023
595b28a
lib: update according to the biodata changes, #TASK-4325. #TASK-1111
jtarraga May 9, 2023
bf32a6f
Add new Pharmacogeonmics REST service
imedina May 9, 2023
4de3c35
app: implement the command 'load' for pharmacogenomics, #TASK-4325. #…
jtarraga May 9, 2023
09e6fa6
Merge branch 'TASK-1111' of https://github.com/opencb/cellbase into T…
jtarraga May 9, 2023
87856d6
server: implement pharmacogenomics web-services, #TASK-4325. #TASK-1111
jtarraga May 9, 2023
b7f7643
lib: minor improvement in pharma clinical annotation data model
imedina May 9, 2023
94fe7a9
server: minor improvements of REST API
imedina May 10, 2023
c01d65f
lib: fix parse generic names and add junit test, #TASK-4325. #TASK-1111
jtarraga May 10, 2023
22583c1
lib: add pharmacogenomics data to the variant annotation, #TASK-4325.…
jtarraga May 10, 2023
029d7cc
lib: add field to pharmacogenomics variant annotation
imedina May 10, 2023
080d06e
server: fix location filter in the pharmacogenomics webservice 'searc…
jtarraga May 10, 2023
be4dc90
lib: filter mismatch alleles from pharmacogenomics annotation
imedina May 11, 2023
80b9e40
lib: select only the pharma clinical annotation matching the annotate…
jtarraga May 11, 2023
3e019fb
lib: check pharmacogenomics annotations have been found
imedina May 11, 2023
13a21fc
lib: add 'summary' to pharmacogenomics variant annotation
imedina May 11, 2023
87cc78e
lib: parse and add pharmacogenomics gene annotation, #TASK-4325. #TAS…
jtarraga May 12, 2023
36cbca4
Merge branch 'TASK-1111' of https://github.com/opencb/cellbase into T…
jtarraga May 12, 2023
a27b910
lib: fix data models changes
imedina May 12, 2023
7e0860c
lib: update according to biodata changes, #TASK-4325. #TASK-1111
jtarraga May 12, 2023
51c91b7
Resolve conflicts, #TASK-4325. #TASK-1111
jtarraga May 12, 2023
2d6f9cb
lib: minor pharma fixes and improvements
imedina May 12, 2023
d3d6f87
lib: update according to the biodata changes, #TASK-4325. #TASK-1111
jtarraga May 12, 2023
f9b8a66
Merge branch 'develop' into TASK-1111
imedina May 17, 2023
9da827b
server: add more filters in pharma/query and improve includes when an…
jtarraga May 22, 2023
85b63b8
Merge branch 'TASK-1111' of https://github.com/opencb/cellbase into T…
jtarraga May 22, 2023
5c77bb3
Merge branch 'develop' into TASK-1111
jtarraga May 22, 2023
fea42b9
Fix some issues after merging develop, #TASK-4325. #TASK-1111
jtarraga May 22, 2023
9edeac4
server: improve pharma endpoints and update according to biodata chan…
jtarraga May 25, 2023
e165a22
lib: take into account haplotypes, and update according to biodata ch…
jtarraga May 29, 2023
fcd56e2
Merge branch 'develop' into TASK-1111
imedina Jun 9, 2023
c58e77e
Merge branch 'develop' into TASK-1111
jtarraga Jul 12, 2023
b62f088
Merge branch 'TASK-1111' of https://github.com/opencb/cellbase into T…
jtarraga Jul 12, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,9 @@ public class LoadCommandOptions {
@ParametersDelegate
public CommonCommandOptions commonOptions = commonCommandOptions;

@Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation, "
+ "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed. 'all' loads everything",
required = true, arity = 1)
@Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation,"
+ " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics."
+ " 'all' loads everything", required = true, arity = 1)
public String data;

@Parameter(names = {"-i", "--input"}, required = true, arity = 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
import java.util.Collections;
import java.util.List;

import static org.opencb.cellbase.lib.EtlCommons.PHARMGKB_DATA;

/**
* Created by imedina on 03/02/15.
*/
Expand Down Expand Up @@ -163,6 +165,9 @@ public void execute() {
case EtlCommons.PUBMED_DATA:
parser = buildPubMed();
break;
case EtlCommons.PHARMACOGENOMICS_DATA:
parser = buildPharmacogenomics();
break;
default:
logger.error("Build option '" + buildCommandOptions.data + "' is not valid");
break;
Expand Down Expand Up @@ -414,4 +419,22 @@ private CellBaseBuilder buildPubMed() throws IOException {
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubmedOutputFolder);
return new PubMedBuilder(pubmedInputFolder, serializer);
}

private CellBaseBuilder buildPharmacogenomics() throws IOException {
Path inFolder = downloadFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA);
Path outFolder = buildFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA);
if (!outFolder.toFile().exists()) {
outFolder.toFile().mkdirs();
}

logger.info("Copying PharmGKB version file...");
if (inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME).toFile().exists()) {
Files.copy(inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME),
outFolder.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME),
StandardCopyOption.REPLACE_EXISTING);
}

CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder);
return new PharmGKBBuilder(inFolder, serializer);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ public void execute() {
case EtlCommons.PUBMED_DATA:
downloadFiles.addAll(downloader.downloadPubMed());
break;
case EtlCommons.PHARMACOGENOMICS_DATA:
downloadFiles.addAll(downloader.downloadPharmKGB());
break;
default:
System.out.println("Value \"" + data + "\" is not allowed for the data parameter. Allowed values"
+ " are: {genome, gene, gene_disease_association, variation, variation_functional_score,"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO
EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA,
EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA,
EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA,
EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA};
EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA,
EtlCommons.PHARMACOGENOMICS_DATA};
} else {
loadOptions = loadCommandOptions.data.split(",");
}
Expand Down Expand Up @@ -289,6 +290,11 @@ public void execute() throws CellBaseException {
loadPubMed();
break;
}
case EtlCommons.PHARMACOGENOMICS_DATA: {
// Load data, create index and update release
loadPharmacogenomica();
break;
}
default:
logger.warn("Not valid 'data'. We should not reach this point");
break;
Expand Down Expand Up @@ -546,12 +552,39 @@ private void loadPubMed() throws CellBaseException {

// Update release (collection and sources)
List<Path> sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.PUBMED_VERSION_FILENAME));
dataReleaseManager.update(dataRelease, "pubmed", EtlCommons.REPEATS_DATA, sources);
dataReleaseManager.update(dataRelease, EtlCommons.PUBMED_DATA, EtlCommons.PUBMED_DATA, sources);
} else {
logger.warn("PubMed folder {} not found", pubmedPath);
}
}

private void loadPharmacogenomica() throws IOException, CellBaseException {
Path pharmaPath = input.resolve(EtlCommons.PHARMACOGENOMICS_DATA);

if (!Files.exists(pharmaPath)) {
logger.warn("Pharmacogenomics folder {} not found to load", pharmaPath);
return;
}

// Load data
Path pharmaJsonPath = pharmaPath.resolve(EtlCommons.PHARMACOGENOMICS_DATA + ".json.gz");
logger.info("Loading file '{}'", pharmaJsonPath.toFile().getName());
try {
loadRunner.load(pharmaJsonPath, EtlCommons.PHARMACOGENOMICS_DATA, dataRelease);
} catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException
| IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException
| LoaderException e) {
logger.error("Error loading file '{}': {}", pharmaJsonPath.toFile().getName(), e.toString());
}

// Create index
createIndex(EtlCommons.PHARMACOGENOMICS_DATA);

// Update release (collection and sources)
List<Path> sources = Collections.singletonList(pharmaPath.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME));
dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PHARMACOGENOMICS_DATA, sources);
}

private void createIndex(String collection) {
if (!createIndexes) {
return;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.opencb.cellbase.core.api;

import org.apache.commons.collections4.CollectionUtils;
import org.opencb.cellbase.core.api.query.AbstractQuery;
import org.opencb.cellbase.core.api.query.QueryException;
import org.opencb.cellbase.core.api.query.QueryParameter;

import java.util.List;
import java.util.Map;

public class PharmaChemicalQuery extends AbstractQuery {

@QueryParameter(id = "id")
private List<String> ids;

@QueryParameter(id = "name")
private List<String> names;

@QueryParameter(id = "source", allowedValues = {"PharmGKB"})
private List<String> sources;

@QueryParameter(id = "types", alias = {"type"})
private List<String> types;

@QueryParameter(id = "variants.variantId", alias = {"variant"})
private List<String> variants;

@QueryParameter(id = "variants.location", alias = {"location"})
private List<String> locations;

@QueryParameter(id = "variants.chromosome", alias = {"chromosome"})
private List<String> chromosomes;

@QueryParameter(id = "variants.haplotypes", alias = {"haplotype"})
private List<String> hapolotypes;

@QueryParameter(id = "variants.geneNames", alias = {"geneName"})
private List<String> geneNames;

@QueryParameter(id = "variants.phenotypes", alias = {"phenotype"})
private List<String> phenotypes;

@QueryParameter(id = "variants.phenotypeTypes", alias = {"phenotypeType"})
private List<String> phenotypeTypes;

@QueryParameter(id = "variants.confidence", alias = {"confidence"})
private List<String> confidences;

@QueryParameter(id = "variants.evidences.pubmed", alias = {"pubmedId"})
private List<String> pubmedIds;

public PharmaChemicalQuery() {
}

public PharmaChemicalQuery(Map<String, String> params) throws QueryException {
super(params);

objectMapper.readerForUpdating(this);
objectMapper.readerFor(PharmaChemicalQuery.class);
objectWriter = objectMapper.writerFor(PharmaChemicalQuery.class);
}

@Override
protected void validateQuery() throws QueryException {
if (CollectionUtils.isNotEmpty(variants)) {
for (String variant : variants) {
if (!variant.startsWith("rs")) {
throw new QueryException("Invalid variant ID: '" + variant + "'; it has to start with rs");
}
}
}
}

@Override
public String toString() {
final StringBuilder sb = new StringBuilder("PharmaChemicalQuery{");
sb.append("ids=").append(ids);
sb.append(", names=").append(names);
sb.append(", sources=").append(sources);
sb.append(", types=").append(types);
sb.append(", variants=").append(variants);
sb.append(", locations=").append(locations);
sb.append(", chromosomes=").append(chromosomes);
sb.append(", hapolotypes=").append(hapolotypes);
sb.append(", geneNames=").append(geneNames);
sb.append(", phenotypes=").append(phenotypes);
sb.append(", phenotypeTypes=").append(phenotypeTypes);
sb.append(", confidences=").append(confidences);
sb.append(", pubmedIds=").append(pubmedIds);
sb.append('}');
return sb.toString();
}

public List<String> getIds() {
return ids;
}

public PharmaChemicalQuery setIds(List<String> ids) {
this.ids = ids;
return this;
}

public List<String> getNames() {
return names;
}

public PharmaChemicalQuery setNames(List<String> names) {
this.names = names;
return this;
}

public List<String> getSources() {
return sources;
}

public PharmaChemicalQuery setSources(List<String> sources) {
this.sources = sources;
return this;
}

public List<String> getTypes() {
return types;
}

public PharmaChemicalQuery setTypes(List<String> types) {
this.types = types;
return this;
}

public List<String> getVariants() {
return variants;
}

public PharmaChemicalQuery setVariants(List<String> variants) {
this.variants = variants;
return this;
}

public List<String> getLocations() {
return locations;
}

public PharmaChemicalQuery setLocations(List<String> locations) {
this.locations = locations;
return this;
}

public List<String> getChromosomes() {
return chromosomes;
}

public PharmaChemicalQuery setChromosomes(List<String> chromosomes) {
this.chromosomes = chromosomes;
return this;
}

public List<String> getHapolotypes() {
return hapolotypes;
}

public PharmaChemicalQuery setHapolotypes(List<String> hapolotypes) {
this.hapolotypes = hapolotypes;
return this;
}

public List<String> getGeneNames() {
return geneNames;
}

public PharmaChemicalQuery setGeneNames(List<String> geneNames) {
this.geneNames = geneNames;
return this;
}

public List<String> getPhenotypes() {
return phenotypes;
}

public PharmaChemicalQuery setPhenotypes(List<String> phenotypes) {
this.phenotypes = phenotypes;
return this;
}

public List<String> getPhenotypeTypes() {
return phenotypeTypes;
}

public PharmaChemicalQuery setPhenotypeTypes(List<String> phenotypeTypes) {
this.phenotypeTypes = phenotypeTypes;
return this;
}

public List<String> getConfidences() {
return confidences;
}

public PharmaChemicalQuery setConfidences(List<String> confidences) {
this.confidences = confidences;
return this;
}

public List<String> getPubmedIds() {
return pubmedIds;
}

public PharmaChemicalQuery setPubmedIds(List<String> pubmedIds) {
this.pubmedIds = pubmedIds;
return this;
}
}
Loading
Loading