diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 28a65d5f93..4ca83aa747 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -171,9 +171,9 @@ public class LoadCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation, " - + "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed. 'all' loads everything", - required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation," + + " conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score, pubmed, pharmacogenomics." + + " 'all' loads everything", required = true, arity = 1) public String data; @Parameter(names = {"-i", "--input"}, required = true, arity = 1, diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index dcd40ba508..8c0d477023 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -38,6 +38,8 @@ import java.util.Collections; import java.util.List; +import static org.opencb.cellbase.lib.EtlCommons.PHARMGKB_DATA; + /** * Created by imedina on 03/02/15. */ @@ -163,6 +165,9 @@ public void execute() { case EtlCommons.PUBMED_DATA: parser = buildPubMed(); break; + case EtlCommons.PHARMACOGENOMICS_DATA: + parser = buildPharmacogenomics(); + break; default: logger.error("Build option '" + buildCommandOptions.data + "' is not valid"); break; @@ -414,4 +419,22 @@ private CellBaseBuilder buildPubMed() throws IOException { CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(pubmedOutputFolder); return new PubMedBuilder(pubmedInputFolder, serializer); } + + private CellBaseBuilder buildPharmacogenomics() throws IOException { + Path inFolder = downloadFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); + Path outFolder = buildFolder.resolve(EtlCommons.PHARMACOGENOMICS_DATA); + if (!outFolder.toFile().exists()) { + outFolder.toFile().mkdirs(); + } + + logger.info("Copying PharmGKB version file..."); + if (inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME).toFile().exists()) { + Files.copy(inFolder.resolve(PHARMGKB_DATA).resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), + outFolder.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME), + StandardCopyOption.REPLACE_EXISTING); + } + + CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder); + return new PharmGKBBuilder(inFolder, serializer); + } } diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index 698f2df033..f8197e6558 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -102,6 +102,9 @@ public void execute() { case EtlCommons.PUBMED_DATA: downloadFiles.addAll(downloader.downloadPubMed()); break; + case EtlCommons.PHARMACOGENOMICS_DATA: + downloadFiles.addAll(downloader.downloadPharmKGB()); + break; default: System.out.println("Value \"" + data + "\" is not allowed for the data parameter. Allowed values" + " are: {genome, gene, gene_disease_association, variation, variation_functional_score," diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 2eed03257d..058689b002 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -80,7 +80,8 @@ public LoadCommandExecutor(AdminCliOptionsParser.LoadCommandOptions loadCommandO EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA, EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA, EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA, - EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA}; + EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA, EtlCommons.PUBMED_DATA, + EtlCommons.PHARMACOGENOMICS_DATA}; } else { loadOptions = loadCommandOptions.data.split(","); } @@ -289,6 +290,11 @@ public void execute() throws CellBaseException { loadPubMed(); break; } + case EtlCommons.PHARMACOGENOMICS_DATA: { + // Load data, create index and update release + loadPharmacogenomica(); + break; + } default: logger.warn("Not valid 'data'. We should not reach this point"); break; @@ -546,12 +552,39 @@ private void loadPubMed() throws CellBaseException { // Update release (collection and sources) List sources = Collections.singletonList(pubmedPath.resolve(EtlCommons.PUBMED_VERSION_FILENAME)); - dataReleaseManager.update(dataRelease, "pubmed", EtlCommons.REPEATS_DATA, sources); + dataReleaseManager.update(dataRelease, EtlCommons.PUBMED_DATA, EtlCommons.PUBMED_DATA, sources); } else { logger.warn("PubMed folder {} not found", pubmedPath); } } + private void loadPharmacogenomica() throws IOException, CellBaseException { + Path pharmaPath = input.resolve(EtlCommons.PHARMACOGENOMICS_DATA); + + if (!Files.exists(pharmaPath)) { + logger.warn("Pharmacogenomics folder {} not found to load", pharmaPath); + return; + } + + // Load data + Path pharmaJsonPath = pharmaPath.resolve(EtlCommons.PHARMACOGENOMICS_DATA + ".json.gz"); + logger.info("Loading file '{}'", pharmaJsonPath.toFile().getName()); + try { + loadRunner.load(pharmaJsonPath, EtlCommons.PHARMACOGENOMICS_DATA, dataRelease); + } catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException + | IllegalAccessException | ExecutionException | IOException | InterruptedException | CellBaseException + | LoaderException e) { + logger.error("Error loading file '{}': {}", pharmaJsonPath.toFile().getName(), e.toString()); + } + + // Create index + createIndex(EtlCommons.PHARMACOGENOMICS_DATA); + + // Update release (collection and sources) + List sources = Collections.singletonList(pharmaPath.resolve(EtlCommons.PHARMGKB_VERSION_FILENAME)); + dataReleaseManager.update(dataRelease, EtlCommons.PHARMACOGENOMICS_DATA, EtlCommons.PHARMACOGENOMICS_DATA, sources); + } + private void createIndex(String collection) { if (!createIndexes) { return; diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PharmaChemicalQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PharmaChemicalQuery.java new file mode 100644 index 0000000000..c2fec9ceb3 --- /dev/null +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/PharmaChemicalQuery.java @@ -0,0 +1,226 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.core.api; + +import org.apache.commons.collections4.CollectionUtils; +import org.opencb.cellbase.core.api.query.AbstractQuery; +import org.opencb.cellbase.core.api.query.QueryException; +import org.opencb.cellbase.core.api.query.QueryParameter; + +import java.util.List; +import java.util.Map; + +public class PharmaChemicalQuery extends AbstractQuery { + + @QueryParameter(id = "id") + private List ids; + + @QueryParameter(id = "name") + private List names; + + @QueryParameter(id = "source", allowedValues = {"PharmGKB"}) + private List sources; + + @QueryParameter(id = "types", alias = {"type"}) + private List types; + + @QueryParameter(id = "variants.variantId", alias = {"variant"}) + private List variants; + + @QueryParameter(id = "variants.location", alias = {"location"}) + private List locations; + + @QueryParameter(id = "variants.chromosome", alias = {"chromosome"}) + private List chromosomes; + + @QueryParameter(id = "variants.haplotypes", alias = {"haplotype"}) + private List hapolotypes; + + @QueryParameter(id = "variants.geneNames", alias = {"geneName"}) + private List geneNames; + + @QueryParameter(id = "variants.phenotypes", alias = {"phenotype"}) + private List phenotypes; + + @QueryParameter(id = "variants.phenotypeTypes", alias = {"phenotypeType"}) + private List phenotypeTypes; + + @QueryParameter(id = "variants.confidence", alias = {"confidence"}) + private List confidences; + + @QueryParameter(id = "variants.evidences.pubmed", alias = {"pubmedId"}) + private List pubmedIds; + + public PharmaChemicalQuery() { + } + + public PharmaChemicalQuery(Map params) throws QueryException { + super(params); + + objectMapper.readerForUpdating(this); + objectMapper.readerFor(PharmaChemicalQuery.class); + objectWriter = objectMapper.writerFor(PharmaChemicalQuery.class); + } + + @Override + protected void validateQuery() throws QueryException { + if (CollectionUtils.isNotEmpty(variants)) { + for (String variant : variants) { + if (!variant.startsWith("rs")) { + throw new QueryException("Invalid variant ID: '" + variant + "'; it has to start with rs"); + } + } + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("PharmaChemicalQuery{"); + sb.append("ids=").append(ids); + sb.append(", names=").append(names); + sb.append(", sources=").append(sources); + sb.append(", types=").append(types); + sb.append(", variants=").append(variants); + sb.append(", locations=").append(locations); + sb.append(", chromosomes=").append(chromosomes); + sb.append(", hapolotypes=").append(hapolotypes); + sb.append(", geneNames=").append(geneNames); + sb.append(", phenotypes=").append(phenotypes); + sb.append(", phenotypeTypes=").append(phenotypeTypes); + sb.append(", confidences=").append(confidences); + sb.append(", pubmedIds=").append(pubmedIds); + sb.append('}'); + return sb.toString(); + } + + public List getIds() { + return ids; + } + + public PharmaChemicalQuery setIds(List ids) { + this.ids = ids; + return this; + } + + public List getNames() { + return names; + } + + public PharmaChemicalQuery setNames(List names) { + this.names = names; + return this; + } + + public List getSources() { + return sources; + } + + public PharmaChemicalQuery setSources(List sources) { + this.sources = sources; + return this; + } + + public List getTypes() { + return types; + } + + public PharmaChemicalQuery setTypes(List types) { + this.types = types; + return this; + } + + public List getVariants() { + return variants; + } + + public PharmaChemicalQuery setVariants(List variants) { + this.variants = variants; + return this; + } + + public List getLocations() { + return locations; + } + + public PharmaChemicalQuery setLocations(List locations) { + this.locations = locations; + return this; + } + + public List getChromosomes() { + return chromosomes; + } + + public PharmaChemicalQuery setChromosomes(List chromosomes) { + this.chromosomes = chromosomes; + return this; + } + + public List getHapolotypes() { + return hapolotypes; + } + + public PharmaChemicalQuery setHapolotypes(List hapolotypes) { + this.hapolotypes = hapolotypes; + return this; + } + + public List getGeneNames() { + return geneNames; + } + + public PharmaChemicalQuery setGeneNames(List geneNames) { + this.geneNames = geneNames; + return this; + } + + public List getPhenotypes() { + return phenotypes; + } + + public PharmaChemicalQuery setPhenotypes(List phenotypes) { + this.phenotypes = phenotypes; + return this; + } + + public List getPhenotypeTypes() { + return phenotypeTypes; + } + + public PharmaChemicalQuery setPhenotypeTypes(List phenotypeTypes) { + this.phenotypeTypes = phenotypeTypes; + return this; + } + + public List getConfidences() { + return confidences; + } + + public PharmaChemicalQuery setConfidences(List confidences) { + this.confidences = confidences; + return this; + } + + public List getPubmedIds() { + return pubmedIds; + } + + public PharmaChemicalQuery setPubmedIds(List pubmedIds) { + this.pubmedIds = pubmedIds; + return this; + } +} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/query/AbstractQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/query/AbstractQuery.java index 50b0451e5f..c3d2b4e6ae 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/query/AbstractQuery.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/query/AbstractQuery.java @@ -122,7 +122,10 @@ public void updateParams(ObjectMap objectMap) { public void updateParams(Map uriParams) { classAttributesToType = getClassAttributesToType(); annotations = getAnnotations(); + try { + validateParams(uriParams, classAttributesToType, annotations); + Map objectHashMap = new HashMap<>(); for (Map.Entry> entry : classAttributesToType.entrySet()) { String fieldNameDotNotation = null; @@ -172,11 +175,48 @@ public void updateParams(Map uriParams) { } } objectMapper.updateValue(this, objectHashMap); - } catch (JsonProcessingException e) { + } catch (JsonProcessingException | QueryException e) { throw new IllegalArgumentException(e); } } + private void validateParams(Map uriParams, Map> classAttributesToType, + Map annotations) throws QueryException { + for (String uriParamName : uriParams.keySet()) { + boolean validUriParamName = false; + for (Map.Entry> entry : classAttributesToType.entrySet()) { + String fieldNameDotNotation = null; + String[] fieldAliases = new String[0]; + String fieldNameCamelCase = entry.getKey(); + QueryParameter queryParameter = annotations.get(fieldNameCamelCase); + if (queryParameter != null) { + fieldNameDotNotation = queryParameter.id(); + fieldAliases = queryParameter.alias(); + } + if (fieldNameDotNotation == null) { + // field has no annotation + continue; + } + String s = fieldNameDotNotation.replace("\\.", "\\\\."); + if (uriParamName.equals(s)) { + validUriParamName = true; + break; + } else { + for (String alias : fieldAliases) { + s = alias.replace("\\.", "\\\\."); + if (uriParamName.equals(s)) { + validUriParamName = true; + break; + } + } + } + } + if (!validUriParamName) { + throw new QueryException("Unknown query parameter '" + uriParamName + "'"); + } + } + } + /** * For this Query class, returns a map of class attributes and the types of those attributes. * diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index e05b19a065..ee4216f560 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -73,6 +73,7 @@ public class DownloadProperties { private URLProperties goAnnotation; private URLProperties revel; private URLProperties pubmed; + private URLProperties pharmGKB; public EnsemblProperties getEnsembl() { return ensembl; @@ -475,6 +476,15 @@ public DownloadProperties setPubmed(URLProperties pubmed) { return this; } + public URLProperties getPharmGKB() { + return pharmGKB; + } + + public DownloadProperties setPharmGKB(URLProperties pharmGKB) { + this.pharmGKB = pharmGKB; + return this; + } + public DownloadProperties setRefSeqProteinFasta(URLProperties refSeqProteinFasta) { this.refSeqProteinFasta = refSeqProteinFasta; return this; diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 343f14abd4..8ee4fe6ac1 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -164,6 +164,19 @@ download: host: https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/ files: - pubmed22n[1..1114..4].xml.gz + pharmGKB: + host: https://www.pharmgkb.org/downloads + version: v1 + files: + - https://api.pharmgkb.org/v1/download/file/data/genes.zip + - https://api.pharmgkb.org/v1/download/file/data/chemicals.zip + - https://api.pharmgkb.org/v1/download/file/data/variants.zip + - https://api.pharmgkb.org/v1/download/file/data/guidelineAnnotations.json.zip + - https://api.pharmgkb.org/v1/download/file/data/variantAnnotations.zip + - https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip + - https://api.pharmgkb.org/v1/download/file/data/clinicalVariants.zip + - https://api.pharmgkb.org/v1/download/file/data/drugLabels.zip + - https://api.pharmgkb.org/v1/download/file/data/relationships.zip species: vertebrates: - id: hsapiens diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index dd2b6caff9..4396f0c2f1 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -48,6 +48,11 @@ public class EtlCommons { public static final String CLINICAL_VARIANTS_DATA = "clinical_variants"; public static final String SPLICE_SCORE_DATA = "splice_score"; + public static final String PHARMACOGENOMICS_DATA = "pharmacogenomics"; + public static final String PHARMGKB_NAME = "PharmGKB"; + public static final String PHARMGKB_DATA = "pharmgkb"; + public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json"; + public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant"; public static final String CLINVAR_VERSION = "2022.11"; public static final String CLINVAR_DATE = "2022-11"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java new file mode 100644 index 0000000000..1f7a4836ca --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/PharmGKBBuilder.java @@ -0,0 +1,1016 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.Xref; +import org.opencb.biodata.models.pharma.*; +import org.opencb.biodata.models.pharma.guideline.BasicObject; +import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.commons.utils.FileUtils; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.*; +import java.util.stream.Collectors; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class PharmGKBBuilder extends CellBaseBuilder { + + private final Path inputDir; + private final Path pharmGKBDir; + + private static final String CHEMICALS_BASENAME = "chemicals"; + private static final String CHEMICALS_TSV_FILENAME = "chemicals.tsv"; + + private static final String VARIANTS_BASENAME = "variants"; + private static final String VARIANTS_TSV_FILENAME = "variants.tsv"; + + private static final String GENES_BASENAME = "genes"; + private static final String GENES_TSV_FILENAME = "genes.tsv"; + + private static final String CLINICAL_ANNOTATIONS_BASENAME = "clinicalAnnotations"; + private static final String CLINICAL_ANNOTATIONS_TSV_FILENAME = "clinical_annotations.tsv"; + private static final String CLINICAL_ANN_ALLELES_TSV_FILENAME = "clinical_ann_alleles.tsv"; + private static final String CLINICAL_ANN_EVIDENCE_TSV_FILENAME = "clinical_ann_evidence.tsv"; + + private static final String VARIANT_ANNOTATIONS_BASENAME = "variantAnnotations"; + private static final String VARIANT_ANNOTATIONS_TSV_FILENAME = "var_drug_ann.tsv"; + private static final String PHENOTYPE_ANNOTATIONS_TSV_FILENAME = "var_pheno_ann.tsv"; + private static final String FUNCTIONAL_ANNOTATIONS_TSV_FILENAME = "var_fa_ann.tsv"; + private static final String STUDY_PARAMETERS_TSV_FILENAME = "study_parameters.tsv"; + + private static final String GUIDELINE_ANNOTATIONS_BASENAME = "guidelineAnnotations"; + + private static final String DRUG_LABELS_BASENAME = "drugLabels"; + private static final String DRUG_LABELS_TSV_FILENAME = "drugLabels.tsv"; + + private static final String RELATIONSHIPS_BASENAME = "relationships"; + private static final String RELATIONSHIPS_TSV_FILENAME = "relationships.tsv"; + + private static final String GUIDELINE_ANNOTATION_EVIDENCE_TYPE = "Guideline Annotation"; + private static final String DRUG_LABEL_ANNOTATION_EVIDENCE_TYPE = "Label Annotation"; + private static final String VARIANT_ANNOTATION_EVIDENCE_TYPE = "Variant Drug Annotation"; + private static final String FUNCTIONAL_ANNOTATION_EVIDENCE_TYPE = "Variant Functional Assay Annotation"; + private static final String PHENOTYPE_ANNOTATION_EVIDENCE_TYPE = "Variant Phenotype Annotation"; + + private static final String LOCATION_KEY = "location"; + private static final String CHROMOSOME_KEY = "chrom"; + private static final String POSITION_KEY = "pos"; + + private static final String GENE_ENTITY = "Gene"; + private static final String CHEMICAL_ENTITY = "Chemical"; + + private static final String PHARMGKB_ID_KEY = "PHARMGKB_ID"; + private static final String PHARMGKB_ASSOCIATION_TYPE_KEY = "PHARMGKB_ASSOCIATION_TYPE"; + private static final String PHARMGKB_LEVEL_OVERRIDE_KEY = "PHARMGKB_LEVEL_OVERRIDE"; + private static final String PHARMGKB_LEVEL_MODIFIERS_KEY = "PHARMGKB_LEVEL_MODIFIERS"; + private static final String PHARMGKB_LAST_UPDATE_DATE_KEY = "PHARMGKB_LAST_UPDATE_DATE"; + private static final String PHARMGKB_IS_VIP_KEY = "PHARMGKB_IS_VIP"; + + public PharmGKBBuilder(Path inputDir, CellBaseFileSerializer serializer) { + super(serializer); + + this.inputDir = inputDir; + this.pharmGKBDir = inputDir.resolve(PHARMGKB_DATA); + } + + @Override + public void parse() throws Exception { + // Check input folder + FileUtils.checkDirectory(inputDir); + + // PharmGKB + FileUtils.checkDirectory(pharmGKBDir); + logger.info("Parsing {} files and building the data models...", PHARMGKB_NAME); + + // Parse chemical file + Map chemicalsMap = parseChemicalFile(); + + // Parse clinical annotation files + parseClinicalAnnotationFiles(chemicalsMap); + + // Parse gene file + parseGeneFile(chemicalsMap); + + logger.info("Parsing {} files finished.", PHARMGKB_NAME); + + // Generation the pharmacogenomics JSON file + logger.info("Writing {} JSON file to {} ...", PHARMACOGENOMICS_DATA, serializer.getOutdir()); + int counter = 0; + for (Map.Entry entry : chemicalsMap.entrySet()) { + ((CellBaseFileSerializer) serializer).serialize(entry.getValue(), PHARMACOGENOMICS_DATA); + if (++counter % 1000 == 0) { + logger.info("\t\t {} chemicals/drugs written.", counter); + } + } + serializer.close(); + logger.info("Writing {} JSON file done!", PHARMACOGENOMICS_DATA); + } + + private Map parseChemicalFile() throws IOException { + Path chemicalsFile = pharmGKBDir.resolve(CHEMICALS_BASENAME).resolve(CHEMICALS_TSV_FILENAME); + Map chemicalsMap = new HashMap<>(); + try (BufferedReader br = FileUtils.newBufferedReader(chemicalsFile)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + // 0 1 2 3 4 5 6 7 8 + // PharmGKB Accession ID Name Generic Names Trade Names Brand Mixtures Type Cross-references SMILES InChI + // 9 10 11 12 13 14 + // Dosing Guideline External Vocabulary Clinical Annotation Count Variant Annotation Count Pathway Count VIP Count + // 15 16 17 18 + // Dosing Guideline Sources Top Clinical Annotation Level Top FDA Label Testing Level Top Any Drug Label Testing Level + // 19 20 21 22 23 + // Label Has Dosing Info Has Rx Annotation RxNorm Identifiers ATC Identifiers PubChem Compound Identifiers + PharmaChemical pharmaChemical = new PharmaChemical() + .setId(fields[0]) + .setSource(PHARMGKB_NAME) + .setName(fields[1]) + .setSmiles(fields[7]) + .setInChI(fields[8]); + + // Generic Names + if (StringUtils.isNotEmpty(fields[2])) { + pharmaChemical.setGenericNames(stringFieldToList(fields[2])); + } + + // Trade Names + if (StringUtils.isNotEmpty(fields[3])) { + pharmaChemical.setTradeNames(stringFieldToList(fields[3])); + } + + // Brand Mixtures + if (StringUtils.isNotEmpty(fields[4])) { + pharmaChemical.setTradeMixtures(stringFieldToList(fields[4])); + } + + // Types + if (StringUtils.isNotEmpty(fields[5])) { + pharmaChemical.setTypes(Arrays.stream(fields[5].split(",")).map(String::trim).collect(Collectors.toList())); + } + + // We need to keep the name not the ID to map by drug name in the clinical annotation method + chemicalsMap.put(pharmaChemical.getName(), pharmaChemical); + } + } + logger.info("Number of Chemical items read {}", chemicalsMap.size()); + + return chemicalsMap; + } + + /** + * This method parses clinical_annotations.tsv, then it parses alleles and evidences to add them to the first one. + * @param chemicalsMap + * @throws IOException + */ + private void parseClinicalAnnotationFiles(Map chemicalsMap) throws IOException { + Map variantAnnotationMap = new HashMap<>(); + Map> drugToVariantAnnotationIdMap = new HashMap<>(); + + Map> variantMap = parseVariantFile(); + + // clinical_annotations.tsv + try (BufferedReader br = FileUtils.newBufferedReader(pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME) + .resolve(CLINICAL_ANNOTATIONS_TSV_FILENAME))) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + + // Sanity check + if (StringUtils.isEmpty(fields[0])) { + logger.warn("Clinical annotation ID is missing in clinical annotations line: {}", line); + continue; + } + + // 0 1 2 3 4 5 6 + // Clinical Annotation ID Variant/Haplotypes Gene Level of Evidence Level Override Level Modifiers Score + // 7 8 9 10 11 12 13 + // Phenotype Category PMID Count Evidence Count Drug(s) Phenotype(s) Latest History Date (YYYY-MM-DD) URL + // 14 + // Specialty Population + PharmaVariantAnnotation pharmaVariantAnnotation = new PharmaVariantAnnotation() + .setConfidence(fields[3]) + .setScore(fields[6]) + .setUrl(fields[13]) + .setPopulation(fields[14]); + + // Variant or haplotypes + if (StringUtils.isNotEmpty(fields[1])) { + if (isHaplotype(fields[1])) { + // Haplotype + pharmaVariantAnnotation.setHaplotypes(getHaplotypeList(fields[1])); + } else { + // Variant + pharmaVariantAnnotation.setVariantId(fields[1]); + } + } + + // Genes + if (StringUtils.isNotEmpty(fields[2])) { + pharmaVariantAnnotation.setGeneNames(Arrays.asList(fields[2].split(";"))); + } + + if (StringUtils.isNotEmpty(fields[7])) { + pharmaVariantAnnotation.setPhenotypeTypes(Arrays.asList(fields[7].split(";"))); + } + + if (StringUtils.isNotEmpty(fields[11])) { + pharmaVariantAnnotation.setPhenotypes(Arrays.asList(fields[11].split(";"))); + } + + Map attributes = new HashMap<>(); + attributes.put(PHARMGKB_ID_KEY, fields[0]); + attributes.put(PHARMGKB_LEVEL_OVERRIDE_KEY, fields[4]); + attributes.put(PHARMGKB_LEVEL_MODIFIERS_KEY, fields[5]); + attributes.put(PHARMGKB_LAST_UPDATE_DATE_KEY, fields[12]); + pharmaVariantAnnotation.setAttributes(attributes); + + // Add some fields from the variant map + if (variantMap.containsKey(pharmaVariantAnnotation.getVariantId())) { + pharmaVariantAnnotation.setLocation((String) variantMap.get(pharmaVariantAnnotation.getVariantId()).get(LOCATION_KEY)); + pharmaVariantAnnotation + .setChromosome((String) variantMap.get(pharmaVariantAnnotation.getVariantId()).get(CHROMOSOME_KEY)); + pharmaVariantAnnotation.setPosition((int) variantMap.get(pharmaVariantAnnotation.getVariantId()).get(POSITION_KEY)); + } else { + logger.warn("Variant {} from clinical annotation not found in the variant map, so chromosome and position are not set", + pharmaVariantAnnotation.getVariantId()); + } + + // Add the annotation to the annotationMap by annotation ID + variantAnnotationMap.put(fields[0], pharmaVariantAnnotation); + + // Process the drug names to update the drugToClinicalAnnotationId map + // This will be used at the end of the method to update the chemical map + if (StringUtils.isNotEmpty(fields[10])) { + // Drugs are separated by semicolon + String[] drugs = fields[10].split(";"); + for (String drug : drugs) { + if (!drugToVariantAnnotationIdMap.containsKey(drug)) { + // Add the drug to the map + drugToVariantAnnotationIdMap.put(drug, new ArrayList<>()); + } + // Add the clinical annotation ID to that drug + drugToVariantAnnotationIdMap.get(drug).add(fields[0]); + } + } + } + } + + // Update the clinical annotation map by parsing the clinical annotation evidences + parseClinicalAnnotationEvidenceFile(variantAnnotationMap); + + // Update the clinical annotation map by parsing the clinical annotation alleles + parseClinicalAnnotationAlleleFile(variantAnnotationMap); + + // Update chemicals map by adding the clinical annotation + for (Map.Entry> entry : drugToVariantAnnotationIdMap.entrySet()) { + if (chemicalsMap.containsKey(entry.getKey())) { + for (String variantAnnotationId : entry.getValue()) { + chemicalsMap.get(entry.getKey()).getVariants().add(variantAnnotationMap.get(variantAnnotationId)); + } + } else { + logger.warn("Drug '{}' not found in the chemicals map", entry.getKey()); + } + } + } + + private Map> parseVariantFile() throws IOException { + Map> variantMap = new HashMap<>(); + // Parse the variant file (i.e., variants.tsv) + Path varPath = pharmGKBDir.resolve(VARIANTS_BASENAME).resolve(VARIANTS_TSV_FILENAME); + try (BufferedReader br = FileUtils.newBufferedReader(varPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + String variantName = fields[1]; + + // Sanity check + if (StringUtils.isEmpty(variantName)) { + logger.warn("Variant name is missing in variant line: {}", line); + continue; + } + + if (variantMap.containsKey(variantName)) { + logger.warn("Variant name is duplicated in variant line: {}", line); + continue; + } + + // 0 1 2 3 4 5 6 + // Variant ID Variant Name Gene IDs Gene Symbols Location Variant Annotation count Clinical Annotation count + // 7 8 9 10 + // Level 1/2 Clinical Annotation count Guideline Annotation count Label Annotation count Synonyms + String location = fields[4]; + if (StringUtils.isEmpty(location)) { + logger.warn("Location is missing for Variant name {}", variantName); + continue; + } + if (!location.startsWith("NC_")) { + logger.warn("Unknown location {}, it has to be a RefSeq ID", location); + continue; + } + Map attrMap = new HashMap<>(); + String[] splits = location.split("[_\\.:]"); + try { + int chrom = Integer.parseInt(splits[1]); + if (chrom >= 1 && chrom <= 22) { + attrMap.put(CHROMOSOME_KEY, String.valueOf(chrom)); + } else if (chrom == 23) { + attrMap.put(CHROMOSOME_KEY, "X"); + } else if (chrom == 24) { + attrMap.put(CHROMOSOME_KEY, "Y"); + } else if (chrom == 12920) { + attrMap.put(CHROMOSOME_KEY, "MT"); + } else { + logger.warn("Unknown chromosome {}", chrom); + continue; + } + } catch (NumberFormatException e) { + logger.warn("Error computing chromosome from location {}: {}", location, e.getMessage()); + continue; + } + try { + int position = Integer.parseInt(splits[3]); + attrMap.put(POSITION_KEY, position); + } catch (NumberFormatException e) { + logger.warn("Error computing chromosome position from location {}: {}", location, e.getMessage()); + continue; + } + attrMap.put(LOCATION_KEY, attrMap.get(CHROMOSOME_KEY) + ":" + attrMap.get(POSITION_KEY)); + + // Add it to the variant map + variantMap.put(variantName, attrMap); + } + } + logger.info("Number of variants = {}", variantMap.size()); + + return variantMap; + } + + private void parseClinicalAnnotationEvidenceFile(Map variantAnnotationMap) throws IOException { + // For CellBase, variant annotation correponds to the PharmGKB clinical annotation + // Processing clinical annotation evidences implies to process the variant annotation, guideline annotations, + // drug label annotations, phenotype annotations and functional analysis annotations + Map variantAssociationMap = new HashMap<>(); + Map guidelineAnnotationsMap = parseGuidelineAnnotationFiles(); + Map drugLabelAnnotationsMap = parseDrugLabelAnnotationFile(); + + // Parse study parameters and update the variant, phenotype and functional annotations with the parsed study parameters + parseVariantAnnotationFile(variantAssociationMap); + parsePhenotypeAnnotationFile(variantAssociationMap); + parseFunctionalAnnotationFile(variantAssociationMap); + parseStudyParameterFile(variantAssociationMap); + + // Parse the clinical annotation alleles file (i.e., clinical_ann_alleles.tsv) + Path evidencesPath = pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_EVIDENCE_TSV_FILENAME); + try (BufferedReader br = FileUtils.newBufferedReader(evidencesPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + String clinicalAnnotationId = fields[0]; + + // Sanity check + if (StringUtils.isEmpty(clinicalAnnotationId)) { + logger.warn("Clinical annotation ID is missing in clinical annotation evidence line: {}", line); + continue; + } + + // 0 1 2 3 4 5 6 + // Clinical Annotation ID Evidence ID Evidence Type Evidence URL PMID Summary Score + String evidenceId = fields[1]; + String evidenceType = fields[2]; + PharmaClinicalEvidence evidence = new PharmaClinicalEvidence() + .setType(evidenceType) + .setUrl(fields[3]) + .setPubmed(fields[4]) + .setSummary(fields[5]) + .setScore(fields[6]); + + switch (evidenceType) { + case VARIANT_ANNOTATION_EVIDENCE_TYPE: + case PHENOTYPE_ANNOTATION_EVIDENCE_TYPE: + case FUNCTIONAL_ANNOTATION_EVIDENCE_TYPE: { + if (variantAssociationMap.containsKey(evidenceId)) { + evidence.getVariantAssociations().add(variantAssociationMap.get(evidenceId)); + } else { + logger.warn("Evidence ID '{}' of type '{}' not found in the variant association map", evidenceId, evidenceType); + } + break; + } + case GUIDELINE_ANNOTATION_EVIDENCE_TYPE: { + if (guidelineAnnotationsMap.containsKey(evidenceId)) { + evidence.getGuidelineAnnotations().add(guidelineAnnotationsMap.get(evidenceId)); + } else { + logger.warn("Evidence ID '{}' of type '{}' not found in the variant annotations map", + evidenceId, evidenceType); + } + break; + } + case DRUG_LABEL_ANNOTATION_EVIDENCE_TYPE: { + if (drugLabelAnnotationsMap.containsKey(evidenceId)) { + evidence.getDrugLabelAnnotations().add(drugLabelAnnotationsMap.get(evidenceId)); + } else { + logger.warn("Evidence ID '{}' of type '{}' not found in the drug label annotations map", + evidenceId, evidenceType); + } + break; + } + default: { + logger.warn("Unknown evidence type '{}': this evidence is skipped. Valid evidence types are: {}", + evidenceType, + StringUtils.join( + Arrays.asList(VARIANT_ANNOTATION_EVIDENCE_TYPE, GUIDELINE_ANNOTATION_EVIDENCE_TYPE, + DRUG_LABEL_ANNOTATION_EVIDENCE_TYPE, FUNCTIONAL_ANNOTATION_EVIDENCE_TYPE, + PHENOTYPE_ANNOTATION_EVIDENCE_TYPE), ",")); + break; + } + } + + Map attributes = new HashMap<>(); + attributes.put(PHARMGKB_ID_KEY, fields[0]); + evidence.setAttributes(attributes); + + // Add evidence to clinical annotation + if (variantAnnotationMap.containsKey(clinicalAnnotationId)) { + variantAnnotationMap.get(clinicalAnnotationId).getEvidences().add(evidence); + } else { + logger.warn("Clinical annotation ID {} from clinical annotation evidence not found in clinical annotations", + clinicalAnnotationId); + } + } + } + } + + private void parseClinicalAnnotationAlleleFile(Map variantAnnotationMap) throws IOException { + // Parse the clinical annotation alleles file (i.e., clinical_ann_alleles.tsv) + Path allelesPath = pharmGKBDir.resolve(CLINICAL_ANNOTATIONS_BASENAME).resolve(CLINICAL_ANN_ALLELES_TSV_FILENAME); + try (BufferedReader br = FileUtils.newBufferedReader(allelesPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + // For CellBase, variant annotation is equivalent to PharmGKB clinical annotation + String variantAnnotationId = fields[0]; + + // Sanity check + if (StringUtils.isEmpty(variantAnnotationId)) { + logger.warn("Clinical annotation ID is missing in clinical annotation alleles line: {}", line); + continue; + } + + // 0 1 2 3 + // Clinical Annotation ID Genotype/Allele Annotation Text Allele Function + PharmaClinicalAllele clinicalAllele = new PharmaClinicalAllele() + .setAllele(fields[1]) + .setAnnotation(fields[2]) + .setDescription(fields[3]); + + Map attributes = new HashMap<>(); + attributes.put(PHARMGKB_ID_KEY, variantAnnotationId); + clinicalAllele.setAttributes(attributes); + + // Add allele to clinical annotation + if (variantAnnotationMap.containsKey(variantAnnotationId)) { + variantAnnotationMap.get(variantAnnotationId).getAlleles().add(clinicalAllele); + } else { + logger.warn("Clinical annotation ID {} from clinical annotation alleles file not found in the clinical annotations map", + variantAnnotationId); + } + } + } + } + + private void parseVariantAnnotationFile(Map variantAssociationMap) throws IOException { + // For CellBase, variant association corresponds to PharmGKB variant annotation + // Parse the variant annotation file (i.e., var_drug_ann.tsv) + Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(VARIANT_ANNOTATIONS_TSV_FILENAME); + int counter = 0; + try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + String variantAnnotationId = fields[0]; + + // Sanity check + if (StringUtils.isEmpty(variantAnnotationId)) { + logger.warn("Variant annotation ID is missing in variant annotations line: {}", line); + continue; + } + + // 0 1 2 3 4 5 + // Variant Annotation ID Variant/Haplotypes Gene Drug(s) PMID Phenotype Category + // 6 7 8 9 10 + // Significance Notes Sentence Alleles Specialty Population + PharmaVariantAssociation variantAssociation = new PharmaVariantAssociation() + .setGeneName(fields[2]) + .setPubmed(fields[4]) + .setPhenotypeType(fields[5]) + .setSignificance(fields[6]) + .setDiscussion(fields[7]) + .setDescription(fields[8]) + .setAlleles(fields[9]) + .setPopulation(fields[10]); + + // Variant or haplotypes + if (StringUtils.isNotEmpty(fields[1])) { + if (isHaplotype(fields[1])) { + // Haplotype + variantAssociation.setHaplotypes(getHaplotypeList(fields[1])); + } else { + // Variant + variantAssociation.setVariantId(fields[1]); + } + } + + Map attributes = new HashMap<>(); + attributes.put(PHARMGKB_ID_KEY, fields[0]); + attributes.put(PHARMGKB_ASSOCIATION_TYPE_KEY, VARIANT_ANNOTATION_EVIDENCE_TYPE); + variantAssociation.setAttributes(attributes); + + if (StringUtils.isNotEmpty(fields[3])) { + variantAssociation.setDrugs(stringFieldToList(fields[3])); + } + + // Add the annotation to the variantAnnotationMap by variant and gene + variantAssociationMap.put(variantAnnotationId, variantAssociation); + counter++; + } + } + logger.info("Number of variant annotations = {}", counter); + } + + private Map parseGuidelineAnnotationFiles() throws IOException { + Map guidelineAnnotationMap = new HashMap<>(); + + ObjectMapper mapper = new ObjectMapper(); + ObjectReader objectReader = mapper.readerFor(PharmaGuidelineAnnotation.class); + + // Parse the guideline annotations JSON files + Path guidelinesPath = pharmGKBDir.resolve(GUIDELINE_ANNOTATIONS_BASENAME); + FileUtils.checkDirectory(guidelinesPath); + for (File file : Objects.requireNonNull(guidelinesPath.toFile().listFiles())) { + if (file.getName().endsWith("json")) { + PharmaGuidelineAnnotation guidelineAnnotation = objectReader.readValue(file); + if (guidelineAnnotation.getGuideline() != null + && StringUtils.isEmpty(guidelineAnnotation.getGuideline().getId())) { + logger.warn("Guideline ID is missing for guideline filename: {}", file.getName()); + continue; + } + // Add the guideline annotation to the map by guideline ID (= Evidence ID) + guidelineAnnotationMap.put(guidelineAnnotation.getGuideline().getId(), guidelineAnnotation); + } + } + logger.info("Number of guideline annotations = {}", guidelineAnnotationMap.size()); + + return guidelineAnnotationMap; + } + + private Map parseDrugLabelAnnotationFile() throws IOException { + Map drugLabelAnnotationMap = new HashMap<>(); + // Parse the drug labels annotations file (i.e., drugLabels.tsv) + Path drugLabelPath = pharmGKBDir.resolve(DRUG_LABELS_BASENAME).resolve(DRUG_LABELS_TSV_FILENAME); + try (BufferedReader br = FileUtils.newBufferedReader(drugLabelPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + String drugLabelId = fields[0]; + + // Sanity check + if (StringUtils.isEmpty(drugLabelId)) { + logger.warn("PharmGKB ID is missing in drug label line: {}", line); + continue; + } + + // 0 1 2 3 4 5 6 7 + // PharmGKB ID Name Source Biomarker Flag Testing Level Has Prescribing Info Has Dosing Info Has Alternate Drug + // 8 9 10 11 12 13 + // Cancer Genome Prescribing Chemicals Genes Variants/Haplotypes Latest History Date (YYYY-MM-DD) + PharmaDrugLabelAnnotation labelAnnotation = new PharmaDrugLabelAnnotation() + .setName(fields[1]) + .setSource(fields[2]) + .setBiomarkerFlag(fields[3]) + .setTestingLevel(fields[4]) + .setPrescribingInfo(fields[5]) + .setDosingInfo(fields[6]) + .setAlternateDrug(fields[7]) + .setCancerGenome(fields[8]); + + Map attributes = new HashMap<>(); + attributes.put(PHARMGKB_ID_KEY, drugLabelId); + labelAnnotation.setAttributes(attributes); + + // Add the drug label annotation to the map by ParhmGKB (= Evidence ID) + drugLabelAnnotationMap.put(drugLabelId, labelAnnotation); + } + } + logger.info("Number of drug label annotations = {}", drugLabelAnnotationMap.size()); + + return drugLabelAnnotationMap; + } + + private void parsePhenotypeAnnotationFile(Map variantAssociationMap) throws IOException { + // Parse the variant annotation file (i.e., var_pheno_ann.tsv) + Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(PHENOTYPE_ANNOTATIONS_TSV_FILENAME); + int counter = 0; + try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + String variantAnnotationId = fields[0]; + + // Sanity check + if (StringUtils.isEmpty(variantAnnotationId)) { + logger.warn("Variant annotation ID is missing in phenotype annotations line: {}", line); + continue; + } + + // 0 1 2 3 4 5 6 7 8 + // Variant Annotation ID Variant/Haplotypes Gene Drug(s) PMID Phenotype Category Significance Notes Sentence + // 9 10 ..... + // Alleles Specialty Population ..... + PharmaVariantAssociation variantAssociation = new PharmaVariantAssociation() + .setGeneName(fields[2]) + .setPubmed(fields[4]) + .setPhenotypeType(fields[5]) + .setSignificance(fields[6]) + .setDiscussion(fields[7]) + .setDescription(fields[8]) + .setAlleles(fields[9]) + .setPopulation(fields[10]); + + // Variant or haplotypes + if (StringUtils.isNotEmpty(fields[1])) { + if (isHaplotype(fields[1])) { + // Haplotype + variantAssociation.setHaplotypes(getHaplotypeList(fields[1])); + } else { + // Variant + variantAssociation.setVariantId(fields[1]); + } + } + + Map attributes = new HashMap<>(); + attributes.put(PHARMGKB_ID_KEY, variantAnnotationId); + attributes.put(PHARMGKB_ASSOCIATION_TYPE_KEY, PHENOTYPE_ANNOTATION_EVIDENCE_TYPE); + variantAssociation.setAttributes(attributes); + + if (StringUtils.isNotEmpty(fields[3])) { + variantAssociation.setDrugs(stringFieldToList(fields[3])); + } + + // Add the annotation to the variantAnnotationMap by variant and gene + variantAssociationMap.put(variantAnnotationId, variantAssociation); + counter++; + } + } + logger.info("Number of phenotype annotations = {}", counter); + } + + private void parseFunctionalAnnotationFile(Map variantAssociationMap) throws IOException { + // Parse the variant annotation file (i.e., var_fa_ann.tsv) + Path varDrugPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(FUNCTIONAL_ANNOTATIONS_TSV_FILENAME); + int counter = 0; + try (BufferedReader br = FileUtils.newBufferedReader(varDrugPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + String variantAnnotationId = fields[0]; + + // Sanity check + if (StringUtils.isEmpty(variantAnnotationId)) { + logger.warn("Variant annotation ID is missing in variant annotations line: {}", line); + continue; + } + + // 0 1 2 3 4 5 + // Variant Annotation ID Variant/Haplotypes Gene Drug(s) PMID Phenotype Category + // 6 7 8 9 10 11 ..... + // Significance Notes Sentence Alleles Specialty Population Assay type ..... + PharmaVariantAssociation variantAssociation = new PharmaVariantAssociation() + .setGeneName(fields[2]) + .setPubmed(fields[4]) + .setPhenotypeType(fields[5]) + .setSignificance(fields[6]) + .setDiscussion(fields[7]) + .setDescription(fields[8]) + .setAlleles(fields[9]) + .setPopulation(fields[10]) + .setAssayType(fields[11]); + + // Variant or haplotypes + if (StringUtils.isNotEmpty(fields[1])) { + if (isHaplotype(fields[1])) { + // Haplotype + variantAssociation.setHaplotypes(getHaplotypeList(fields[1])); + } else { + // Variant + variantAssociation.setVariantId(fields[1]); + } + } + + Map attributes = new HashMap<>(); + attributes.put(PHARMGKB_ID_KEY, variantAnnotationId); + attributes.put(PHARMGKB_ASSOCIATION_TYPE_KEY, FUNCTIONAL_ANNOTATION_EVIDENCE_TYPE); + variantAssociation.setAttributes(attributes); + + if (StringUtils.isNotEmpty(fields[3])) { + variantAssociation.setDrugs(stringFieldToList(fields[3])); + } + + // Add the annotation to the variantAnnotationMap by variant and gene + variantAssociationMap.put(variantAnnotationId, variantAssociation); + counter++; + } + } + logger.info("Number of variant annotations = {}", counter); + } + + private void parseStudyParameterFile(Map variantAssociationMap) throws IOException { + Map> studyParametersMap = new HashMap<>(); + // Parse the study parameters file (i.e., study_parameters.tsv) + Path studyParamsPath = pharmGKBDir.resolve(VARIANT_ANNOTATIONS_BASENAME).resolve(STUDY_PARAMETERS_TSV_FILENAME); + try (BufferedReader br = FileUtils.newBufferedReader(studyParamsPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + String variantAnnotationId = fields[1]; + + // Sanity check + if (StringUtils.isEmpty(variantAnnotationId)) { + logger.warn("Variant annotation ID is missing in study parameters line: {}", line); + continue; + } + + // 0 1 2 3 4 5 + // Study Parameters ID Variant Annotation ID Study Type Study Cases Study Controls Characteristics + // 6 7 8 9 + // Characteristics Type Frequency In Cases Allele Of Frequency In Cases Frequency In Controls + // 10 11 12 13 14 15 + // Allele Of Frequency In Controls P Value Ratio Stat Type Ratio Stat Confidence Interval Start Confidence Interval Stop + // 16 + // Biogeographical Groups + PharmaStudyParameters studyParams = new PharmaStudyParameters() + .setStudyType(fields[2]) + .setStudyCases(fields[3]) + .setStudyControls(fields[4]) + .setCharacteristics(fields[5]) + .setCharacteristicsType(fields[6]) + .setFrequencyInCases(fields[7]) + .setAlleleOfFrequencyInCases(fields[8]) + .setFrequencyInControls(fields[9]) + .setAlleleOfFrequencyInControls(fields[10]) + .setpValue(fields[11]) + .setRatioStatType(fields[12]) + .setRatioStat(fields[13]) + .setConfidenceIntervalStart(fields[14]) + .setConfidenceIntervalStop(fields[15]) + .setBiogeographicalGroups(fields[16]); + + Map attributes = new HashMap<>(); + attributes.put(PHARMGKB_ID_KEY, variantAnnotationId); + studyParams.setAttributes(attributes); + + // Add the study parameters map + if (!studyParametersMap.containsKey(variantAnnotationId)) { + studyParametersMap.put(variantAnnotationId, new ArrayList<>()); + } + studyParametersMap.get(variantAnnotationId).add(studyParams); + } + } + logger.info("Number of study parameters lines = {}", studyParametersMap.size()); + + for (Map.Entry> entry : studyParametersMap.entrySet()) { + if (variantAssociationMap.containsKey(entry.getKey())) { + variantAssociationMap.get(entry.getKey()).setStudyParameters(entry.getValue()); + } else { + logger.warn("Study parameters with variant annotation ID {} not found in variant association map", entry.getKey()); + } + } + } + + private void parseGeneFile(Map chemicalsMap) throws IOException { + // To relate genes with chemicals we will take the relationships from: + // 1. From guidelines (from the members 'relatedGenes' and 'relatedChemicals') + // 2. From the file relationships.tsv (from the relationship Gene - Chemical) + + // Create the PharmGKB gene ID map by chemical name + Map> pgkbGeneIdMapByChemicalName = new HashMap<>(); + + // Create and populate guideline annotations map by PharmGKB gene ID + List guidelineAnnotations = new ArrayList<>(parseGuidelineAnnotationFiles().values()); + Map> guidelineAnnotationMapByPgkbGeneId = new HashMap<>(); + for (PharmaGuidelineAnnotation guidelineAnnotation : guidelineAnnotations) { + if (guidelineAnnotation.getGuideline() != null + && CollectionUtils.isNotEmpty(guidelineAnnotation.getGuideline().getRelatedGenes())) { + for (BasicObject relatedGene : guidelineAnnotation.getGuideline().getRelatedGenes()) { + if (StringUtils.isNotEmpty(relatedGene.getId())) { + String pgkbGeneId = relatedGene.getId(); + if (StringUtils.isNotEmpty(pgkbGeneId)) { + // Populate the guideline annotation map by PharmGKB gene ID + if (!guidelineAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) { + guidelineAnnotationMapByPgkbGeneId.put(pgkbGeneId, new ArrayList<>()); + } + guidelineAnnotationMapByPgkbGeneId.get(pgkbGeneId).add(guidelineAnnotation); + + // Populate the PharmGKB gene ID map by chemical names + if (CollectionUtils.isNotEmpty(guidelineAnnotation.getGuideline().getRelatedChemicals())) { + for (BasicObject relatedChemical : guidelineAnnotation.getGuideline().getRelatedChemicals()) { + String chemicalName = relatedChemical.getName(); + if (StringUtils.isNotEmpty(chemicalName)) { + if (!pgkbGeneIdMapByChemicalName.containsKey(chemicalName)) { + pgkbGeneIdMapByChemicalName.put(chemicalName, new HashSet<>()); + } + pgkbGeneIdMapByChemicalName.get(chemicalName).add(pgkbGeneId); + } + } + } + } + } + } + } + } + + // Parse the genes file (i.e., genes.tsv) + Map geneAnnotationMapByPgkbGeneId = new HashMap<>(); + Path genesPath = pharmGKBDir.resolve(GENES_BASENAME).resolve(GENES_TSV_FILENAME); + try (BufferedReader br = FileUtils.newBufferedReader(genesPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + String pgkbGeneId = fields[0]; + + // Sanity check + if (StringUtils.isEmpty(pgkbGeneId)) { + logger.warn("PharmGKB accession ID is missing in genes file line: {}", line); + continue; + } + // 0 1 2 3 4 5 6 7 8 + // PharmGKB Accession Id NCBI Gene ID HGNC ID Ensembl Id Name Symbol Alternate Names Alternate Symbols Is VIP + // 9 10 11 12 13 + // Has Variant Annotation Cross-references Has CPIC Dosing Guideline Chromosome Chromosomal Start - GRCh37 + // 14 15 16 + // Chromosomal Stop - GRCh37 Chromosomal Start - GRCh38 Chromosomal Stop - GRCh38 + PharmaGeneAnnotation geneAnnotation = new PharmaGeneAnnotation() + .setId(pgkbGeneId) + .setName(fields[4]); + + List xrefs = new ArrayList<>(); + if (StringUtils.isNotEmpty(fields[1])) { + xrefs.add(new Xref(fields[1], "NCBI", "NCBI gene ID")); + } + if (StringUtils.isNotEmpty(fields[2])) { + xrefs.add(new Xref(fields[2], "HGNC", "HGNC gene ID")); + } + if (StringUtils.isNotEmpty(fields[3])) { + xrefs.add(new Xref(fields[3], "Ensembl", "Ensembl gene ID")); + } + if (StringUtils.isNotEmpty(fields[5])) { + xrefs.add(new Xref(fields[5], "HGNC", "HGNC gene symbol")); + } + if (CollectionUtils.isNotEmpty(xrefs)) { + geneAnnotation.setXrefs(xrefs); + } + + if (StringUtils.isNotEmpty(fields[9])) { + geneAnnotation.setHasVariantAnnotation(fields[9].toLowerCase(Locale.ROOT).equals("yes")); + } + + // Set guidelines by getting them from the guideline annotations map + if (guidelineAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) { + geneAnnotation.setGuidelineAnnotations(guidelineAnnotationMapByPgkbGeneId.get(pgkbGeneId)); + } + + Map attributes = new HashMap<>(); + attributes.put(PHARMGKB_IS_VIP_KEY, fields[8]); + geneAnnotation.setAttributes(attributes); + + // Add to the map + if (geneAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) { + logger.warn("PharmGKB gene ID {} is duplicated in the PharmGKB file {}", pgkbGeneId, GENES_TSV_FILENAME); + } else { + geneAnnotationMapByPgkbGeneId.put(pgkbGeneId, geneAnnotation); + } + } + } + + // Parse the chemical-gene relationships and update the PharmGKB gene ID map byh chemical name + // In addtion, updata the gene annotation map with additional fields (e.g., evidences, pubmeds...) + parseChemicalGeneRelationships(pgkbGeneIdMapByChemicalName, geneAnnotationMapByPgkbGeneId); + + // Finally, update the chemical map with the gene annotation + for (Map.Entry entry : chemicalsMap.entrySet()) { + String chemicalName = entry.getKey(); + if (pgkbGeneIdMapByChemicalName.containsKey(chemicalName)) { + for (String pgkbGeneId : pgkbGeneIdMapByChemicalName.get(chemicalName)) { + if (geneAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) { + entry.getValue().getGenes().add(geneAnnotationMapByPgkbGeneId.get(pgkbGeneId)); + } + } + } + } + + logger.info("Number of parsed genes = {}", geneAnnotationMapByPgkbGeneId.size()); + } + + private void parseChemicalGeneRelationships(Map> pgkbGeneIdMapByChemicalName, + Map geneAnnotationMapByPgkbGeneId) throws IOException { + int counter = 0; + // Parse the genes file (i.e., relationships.tsv) + Path relationshipsPath = pharmGKBDir.resolve(RELATIONSHIPS_BASENAME).resolve(RELATIONSHIPS_TSV_FILENAME); + try (BufferedReader br = FileUtils.newBufferedReader(relationshipsPath)) { + // Skip first line, i.e. the header line + String line = br.readLine(); + while ((line = br.readLine()) != null) { + String[] fields = line.split("\t", -1); + + // 0 1 2 3 4 5 6 7 8 0 10 + // Entity1_id Entity1_name Entity1_type Entity2_id Entity2_name Entity2_type Evidence Association PK PD PMIDs + String pgkbGeneId = fields[0]; + String entity1Type = fields[2]; + String chemicalName = fields[4]; + String entity2Type = fields[5]; + if (StringUtils.isNotEmpty(pgkbGeneId) && StringUtils.isNotEmpty(entity1Type) && StringUtils.isNotEmpty(chemicalName) + && StringUtils.isNotEmpty(entity2Type) && entity1Type.equals(GENE_ENTITY) && entity2Type.equals(CHEMICAL_ENTITY)) { + if (!pgkbGeneIdMapByChemicalName.containsKey(chemicalName)) { + pgkbGeneIdMapByChemicalName.put(chemicalName, new HashSet<>()); + } + pgkbGeneIdMapByChemicalName.get(chemicalName).add(pgkbGeneId); + + // Update gene annotation map + if (geneAnnotationMapByPgkbGeneId.containsKey(pgkbGeneId)) { + if (StringUtils.isNotEmpty(fields[6])) { + geneAnnotationMapByPgkbGeneId.get(pgkbGeneId).setEvidences(Arrays.asList(fields[6].split(",", -1))); + } + if (StringUtils.isNotEmpty(fields[7])) { + geneAnnotationMapByPgkbGeneId.get(pgkbGeneId).setConfidence(fields[7]); + } + if (StringUtils.isNotEmpty(fields[10])) { + geneAnnotationMapByPgkbGeneId.get(pgkbGeneId).setPubmed(Arrays.asList(fields[10].split(";", -1))); + } + } else { + logger.warn("PhamGKB gene ID {} found in the file {} but not in the file {}", pgkbGeneId, + RELATIONSHIPS_TSV_FILENAME, GENES_TSV_FILENAME); + } + counter++; + } + } + } + logger.info("Number of parsed {}-{} relationships = {}", GENE_ENTITY, CHEMICAL_ENTITY, counter); + } + + private List stringFieldToList(String field) { + if (field.startsWith("\"")) { + return Arrays + .stream(field.replace("\"\"\"", "\"").replace("\"\"", "\"").replace("\", \"", "\",\"").split("\",\"")) + .map(s -> s.replace("\"", "").trim()) + .collect(Collectors.toList()); + } else { + if (field.contains(", ")) { + return Arrays + .stream(field.replace(", ", ",").split(",")) + .map(String::trim) + .collect(Collectors.toList()); + } else { + return Collections.singletonList(field); + } + } + } + + private boolean isHaplotype(String value) { + return (!value.startsWith("rs") && value.contains("*")); + } + + private List getHaplotypeList(String value) { + return Arrays.stream(value.split(",")).map(s -> s.trim()).collect(Collectors.toList()); + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index e178a3b4af..a4ade6603e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -28,8 +28,8 @@ import org.opencb.cellbase.core.config.CellBaseConfiguration; import org.opencb.cellbase.core.config.SpeciesConfiguration; import org.opencb.cellbase.core.exception.CellBaseException; -import org.opencb.cellbase.lib.EtlCommons; import org.opencb.cellbase.core.utils.SpeciesUtils; +import org.opencb.cellbase.lib.EtlCommons; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java index 1ddba56aa9..17022cae4b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java @@ -93,4 +93,9 @@ public List downloadPubMed() throws IOException, CellBaseException PubMedDownloadManager manager = new PubMedDownloadManager(species, assembly, outputDirectory, configuration); return manager.download(); } + + public List downloadPharmKGB() throws IOException, CellBaseException, InterruptedException { + PharmGKBDownloadManager manager = new PharmGKBDownloadManager(species, assembly, outputDirectory, configuration); + return manager.download(); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java new file mode 100644 index 0000000000..274f6c62a7 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/PharmGKBDownloadManager.java @@ -0,0 +1,88 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.commons.exec.Command; +import org.opencb.commons.utils.FileUtils; + +import java.io.IOException; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class PharmGKBDownloadManager extends AbstractDownloadManager { + + public PharmGKBDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, targetDirectory, configuration); + } + + @Override + public List download() throws IOException, InterruptedException { + logger.info("Downloading PharmGKB files..."); + DownloadProperties.URLProperties pharmGKB = configuration.getDownload().getPharmGKB(); + Path pharmgkbDownloadFolder = downloadFolder.resolve(PHARMACOGENOMICS_DATA).resolve(PHARMGKB_DATA); + Files.createDirectories(pharmgkbDownloadFolder); + + List urls = new ArrayList<>(); + List downloadFiles = new ArrayList<>(); + for (String url : pharmGKB.getFiles()) { + urls.add(url); + + Path downloadedFileName = Paths.get(new URL(url).getPath()).getFileName(); + Path downloadedFilePath = pharmgkbDownloadFolder.resolve(downloadedFileName); + logger.info("Downloading file {} to {}", url, downloadedFilePath); + DownloadFile downloadFile = downloadFile(url, downloadedFilePath.toString()); + downloadFiles.add(downloadFile); + + // Unzip downloaded file + unzip(downloadedFilePath.getParent(), downloadedFileName.toString(), Collections.emptyList(), + pharmgkbDownloadFolder.resolve(downloadedFileName.toString().split("\\.")[0])); + } + + // Save versions + saveVersionData(PHARMACOGENOMICS_DATA, PHARMGKB_NAME, pharmGKB.getVersion(), getTimeStamp(), urls, + pharmgkbDownloadFolder.resolve(PHARMGKB_VERSION_FILENAME)); + + return downloadFiles; + } + + private void unzip(Path inPath, String zipFilename, List outFilenames, Path outPath) throws IOException { + // Check zip file exists + FileUtils.checkFile(inPath.resolve(zipFilename)); + + // Unzip files if output dir does NOT exist + if (!outPath.toFile().exists()) { + logger.info("Unzipping {} into {}", zipFilename, outPath); + Command cmd = new Command("unzip -d " + outPath + " " + inPath.resolve(zipFilename)); + cmd.run(); + // Check if expected files exist + for (String outFilename : outFilenames) { + FileUtils.checkFile(outPath.resolve(outFilename)); + } + } + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java index 4245bb9da2..e120e0ae51 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java @@ -91,6 +91,10 @@ public PublicationMongoDBAdaptor getPublicationMongoDBAdaptor() { return new PublicationMongoDBAdaptor(mongoDatastore); } + public PharmacogenomicsMongoDBAdaptor getPharmacogenomicsMongoDBAdaptor() { + return new PharmacogenomicsMongoDBAdaptor(mongoDatastore); + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("MongoDBAdaptorFactory{"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java new file mode 100644 index 0000000000..aabf539eac --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PharmacogenomicsMongoDBAdaptor.java @@ -0,0 +1,136 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.impl.core; + +import com.mongodb.client.model.Filters; +import org.bson.Document; +import org.bson.conversions.Bson; +import org.opencb.biodata.models.pharma.PharmaChemical; +import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.iterator.CellBaseIterator; +import org.opencb.cellbase.lib.iterator.CellBaseMongoDBIterator; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.datastore.core.QueryParam; +import org.opencb.commons.datastore.mongodb.GenericDocumentComplexConverter; +import org.opencb.commons.datastore.mongodb.MongoDBCollection; +import org.opencb.commons.datastore.mongodb.MongoDBIterator; +import org.opencb.commons.datastore.mongodb.MongoDataStore; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * Created by jtarraga on 9/4/23. + */ +public class PharmacogenomicsMongoDBAdaptor extends CellBaseDBAdaptor + implements CellBaseCoreDBAdaptor { + + private static final GenericDocumentComplexConverter CONVERTER; + + static { + CONVERTER = new GenericDocumentComplexConverter<>(PharmaChemical.class); + } + + public PharmacogenomicsMongoDBAdaptor(MongoDataStore mongoDataStore) { + super(mongoDataStore); + + this.init(); + } + + private void init() { + mongoDBCollectionByRelease = buildCollectionByReleaseMap("pharmacogenomics"); + + logger.debug("PharmacogenomicsMongoDBAdaptor initialised"); + } + + @Override + public CellBaseDataResult aggregationStats(PharmaChemicalQuery query) { + logger.error("Not implemented yet"); + return null; + } + + @Override + public List> info(List ids, ProjectionQueryOptions queryOptions, int dataRelease, + String token) throws CellBaseException { + List> results = new ArrayList<>(); + Bson projection = getProjection(queryOptions); + for (String id : ids) { + List orBsonList = new ArrayList<>(ids.size()); + orBsonList.add(Filters.eq("id", id)); + orBsonList.add(Filters.eq("name", id)); + Bson query = Filters.or(orBsonList); + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease); + results.add(new CellBaseDataResult<>(mongoDBCollection.find(query, projection, CONVERTER, new QueryOptions()))); + } + return results; + } + + @Override + public CellBaseIterator iterator(PharmaChemicalQuery query) throws CellBaseException { + Bson bson = parseQuery(query); + QueryOptions queryOptions = query.toQueryOptions(); + Bson projection = getProjection(query); + MongoDBIterator iterator; + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, query.getDataRelease()); + iterator = mongoDBCollection.iterator(null, bson, projection, CONVERTER, queryOptions); + return new CellBaseMongoDBIterator<>(iterator); + } + + @Override + public CellBaseDataResult distinct(PharmaChemicalQuery query) throws CellBaseException { + Bson bsonDocument = parseQuery(query); + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, query.getDataRelease()); + return new CellBaseDataResult<>(mongoDBCollection.distinct(query.getFacet(), bsonDocument, String.class)); + } + + @Override + public CellBaseDataResult groupBy(PharmaChemicalQuery query) throws CellBaseException { + throw new CellBaseException("Not implemented yet"); + } + + public Bson parseQuery(PharmaChemicalQuery pharmaQuery) { + List andBsonList = new ArrayList<>(); + boolean visited = false; + try { + for (Map.Entry entry : pharmaQuery.toObjectMap().entrySet()) { + String dotNotationName = entry.getKey(); + Object value = entry.getValue(); + switch (dotNotationName) { + case "dataRelease": + case "token": + // do nothing + break; + default: + createAndOrQuery(value, dotNotationName, QueryParam.Type.STRING, andBsonList); + break; + } + } + } catch (IllegalAccessException e) { + e.printStackTrace(); + } + logger.debug("pharmacogenomics parsed query: " + andBsonList); + if (andBsonList.size() > 0) { + return Filters.and(andBsonList); + } else { + return new Document(); + } + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/CellBaseManagerFactory.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/CellBaseManagerFactory.java index f89120edb6..ba6e90e150 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/CellBaseManagerFactory.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/CellBaseManagerFactory.java @@ -44,6 +44,7 @@ public class CellBaseManagerFactory { private Map ontologyManagers; private FileManager fileManager; private PublicationManager publicationManager; + private Map pharmacogenomicsManagers; private Map dataReleaseManagers; @@ -65,6 +66,7 @@ public CellBaseManagerFactory(CellBaseConfiguration configuration) { tfManagers = new HashMap<>(); ontologyManagers = new HashMap<>(); dataReleaseManagers = new HashMap<>(); + pharmacogenomicsManagers = new HashMap<>(); } private String getMultiKey(String species, String assembly) { @@ -353,4 +355,23 @@ public PublicationManager getPublicationManager() throws CellBaseException { } return publicationManager; } + + public PharmacogenomicsManager getPharmacogenomicsManager(String species) throws CellBaseException { + if (species == null) { + throw new CellBaseException("Species is required."); + } + SpeciesConfiguration.Assembly assembly = SpeciesUtils.getDefaultAssembly(configuration, species); + return getPharmacogenomicsManager(species, assembly.getName()); + } + + public PharmacogenomicsManager getPharmacogenomicsManager(String species, String assembly) throws CellBaseException { + String multiKey = getMultiKey(species, assembly); + if (!pharmacogenomicsManagers.containsKey(multiKey)) { + if (!validateSpeciesAssembly(species, assembly)) { + throw new CellBaseException("Invalid species " + species + " or assembly " + assembly); + } + pharmacogenomicsManagers.put(multiKey, new PharmacogenomicsManager(species, assembly, configuration)); + } + return pharmacogenomicsManagers.get(multiKey); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PharmacogenomicsManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PharmacogenomicsManager.java new file mode 100644 index 0000000000..72c564945d --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/PharmacogenomicsManager.java @@ -0,0 +1,57 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.managers; + +import org.opencb.biodata.models.pharma.PharmaChemical; +import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.impl.core.CellBaseCoreDBAdaptor; +import org.opencb.cellbase.lib.impl.core.PharmacogenomicsMongoDBAdaptor; + +import java.util.List; + +public class PharmacogenomicsManager extends AbstractManager implements AggregationApi { + + private PharmacogenomicsMongoDBAdaptor pharmacogenomicsDBAdaptor; + + public PharmacogenomicsManager(String species, CellBaseConfiguration configuration) throws CellBaseException { + this(species, null, configuration); + } + + public PharmacogenomicsManager(String species, String assembly, CellBaseConfiguration configuration) throws CellBaseException { + super(species, assembly, configuration); + + this.init(); + } + + private void init() { + pharmacogenomicsDBAdaptor = dbAdaptorFactory.getPharmacogenomicsMongoDBAdaptor(); + } + + @Override + public CellBaseCoreDBAdaptor getDBAdaptor() { + return pharmacogenomicsDBAdaptor; + } + + public List> info(List ids, ProjectionQueryOptions query, int dataRelease, + String token) throws CellBaseException { + return pharmacogenomicsDBAdaptor.info(ids, query, dataRelease, token); + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java index 0bcc0a6237..ce3e2057c3 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java @@ -43,7 +43,10 @@ import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.stream.Collectors; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java index 8f2021e475..81592920bb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java @@ -19,6 +19,7 @@ import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.models.core.*; +import org.opencb.biodata.models.pharma.PharmaChemical; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantBuilder; import org.opencb.biodata.models.variant.annotation.ConsequenceTypeMappings; @@ -36,6 +37,7 @@ import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.managers.*; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; +import org.opencb.cellbase.lib.variant.annotation.futures.FuturePharmacogenomicsAnnotator; import org.opencb.cellbase.lib.variant.hgvs.HgvsCalculator; import org.opencb.commons.datastore.core.QueryOptions; import org.slf4j.Logger; @@ -68,6 +70,7 @@ public class VariantAnnotationCalculator { private ClinicalManager clinicalManager; private RepeatsManager repeatsManager; private ProteinManager proteinManager; + private PharmacogenomicsManager pharmacogenomicsManager; private int dataRelease; private String token; private Set annotatorSet; @@ -101,6 +104,7 @@ public VariantAnnotationCalculator(String species, String assembly, int dataRele this.proteinManager = cellbaseManagerFactory.getProteinManager(species, assembly); this.clinicalManager = cellbaseManagerFactory.getClinicalManager(species, assembly); this.repeatsManager = cellbaseManagerFactory.getRepeatsManager(species, assembly); + this.pharmacogenomicsManager = cellbaseManagerFactory.getPharmacogenomicsManager(species, assembly); // Check data release this.dataRelease = cellbaseManagerFactory.getDataReleaseManager(species, assembly).checkDataRelease(dataRelease); @@ -505,6 +509,14 @@ private List runAnnotationProcess(List normalizedVar spliceScoreFuture = CACHED_THREAD_POOL.submit(futureSpliceScoreAnnotator); } + FuturePharmacogenomicsAnnotator futurePharmacogenomicsAnnotator = null; + Future>> pharmacogenomicsFuture = null; + if (annotatorSet.contains("pharmacogenomics")) { + futurePharmacogenomicsAnnotator = new FuturePharmacogenomicsAnnotator(normalizedVariantList, QueryOptions.empty(), dataRelease, + pharmacogenomicsManager, logger); + pharmacogenomicsFuture = CACHED_THREAD_POOL.submit(futurePharmacogenomicsAnnotator); + } + // We iterate over all variants to get the rest of the annotations and to create the VariantAnnotation objects Queue variantBuffer = new LinkedList<>(); long startTime = System.currentTimeMillis(); @@ -640,10 +652,12 @@ private List runAnnotationProcess(List normalizedVar if (futureCytobandAnnotator != null) { futureCytobandAnnotator.processResults(cytobandFuture, variantAnnotationList); } - if (futureSpliceScoreAnnotator != null) { futureSpliceScoreAnnotator.processResults(spliceScoreFuture, variantAnnotationList); } + if (futurePharmacogenomicsAnnotator != null) { + futurePharmacogenomicsAnnotator.processResults(pharmacogenomicsFuture, variantAnnotationList); + } // Not needed with newCachedThreadPool // fixedThreadPool.shutdown(); @@ -1150,7 +1164,7 @@ private Set getAnnotatorSet(QueryOptions queryOptions) { } else { // 'expression' removed in CB 5.0 annotatorSet = new HashSet<>(Arrays.asList("variation", "traitAssociation", "conservation", "functionalScore", - "consequenceType", "geneDisease", "drugInteraction", "geneConstraints", "mirnaTargets", + "consequenceType", "geneDisease", "drugInteraction", "geneConstraints", "mirnaTargets", "pharmacogenomics", "cancerGeneAssociation", "cancerHotspots", "populationFrequencies", "repeats", "cytoband", "hgvs")); List excludeList = queryOptions.getAsStringList("exclude"); excludeList.forEach(annotatorSet::remove); @@ -1960,4 +1974,3 @@ public VariantNormalizer getNormalizer() { return normalizer; } } - diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePharmacogenomicsAnnotator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePharmacogenomicsAnnotator.java new file mode 100644 index 0000000000..7760dbfe03 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FuturePharmacogenomicsAnnotator.java @@ -0,0 +1,211 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.variant.annotation.futures; + +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.pharma.*; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.avro.Pharmacogenomics; +import org.opencb.biodata.models.variant.avro.PharmacogenomicsAlleles; +import org.opencb.biodata.models.variant.avro.PharmacogenomicsClinicalAnnotation; +import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.managers.PharmacogenomicsManager; +import org.opencb.commons.datastore.core.QueryOptions; +import org.slf4j.Logger; + +import java.util.*; +import java.util.concurrent.*; +import java.util.stream.Collectors; + +public class FuturePharmacogenomicsAnnotator implements Callable>> { + private PharmacogenomicsManager pharmacogenomicsManager; + + private List variantList; + private QueryOptions queryOptions; + private int dataRelease; + + private Logger logger; + + public FuturePharmacogenomicsAnnotator(List variantList, QueryOptions queryOptions, int dataRelease, + PharmacogenomicsManager pharmacogenomicsManager, Logger logger) { + this.pharmacogenomicsManager = pharmacogenomicsManager; + + this.variantList = variantList; + this.queryOptions = queryOptions; + this.dataRelease = dataRelease; + + this.logger = logger; + } + + @Override + public List> call() throws Exception { + long startTime = System.currentTimeMillis(); + + List> cellBaseDataResultList = new ArrayList<>(variantList.size()); + + logger.debug("Pharmacogenomics queries..."); + // Want to return only one CellBaseDataResult object per Variant + List includes = new ArrayList<>(); + includes.add("id"); + includes.add("name"); + includes.add("source"); + includes.add("types"); + includes.add("smiles"); + includes.add("inChI"); + includes.add("variants.variantId"); + includes.add("variants.geneName"); + includes.add("variants.chromosome"); + includes.add("variants.position"); + includes.add("variants.phenotypes"); + includes.add("variants.phenotypeType"); + includes.add("variants.confidence"); + includes.add("variants.score"); + includes.add("variants.url"); + includes.add("variants.evidences.pubmed"); + includes.add("variants.evidences.variantAssociations.description"); + includes.add("variants.evidences.variantAssociations.discussion"); + includes.add("variants.alleles"); + logger.info("Pharmacogenomics variant annotation/search includes: {}", StringUtils.join(includes, ",")); + for (Variant variant : variantList) { + PharmaChemicalQuery query = new PharmaChemicalQuery(); + query.setLocations(Collections.singletonList(variant.getChromosome() + ":" + variant.getStart())); + query.setDataRelease(dataRelease); + query.setIncludes(includes); + cellBaseDataResultList.add(pharmacogenomicsManager.search(query)); + } + logger.info("Pharmacogenomics queries performance in {} ms for {} variants", System.currentTimeMillis() - startTime, + variantList.size()); + return cellBaseDataResultList; + } + + public void processResults(Future>> pharmaFuture, + List variantAnnotationList) + throws InterruptedException, ExecutionException { + List> pharmaChemicalCellBaseDataResults; + try { + pharmaChemicalCellBaseDataResults = pharmaFuture.get(30, TimeUnit.SECONDS); + } catch (TimeoutException e) { + pharmaFuture.cancel(true); + throw new ExecutionException("Unable to finish pharmacogenomics query on time", e); + } + + if (CollectionUtils.isNotEmpty(pharmaChemicalCellBaseDataResults)) { + for (int i = 0; i < variantAnnotationList.size(); i++) { + CellBaseDataResult pharmaChemicalResult = pharmaChemicalCellBaseDataResults.get(i); + if (pharmaChemicalResult != null && CollectionUtils.isNotEmpty(pharmaChemicalResult.getResults())) { + List pharmacogenomicsList = new ArrayList<>(); + for (PharmaChemical pharmaChemical : pharmaChemicalResult.getResults()) { + Pharmacogenomics pharmacogenomics = new Pharmacogenomics(); + // Basic annotation fields + pharmacogenomics.setId(pharmaChemical.getId()); + pharmacogenomics.setName(pharmaChemical.getName()); + pharmacogenomics.setSource(pharmaChemical.getSource()); + pharmacogenomics.setTypes(pharmaChemical.getTypes()); + pharmacogenomics.setSmiles(pharmaChemical.getSmiles()); + pharmacogenomics.setInChI(pharmaChemical.getInChI()); + + // Clinical annotation fields + if (CollectionUtils.isNotEmpty(pharmaChemical.getVariants())) { + String varAnnotChrom = variantAnnotationList.get(i).getChromosome(); + int varAnnotStart = variantAnnotationList.get(i).getStart(); + + List resultClinicalAnnotations = new ArrayList<>(); + + // We must filter out those annotations based on different alternate alleles + // 1. Construct the HOM ALT genotype + final String queryAllele = + variantAnnotationList.get(i).getAlternate() + variantAnnotationList.get(i).getAlternate(); + for (PharmaVariantAnnotation pharmaVariantAnnotation : pharmaChemical.getVariants()) { + // 2. Check the variant is the same + if (!varAnnotChrom.equals(pharmaVariantAnnotation.getChromosome()) + || varAnnotStart != pharmaVariantAnnotation.getPosition()) { + continue; + } + + // 3. Check if the 'alleles' contains the alternate homozygous genotype, or 'null' or '*', + // otherwise go to next annotation + if (CollectionUtils.isNotEmpty(pharmaVariantAnnotation.getAlleles())) { + boolean found = false; + for (PharmaClinicalAllele allele : pharmaVariantAnnotation.getAlleles()) { + if (allele.getAllele().equalsIgnoreCase(queryAllele) + || allele.getAllele().contains("null") + || allele.getAllele().contains("*")) { + found = true; + break; + } + } + if (!found) { + continue; + } + } + + // 4. Create, build and add the annotation + PharmacogenomicsClinicalAnnotation resultClinicalAnnotation = new PharmacogenomicsClinicalAnnotation(); + resultClinicalAnnotation.setVariantId(pharmaVariantAnnotation.getVariantId()); + + resultClinicalAnnotation.setGeneNames(pharmaVariantAnnotation.getGeneNames()); + resultClinicalAnnotation.setPhenotypes(pharmaVariantAnnotation.getPhenotypes()); + resultClinicalAnnotation.setPhenotypeTypes(pharmaVariantAnnotation.getPhenotypeTypes()); + resultClinicalAnnotation.setConfidence(pharmaVariantAnnotation.getConfidence()); + resultClinicalAnnotation.setScore(pharmaVariantAnnotation.getScore()); + resultClinicalAnnotation.setUrl(pharmaVariantAnnotation.getUrl()); + + if (CollectionUtils.isNotEmpty(pharmaVariantAnnotation.getEvidences())) { + Set pubmeds = new LinkedHashSet<>(); + Set summaries = new LinkedHashSet<>(); + for (PharmaClinicalEvidence evidence : pharmaVariantAnnotation.getEvidences()) { + if (StringUtils.isNotEmpty(evidence.getPubmed())) { + pubmeds.add(evidence.getPubmed()); + } + if (CollectionUtils.isNotEmpty(evidence.getVariantAssociations())) { + for (PharmaVariantAssociation variantAssociation : evidence.getVariantAssociations()) { + summaries.add(variantAssociation.getDescription()); + summaries.add(variantAssociation.getDiscussion()); + } + } + } + resultClinicalAnnotation.setPubmed(new ArrayList<>(pubmeds)); + resultClinicalAnnotation.setSummary(String.join(" ", summaries)); + } + + if (CollectionUtils.isNotEmpty(pharmaVariantAnnotation.getAlleles())) { + resultClinicalAnnotation.setAlleles(pharmaVariantAnnotation.getAlleles().stream() + .map(a -> new PharmacogenomicsAlleles(a.getAllele(), a.getAnnotation(), a.getDescription())) + .collect(Collectors.toList()) + ); + } + // Add pharmacogenomics clinical annotation to the list + resultClinicalAnnotations.add(resultClinicalAnnotation); + } + // Set pharmacogenomics clinical annotation + pharmacogenomics.setAnnotations(resultClinicalAnnotations); + } + // Add pharmacogenomics to the list if at least one annotation for the same variant has been found + if (CollectionUtils.isNotEmpty(pharmacogenomics.getAnnotations())) { + pharmacogenomicsList.add(pharmacogenomics); + } + } + // Set the pharmacogenomics data in the variant annotation + variantAnnotationList.get(i).setPharmacogenomics(pharmacogenomicsList); + } + } + } + } +} diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index b2b83cc2f4..de81c7b83b 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -130,3 +130,18 @@ {"collection": "splice_score", "fields": {"chromosome": 1, "position": 1}, "options": {"background": true}} {"collection": "pubmed", "fields": {"medlineCitation.pmid.content": 1}, "options": {"background": true}} + +{"collection": "pharmacogenomics", "fields": {"_chunkIds": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"id": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"name": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"source": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"types": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"variants.variantId": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"variants.location": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"variants.chromosome": 1, "variants.position": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"variants.haplotypes": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"variants.geneNames": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"variants.phenotypes": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"variants.phenotypeType": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"variants.confidence": 1}, "options": {"background": true}} +{"collection": "pharmacogenomics", "fields": {"variants.evidences.pubmed": 1}, "options": {"background": true}} diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/PharmGKBBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/PharmGKBBuilderTest.java new file mode 100644 index 0000000000..d94e322fd7 --- /dev/null +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/PharmGKBBuilderTest.java @@ -0,0 +1,35 @@ +package org.opencb.cellbase.lib.builders; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.*; + +class PharmGKBBuilderTest { + + @Test + public void parseGenericNames() { + String genericName = "\"\"\"(2R,3R,11bR)-9-methoxy-3-(2-methylpropyl)-2,3,4,6,7,11b-hexahydro-1H-benzo[a]quinolizine-2,10-diol\"\", \"\"10-o-desmethyl-alpha-htbz\"\"\""; + List names = stringFieldToList(genericName); + assertTrue(names.size() == 2); + assertTrue(names.contains("(2R,3R,11bR)-9-methoxy-3-(2-methylpropyl)-2,3,4,6,7,11b-hexahydro-1H-benzo[a]quinolizine-2,10-diol")); + assertTrue(names.contains("10-o-desmethyl-alpha-htbz")); + } + + private List stringFieldToList(String field) { + if (field.startsWith("\"")) { + return Arrays.stream(field.replace("\"\"\"", "\"").replace("\"\"", "\"").replace("\", \"", "\",\"").split("\",\"")) + .map(s -> s.replace("\"", "").trim()).collect(Collectors.toList()); + } else { + if (field.contains(", ")) { + return Arrays.stream(field.replace(", ", ",").split(",")).map(String::trim).collect(Collectors.toList()); + } else { + return Collections.singletonList(field); + } + } + } +} \ No newline at end of file diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/ClinicalWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/ClinicalWSServer.java index 8e057f8b19..3800d3bbbe 100644 --- a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/ClinicalWSServer.java +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/ClinicalWSServer.java @@ -36,12 +36,10 @@ import static org.opencb.cellbase.core.ParamConstants.*; -/** - * Created by fjlopez on 06/12/16. - */ -@Path("/{apiVersion}/{species}/clinical") + +@Path("/{apiVersion}/{species}/clinical/variant") @Produces(MediaType.APPLICATION_JSON) -@Api(value = "Clinical", description = "Clinical RESTful Web Services API") +@Api(value = "Clinical Variants", description = "Clinical RESTful Web Services API") public class ClinicalWSServer extends GenericRestWSServer { private ClinicalManager clinicalManager; @@ -66,7 +64,7 @@ public ClinicalWSServer(@PathParam("apiVersion") @ApiParam(name = "apiVersion", } @GET - @Path("/variant/search") + @Path("/search") @ApiOperation(httpMethod = "GET", notes = "No more than 1000 objects are allowed to be returned at a time. " + DOT_NOTATION_NOTE, value = "Retrieves all clinical variants", response = Variant.class, responseContainer = "QueryResponse") @@ -127,7 +125,7 @@ public Response getAll() { } @GET - @Path("/variant/alleleOriginLabels") + @Path("/alleleOriginLabels") @ApiOperation(httpMethod = "GET", notes = "", value = "Retrieves all available allele origin labels", response = Variant.class, responseContainer = "QueryResponse") @@ -140,7 +138,7 @@ public Response getAlleleOriginLabels() { } @GET - @Path("/variant/modeInheritanceLabels") + @Path("/modeInheritanceLabels") @ApiOperation(httpMethod = "GET", notes = "", value = "Retrieves all available mode of inheritance labels", response = Variant.class, responseContainer = "QueryResponse") @@ -153,7 +151,7 @@ public Response getModeInheritanceLabels() { } @GET - @Path("/variant/clinsigLabels") + @Path("/clinsigLabels") @ApiOperation(httpMethod = "GET", notes = "", value = "Retrieves all available clinical significance labels", response = Variant.class, responseContainer = "QueryResponse") @@ -166,7 +164,7 @@ public Response getClinicalSignificanceLabels() { } @GET - @Path("/variant/consistencyLabels") + @Path("/consistencyLabels") @ApiOperation(httpMethod = "GET", notes = "", value = "Retrieves all available consistency labels", response = Variant.class, responseContainer = "QueryResponse") @@ -179,7 +177,7 @@ public Response getConsistencyLabels() { } @GET - @Path("/variant/type") + @Path("/type") @ApiOperation(httpMethod = "GET", notes = "", value = "Retrieves all available variant types", response = Variant.class, responseContainer = "QueryResponse") diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java new file mode 100644 index 0000000000..983a45f739 --- /dev/null +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/clinical/PharmacogenomicsWSServer.java @@ -0,0 +1,180 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.server.rest.clinical; + +import io.swagger.annotations.*; +import org.opencb.biodata.models.pharma.PharmaChemical; +import org.opencb.cellbase.core.api.PharmaChemicalQuery; +import org.opencb.cellbase.core.api.query.QueryException; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.core.utils.SpeciesUtils; +import org.opencb.cellbase.lib.managers.ClinicalManager; +import org.opencb.cellbase.lib.managers.PharmacogenomicsManager; +import org.opencb.cellbase.server.rest.GenericRestWSServer; + +import javax.servlet.http.HttpServletRequest; +import javax.ws.rs.*; +import javax.ws.rs.core.Context; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; +import javax.ws.rs.core.UriInfo; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.opencb.cellbase.core.ParamConstants.*; + + +@Path("/{apiVersion}/{species}/clinical/pharmacogenomics") +@Produces(MediaType.APPLICATION_JSON) +@Api(value = "Clinical Pharmacogenomics", description = "Clinical RESTful Web Services API") +public class PharmacogenomicsWSServer extends GenericRestWSServer { + + private ClinicalManager clinicalManager; + private PharmacogenomicsManager pharmacogenomicsManager; + + public PharmacogenomicsWSServer(@PathParam("apiVersion") @ApiParam(name = "apiVersion", value = VERSION_DESCRIPTION, + defaultValue = DEFAULT_VERSION) String apiVersion, + @PathParam("species") @ApiParam(name = "species", value = SPECIES_DESCRIPTION) String species, + @ApiParam(name = "assembly", value = ASSEMBLY_DESCRIPTION) @DefaultValue("") @QueryParam("assembly") + String assembly, + @ApiParam(name = "dataRelease", value = DATA_RELEASE_DESCRIPTION) @DefaultValue("0") + @QueryParam("dataRelease") int dataRelease, + @ApiParam(name = "token", value = DATA_ACCESS_TOKEN_DESCRIPTION) @DefaultValue("") @QueryParam("token") + String token, + @Context UriInfo uriInfo, @Context HttpServletRequest hsr) + throws QueryException, IOException, CellBaseException { + super(apiVersion, species, uriInfo, hsr); + if (assembly == null) { + assembly = SpeciesUtils.getDefaultAssembly(cellBaseConfiguration, species).getName(); + } + + clinicalManager = cellBaseManagerFactory.getClinicalManager(species, assembly); + pharmacogenomicsManager = cellBaseManagerFactory.getPharmacogenomicsManager(species, assembly); + } + + @GET + @Path("/search") + @ApiOperation(httpMethod = "GET", notes = "No more than 1000 objects are allowed to be returned at a time. " + + DOT_NOTATION_NOTE, + value = "Retrieves all chemicals/drugs", response = PharmaChemical.class, responseContainer = "QueryResponse") + @ApiImplicitParams({ + @ApiImplicitParam(name = "count", value = COUNT_DESCRIPTION, + required = false, dataType = "boolean", paramType = "query", defaultValue = "false", + allowableValues = "false,true"), +// @ApiImplicitParam(name = SOURCE_PARAM, value = SOURCE_DESCRIPTION, +// required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "name", value = "List of chemical/drug names, e.g.: warfarin. In order to get the list of chemical or" + + " drug names, please, call the endpoint pharmacogenomics/distinct?field=names", dataType = "java.util.List", + paramType = "query"), + @ApiImplicitParam(name = "type", value = "List of chemical/drug types, e.g.: Drug,Metabolite. In order to get the list of" + + " chemical or drug types, please, call the endpoint pharmacogenomics/distinct?field=types", + dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "variant", value = "List of variants (dbSNP IDs), e.g.: rs1429376,rs11191561. In order to get the list" + + " of variant IDs, please, call the endpoint endpoint pharmacogenomics/distinct?field=variants.variantId", + dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "haplotype", value = "List of haplotypes, e.g.: CYP2A6*1. In order to get the list of gene names," + + "please, call the endpoint endpoint pharmacogenomics/distinct?field=variants.haplotypes", dataType = "java.util.List", + paramType = "query"), + @ApiImplicitParam(name = "geneName", value = "List of gene names, e.g.: NT5C2,VKORC1. In order to get the list of gene names," + + "please, call the endpoint endpoint pharmacogenomics/distinct?field=variants.geneNames", dataType = "java.util.List", + paramType = "query"), + @ApiImplicitParam(name = "location", value = "List of chromosomic coordinates in the format: chromosome:position, e.g.:" + + " 10:103109774", dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "phenotype", value = "List of phenotypes, e.g.: Hemorrhage,Thrombosis. In order to get the list of" + + "phenotype values, please, call the endpoint pharmacogenomics/distinct?field=variants.phenotypes", + dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "phenotypeType", value = "List of phenotype categories (i.e., association phenotype), e.g.: Dosage," + + "Toxicity. In order to get the list of phenotype category values, please, call the endpoint pharmacogenomics/distinct" + + "?field=variants.phenotypeTypes", dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "confidence", value = "List of confidence values. Valid values: 1A, 1B, 2A, 2B, 3, 4", + dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "pubmedId", value = "List of evidence PubMed IDs, e.g.: 14765194", dataType = "java.util.List", + paramType = "query"), + @ApiImplicitParam(name = "exclude", value = EXCLUDE_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "include", value = INCLUDE_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "sort", value = SORT_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "order", value = ORDER_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query", + defaultValue = "", allowableValues="ASCENDING,DESCENDING"), + @ApiImplicitParam(name = "limit", value = LIMIT_DESCRIPTION, + required = false, defaultValue = DEFAULT_LIMIT, dataType = "java.util.List", + paramType = "query"), + @ApiImplicitParam(name = "skip", value = SKIP_DESCRIPTION, + required = false, defaultValue = DEFAULT_SKIP, dataType = "java.util.List", + paramType = "query") + }) + public Response getAll() { + try { + PharmaChemicalQuery query = new PharmaChemicalQuery(uriParams); + CellBaseDataResult queryResults = pharmacogenomicsManager.search(query); + + return createOkResponse(queryResults); + } catch (Exception e) { + return createErrorResponse(e); + } + } + + @GET + @Path("/{chemicals}/info") + @ApiOperation(httpMethod = "GET", value = "Get information about the specified chemical(s) or drug(s)", response = PharmaChemical.class, + responseContainer = "QueryResponse") + @ApiImplicitParams({ + @ApiImplicitParam(name = "exclude", value = EXCLUDE_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "include", value = INCLUDE_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query") + }) + public Response getInfo(@PathParam("chemicals") @ApiParam(name = "chemicals", value = "Chemical/drug names", required = true) + String chemicals) { + try { + PharmaChemicalQuery pharmaQuery = new PharmaChemicalQuery(uriParams); + List> queryResults = pharmacogenomicsManager.info(Arrays.asList(chemicals.split(",")), + pharmaQuery, getDataRelease(), getToken()); + return createOkResponse(queryResults); + } catch (Exception e) { + return createErrorResponse(e); + } + } + + @GET + @Path("/distinct") + @ApiOperation(httpMethod = "GET", notes = "Gets a unique list of values, e.g. variants.location", + value = "Get a unique list of values for a given field.") + @ApiImplicitParams({ + @ApiImplicitParam(name = "type", value = "List of types", + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "gene", value = "List of gene names", + required = false, dataType = "java.util.List", paramType = "query"), + }) + public Response getUniqueValues(@QueryParam("field") @ApiParam(name = "field", required = true, + value = "Name of column to return, e.g. variants.location") String field) { + try { + copyToFacet("field", field); + PharmaChemicalQuery query = new PharmaChemicalQuery(uriParams); + CellBaseDataResult queryResults = pharmacogenomicsManager.distinct(query); + return createOkResponse(queryResults); + } catch (Exception e) { + return createErrorResponse(e); + } + } + +}