diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 409c66ba1e..7a6605dacd 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -125,13 +125,14 @@ download: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz - version: "2023-12-01" + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-05.xml.gz + version: 2024-05 clinvarVariation: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/VCV_xml_old_format/ClinVarVariationRelease_2024-05.xml.gz + version: 2024-05 clinvarSummary: host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz version: "2023-12-01" @@ -158,10 +159,10 @@ download: genomicSuperDups: host: http://hgdownload.cse.ucsc.edu/goldenPath gwasCatalog: -# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv -# version: "1.0.2 associations_e106_r2022-05-17" - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv - version: "23-12-21" + #host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv + host: "https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/05/20/gwas-catalog-associations_ontology-annotated.tsv" + #version: "1.0.2 associations_e106_r2022-05-17" + version: "2024-05-20" hpo: ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 6330cb71a3..693c8da9ab 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -56,9 +56,9 @@ public class EtlCommons { public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json"; public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant"; - public static final String CLINVAR_VERSION = "2022.11"; - public static final String CLINVAR_DATE = "2022-11"; - public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz"; + public static final String CLINVAR_VERSION = "2024-05"; + public static final String CLINVAR_DATE = "2024-05"; + public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz"; public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv"; public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java index a31bd8d5e6..8b88f821f6 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java @@ -210,7 +210,7 @@ private void printSummary() { } private boolean updateRocksDB(SequenceLocation sequenceLocation, String variationId, String[] lineFields, - String mateVariantString, Map traitsToEfoTermsMap) + String mateVariantString, Map traitsToEfoTermsMap) throws RocksDBException, IOException { // More than one variant being returned from the normalisation process would mean it's and MNV which has been // decomposed @@ -266,13 +266,34 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy } // parse RCVs - String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); - String clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion() - .getClinicalSignificance() - .getDescription(); - String reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance() - .getReviewStatus().name(); - List getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn(); + String accession = null; + try { + accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); + } catch (Exception e) { + logger.warn("Error getting accession. Ignore error and leave it as null.", e); + } + String clinicalSignficanceDescription = null; + try { + clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion() + .getClinicalSignificance() + .getDescription(); + } catch (Exception e) { + logger.warn("Error getting clinical significance description. Ignore error and leave it as null.", e); + } + String reviewStatusName = null; + try { + reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance() + .getReviewStatus().name(); + } catch (Exception e) { + logger.warn("Error getting review status name. Ignore error and leave it as null.", e); + } + List getObservedIn = null; + try { + getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn(); + } catch (Exception e) { + logger.warn("Error getting observed in. Ignore error and leave it as null.", e); + } + addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString, clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription, reviewStatusName, getObservedIn); @@ -388,7 +409,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu Map traitsToEfoTermsMap, String accession, String clinicalSignficanceDescription, String reviewStatusName, List getObservedIn) - throws JsonProcessingException { + throws JsonProcessingException { List additionalProperties = new ArrayList<>(3); EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE); @@ -544,7 +565,7 @@ private ModeOfInheritance getModeOfInheritance(String modeOfInheritance) { private List getGenomicFeature(PublicSetType publicSet, String alleleId) { if (publicSet.getReferenceClinVarAssertion().getMeasureSet() != null) { return getGenomicFeature(publicSet.getReferenceClinVarAssertion().getMeasureSet()); - // No measureSet means there must be genotypeSet + // No measureSet means there must be genotypeSet } else if (publicSet.getReferenceClinVarAssertion().getGenotypeSet() != null) { for (MeasureSetType measureSet : publicSet.getReferenceClinVarAssertion().getGenotypeSet().getMeasureSet()) { if (measureSet.getMeasure() != null) { @@ -596,7 +617,7 @@ private List getHeritableTrait(PublicSetType publicSet, Map 0) { logger.warn("ClinVar record found " + publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc() + " with no preferred trait provided. Arbitrarily selecting first one: {}", trait.getName() .get(0).getElementValue().getValue()); return trait.getName().get(0).getElementValue().getValue(); - // No trait name provided at all + // No trait name provided at all } else { throw new IllegalArgumentException("ClinVar record found " + publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc() diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java index bbe33017fd..2b34f86a50 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java @@ -74,13 +74,12 @@ public abstract class ClinicalIndexer { protected VariantNormalizer normalizer; public ClinicalIndexer(Path genomeSequenceFilePath) throws IOException { - // Forcing decomposition here in all cases - assuming the way CellBase stores clinical variants from here - // onwards will be decomposed and Adaptors will deal with phased/no-phased queries + // Use the same OpenCGA normalization parameters VariantNormalizer.VariantNormalizerConfig variantNormalizerConfig = (new VariantNormalizer.VariantNormalizerConfig()) .setReuseVariants(true) - .setNormalizeAlleles(false) - .setDecomposeMNVs(true); + .setNormalizeAlleles(true) + .setDecomposeMNVs(false); if (genomeSequenceFilePath != null) { logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath.toString()); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java index f8d2f16d15..a26d18c60c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java @@ -41,7 +41,7 @@ public class CosmicIndexer extends ClinicalIndexer { private Pattern mutationGRCh37GenomePositionPattern; private Pattern snvPattern; - private static final String COSMIC_VERSION = "v95"; + private static final String COSMIC_VERSION = "v99"; private static final int GENE_NAMES_COLUMN = 0; private static final int HGNC_COLUMN = 3; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java index 2b4f2e4d8b..0fe3b0f115 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java @@ -31,6 +31,7 @@ import java.nio.file.Path; import java.text.NumberFormat; import java.util.*; +import java.util.stream.Collectors; public class GwasIndexer extends ClinicalIndexer { @@ -46,6 +47,8 @@ public class GwasIndexer extends ClinicalIndexer { private int gwasLinesNotFoundInDbsnp; private int invalidVariantRecords; + private int lineCounter = 0; + public GwasIndexer(Path gwasFile, Path dbSnpTabixFile, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); @@ -56,36 +59,31 @@ public GwasIndexer(Path gwasFile, Path dbSnpTabixFile, Path genomeSequenceFilePa } public void index() throws RocksDBException, IOException { - logger.info("Parsing GWAS catalog file ..."); - - BufferedReader inputReader = null; - TabixReader dbsnpTabixReader = null; - - try { - logger.info("Opening GWAS catalog file " + gwasFile + " ..."); - inputReader = new BufferedReader(new FileReader(gwasFile.toFile())); + try (BufferedReader inputReader = new BufferedReader(new FileReader(gwasFile.toFile())); + TabixReader dbsnpTabixReader = new TabixReader(dbSnpTabixFile.toString())) { logger.info("Ignoring GWAS catalog file header line ..."); - String line = inputReader.readLine(); + inputReader.readLine(); + ++lineCounter; + Map chromosomeMap = buildChromosomeMap(dbsnpTabixReader); Map gwasMap = new HashMap<>(); - logger.info("Opening dbSNP tabix file " + dbSnpTabixFile + " ..."); - dbsnpTabixReader = new TabixReader(dbSnpTabixFile.toString()); long processedGwasLines = 0; - logger.info("Parsing GWAS catalog file ..."); + logger.info("Parsing GWAS catalog file {} ...", gwasFile); + String line; while ((line = inputReader.readLine()) != null) { + ++lineCounter; if (!line.isEmpty()) { processedGwasLines++; if (processedGwasLines % 10000 == 0) { logger.info("{} lines parsed", processedGwasLines); } - processGwasCatalogLine(line.split("\t"), dbsnpTabixReader, gwasMap); + processGwasCatalogLine(line.split("\t"), dbsnpTabixReader, gwasMap, chromosomeMap); } } - dbsnpTabixReader.close(); logger.info("Updating clinical variant annotation..."); long counter = 0; @@ -118,16 +116,9 @@ public void index() throws RocksDBException, IOException { rdb.put(entry.getKey().getBytes(), jsonObjectWriter.writeValueAsBytes(variantAnnotation)); } this.printSummary(processedGwasLines, gwasMap); - } catch (RocksDBException | IOException e) { + } catch (RocksDBException | IOException e) { logger.error("Error reading/writing from/to the RocksDB index while indexing GWAS catalog file"); throw e; - } finally { - if (inputReader != null) { - inputReader.close(); - } - if (dbsnpTabixReader != null) { - dbsnpTabixReader.close(); - } } } @@ -184,13 +175,14 @@ significant digit (for example, a published p-value of 4.8 x 10-7 is rounded to 37 GENOTYPING_TECHNOLOGY* +: Genotyping technology/ies used in this study, with additional array information (ex. Immunochip or Exome array) in brackets. */ - private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReader, Map gwasMap) { + private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReader, Map gwasMap, + Map chromosomeMap) throws IOException { Integer start = parseStart(values); if (start != null) { String chromosome = parseChromosome(values[11]); if (StringUtils.isNotEmpty(chromosome)) { String snpId = "rs" + values[23].trim(); - String[] refAndAlt = getRefAndAltFromDbsnp(chromosome, start, snpId, dbsnpTabixReader); + String[] refAndAlt = getRefAndAltFromDbsnp(chromosome, start, snpId, dbsnpTabixReader, chromosomeMap); if (refAndAlt != null) { // Create variant Variant variant; @@ -270,21 +262,27 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade // Scores management GwasAssociationStudyTraitScores scores = new GwasAssociationStudyTraitScores(); - try { - scores.setPValue(Double.parseDouble(values[27])); - } catch (NumberFormatException e) { -// logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]); + if (StringUtils.isNotEmpty(values[27])) { + try { + scores.setPValue(Double.parseDouble(values[27])); + } catch (NumberFormatException e) { + logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]); + } } - try { - scores.setPValueMlog(Double.parseDouble(values[28])); - } catch (NumberFormatException e) { -// logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]); + if (StringUtils.isNotEmpty(values[28])) { + try { + scores.setPValueMlog(Double.parseDouble(values[28])); + } catch (NumberFormatException e) { + logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]); + } } scores.setPValueText(values[29]); - try { - scores.setOrBeta(Double.parseDouble(values[30])); - } catch (NumberFormatException e) { -// logger.warn(e.getMessage() + ". Parsing Odd or beta: " + values[30]); + if (StringUtils.isNotEmpty(values[30])) { + try { + scores.setOrBeta(Double.parseDouble(values[30])); + } catch (NumberFormatException e) { + logger.warn(e.getMessage() + ". Parsing Odd or beta: " + values[30]); + } } scores.setPercentCI(values[31]); @@ -301,15 +299,15 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade gwasMap.put(key, gwas); } } else { -// logger.warn("Variant not found in dbSNP " + snpId + ". Line: " + StringUtils.join(values, "\t\t\t")); + logger.warn("dbSNP {} not found. Line: {}", snpId, lineCounter); gwasLinesNotFoundInDbsnp++; } } else { -// logger.warn("Invalid chromosome " + chromosome + ". Line: " + StringUtils.join(values, "\t\t\t")); + logger.warn("Invalid chromosome {}. Line: {}", chromosome, lineCounter); invalidChromosome++; } } else { -// logger.warn("Invalid position " + start + ". Line: " + StringUtils.join(values, "\t\t\t")); + logger.warn("Invalid position {}. Line: {}", start, lineCounter); invalidStartRecords++; } } @@ -342,6 +340,39 @@ private String parseChromosome(String chromosome) { return transformedChromosome; } + private Map buildChromosomeMap(TabixReader dbsnpTabixReader) { + List chroms = dbsnpTabixReader.getChromosomes().stream().filter(name -> name.startsWith("NC_")) + .collect(Collectors.toList()); + + Map chromMap = new HashMap<>(); + for (int i = 1; i < 22; i++) { + chromMap.put(Integer.toString(i), Integer.toString(i)); + } + chromMap.put("X", "X"); + chromMap.put("Y", "Y"); + chromMap.put("MT", "MT"); + + for (String chrom : chroms) { + String[] split = chrom.split("[_\\.]"); + int value = Integer.parseInt(split[1]); + switch (value) { + case 23: + chromMap.put("X", chrom); + break; + case 24: + chromMap.put("Y", chrom); + break; + case 12920: + chromMap.put("MT", chrom); + break; + default: + chromMap.put(Integer.toString(value), chrom); + break; + } + } + return chromMap; + } + private Float parseFloat(String value) { Float riskAlleleFrequency = null; if (NumberUtils.isNumber(value)) { @@ -350,29 +381,33 @@ private Float parseFloat(String value) { return riskAlleleFrequency; } - private String[] getRefAndAltFromDbsnp(String chromosome, Integer start, String snpId, TabixReader dbsnpTabixReader) { + private String[] getRefAndAltFromDbsnp(String chromosome, Integer start, String snpId, TabixReader dbsnpTabixReader, + Map chromosomeMap) throws IOException { + boolean found = false; + Set foundSnpIds = new HashSet<>(); String[] refAndAlt = null; - TabixReader.Iterator dbsnpIterator = dbsnpTabixReader.query(chromosome + ":" + start + "-" + start); - try { - String dbSnpRecord = dbsnpIterator.next(); - boolean found = false; - while (dbSnpRecord != null && !found) { - String[] dbsnpFields = dbSnpRecord.split("\t"); - - if (snpId.equalsIgnoreCase(dbsnpFields[2])) { - refAndAlt = new String[2]; - refAndAlt[REF] = dbsnpFields[3]; - refAndAlt[ALT] = dbsnpFields[4]; - found = true; - } - - dbSnpRecord = dbsnpIterator.next(); + String query = chromosomeMap.get(chromosome) + ":" + start + "-" + start; + TabixReader.Iterator dbsnpIterator = dbsnpTabixReader.query(query); + String dbSnpRecord = null; + dbSnpRecord = dbsnpIterator.next(); + while (dbSnpRecord != null && !found) { + String[] dbsnpFields = dbSnpRecord.split("\t"); + + if (snpId.equalsIgnoreCase(dbsnpFields[2])) { + refAndAlt = new String[2]; + refAndAlt[REF] = dbsnpFields[3]; + refAndAlt[ALT] = dbsnpFields[4]; + found = true; + } else { + foundSnpIds.add(dbsnpFields[2]); } - } catch (IOException e) { - logger.warn("Error reading position '" + chromosome + ":" + start + "' in dbSNP: " + e.getMessage()); - } + dbSnpRecord = dbsnpIterator.next(); + } + if (!found) { + logger.warn("dbSNP {} not found from query {}. Found: {}", snpId, query, foundSnpIds); + } return refAndAlt; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index eb1f28db2d..bb9e0c36e4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -81,8 +81,10 @@ public List downloadClinical() throws IOException, InterruptedExce url = configuration.getDownload().getClinvarVariationAllele().getHost(); downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString())); clinvarUrls.add(url); - saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, getClinVarVersion(), getTimeStamp(), clinvarUrls, - clinicalFolder.resolve("clinvarVersion.json")); + saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, configuration.getDownload().getClinvar() + .getVersion(), getTimeStamp(), clinvarUrls, clinicalFolder.resolve("clinvarVersion.json")); + + logger.info("\t\tDone"); // Gwas catalog logger.info("\t\tDownloading GWAS catalog file ..."); @@ -91,6 +93,7 @@ public List downloadClinical() throws IOException, InterruptedExce downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.GWAS_FILE).toString())); saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, GWAS_NAME, gwasCatalog.getVersion(), getTimeStamp(), Collections.singletonList(url), clinicalFolder.resolve("gwasVersion.json")); + logger.info("\t\tDone"); // List hgvsList = getDocmHgvsList(); // if (!hgvsList.isEmpty()) { @@ -236,10 +239,4 @@ private List getDocmHgvsList() throws IOException { return hgvsList; } - - private String getClinVarVersion() { - // ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2015-12.xml.gz - return configuration.getDownload().getClinvar().getHost().split("_")[1].split("\\.")[0]; - } - } diff --git a/pom.xml b/pom.xml index 1c619a886b..7aab94cee9 100644 --- a/pom.xml +++ b/pom.xml @@ -25,6 +25,7 @@ ${project.version} 5.2.1-SNAPSHOT 3.2.1-SNAPSHOT + 0.1.0 9.4.51.v20230217