From 0eb898e776206bbe075569c41220ca6ea49b1351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 31 Jul 2024 09:59:49 +0200 Subject: [PATCH] lib: improve gene (Ensembl/RefSeq) builder by supporting multi-species (e.g., mmusculus), #TASK-6426, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 76 ++++-- .../org/opencb/cellbase/lib/EtlCommons.java | 34 ++- .../lib/builders/AbstractBuilder.java | 7 +- .../lib/builders/EnsemblGeneBuilder.java | 153 +++++++----- .../builders/EnsemblGeneBuilderIndexer.java | 26 ++- .../cellbase/lib/builders/GeneBuilder.java | 66 +++++- .../lib/builders/GeneBuilderIndexer.java | 81 ++++--- .../lib/builders/RefSeqGeneBuilder.java | 217 +++++++++--------- .../builders/RefSeqGeneBuilderIndexer.java | 6 +- 9 files changed, 414 insertions(+), 252 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 129b31e78..aff9c5cc8 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -47,8 +47,10 @@ import static org.opencb.cellbase.lib.EtlCommons.*; import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE; +import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.ProteinBuilder.OUTPUT_PROTEIN_OUTPUT_FILENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*; import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME; import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME; @@ -69,8 +71,6 @@ public class BuildCommandExecutor extends CommandExecutor { private boolean flexibleGTFParsing; - private static final String DATA_ALREADY_BUILT = "{} data has already been built."; - public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) { super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf); @@ -234,8 +234,49 @@ private AbstractBuilder buildGenomeSequence() throws CellBaseException { } private AbstractBuilder buildGene() throws CellBaseException { - return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing, - configuration); + logger.info(BUILDING_LOG_MESSAGE, getDataName(GENE_DATA)); + + // Sanity check + Path geneDownloadPath = downloadFolder.resolve(GENE_DATA); + Path geneBuildPath = buildFolder.resolve(GENE_DATA); + + List versionFiles = new ArrayList<>(Arrays.asList( + geneDownloadPath.resolve(ENSEMBL_DATA).resolve(getDataVersionFilename(ENSEMBL_DATA)), + geneDownloadPath.resolve(REFSEQ_DATA).resolve(getDataVersionFilename(REFSEQ_DATA)))); + List dataList = GeneBuilder.getCommonDataSources(speciesConfiguration, configuration); + for (String data : dataList) { + Path versionFile; + switch (data) { + case MIRTARBASE_DATA: + versionFile = downloadFolder.resolve(REGULATION_DATA).resolve(MIRTARBASE_DATA).resolve(getDataVersionFilename(data)); + break; + case MIRBASE_DATA: + versionFile = downloadFolder.resolve(REGULATION_DATA).resolve(MIRBASE_DATA).resolve(getDataVersionFilename(data)); + break; + default: + versionFile = downloadFolder.resolve(GERP_DATA).resolve(getDataVersionFilename(data)); + break; + } + versionFiles.add(versionFile); + } + + List filesToCheck = new ArrayList<>(Arrays.asList(geneBuildPath.resolve(ENSEMBL_GENE_OUTPUT_FILENAME), + geneBuildPath.resolve(REFSEQ_GENE_OUTPUT_FILENAME))); + for (Path versionFile : versionFiles) { + filesToCheck.add(geneBuildPath.resolve(versionFile.getFileName())); + } + filesToCheck.addAll(versionFiles); + + if (AbstractBuilder.existFiles(filesToCheck)) { + logger.warn(DATA_ALREADY_BUILT, getDataName(ENSEMBL_DATA) + " and " + getDataName(REFSEQ_DATA) + " genes"); + return null; + } + + System.exit(-1); + + copyVersionFiles(versionFiles, geneBuildPath); + + return new GeneBuilder(geneDownloadPath, geneBuildPath, speciesConfiguration, flexibleGTFParsing, configuration); } private AbstractBuilder buildRepeats() throws CellBaseException { @@ -403,25 +444,8 @@ private Path getFastaReferenceGenome() throws CellBaseException { SpeciesUtils.getSpeciesShortname(speciesConfiguration), assembly.getName(), null); String fastaFilename = Paths.get(ensemblUrl).getFileName().toString(); Path gzFastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename); - Path fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(GZ_EXTENSION, "")); - if (!fastaPath.toFile().exists()) { - // Gunzip - logger.info("Gunzip file: {}", gzFastaPath); - try { - List params = Arrays.asList("--keep", gzFastaPath.toString()); - EtlCommons.runCommandLineProcess(null, "gunzip", params, null); - } catch (IOException e) { - throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); - } catch (InterruptedException e) { - // Restore interrupted state... - Thread.currentThread().interrupt(); - throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); - } - } - if (!fastaPath.toFile().exists()) { - throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); - } - return fastaPath; + + return EtlCommons.getFastaPath(gzFastaPath); } private AbstractBuilder buildSplice() throws IOException, CellBaseException { @@ -484,7 +508,11 @@ private void checkVersionFiles(List versionPaths) throws CellBaseException private void copyVersionFiles(List versionPaths, Path targetPath) throws CellBaseException { // Check version files before copying them checkVersionFiles(versionPaths); - if (!targetPath.toFile().exists()) { + copyFiles(versionPaths, targetPath); + } + + private void copyFiles(List versionPaths, Path targetPath) throws CellBaseException { + if (!Files.exists(targetPath)) { try { Files.createDirectories(targetPath); } catch (IOException e) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 11af71249..e94acaf4b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -93,13 +93,11 @@ public final class EtlCommons { // Gene public static final String GENE_DATA = "gene"; - public static final String ENSEMBL_GENE_BASENAME = "ensemblGene"; public static final String GENE_ANNOTATION_DATA = "gene_annotation"; public static final String GENE_DISEASE_ANNOTATION_DATA = "gene_disease_annotation"; // RefSeq public static final String REFSEQ_DATA = "refseq"; - public static final String REFSEQ_GENE_BASENAME = "refSeqGene"; // Must match the configuration file public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF"; public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA"; @@ -508,7 +506,7 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - LOGGER.debug("Executing command: {}", StringUtils.join(builder.command(), " ")); + LOGGER.info("Executing command: {}", StringUtils.join(builder.command(), " ")); Process process = builder.start(); process.waitFor(); @@ -541,6 +539,34 @@ private static ProcessBuilder getProcessBuilder(File workingDirectory, String bi return builder; } + public static Path getFastaPath(Path gzFastaPath) throws CellBaseException { + // Sanity check + if (!Files.exists(gzFastaPath)) { + throw new CellBaseException("Gzipped FASTA file " + gzFastaPath + " does not exist"); + } + + // Check FASTA and unzip if necessary + Path fastaPath = gzFastaPath.getParent().resolve(gzFastaPath.getFileName().toString().replace(GZ_EXTENSION, "")); + if (!fastaPath.toFile().exists()) { + // Gunzip + LOGGER.info("Gunzip file {}", gzFastaPath); + try { + List params = Arrays.asList("--keep", gzFastaPath.toString()); + EtlCommons.runCommandLineProcess(null, "gunzip", params, null); + } catch (IOException e) { + throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); + } catch (InterruptedException e) { + // Restore interrupted state... + Thread.currentThread().interrupt(); + throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e); + } + } + if (!fastaPath.toFile().exists()) { + throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip"); + } + return fastaPath; + } + public static boolean isMissing(String string) { return !((string != null) && !string.isEmpty() && !string.replace(" ", "") @@ -736,7 +762,7 @@ private static List getRepeatsDataList(CellBaseConfiguration configurati return dataList; } - private static boolean isDataSupported(DownloadProperties.URLProperties props, String prefix) { + public static boolean isDataSupported(DownloadProperties.URLProperties props, String prefix) { for (String key : props.getFiles().keySet()) { if (key.startsWith(prefix)) { return true; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java index 8359f26e8..550197c76 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AbstractBuilder.java @@ -61,6 +61,9 @@ public abstract class AbstractBuilder { public static final String PARSING_LOG_MESSAGE = "Parsing {} ..."; public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done."; + public static final String SKIPPING_INDEX_DATA_LOG_MESSAGE = "Skipping index for data '{}': it is not supported for species '{}'."; + public static final String DATA_ALREADY_BUILT = "'{}' data has already been built."; + protected AbstractBuilder(CellBaseSerializer serializer) { logger = LoggerFactory.getLogger(this.getClass()); @@ -80,7 +83,7 @@ public void disconnect() { } } - protected String getConfigurationFileIdPrefix(String scientificSpecies) { + protected static String getConfigurationFileIdPrefix(String scientificSpecies) { String prefix = ""; if (StringUtils.isNotEmpty(scientificSpecies) && !scientificSpecies.equals("Homo sapiens") && scientificSpecies.contains(" ")) { char c = scientificSpecies.charAt(0); @@ -94,6 +97,8 @@ protected File checkFile(DownloadProperties.URLProperties props, String fileId, String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString(); if (filename.contains(MANUAL_PREFIX)) { filename = filename.replace(MANUAL_PREFIX, ""); + } else if (filename.contains(SCRIPT_PREFIX)) { + filename = filename.split("@")[1]; } Path filePath = targetPath.resolve(filename); if (!Files.exists(filePath)) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java index e29cba82b..044d9bc23 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilder.java @@ -31,7 +31,6 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataSource; import org.opencb.cellbase.core.serializer.CellBaseSerializer; -import org.opencb.cellbase.lib.EtlCommons; import org.rocksdb.RocksDBException; import java.io.File; @@ -53,31 +52,28 @@ public class EnsemblGeneBuilder extends AbstractBuilder { private final Map transcriptDict; private final Map exonDict; - private Path gtfFile; - private Path proteinFastaFile; - private Path cDnaFastaFile; - private Path geneDescriptionFile; - private Path xrefsFile; - private Path hgncFile; - private Path maneFile; - private Path lrgFile; - private Path uniprotIdMappingFile; - private Path tfbsFile; - private Path tabixFile; - private Path geneExpressionFile; - private Path geneDrugFile; - private Path hpoFile; - private Path disgenetFile; - private Path genomeSequenceFilePath; - private Path gnomadFile; - private Path geneOntologyAnnotationFile; - private Path miRBaseFile; - private Path miRTarBaseFile; - private Path cancerGeneCensusFile; - private Path cancerHostpotFile; - private Path ensemblCanonicalFile; -// private Path tso500File; -// private Path eglhHaemOncFile; + private Path gtfFile = null; + private Path proteinFastaFile = null; + private Path cDnaFastaFile = null; + private Path geneDescriptionFile = null; + private Path xrefsFile = null; + private Path hgncFile = null; + private Path maneFile = null; + private Path lrgFile = null; + private Path uniprotIdMappingFile = null; + private Path tfbsFile = null; + private Path tabixFile = null; + private Path geneExpressionFile = null; + private Path geneDrugFile = null; + private Path hpoFile = null; + private Path genomeSequenceFilePath = null; + private Path gnomadFile = null; + private Path geneOntologyAnnotationFile = null; + private Path miRBaseFile = null; + private Path miRTarBaseFile = null; + private Path cancerGeneCensusFile = null; + private Path cancerHostpotFile = null; + private Path ensemblCanonicalFile = null; // source for genes is either ensembl or refseq private final String SOURCE = ParamConstants.QueryParams.ENSEMBL.key(); @@ -92,6 +88,11 @@ public class EnsemblGeneBuilder extends AbstractBuilder { private String feature; private Gtf nextGtfToReturn; + private boolean isHSapiens = false; + + public static final String ENSEMBL_GENE_BASENAME = "ensemblGene"; + public static final String ENSEMBL_GENE_OUTPUT_FILENAME = ENSEMBL_GENE_BASENAME + ".json.gz"; + public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing, CellBaseConfiguration configuration, CellBaseSerializer serializer) { super(serializer); @@ -103,6 +104,10 @@ public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfigu transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); + + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + isHSapiens = true; + } } public void check() throws Exception { @@ -134,27 +139,68 @@ public void check() throws Exception { xrefsFile = checkFile(props, ENSEMBL_XREFS_FILE_ID, downloadPath.getParent(), "Ensembl Xrefs").toPath(); ensemblCanonicalFile = checkFile(props, ENSEMBL_CANONICAL_FILE_ID, downloadPath.getParent(), "Ensembl Canonical").toPath(); - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { -// tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); -// eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); + // Check common files + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + if (isHSapiens || isDataSupported(configuration.getDownload().getManeSelect(), prefixId)) { maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(MANE_SELECT_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getLrg(), prefixId)) { lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(LRG_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHgnc(), prefixId)) { hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(HGNC_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { cancerHostpotFile = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getDgidb(), prefixId)) { geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(DGIDB_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGeneUniprotXref(), prefixId)) { uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGeneExpressionAtlas(), prefixId)) { geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHpo(), prefixId)) { hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); - disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(HPO_DISEASE_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGnomadConstraints(), prefixId)) { gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getGoAnnotation(), prefixId)) { geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA), speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(CANCER_GENE_CENSUS_DATA), speciesConfiguration.getScientificName()); } // Check regulation files // Motif features - List files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), - 2); + List files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MOTIF_FEATURES_DATA), 2); if (files.get(0).getName().endsWith("tbi")) { tabixFile = files.get(0).toPath(); tfbsFile = files.get(1).toPath(); @@ -162,36 +208,28 @@ public void check() throws Exception { tabixFile = files.get(1).toPath(); tfbsFile = files.get(0).toPath(); } + // mirbase - miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); + if (isHSapiens || isDataSupported(configuration.getDownload().getMirbase(), prefixId)) { + miRBaseFile = checkFiles(MIRBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MIRBASE_DATA), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(MIRTARBASE_DATA), speciesConfiguration.getScientificName()); + } // mirtarbase - miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); + if (isHSapiens || isDataSupported(configuration.getDownload().getMiRTarBase(), prefixId)) { + miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MIRTARBASE_DATA), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, getDataName(MIRTARBASE_DATA), speciesConfiguration.getScientificName()); + } // Check genome FASTA file Path genomeDownloadPath = downloadPath.getParent().getParent().resolve(GENOME_DATA); String genomeGzFilename = Paths.get(((DataSource) dataSourceReader.readValue(genomeDownloadPath .resolve(getDataVersionFilename(GENOME_DATA)).toFile())).getUrls().get(0)).getFileName().toString(); - genomeSequenceFilePath = genomeDownloadPath.resolve(genomeGzFilename); - if (Files.exists(genomeSequenceFilePath)) { - // Need to be gunzip-ed - logger.info("Gunzip file: {}", genomeSequenceFilePath); - try { - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(genomeSequenceFilePath.toString()), null); - } catch (IOException e) { - throw new CellBaseException("Error executing gunzip in FASTA file " + genomeSequenceFilePath, e); - } catch (InterruptedException e) { - // Restore interrupted state... - Thread.currentThread().interrupt(); - throw new CellBaseException("Error executing gunzip in FASTA file " + genomeSequenceFilePath, e); - } - } - String genomeFilename = genomeGzFilename.replace(GZ_EXTENSION, ""); - genomeSequenceFilePath = genomeDownloadPath.resolve(genomeFilename); - if (!Files.exists(genomeSequenceFilePath)) { - throw new CellBaseException("Genome FASTA file " + genomeSequenceFilePath.getFileName() + " does not exist at " - + genomeSequenceFilePath.getParent()); - } + genomeSequenceFilePath = getFastaPath(genomeDownloadPath.resolve(genomeGzFilename)); logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel); checked = true; @@ -209,10 +247,9 @@ public void parse() throws Exception { EnsemblGeneBuilderIndexer indexer = new EnsemblGeneBuilderIndexer(serializer.getOutdir()); try { // process files and put values in rocksdb - indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, - proteinFastaFile, cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, - geneDrugFile, hpoFile, disgenetFile, gnomadFile, geneOntologyAnnotationFile, miRBaseFile, - miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile); + indexer.index(geneDescriptionFile, xrefsFile, hgncFile, maneFile, lrgFile, uniprotIdMappingFile, proteinFastaFile, + cDnaFastaFile, speciesConfiguration.getScientificName(), geneExpressionFile, geneDrugFile, hpoFile, gnomadFile, + geneOntologyAnnotationFile, miRBaseFile, miRTarBaseFile, cancerGeneCensusFile, cancerHostpotFile, ensemblCanonicalFile); TabixReader tabixReader = null; if (!Files.exists(tfbsFile) || !Files.exists(tabixFile)) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java index d46ebef22..4841f5ffe 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/EnsemblGeneBuilderIndexer.java @@ -71,8 +71,8 @@ public EnsemblGeneBuilderIndexer(Path geneDirectoryPath) { public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path maneFile, Path lrgFile, Path uniprotIdMappingFile, Path proteinFastaFile, Path cDnaFastaFile, String species, Path geneExpressionFile, Path geneDrugFile, Path hpoFile, - Path disgenetFile, Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, - Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile) + Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile, Path cancerGeneGensusFile, + Path cancerHostpotFile, Path canonicalFile) throws IOException, RocksDBException, FileFormatException, CellBaseException { indexDescriptions(geneDescriptionFile); indexXrefs(xrefsFile, uniprotIdMappingFile); @@ -83,7 +83,7 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path indexCdnaSequences(cDnaFastaFile); indexExpression(species, geneExpressionFile); indexDrugs(geneDrugFile); - indexDiseases(hpoFile, disgenetFile); + indexDiseases(hpoFile); indexConstraints(gnomadFile); indexOntologyAnnotations(geneOntologyAnnotationFile); indexMiRBase(species, miRBaseFile); @@ -91,8 +91,6 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path indexCancerGeneCensus(cancerGeneGensusFile); indexCancerHotspot(cancerHostpotFile); indexCanonical(canonicalFile); -// indexTSO500(tso500File); -// indexEGLHHaemOnc(eglhHaemOncFile); } private void indexDescriptions(Path geneDescriptionFile) throws IOException, RocksDBException { @@ -202,6 +200,10 @@ public List getXrefs(String id) throws RocksDBException, IOException { } private void indexExpression(String species, Path geneExpressionFile) throws IOException, RocksDBException { + if (geneExpressionFile == null) { + return; + } + Map> geneExpressionMap = new HashMap<>(); if (geneExpressionFile != null && Files.exists(geneExpressionFile) && Files.size(geneExpressionFile) > 0 && species != null) { @@ -253,7 +255,11 @@ public List getExpression(String id) throws RocksDBException, IOExce } private void indexConstraints(Path gnomadFile) throws IOException, RocksDBException { - if (gnomadFile != null && Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { + if (gnomadFile == null) { + return; + } + + if (Files.exists(gnomadFile) && Files.size(gnomadFile) > 0) { logger.info("Loading OE scores from '{}'", gnomadFile); InputStream inputStream = Files.newInputStream(gnomadFile); BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(inputStream))); @@ -309,6 +315,10 @@ private void addConstraint(List constraints, String name, String val } private void indexOntologyAnnotations(Path goaFile) throws IOException, RocksDBException { + if (goaFile == null) { + return; + } + Map> annotations = new HashMap<>(); if (goaFile != null && Files.exists(goaFile) && Files.size(goaFile) > 0) { logger.info("Loading GO annotation from '{}'", goaFile); @@ -329,6 +339,10 @@ public List getOntologyAnnotations(String id) thr } private void indexMiRBase(String species, Path miRBaseFile) throws IOException { + if (miRBaseFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, miRBaseFile); MirBaseCallback callback = new MirBaseCallback(rocksdb, rocksDbManager); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index a5dda27e3..44b7e587f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -21,12 +21,19 @@ import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; +import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; +import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_BASENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_BASENAME; +import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME; public class GeneBuilder extends AbstractBuilder { + private Path downloadPath; private EnsemblGeneBuilder ensemblGeneBuilder; private RefSeqGeneBuilder refSeqGeneBuilder; @@ -34,15 +41,15 @@ public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speci CellBaseConfiguration configuration) throws CellBaseException { super(null); + this.downloadPath = downloadPath; + // Create Ensembl gene builder - CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(ENSEMBL_DATA), - ENSEMBL_GENE_BASENAME); + CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath, ENSEMBL_GENE_BASENAME); this.ensemblGeneBuilder = new EnsemblGeneBuilder(downloadPath.resolve(ENSEMBL_DATA), speciesConfiguration, flexibleGTFParsing, configuration, ensemblGeneSerializer); // Create RefSeq gene builder - CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(REFSEQ_DATA), - REFSEQ_GENE_BASENAME); + CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath, REFSEQ_GENE_BASENAME); this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, configuration, refSeqGeneSerializer); } @@ -57,15 +64,58 @@ public void check() throws Exception { @Override public void parse() throws Exception { - logger.info(BUILDING_LOG_MESSAGE, getDataName(GENE_DATA)); - // Check folders and files before building check(); -// // Build Ensembl/RefSeq genes + // Build Ensembl genes ensemblGeneBuilder.parse(); - refSeqGeneBuilder.parse(); + + // Build RefSeq genes + if (!Files.exists(downloadPath.resolve(REFSEQ_DATA).resolve(REFSEQ_GENE_OUTPUT_FILENAME))) { + refSeqGeneBuilder.parse(); + } else { + logger.info(DATA_ALREADY_BUILT, getDataName(REFSEQ_DATA) + " gene"); + } + logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); } + + public static List getCommonDataSources(SpeciesConfiguration speciesConfiguration, CellBaseConfiguration configuration) { + List dataList = new ArrayList<>(); + + boolean isHSapiens = false; + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + isHSapiens = true; + } + + String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); + + if (isHSapiens || isDataSupported(configuration.getDownload().getManeSelect(), prefixId)) { + dataList.add(MANE_SELECT_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getLrg(), prefixId)) { + dataList.add(LRG_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + dataList.add(CANCER_HOTSPOT_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getDgidb(), prefixId)) { + dataList.add(DGIDB_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHpo(), prefixId)) { + dataList.add(HPO_DISEASE_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { + dataList.add(CANCER_GENE_CENSUS_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getMiRTarBase(), prefixId)) { + dataList.add(MIRTARBASE_DATA); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getMirbase(), prefixId)) { + dataList.add(MIRBASE_DATA); + } + + return dataList; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java index 1f56afe56..7b980ffd7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilderIndexer.java @@ -44,8 +44,8 @@ import java.util.*; import java.util.stream.Collectors; -import static org.opencb.cellbase.lib.EtlCommons.DISGENET_DATA; import static org.opencb.cellbase.lib.EtlCommons.ENSEMBL_DATA; +import static org.opencb.cellbase.lib.EtlCommons.HPO_DISEASE_DATA; import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_DONE_LOG_MESSAGE; import static org.opencb.cellbase.lib.builders.AbstractBuilder.PARSING_LOG_MESSAGE; @@ -69,8 +69,6 @@ public class GeneBuilderIndexer { protected final String DRUGS_SUFFIX = "_drug"; protected final String DISEASE_SUFFIX = "_disease"; protected final String MIRTARBASE_SUFFIX = "_mirtarbase"; -// protected final String TSO500_SUFFIX = "_tso500"; -// protected final String EGLH_HAEMONC_SUFFIX = "_eglh_haemonc"; public GeneBuilderIndexer(Path genePath) { this.init(genePath); @@ -101,6 +99,10 @@ public String getCdnaFasta(String id) throws RocksDBException { } protected void indexProteinSequences(Path proteinFastaFile) throws IOException, FileFormatException, RocksDBException { + if (proteinFastaFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, proteinFastaFile); FastaReader fastaReader = new FastaReader(proteinFastaFile); Fasta fasta; @@ -116,6 +118,10 @@ protected String getProteinFasta(String id) throws RocksDBException { } protected void indexHgncIdMapping(Path hgncMappingFile) throws IOException, RocksDBException { + if (hgncMappingFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, hgncMappingFile); try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hgncMappingFile)) { String line = bufferedReader.readLine(); @@ -135,6 +141,10 @@ public String getHgncId(String id) throws RocksDBException { } protected void indexManeMapping(Path maneMappingFile, String referenceId) throws IOException, RocksDBException { + if (maneMappingFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, maneMappingFile); int idColumn = referenceId.equalsIgnoreCase(ENSEMBL_DATA) ? 7 : 5; @@ -161,6 +171,10 @@ public String getMane(String id, String field) throws RocksDBException { } protected void indexLrgMapping(Path lrgMappingFile, String referenceId) throws IOException, RocksDBException { + if (lrgMappingFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, lrgMappingFile); // # Last modified: 30-03-2021@22:00:06 @@ -189,6 +203,10 @@ public String getLrg(String id, String field) throws RocksDBException { } protected void indexCancerGeneCensus(Path cgcFile) throws IOException, RocksDBException { + if (cgcFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, cgcFile); Map tissuesMap = new HashMap<>(); @@ -313,6 +331,10 @@ public List getCancerGeneCensus(String geneName) throws R } public void indexCancerHotspot(Path cancerHotspot) throws IOException, RocksDBException { + if (cancerHotspot == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, cancerHotspot); // Store all cancer hotspot (different gene and aminoacid position) for each gene in the same key @@ -497,6 +519,10 @@ protected void close() throws IOException { } protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBException { + if (geneDrugFile == null) { + return; + } + logger.info(PARSING_LOG_MESSAGE, geneDrugFile); String currentGene = ""; @@ -561,49 +587,32 @@ protected void indexDrugs(Path geneDrugFile) throws IOException, RocksDBExceptio logger.info(PARSING_DONE_LOG_MESSAGE, geneDrugFile); } - protected void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOException, RocksDBException { + protected void indexDiseases(Path hpoFilePath) throws IOException, RocksDBException { + if (hpoFilePath == null) { + return; + } Map> geneDiseaseAssociationMap = new HashMap<>(50000); String line; // HPO -// logger.info(PARSING_LOG_MESSAGE, hpoFilePath); -// try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { -// // Skip first header line -// bufferedReader.readLine(); -// while ((line = bufferedReader.readLine()) != null) { -// String[] fields = line.split("\t"); -// String omimId = fields[6]; -// String geneSymbol = fields[3]; -// String hpoId = fields[0]; -// String diseaseName = fields[1]; -// GeneTraitAssociation disease = -// new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), HPO_DATA); -// addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); -// } -// } -// logger.info(PARSING_DONE_LOG_MESSAGE, hpoFilePath); - - // DisGeNet - logger.info(PARSING_LOG_MESSAGE, disgenetFilePath); - try (BufferedReader bufferedReader = FileUtils.newBufferedReader(disgenetFilePath)) { + logger.info(PARSING_LOG_MESSAGE, hpoFilePath); + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(hpoFilePath)) { // Skip first header line bufferedReader.readLine(); while ((line = bufferedReader.readLine()) != null) { String[] fields = line.split("\t"); - String diseaseId = fields[4]; - String diseaseName = fields[5]; - String score = fields[9]; - String numberOfPubmeds = fields[13].trim(); - String numberOfSNPs = fields[14]; - String source = fields[15]; - GeneTraitAssociation disease = new GeneTraitAssociation(diseaseId, diseaseName, "", Float.parseFloat(score), - Integer.parseInt(numberOfPubmeds), Arrays.asList(numberOfSNPs), Arrays.asList(source), DISGENET_DATA); - addValueToMapElement(geneDiseaseAssociationMap, fields[1], disease); + String omimId = fields[6]; + String geneSymbol = fields[3]; + String hpoId = fields[0]; + String diseaseName = fields[1]; + GeneTraitAssociation disease = + new GeneTraitAssociation(omimId, diseaseName, hpoId, 0f, 0, new ArrayList<>(), new ArrayList<>(), HPO_DISEASE_DATA); + addValueToMapElement(geneDiseaseAssociationMap, geneSymbol, disease); } } - logger.info(PARSING_DONE_LOG_MESSAGE, disgenetFilePath); + logger.info(PARSING_DONE_LOG_MESSAGE); for (Map.Entry> entry : geneDiseaseAssociationMap.entrySet()) { rocksDbManager.update(rocksdb, entry.getKey() + DISEASE_SUFFIX, entry.getValue()); @@ -611,6 +620,10 @@ protected void indexDiseases(Path hpoFilePath, Path disgenetFilePath) throws IOE } protected void indexMiRTarBase(Path miRTarBaseFile) throws IOException, RocksDBException { + if (miRTarBaseFile == null) { + return; + } + MiRTarBaseIndexer miRTarBaseIndexer = new MiRTarBaseIndexer(); Map> result = miRTarBaseIndexer.index(miRTarBaseFile); for (Map.Entry> entry : result.entrySet()) { diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java index 3248e2f5d..b470b2cb2 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilder.java @@ -45,20 +45,17 @@ public class RefSeqGeneBuilder extends AbstractBuilder { private Map transcriptDict; private Map exonDict; - private Path gtfFile; - private Path fastaFile; - private Path proteinFastaFile; - private Path cdnaFastaFile; - private Path maneFile; - private Path lrgFile; - private Path disgenetFile; - private Path hpoFile; - private Path geneDrugFile; - private Path miRTarBaseFile; - private Path cancerGeneCensusFile; - private Path cancerHotspot; -// private Path tso500File; -// private Path eglhHaemOncFile; + private Path gtfFile = null; + private Path fastaFile = null; + private Path proteinFastaFile = null; + private Path cdnaFastaFile = null; + private Path maneFile = null; + private Path lrgFile = null; + private Path hpoFile = null; + private Path geneDrugFile = null; + private Path miRTarBaseFile = null; + private Path cancerGeneCensusFile = null; + private Path cancerHotspot = null; private SpeciesConfiguration speciesConfiguration; private static final Map REFSEQ_CHROMOSOMES = new HashMap<>(); private static final String KNOWN_STATUS = "KNOWN"; @@ -70,6 +67,15 @@ public class RefSeqGeneBuilder extends AbstractBuilder { // sometimes there are two stop codons (eg NM_018159.4). Only parse the first one, skip the second private boolean seenStopCodon = false; + private boolean isHSapiens = false; + + private static final String ENSEMBL = "ensembl"; + private static final String TRANSCRIPT_ID = "transcript_id"; + private static final String EXON_NUMBER = "exon_number"; + + public static final String REFSEQ_GENE_BASENAME = "refSeqGene"; + public static final String REFSEQ_GENE_OUTPUT_FILENAME = REFSEQ_GENE_BASENAME + ".json.gz"; + public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, CellBaseConfiguration configuration, CellBaseSerializer serializer) { super(serializer); @@ -80,6 +86,10 @@ public RefSeqGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfigur transcriptDict = new HashMap<>(250000); exonDict = new HashMap<>(8000000); + + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + isHSapiens = true; + } } public void check() throws Exception { @@ -109,43 +119,49 @@ public void check() throws Exception { // Check genome FASTA file String genomeGzFilename = Paths.get(props.getFiles().get(prefixId + REFSEQ_GENOMIC_FNA_FILE_ID)).getFileName().toString(); - fastaFile = downloadPath.resolve(genomeGzFilename); - if (Files.exists(fastaFile)) { - // Need to be gunzip-ed - logger.info("Gunzip file: {}", fastaFile); - try { - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaFile.toString()), null); - } catch (IOException e) { - throw new CellBaseException("Error executing gunzip in FASTA file " + fastaFile, e); - } catch (InterruptedException e) { - // Restore interrupted state... - Thread.currentThread().interrupt(); - throw new CellBaseException("Error executing gunzip in FASTA file " + fastaFile, e); - } - } - String genomeFilename = genomeGzFilename.replace(GZ_EXTENSION, ""); - fastaFile = downloadPath.resolve(genomeFilename); - if (!Files.exists(fastaFile)) { - throw new CellBaseException("Genome FASTA file " + fastaFile.getFileName() + " does not exist at " + fastaFile.getParent()); - } + Path fastaGzFile = downloadPath.resolve(genomeGzFilename); + fastaFile = EtlCommons.getFastaPath(fastaGzFile); // Check common files - props = configuration.getDownload().getEnsembl().getUrl(); - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { -// tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath(); -// eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath(); + if (isHSapiens || isDataSupported(configuration.getDownload().getManeSelect(), prefixId)) { maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, MANE_SELECT_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getLrg(), prefixId)) { lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, LRG_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { cancerHotspot = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, CANCER_HOTSPOT_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getDgidb(), prefixId)) { geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, DGIDB_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getHpo(), prefixId)) { hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath(); - disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, HPO_DISEASE_DATA, speciesConfiguration.getScientificName()); + } + if (isHSapiens || isDataSupported(configuration.getDownload().getCancerHotspot(), prefixId)) { cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, CANCER_GENE_CENSUS_DATA, speciesConfiguration.getScientificName()); } // Check regulation files // mirtarbase - miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 1).get(0).toPath(); + if (isHSapiens || isDataSupported(configuration.getDownload().getMiRTarBase(), prefixId)) { + miRTarBaseFile = checkFiles(MIRTARBASE_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA) + .resolve(MIRTARBASE_DATA), 1).get(0).toPath(); + } else { + logger.info(SKIPPING_INDEX_DATA_LOG_MESSAGE, MIRTARBASE_DATA, speciesConfiguration.getScientificName()); + } logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, refSeqGeneLabel); checked = true; @@ -163,41 +179,41 @@ public void parse() throws Exception { // Index protein sequences for later logger.info("Indexing gene annotation for {} ...", getDataName(REFSEQ_DATA)); RefSeqGeneBuilderIndexer indexer = new RefSeqGeneBuilderIndexer(gtfFile.getParent()); - indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, disgenetFile, miRTarBaseFile, - cancerGeneCensusFile, cancerHotspot); + indexer.index(maneFile, lrgFile, proteinFastaFile, cdnaFastaFile, geneDrugFile, hpoFile, miRTarBaseFile, cancerGeneCensusFile, + cancerHotspot); logger.info("Indexing done for {}", getDataName(REFSEQ_DATA)); logger.info(PARSING_LOG_MESSAGE, gtfFile); - GtfReader gtfReader = new GtfReader(gtfFile); - - Gtf gtf; - while ((gtf = gtfReader.read()) != null) { - String chromosome = getSequenceName(gtf.getSequenceName()); - switch (gtf.getFeature()) { - case "gene": - // we've finished the previous transcript, store xrefs - addXrefs(transcript, geneDbxrefs, exonDbxrefs); - parseGene(gtf, chromosome, indexer); - break; - case "transcript": - break; - case "exon": - parseExon(gtf, chromosome, fastaIndex, indexer); - break; - case "CDS": - parseCDS(gtf, indexer); - break; - case "start_codon": - seenStopCodon = false; - break; - case "stop_codon": - if (!seenStopCodon) { - parseStopCodon(gtf); - seenStopCodon = true; - } - break; - default: - throw new RuntimeException("Unexpected feature type: " + gtf.getFeature()); + try (GtfReader gtfReader = new GtfReader(gtfFile)) { + Gtf gtf; + while ((gtf = gtfReader.read()) != null) { + String chromosome = getSequenceName(gtf.getSequenceName()); + switch (gtf.getFeature()) { + case "gene": + // we've finished the previous transcript, store xrefs + addXrefs(transcript, geneDbxrefs, exonDbxrefs); + parseGene(gtf, chromosome, indexer); + break; + case "transcript": + break; + case "exon": + parseExon(gtf, chromosome, fastaIndex, indexer); + break; + case "CDS": + parseCDS(gtf, indexer); + break; + case "start_codon": + seenStopCodon = false; + break; + case "stop_codon": + if (!seenStopCodon) { + parseStopCodon(gtf); + seenStopCodon = true; + } + break; + default: + throw new CellBaseException("Error parsing: unexpected feature type: " + gtf.getFeature()); + } } } @@ -208,7 +224,6 @@ public void parse() throws Exception { store(); // Close - gtfReader.close(); serializer.close(); if (fastaIndex != null) { fastaIndex.close(); @@ -239,7 +254,6 @@ private void addXrefs(Transcript transcript, Set geneDbxrefs, Set ex return; } exonDbxrefs.addAll(geneDbxrefs); -// transcript.setXrefs(new ArrayList<>(exonDbxrefs)); transcript.getXrefs().addAll(exonDbxrefs); transcript.getXrefs().add(new Xref(transcript.getName(), "hgnc_symbol", "HGNC Symbol")); @@ -278,8 +292,9 @@ private void parseGene(Gtf gtf, String chromosome, RefSeqGeneBuilderIndexer inde geneDbxrefs = parseXrefs(gtf); } - private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeqGeneBuilderIndexer indexer) throws RocksDBException { - String transcriptId = gtf.getAttributes().get("transcript_id"); + private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeqGeneBuilderIndexer indexer) throws RocksDBException, + CellBaseException { + String transcriptId = gtf.getAttributes().get(TRANSCRIPT_ID); // new transcript if (!transcriptDict.containsKey(transcriptId)) { @@ -303,7 +318,7 @@ private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeq if (fastaIndex != null) { exonSequence = fastaIndex.query(gtf.getSequenceName(), gtf.getStart(), gtf.getEnd()); } - String exonNumber = gtf.getAttributes().get("exon_number"); + String exonNumber = gtf.getAttributes().get(EXON_NUMBER); // RefSeq does not provide Exon IDs, we are using transcript ID and exon numbers String exonId = transcriptId + "_" + exonNumber; @@ -325,14 +340,14 @@ private void parseExon(Gtf gtf, String chromosome, FastaIndex fastaIndex, RefSeq } } - private void parseCDS(Gtf gtf, RefSeqGeneBuilderIndexer indexer) throws RocksDBException { - String exonNumber = gtf.getAttributes().get("exon_number"); + private void parseCDS(Gtf gtf, RefSeqGeneBuilderIndexer indexer) throws RocksDBException, CellBaseException { + String exonNumber = gtf.getAttributes().get(EXON_NUMBER); if (StringUtils.isEmpty(exonNumber)) { // this CDS doesn't know which exon it belongs to. skip return; } - transcript = transcriptDict.get(gtf.getAttributes().get("transcript_id")); + transcript = transcriptDict.get(gtf.getAttributes().get(TRANSCRIPT_ID)); String exonId = transcript.getId() + "_" + exonNumber; Exon exon = exonDict.get(exonId); @@ -458,12 +473,12 @@ private void parseCDS(Gtf gtf, RefSeqGeneBuilderIndexer indexer) throws RocksDBE } private void parseStopCodon(Gtf gtf) { - String exonNumber = gtf.getAttributes().get("exon_number"); + String exonNumber = gtf.getAttributes().get(EXON_NUMBER); if (StringUtils.isEmpty(exonNumber)) { // some codons don't have an exon number, discard return; } - Transcript transcript = transcriptDict.get(gtf.getAttributes().get("transcript_id")); + transcript = transcriptDict.get(gtf.getAttributes().get(TRANSCRIPT_ID)); String exonId = transcript.getId() + "_" + exonNumber; Exon exon = exonDict.get(exonId); @@ -564,14 +579,14 @@ private void parseStopCodon(Gtf gtf) { } } - private Set parseXrefs(Gtf gtf) { + private Set parseXrefs(Gtf gtf) throws CellBaseException { Set xrefSet = new HashSet<>(); String xrefs = gtf.getAttributes().get("db_xref"); if (StringUtils.isNotEmpty(xrefs)) { for (String xrefString : xrefs.split(",")) { String[] dbxrefParts = xrefString.split(":", 2); if (dbxrefParts.length != 2) { - throw new RuntimeException("Bad xref, expected colon: " + xrefString); + throw new CellBaseException("Error parsing Xrefs: bad xref, expected colon: " + xrefString); } String dbName = dbxrefParts[0].toLowerCase(); String id = dbxrefParts[1]; @@ -580,7 +595,7 @@ private Set parseXrefs(Gtf gtf) { dbName = "hgnc_id"; dbDisplayName = "HGNC ID"; } - if ("ensembl".equalsIgnoreCase(dbName)) { + if (ENSEMBL.equalsIgnoreCase(dbName)) { if (id.startsWith("ENST")) { dbName = "ensembl_transcript"; dbDisplayName = "Ensembl transcript"; @@ -601,7 +616,6 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId Map gtfAttributes = gtf.getAttributes(); String name = gene.getName(); -// String biotype = gtfAttributes.get("gbkey"); String biotype = gtfAttributes.get("transcript_biotype"); if ("mRNA".equals(biotype)) { biotype = "protein_coding"; @@ -612,7 +626,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), new HashSet<>(), new TranscriptAnnotation()); // Add MANE Select mappings, with this we can know which Ensembl and Refseq transcripts match according to MANE - for (String suffix: Arrays.asList("ensembl", "ensembl_protein")) { + for (String suffix: Arrays.asList(ENSEMBL, "ensembl_protein")) { String maneRefSeq = indexer.getMane(transcriptId, suffix); if (StringUtils.isNotEmpty(maneRefSeq)) { transcript.getXrefs().add(new Xref(maneRefSeq, "mane_select_" + suffix, @@ -621,7 +635,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId } // Add LRG mappings, with this we can know which Ensembl and Refseq transcripts match according to LRG - String lrgRefSeq = indexer.getLrg(transcriptId, "ensembl"); + String lrgRefSeq = indexer.getLrg(transcriptId, ENSEMBL); if (StringUtils.isNotEmpty(lrgRefSeq)) { transcript.getXrefs().add(new Xref(lrgRefSeq, "lrg_ensembl", "LRG Ensembl")); } @@ -637,15 +651,6 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId if (StringUtils.isNotEmpty(lrg)) { transcript.getFlags().add("LRG"); } - // 3. TSO500 and EGLH HaemOnc -// String tso500Flag = indexer.getTSO500(transcriptId.split("\\.")[0]); -// if (StringUtils.isNotEmpty(tso500Flag)) { -// transcript.getFlags().add(tso500Flag); -// } -// String eglhHaemOncFlag = indexer.getEGLHHaemOnc(transcriptId.split("\\.")[0]); -// if (StringUtils.isNotEmpty(eglhHaemOncFlag)) { -// transcript.getFlags().add(eglhHaemOncFlag); -// } gene.getTranscripts().add(transcript); @@ -654,7 +659,7 @@ private Transcript getTranscript(Gtf gtf, String chromosome, String transcriptId } private String getGeneId(Gtf gtf) throws CellBaseException { - // db_xref "GeneID:100287102"; + // Splitting the db_xref, e.g.: "GeneID:100287102" String xrefString = gtf.getAttributes().get("db_xref"); String[] xrefs = xrefString.split(","); for (String xref : xrefs) { @@ -667,11 +672,11 @@ private String getGeneId(Gtf gtf) throws CellBaseException { throw new CellBaseException("Didn't find geneId for db_xref:" + xrefString); } - private String getSequenceName(String fullSequenceName) { + private String getSequenceName(String fullSequenceName) throws CellBaseException { String[] sequenceNameParts = fullSequenceName.split("\\."); if (sequenceNameParts.length != 2) { - throw new RuntimeException("bad chromosome: " + fullSequenceName); + throw new CellBaseException("Invalid sequence name: bad chromosome: " + fullSequenceName); } // just get the first part, e.g. NC_000024.11 @@ -683,20 +688,6 @@ private String getSequenceName(String fullSequenceName) { return fullSequenceName; } -// private void setAnnotationFiles(Path refSeqDirectoryPath) { -// Path geneDirectoryPath = refSeqDirectoryPath.getParent().resolve("gene"); -// maneFile = geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"); -// lrgFile = geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"); -// geneDrugFile = geneDirectoryPath.resolve("dgidb.tsv"); -// disgenetFile = geneDirectoryPath.resolve("all_gene_disease_associations.tsv.gz"); -// hpoFile = geneDirectoryPath.resolve("phenotype_to_genes.txt"); -// cancerGeneCensus = geneDirectoryPath.resolve("cancer-gene-census.tsv"); -// cancerHotspot = geneDirectoryPath.resolve("hotspots_v2.xls"); -// tso500File = geneDirectoryPath.resolve("TSO500_transcripts.txt"); -// eglhHaemOncFile = geneDirectoryPath.resolve("EGLH_HaemOnc_transcripts.txt"); -// miRTarBaseFile = refSeqDirectoryPath.getParent().resolve("regulation/hsa_MTI.xlsx"); -// } - static { REFSEQ_CHROMOSOMES.put("NC_000001", "1"); REFSEQ_CHROMOSOMES.put("NC_000002", "2"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java index 596c8b61c..6a4fe69fc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/RefSeqGeneBuilderIndexer.java @@ -32,18 +32,16 @@ public RefSeqGeneBuilderIndexer(Path refSeqDirectoryPath) { } public void index(Path maneFile, Path lrgFile, Path proteinFastaFile, Path cDnaFastaFile, Path geneDrugFile, Path hpoFilePath, - Path disgenetFile, Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot) + Path miRTarBaseFile, Path cancerGeneGensus, Path cancerHotspot) throws IOException, RocksDBException, FileFormatException, CellBaseException { indexManeMapping(maneFile, REFSEQ_DATA); indexLrgMapping(lrgFile, REFSEQ_DATA); indexProteinSequences(proteinFastaFile); indexCdnaSequences(cDnaFastaFile); indexDrugs(geneDrugFile); - indexDiseases(hpoFilePath, disgenetFile); + indexDiseases(hpoFilePath); indexMiRTarBase(miRTarBaseFile); indexCancerGeneCensus(cancerGeneGensus); indexCancerHotspot(cancerHotspot); -// indexTSO500(tso500File); -// indexEGLHHaemOnc(eglhHaemOncFile); } }