diff --git a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl index d227b4c1c..22b6a825b 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/gene_extra_info.pl @@ -18,7 +18,7 @@ #################################################################### ##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl -s "Mus musculus" -o /tmp -# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --outdir ../../appl_db/ird_v1/hsa ... +# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --assembly "GRCh38" --outdir ../../appl_db/ird_v1/hsa ... ## Parsing command line GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'outdir=s' => \$outdir, 'phylo=s' => \$phylo, diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 356c637ca..faa3e3b2f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -26,7 +26,10 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import static org.opencb.cellbase.lib.EtlCommons.*; @@ -75,7 +78,6 @@ public List download() throws IOException, InterruptedException, C downloadFiles.add(downloadDrugData(geneDownloadPath)); downloadFiles.add(downloadGeneUniprotXref(geneDownloadPath)); downloadFiles.add(downloadGeneExpressionAtlas(geneDownloadPath)); - downloadFiles.add(downloadGeneDiseaseAnnotation(geneDownloadPath)); downloadFiles.add(downloadGnomadConstraints(geneDownloadPath)); downloadFiles.add(downloadGO(geneDownloadPath)); logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA)); @@ -83,18 +85,26 @@ public List download() throws IOException, InterruptedException, C // Save data sources manually downloaded if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { // HPO - saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), - Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)), - geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA))); - logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA), - getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath); + if (Files.exists(geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA)))) { + logger.warn("The version file {} already exists", getDataVersionFilename(HPO_DISEASE_DATA)); + } else { + saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(), + Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)), + geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA))); + logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA), + getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath); + } // Cancer gene census - saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(), - Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), CANCER_GENE_CENSUS_FILE_ID)), - geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA))); - logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA), - getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath); + if (Files.exists(geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA)))) { + logger.warn("The version file {} already exists", getDataVersionFilename(CANCER_GENE_CENSUS_DATA)); + } else { + saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(), + Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), + CANCER_GENE_CENSUS_FILE_ID)), geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA))); + logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA), + getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath); + } } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DATA)); @@ -106,6 +116,11 @@ private List downloadEnsemblData(Path ensemblDownloadPath) throws // Check if the species is supported if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { + // Already downloaded ? + if (isAlreadyDownloaded(ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA)), getDataName(ENSEMBL_DATA))) { + return downloadFiles; + } + logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA)); DownloadProperties.EnsemblProperties ensemblConfig = configuration.getDownload().getEnsembl(); @@ -135,7 +150,8 @@ private List downloadRefSeq(Path refSeqDownloadPath) throws IOExce if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) { // GTF, DNA, RNA, Protein String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID)) { + if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID) + && !isAlreadyDownloaded(refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA)), getDataName(REFSEQ_DATA))) { logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA)); DownloadProperties.URLProperties refSeqConfig = configuration.getDownload().getRefSeq(); @@ -155,6 +171,14 @@ private List downloadRefSeq(Path refSeqDownloadPath) throws IOExce } public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, CellBaseException { + String ensemblCanonicalScript = "ensembl_canonical.pl"; + String ensemblCanonicalFilename = "ensembl_canonical.txt"; + + if (Files.exists(geneDownloadPath.resolve(ensemblCanonicalFilename))) { + logger.warn("File {} already exists, skipping running the Perl script {}", ensemblCanonicalFilename, ensemblCanonicalScript); + return; + } + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); @@ -165,7 +189,7 @@ public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, geneDownloadPath.toAbsolutePath().toString(), "/tmp"); // Params - String params = "/opt/cellbase/scripts/ensembl-scripts/ensembl_canonical.pl" + String params = "/opt/cellbase/scripts/ensembl-scripts/" + ensemblCanonicalScript + " --species \"" + speciesConfiguration.getId() + "\"" + " --assembly \"" + assemblyConfiguration.getName() + "\"" + " --outdir \"" + outputBinding.getValue() + "\""; @@ -173,13 +197,24 @@ public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, // Execute perl script in docker DockerUtils.run(dockerImage, null, outputBinding, params, null); } catch (Exception e) { - throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + logger.error("{}", e.getStackTrace()); +// throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); } public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, CellBaseException { + String geneExtraInfoScript = "gene_extra_info.pl"; + String descriptionFilename = "description.txt"; + String xrefsFilename = "xrefs.txt"; + + if (Files.exists(geneDownloadPath.resolve(descriptionFilename)) && Files.exists(geneDownloadPath.resolve(xrefsFilename))) { + logger.warn("Files {} and {} already exist, skipping running the Perl script {}", descriptionFilename, xrefsFilename, + geneExtraInfoScript); + return; + } + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); @@ -190,14 +225,16 @@ public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, Cel geneDownloadPath.toAbsolutePath().toString(), "/tmp"); // Params - String params = "/opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl" - + " --species \"" + speciesConfiguration.getId() + "\"" + String params = "/opt/cellbase/scripts/ensembl-scripts/" + geneExtraInfoScript + + " --species \"" + speciesConfiguration.getScientificName() + "\"" + + " --assembly \"" + assemblyConfiguration.getName() + "\"" + " --outdir \"" + outputBinding.getValue() + "\""; // Execute perl script in docker DockerUtils.run(dockerImage, null, outputBinding, params, null); } catch (Exception e) { - throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + logger.error("{}", e.getStackTrace()); +// throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); } logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); @@ -207,7 +244,9 @@ private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, Int DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(MANE_SELECT_DATA)), + getDataName(MANE_SELECT_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MANE_SELECT_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID, @@ -236,7 +275,8 @@ private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, Int DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(HGNC_DATA)), getDataName(HGNC_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, geneDownloadPath); @@ -250,7 +290,9 @@ private DownloadFile downloadCancerHotspot(Path geneDownloadPath) throws IOExcep DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(CANCER_HOTSPOT_DATA)), + getDataName(CANCER_HOTSPOT_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID, @@ -265,7 +307,8 @@ private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException, DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(DGIDB_DATA)), getDataName(DGIDB_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, geneDownloadPath); @@ -280,7 +323,9 @@ private DownloadFile downloadGeneUniprotXref(Path geneDownloadPath) throws IOExc // Check if the species is supported String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID)) { + if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(UNIPROT_XREF_DATA)), + getDataName(UNIPROT_XREF_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(), @@ -295,7 +340,9 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws I DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GENE_EXPRESSION_ATLAS_DATA)), + getDataName(GENE_EXPRESSION_ATLAS_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(), @@ -306,27 +353,13 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws I return downloadFile; } - private DownloadFile downloadGeneDiseaseAnnotation(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { - DownloadFile downloadFile = null; - - // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); - - // DisGeNet - downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(), - DISGENET_FILE_ID, DISGENET_DATA, geneDownloadPath); - - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA)); - } - return downloadFile; - } - private DownloadFile downloadGnomadConstraints(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException { DownloadFile downloadFile = null; // Check if the species is supported - if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) { + if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GNOMAD_CONSTRAINTS_DATA)), + getDataName(GNOMAD_CONSTRAINTS_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(), @@ -342,7 +375,9 @@ private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, Inter // Check if the species is supported String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName()); - if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID)) { + if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID) + && !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GO_ANNOTATION_DATA)), + getDataName(GO_ANNOTATION_DATA))) { logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA)); downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(),