From d10931d21f3c566b1e7fe1c85487084223b836a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 11:34:03 +0200 Subject: [PATCH] lib: improve genome and conservation downloaders by checking if data is already downloaded, and fix sonnar issues, #TASK-5575, #TASK-5576 --- .../lib/download/AbstractDownloadManager.java | 8 + .../download/ConservationDownloadManager.java | 147 +++++++++++------- .../lib/download/GenomeDownloadManager.java | 9 +- 3 files changed, 106 insertions(+), 58 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java index b2a098f7e..975e182cb 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/AbstractDownloadManager.java @@ -334,6 +334,14 @@ public static void writeDownloadLogFile(Path downloadFolder, List writer.writeValue(new File(downloadFolder + "/download_log.json"), downloadFiles); } + public boolean isAlreadyDownloaded(Path path, String dataName) { + if (Files.exists(path)) { + logger.info(DATA_ALREADY_DOWNLOADED, path.getFileName(), dataName); + return true; + } + return false; + } + private boolean validateDownloadFile(DownloadFile downloadFile, String outputFileName, String outputFileLog) { long expectedFileSize = getExpectedFileSize(outputFileLog); long actualFileSize = FileUtils.sizeOf(new File(outputFileName)); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java index f9a33b5c9..64be42ed7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ConservationDownloadManager.java @@ -54,14 +54,26 @@ public List downloadConservation() throws IOException, Interrupted // Check if the species is supported if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CONSERVATION_DATA)) { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); // Create folders Path conservationFolder = downloadFolder.resolve(CONSERVATION_DATA); Files.createDirectories(conservationFolder); - Files.createDirectories(conservationFolder.resolve(GERP_DATA)); - Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); - Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); + Path gerpFolder = Files.createDirectories(conservationFolder.resolve(GERP_DATA)); + Path phastConsFolder = Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA)); + Path phyloPFolder = Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA)); + + // Already downloaded ? + boolean downloadGerp = !isAlreadyDownloaded(gerpFolder.resolve(getDataVersionFilename(GERP_DATA)), getDataName(GERP_DATA)); + boolean downloadPhastCons = !isAlreadyDownloaded(phastConsFolder.resolve(getDataVersionFilename(PHASTCONS_DATA)), + getDataName(PHASTCONS_DATA)); + boolean downloadPhyloP = !isAlreadyDownloaded(phyloPFolder.resolve(getDataVersionFilename(PHYLOP_DATA)), + getDataName(PHYLOP_DATA)); + + if (!downloadGerp && !downloadPhastCons && !downloadPhyloP) { + return new ArrayList<>(); + } + + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); // Download data String filename; @@ -80,30 +92,39 @@ public List downloadConservation() throws IOException, Interrupted String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M"}; for (String chromosome : chromosomes) { - logger.info(DOWNLOADING_LOG_MESSAGE, "phastConst " + chromosome); - String phastConsUrl = phastconsHost + configuration.getDownload().getPhastCons().getFiles().get(PHASTCONS_FILE_ID) - + "chr" + chromosome + ".phastCons470way.wigFix.gz"; - filename = Paths.get(phastConsUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); - downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); - phastconsUrls.add(phastConsUrl); - - logger.info(DOWNLOADING_LOG_MESSAGE, "phyloP " + chromosome); - String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(PHYLOP_FILE_ID) - + "chr" + chromosome + ".phyloP470way.wigFix.gz"; - filename = Paths.get(phyloPUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); - downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); - phyloPUrls.add(phyloPUrl); + if (downloadPhastCons) { + logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome)); + String phastConsUrl = phastconsHost + configuration.getDownload().getPhastCons().getFiles().get(PHASTCONS_FILE_ID) + + "chr" + chromosome + ".phastCons470way.wigFix.gz"; + filename = Paths.get(phastConsUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); + downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); + phastconsUrls.add(phastConsUrl); + logger.info(OK_LOG_MESSAGE); + } + + if (downloadPhyloP) { + logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome)); + String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(PHYLOP_FILE_ID) + + "chr" + chromosome + ".phyloP470way.wigFix.gz"; + filename = Paths.get(phyloPUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); + downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); + phyloPUrls.add(phyloPUrl); + logger.info(OK_LOG_MESSAGE); + } } // 2. Gerp - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); - gerpUrl = configuration.getDownload().getGerp().getHost() - + configuration.getDownload().getGerp().getFiles().get(GERP_FILE_ID); - filename = Paths.get(gerpUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); - downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + if (downloadGerp) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); + gerpUrl = configuration.getDownload().getGerp().getHost() + + configuration.getDownload().getGerp().getFiles().get(GERP_FILE_ID); + filename = Paths.get(gerpUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); + downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); + } } // Mouse @@ -114,43 +135,63 @@ public List downloadConservation() throws IOException, Interrupted String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "X", "Y", "M"}; for (String chromosome : chromosomes) { - logger.info(DOWNLOADING_LOG_MESSAGE, "phastConst " + chromosome); - String phastConsUrl = phastconsHost - + configuration.getDownload().getPhastCons().getFiles().get(prefixId + PHASTCONS_FILE_ID) - + "chr" + chromosome + ".phastCons35way.wigFix.gz"; - filename = Paths.get(phastConsUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); - downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); - phastconsUrls.add(phastConsUrl); - - logger.info(DOWNLOADING_LOG_MESSAGE, "phyloP " + chromosome); - String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(prefixId + PHYLOP_FILE_ID) - + "chr" + chromosome + ".phyloP35way.wigFix.gz"; - filename = Paths.get(phyloPUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); - downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); - phyloPUrls.add(phyloPUrl); + if (downloadPhastCons) { + logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome)); + String phastConsUrl = phastconsHost + + configuration.getDownload().getPhastCons().getFiles().get(prefixId + PHASTCONS_FILE_ID) + + "chr" + chromosome + ".phastCons35way.wigFix.gz"; + filename = Paths.get(phastConsUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename); + downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString())); + phastconsUrls.add(phastConsUrl); + logger.info(OK_LOG_MESSAGE); + } + + if (downloadPhyloP) { + logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome)); + String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(prefixId + PHYLOP_FILE_ID) + + "chr" + chromosome + ".phyloP35way.wigFix.gz"; + filename = Paths.get(phyloPUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename); + downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString())); + phyloPUrls.add(phyloPUrl); + logger.info(OK_LOG_MESSAGE); + } } // 2. Gerp - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); - gerpUrl = configuration.getDownload().getGerp().getHost() - + configuration.getDownload().getGerp().getFiles().get(prefixId + GERP_FILE_ID); - filename = Paths.get(gerpUrl).getFileName().toString(); - outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); - downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + if (downloadGerp) { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA)); + gerpUrl = configuration.getDownload().getGerp().getHost() + + configuration.getDownload().getGerp().getFiles().get(prefixId + GERP_FILE_ID); + filename = Paths.get(gerpUrl).getFileName().toString(); + outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename); + downloadFiles.add(downloadFile(gerpUrl, outputPath.toString())); + logger.info(OK_LOG_MESSAGE); + } } // Save data version - saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, - conservationFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); - saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, - conservationFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); - saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), - Collections.singletonList(gerpUrl), conservationFolder.resolve(getDataVersionFilename(GERP_DATA))); + if (downloadPhastCons) { + saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls, + phastConsFolder.resolve(getDataVersionFilename(PHASTCONS_DATA))); + } + if (downloadPhyloP) { + saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls, + phyloPFolder.resolve(getDataVersionFilename(PHYLOP_DATA))); + } + if (downloadGerp) { + saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(), + Collections.singletonList(gerpUrl), gerpFolder.resolve(getDataVersionFilename(GERP_DATA))); + } + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); } return downloadFiles; } + private String getChromDownloadMessage(String dataName, String chromosome) { + return dataName + ", chrom. " + chromosome; + } + } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index fa3741172..e5ad0d882 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -51,8 +51,8 @@ public List download() throws IOException, InterruptedException, C public List downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException { Path genomeVersionFilePath = sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA)); - if (Files.exists(genomeVersionFilePath)) { - logger.info(DATA_ALREADY_DOWNLOADED, genomeVersionFilePath.getFileName(), getDataName(GENOME_DATA)); + // Already downloaded + if (isAlreadyDownloaded(genomeVersionFilePath, getDataName(GENOME_DATA))) { return new ArrayList<>(); } @@ -76,8 +76,8 @@ public List downloadReferenceGenome() throws IOException, Interrup public void downloadGenomeInfo() throws IOException, CellBaseException { String genomeInfoFilename = "genome_info.json"; - if (Files.exists(sequenceFolder.resolve(genomeInfoFilename))) { - logger.info(DATA_ALREADY_DOWNLOADED, genomeInfoFilename, getDataName(GENOME_INFO_DATA)); + // Already downloaded + if (isAlreadyDownloaded(sequenceFolder.resolve(genomeInfoFilename), getDataName(GENOME_INFO_DATA))) { return; } @@ -105,5 +105,4 @@ public void downloadGenomeInfo() throws IOException, CellBaseException { logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); } - }