Skip to content

Commit

Permalink
lib: improve genome and conservation downloaders by checking if data …
Browse files Browse the repository at this point in the history
…is already downloaded, and fix sonnar issues, #TASK-5575, #TASK-5576
  • Loading branch information
jtarraga committed Jul 24, 2024
1 parent 1d171d5 commit d10931d
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,14 @@ public static void writeDownloadLogFile(Path downloadFolder, List<DownloadFile>
writer.writeValue(new File(downloadFolder + "/download_log.json"), downloadFiles);
}

public boolean isAlreadyDownloaded(Path path, String dataName) {
if (Files.exists(path)) {
logger.info(DATA_ALREADY_DOWNLOADED, path.getFileName(), dataName);
return true;
}
return false;
}

private boolean validateDownloadFile(DownloadFile downloadFile, String outputFileName, String outputFileLog) {
long expectedFileSize = getExpectedFileSize(outputFileLog);
long actualFileSize = FileUtils.sizeOf(new File(outputFileName));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,26 @@ public List<DownloadFile> downloadConservation() throws IOException, Interrupted

// Check if the species is supported
if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), CONSERVATION_DATA)) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA));

// Create folders
Path conservationFolder = downloadFolder.resolve(CONSERVATION_DATA);
Files.createDirectories(conservationFolder);
Files.createDirectories(conservationFolder.resolve(GERP_DATA));
Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA));
Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA));
Path gerpFolder = Files.createDirectories(conservationFolder.resolve(GERP_DATA));
Path phastConsFolder = Files.createDirectories(conservationFolder.resolve(PHASTCONS_DATA));
Path phyloPFolder = Files.createDirectories(conservationFolder.resolve(PHYLOP_DATA));

// Already downloaded ?
boolean downloadGerp = !isAlreadyDownloaded(gerpFolder.resolve(getDataVersionFilename(GERP_DATA)), getDataName(GERP_DATA));
boolean downloadPhastCons = !isAlreadyDownloaded(phastConsFolder.resolve(getDataVersionFilename(PHASTCONS_DATA)),
getDataName(PHASTCONS_DATA));
boolean downloadPhyloP = !isAlreadyDownloaded(phyloPFolder.resolve(getDataVersionFilename(PHYLOP_DATA)),
getDataName(PHYLOP_DATA));

if (!downloadGerp && !downloadPhastCons && !downloadPhyloP) {
return new ArrayList<>();
}

logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CONSERVATION_DATA));

// Download data
String filename;
Expand All @@ -80,30 +92,39 @@ public List<DownloadFile> downloadConservation() throws IOException, Interrupted
String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
"15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "M"};
for (String chromosome : chromosomes) {
logger.info(DOWNLOADING_LOG_MESSAGE, "phastConst " + chromosome);
String phastConsUrl = phastconsHost + configuration.getDownload().getPhastCons().getFiles().get(PHASTCONS_FILE_ID)
+ "chr" + chromosome + ".phastCons470way.wigFix.gz";
filename = Paths.get(phastConsUrl).getFileName().toString();
outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename);
downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString()));
phastconsUrls.add(phastConsUrl);

logger.info(DOWNLOADING_LOG_MESSAGE, "phyloP " + chromosome);
String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(PHYLOP_FILE_ID)
+ "chr" + chromosome + ".phyloP470way.wigFix.gz";
filename = Paths.get(phyloPUrl).getFileName().toString();
outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename);
downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString()));
phyloPUrls.add(phyloPUrl);
if (downloadPhastCons) {
logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome));
String phastConsUrl = phastconsHost + configuration.getDownload().getPhastCons().getFiles().get(PHASTCONS_FILE_ID)
+ "chr" + chromosome + ".phastCons470way.wigFix.gz";
filename = Paths.get(phastConsUrl).getFileName().toString();
outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename);
downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString()));
phastconsUrls.add(phastConsUrl);
logger.info(OK_LOG_MESSAGE);
}

if (downloadPhyloP) {
logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome));
String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(PHYLOP_FILE_ID)
+ "chr" + chromosome + ".phyloP470way.wigFix.gz";
filename = Paths.get(phyloPUrl).getFileName().toString();
outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename);
downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString()));
phyloPUrls.add(phyloPUrl);
logger.info(OK_LOG_MESSAGE);
}
}

// 2. Gerp
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA));
gerpUrl = configuration.getDownload().getGerp().getHost()
+ configuration.getDownload().getGerp().getFiles().get(GERP_FILE_ID);
filename = Paths.get(gerpUrl).getFileName().toString();
outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename);
downloadFiles.add(downloadFile(gerpUrl, outputPath.toString()));
if (downloadGerp) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA));
gerpUrl = configuration.getDownload().getGerp().getHost()
+ configuration.getDownload().getGerp().getFiles().get(GERP_FILE_ID);
filename = Paths.get(gerpUrl).getFileName().toString();
outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename);
downloadFiles.add(downloadFile(gerpUrl, outputPath.toString()));
logger.info(OK_LOG_MESSAGE);
}
}

// Mouse
Expand All @@ -114,43 +135,63 @@ public List<DownloadFile> downloadConservation() throws IOException, Interrupted
String[] chromosomes = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
"15", "16", "17", "18", "19", "X", "Y", "M"};
for (String chromosome : chromosomes) {
logger.info(DOWNLOADING_LOG_MESSAGE, "phastConst " + chromosome);
String phastConsUrl = phastconsHost
+ configuration.getDownload().getPhastCons().getFiles().get(prefixId + PHASTCONS_FILE_ID)
+ "chr" + chromosome + ".phastCons35way.wigFix.gz";
filename = Paths.get(phastConsUrl).getFileName().toString();
outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename);
downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString()));
phastconsUrls.add(phastConsUrl);

logger.info(DOWNLOADING_LOG_MESSAGE, "phyloP " + chromosome);
String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(prefixId + PHYLOP_FILE_ID)
+ "chr" + chromosome + ".phyloP35way.wigFix.gz";
filename = Paths.get(phyloPUrl).getFileName().toString();
outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename);
downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString()));
phyloPUrls.add(phyloPUrl);
if (downloadPhastCons) {
logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHASTCONS_DATA), chromosome));
String phastConsUrl = phastconsHost
+ configuration.getDownload().getPhastCons().getFiles().get(prefixId + PHASTCONS_FILE_ID)
+ "chr" + chromosome + ".phastCons35way.wigFix.gz";
filename = Paths.get(phastConsUrl).getFileName().toString();
outputPath = conservationFolder.resolve(PHASTCONS_DATA).resolve(filename);
downloadFiles.add(downloadFile(phastConsUrl, outputPath.toString()));
phastconsUrls.add(phastConsUrl);
logger.info(OK_LOG_MESSAGE);
}

if (downloadPhyloP) {
logger.info(DOWNLOADING_LOG_MESSAGE, getChromDownloadMessage(getDataName(PHYLOP_DATA), chromosome));
String phyloPUrl = phylopHost + configuration.getDownload().getPhylop().getFiles().get(prefixId + PHYLOP_FILE_ID)
+ "chr" + chromosome + ".phyloP35way.wigFix.gz";
filename = Paths.get(phyloPUrl).getFileName().toString();
outputPath = conservationFolder.resolve(PHYLOP_DATA).resolve(filename);
downloadFiles.add(downloadFile(phyloPUrl, outputPath.toString()));
phyloPUrls.add(phyloPUrl);
logger.info(OK_LOG_MESSAGE);
}
}

// 2. Gerp
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA));
gerpUrl = configuration.getDownload().getGerp().getHost()
+ configuration.getDownload().getGerp().getFiles().get(prefixId + GERP_FILE_ID);
filename = Paths.get(gerpUrl).getFileName().toString();
outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename);
downloadFiles.add(downloadFile(gerpUrl, outputPath.toString()));
if (downloadGerp) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GERP_DATA));
gerpUrl = configuration.getDownload().getGerp().getHost()
+ configuration.getDownload().getGerp().getFiles().get(prefixId + GERP_FILE_ID);
filename = Paths.get(gerpUrl).getFileName().toString();
outputPath = conservationFolder.resolve(GERP_DATA).resolve(filename);
downloadFiles.add(downloadFile(gerpUrl, outputPath.toString()));
logger.info(OK_LOG_MESSAGE);
}
}

// Save data version
saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls,
conservationFolder.resolve(getDataVersionFilename(PHASTCONS_DATA)));
saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls,
conservationFolder.resolve(getDataVersionFilename(PHYLOP_DATA)));
saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(),
Collections.singletonList(gerpUrl), conservationFolder.resolve(getDataVersionFilename(GERP_DATA)));
if (downloadPhastCons) {
saveDataSource(PHASTCONS_DATA, configuration.getDownload().getPhastCons().getVersion(), getTimeStamp(), phastconsUrls,
phastConsFolder.resolve(getDataVersionFilename(PHASTCONS_DATA)));
}
if (downloadPhyloP) {
saveDataSource(PHYLOP_DATA, configuration.getDownload().getPhylop().getVersion(), getTimeStamp(), phyloPUrls,
phyloPFolder.resolve(getDataVersionFilename(PHYLOP_DATA)));
}
if (downloadGerp) {
saveDataSource(GERP_DATA, configuration.getDownload().getGerp().getVersion(), getTimeStamp(),
Collections.singletonList(gerpUrl), gerpFolder.resolve(getDataVersionFilename(GERP_DATA)));
}

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA));
}
return downloadFiles;
}

private String getChromDownloadMessage(String dataName, String chromosome) {
return dataName + ", chrom. " + chromosome;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ public List<DownloadFile> download() throws IOException, InterruptedException, C
public List<DownloadFile> downloadReferenceGenome() throws IOException, InterruptedException, CellBaseException {
Path genomeVersionFilePath = sequenceFolder.resolve(getDataVersionFilename(GENOME_DATA));

if (Files.exists(genomeVersionFilePath)) {
logger.info(DATA_ALREADY_DOWNLOADED, genomeVersionFilePath.getFileName(), getDataName(GENOME_DATA));
// Already downloaded
if (isAlreadyDownloaded(genomeVersionFilePath, getDataName(GENOME_DATA))) {
return new ArrayList<>();
}

Expand All @@ -76,8 +76,8 @@ public List<DownloadFile> downloadReferenceGenome() throws IOException, Interrup
public void downloadGenomeInfo() throws IOException, CellBaseException {
String genomeInfoFilename = "genome_info.json";

if (Files.exists(sequenceFolder.resolve(genomeInfoFilename))) {
logger.info(DATA_ALREADY_DOWNLOADED, genomeInfoFilename, getDataName(GENOME_INFO_DATA));
// Already downloaded
if (isAlreadyDownloaded(sequenceFolder.resolve(genomeInfoFilename), getDataName(GENOME_INFO_DATA))) {
return;
}

Expand Down Expand Up @@ -105,5 +105,4 @@ public void downloadGenomeInfo() throws IOException, CellBaseException {

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA));
}

}

0 comments on commit d10931d

Please sign in to comment.