Skip to content

Commit

Permalink
lib: improve variation downloader by checking if data is already down…
Browse files Browse the repository at this point in the history
…loaded, #TASK-5575, #TASK-5564
  • Loading branch information
jtarraga committed Jul 24, 2024
1 parent 6fc7129 commit 8ed0e0d
Showing 1 changed file with 29 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import static org.opencb.cellbase.lib.EtlCommons.*;
Expand All @@ -45,34 +44,41 @@ public List<DownloadFile> downloadVariation() throws IOException, InterruptedExc
List<DownloadFile> downloadFiles = new ArrayList<>();

// Check if species is supported
if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_DATA)) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(VARIATION_DATA));

// and we do not need to download human variation data from Ensembl. It is already included in the CellBase.
if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), VARIATION_DATA)
&& !speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
Path variationFolder = downloadFolder.resolve(VARIATION_DATA);
Files.createDirectories(variationFolder);

// We do not need to download human variation data from Ensembl. It is already included in the CellBase.
if (!speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, speciesShortName + ".vcf.gz");
String fileName = variationFolder.resolve(speciesShortName + ".gtf.gz").toString();
String url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/"
+ speciesShortName + ".vcf.gz";
downloadFiles.add(downloadFile(url, fileName));
logger.info(OK_LOG_MESSAGE);
saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url),
variationFolder.resolve(getDataVersionFilename(VARIATION_DATA)));

fileName = variationFolder.resolve(speciesShortName + "_structural_variations.gtf.gz").toString();
url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/"
+ speciesShortName + "_structural_variations.vcf.gz";
downloadFiles.add(downloadFile(url, fileName));
logger.info(OK_LOG_MESSAGE);
saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), Collections.singletonList(url),
variationFolder.resolve(getDataVersionFilename(VARIATION_DATA)));
if (isAlreadyDownloaded(downloadFolder.resolve(getDataVersionFilename(VARIATION_DATA)), getDataName(VARIATION_DATA))) {
return new ArrayList<>();
}

logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(VARIATION_DATA));

List<String> urls = new ArrayList<>();

String fileName = variationFolder.resolve(speciesShortName + ".gtf.gz").toString();
String url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/"
+ speciesShortName + ".vcf.gz";
logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, fileName);
downloadFiles.add(downloadFile(url, fileName));
urls.add(url);
logger.info(OK_LOG_MESSAGE);

fileName = variationFolder.resolve(speciesShortName + "_structural_variations.gtf.gz").toString();
url = ensemblHostUrl + "/" + ensemblRelease + "/variation/vcf/" + speciesShortName + "/"
+ speciesShortName + "_structural_variations.vcf.gz";
logger.info(DOWNLOADING_FROM_TO_LOG_MESSAGE, url, fileName);
downloadFiles.add(downloadFile(url, fileName));
urls.add(url);
logger.info(OK_LOG_MESSAGE);

saveDataSource(VARIATION_DATA, ensemblVersion, getTimeStamp(), urls, variationFolder.resolve(
getDataVersionFilename(VARIATION_DATA)));

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(VARIATION_DATA));
}

return downloadFiles;
}
}

0 comments on commit 8ed0e0d

Please sign in to comment.