Skip to content

Commit

Permalink
lib: improve gene downloader, removing DISGENET, fixing sonnar issues…
Browse files Browse the repository at this point in the history
…, #TASK-5575, #TASK-5564
  • Loading branch information
jtarraga committed Jul 24, 2024
1 parent e48d27d commit 642935a
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
####################################################################
##docker run -it --mount type=bind,source=/tmp,target=/tmp opencb/cellbase-builder:6.2.0-SNAPSHOT /opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl -s "Mus musculus" -o /tmp

# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --outdir ../../appl_db/ird_v1/hsa ...
# USAGE: ./gene_extra_info.pl --species "Homo sapiens" --assembly "GRCh38" --outdir ../../appl_db/ird_v1/hsa ...

## Parsing command line
GetOptions ('species=s' => \$species, 'assembly=s' => \$assembly, 'outdir=s' => \$outdir, 'phylo=s' => \$phylo,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import static org.opencb.cellbase.lib.EtlCommons.*;

Expand Down Expand Up @@ -75,26 +78,33 @@ public List<DownloadFile> download() throws IOException, InterruptedException, C
downloadFiles.add(downloadDrugData(geneDownloadPath));
downloadFiles.add(downloadGeneUniprotXref(geneDownloadPath));
downloadFiles.add(downloadGeneExpressionAtlas(geneDownloadPath));
downloadFiles.add(downloadGeneDiseaseAnnotation(geneDownloadPath));
downloadFiles.add(downloadGnomadConstraints(geneDownloadPath));
downloadFiles.add(downloadGO(geneDownloadPath));
logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_ANNOTATION_DATA));

// Save data sources manually downloaded
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
// HPO
saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(),
Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)),
geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA)));
logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA),
getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath);
if (Files.exists(geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA)))) {
logger.warn("The version file {} already exists", getDataVersionFilename(HPO_DISEASE_DATA));
} else {
saveDataSource(HPO_DISEASE_DATA, configuration.getDownload().getHpo().getVersion(), getTimeStamp(),
Collections.singletonList(getManualUrl(configuration.getDownload().getHpo(), HPO_FILE_ID)),
geneDownloadPath.resolve(getDataVersionFilename(HPO_DISEASE_DATA)));
logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(HPO_DISEASE_DATA),
getDataVersionFilename(HPO_DISEASE_DATA), geneDownloadPath);
}

// Cancer gene census
saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(),
Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(), CANCER_GENE_CENSUS_FILE_ID)),
geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA)));
logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA),
getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath);
if (Files.exists(geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA)))) {
logger.warn("The version file {} already exists", getDataVersionFilename(CANCER_GENE_CENSUS_DATA));
} else {
saveDataSource(CANCER_GENE_CENSUS_DATA, configuration.getDownload().getCancerGeneCensus().getVersion(), getTimeStamp(),
Collections.singletonList(getManualUrl(configuration.getDownload().getCancerGeneCensus(),
CANCER_GENE_CENSUS_FILE_ID)), geneDownloadPath.resolve(getDataVersionFilename(CANCER_GENE_CENSUS_DATA)));
logger.warn("{} must be downloaded manually; the version file {} was created at {}", getDataName(CANCER_GENE_CENSUS_DATA),
getDataVersionFilename(CANCER_GENE_CENSUS_DATA), geneDownloadPath);
}
}

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DATA));
Expand All @@ -106,6 +116,11 @@ private List<DownloadFile> downloadEnsemblData(Path ensemblDownloadPath) throws

// Check if the species is supported
if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) {
// Already downloaded ?
if (isAlreadyDownloaded(ensemblDownloadPath.resolve(getDataVersionFilename(ENSEMBL_DATA)), getDataName(ENSEMBL_DATA))) {
return downloadFiles;
}

logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_DATA), getDataCategory(ENSEMBL_DATA));
DownloadProperties.EnsemblProperties ensemblConfig = configuration.getDownload().getEnsembl();

Expand Down Expand Up @@ -135,7 +150,8 @@ private List<DownloadFile> downloadRefSeq(Path refSeqDownloadPath) throws IOExce
if (SpeciesUtils.hasData(configuration, speciesConfiguration.getScientificName(), GENE_DATA)) {
// GTF, DNA, RNA, Protein
String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName());
if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID)) {
if (configuration.getDownload().getGenomicSuperDups().getFiles().containsKey(prefixId + REFSEQ_GENOMIC_GTF_FILE_ID)
&& !isAlreadyDownloaded(refSeqDownloadPath.resolve(getDataVersionFilename(REFSEQ_DATA)), getDataName(REFSEQ_DATA))) {
logger.info(CATEGORY_DOWNLOADING_LOG_MESSAGE, getDataName(REFSEQ_DATA), getDataCategory(REFSEQ_DATA));

DownloadProperties.URLProperties refSeqConfig = configuration.getDownload().getRefSeq();
Expand All @@ -155,6 +171,14 @@ private List<DownloadFile> downloadRefSeq(Path refSeqDownloadPath) throws IOExce
}

public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, CellBaseException {
String ensemblCanonicalScript = "ensembl_canonical.pl";
String ensemblCanonicalFilename = "ensembl_canonical.txt";

if (Files.exists(geneDownloadPath.resolve(ensemblCanonicalFilename))) {
logger.warn("File {} already exists, skipping running the Perl script {}", ensemblCanonicalFilename, ensemblCanonicalScript);
return;
}

logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA));

String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion();
Expand All @@ -165,21 +189,32 @@ public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException,
geneDownloadPath.toAbsolutePath().toString(), "/tmp");

// Params
String params = "/opt/cellbase/scripts/ensembl-scripts/ensembl_canonical.pl"
String params = "/opt/cellbase/scripts/ensembl-scripts/" + ensemblCanonicalScript
+ " --species \"" + speciesConfiguration.getId() + "\""
+ " --assembly \"" + assemblyConfiguration.getName() + "\""
+ " --outdir \"" + outputBinding.getValue() + "\"";

// Execute perl script in docker
DockerUtils.run(dockerImage, null, outputBinding, params, null);
} catch (Exception e) {
throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e);
logger.error("{}", e.getStackTrace());
// throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e);
}

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA));
}

public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, CellBaseException {
String geneExtraInfoScript = "gene_extra_info.pl";
String descriptionFilename = "description.txt";
String xrefsFilename = "xrefs.txt";

if (Files.exists(geneDownloadPath.resolve(descriptionFilename)) && Files.exists(geneDownloadPath.resolve(xrefsFilename))) {
logger.warn("Files {} and {} already exist, skipping running the Perl script {}", descriptionFilename, xrefsFilename,
geneExtraInfoScript);
return;
}

logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA));

String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion();
Expand All @@ -190,14 +225,16 @@ public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, Cel
geneDownloadPath.toAbsolutePath().toString(), "/tmp");

// Params
String params = "/opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl"
+ " --species \"" + speciesConfiguration.getId() + "\""
String params = "/opt/cellbase/scripts/ensembl-scripts/" + geneExtraInfoScript
+ " --species \"" + speciesConfiguration.getScientificName() + "\""
+ " --assembly \"" + assemblyConfiguration.getName() + "\""
+ " --outdir \"" + outputBinding.getValue() + "\"";

// Execute perl script in docker
DockerUtils.run(dockerImage, null, outputBinding, params, null);
} catch (Exception e) {
throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e);
logger.error("{}", e.getStackTrace());
// throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e);
}

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA));
Expand All @@ -207,7 +244,9 @@ private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, Int
DownloadFile downloadFile = null;

// Check if the species is supported
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)
&& !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(MANE_SELECT_DATA)),
getDataName(MANE_SELECT_DATA))) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(MANE_SELECT_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getManeSelect(), MANE_SELECT_FILE_ID,
Expand Down Expand Up @@ -236,7 +275,8 @@ private DownloadFile downloadHgnc(Path geneDownloadPath) throws IOException, Int
DownloadFile downloadFile = null;

// Check if the species is supported
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)
&& !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(HGNC_DATA)), getDataName(HGNC_DATA))) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(HGNC_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getHgnc(), HGNC_FILE_ID, HGNC_DATA, geneDownloadPath);
Expand All @@ -250,7 +290,9 @@ private DownloadFile downloadCancerHotspot(Path geneDownloadPath) throws IOExcep
DownloadFile downloadFile = null;

// Check if the species is supported
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)
&& !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(CANCER_HOTSPOT_DATA)),
getDataName(CANCER_HOTSPOT_DATA))) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(CANCER_HOTSPOT_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getCancerHotspot(), CANCER_HOTSPOT_FILE_ID,
Expand All @@ -265,7 +307,8 @@ private DownloadFile downloadDrugData(Path geneDownloadPath) throws IOException,
DownloadFile downloadFile = null;

// Check if the species is supported
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)
&& !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(DGIDB_DATA)), getDataName(DGIDB_DATA))) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(DGIDB_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDgidb(), DGIDB_FILE_ID, DGIDB_DATA, geneDownloadPath);
Expand All @@ -280,7 +323,9 @@ private DownloadFile downloadGeneUniprotXref(Path geneDownloadPath) throws IOExc

// Check if the species is supported
String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName());
if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID)) {
if (configuration.getDownload().getGeneUniprotXref().getFiles().containsKey(prefixId + UNIPROT_XREF_FILE_ID)
&& !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(UNIPROT_XREF_DATA)),
getDataName(UNIPROT_XREF_DATA))) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(UNIPROT_XREF_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneUniprotXref(),
Expand All @@ -295,7 +340,9 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws I
DownloadFile downloadFile = null;

// Check if the species is supported
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)
&& !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GENE_EXPRESSION_ATLAS_DATA)),
getDataName(GENE_EXPRESSION_ATLAS_DATA))) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXPRESSION_ATLAS_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGeneExpressionAtlas(),
Expand All @@ -306,27 +353,13 @@ private DownloadFile downloadGeneExpressionAtlas(Path geneDownloadPath) throws I
return downloadFile;
}

private DownloadFile downloadGeneDiseaseAnnotation(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException {
DownloadFile downloadFile = null;

// Check if the species is supported
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA));

// DisGeNet
downloadFile = downloadAndSaveDataSource(configuration.getDownload().getDisgenet(),
DISGENET_FILE_ID, DISGENET_DATA, geneDownloadPath);

logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_DISEASE_ANNOTATION_DATA));
}
return downloadFile;
}

private DownloadFile downloadGnomadConstraints(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException {
DownloadFile downloadFile = null;

// Check if the species is supported
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)
&& !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GNOMAD_CONSTRAINTS_DATA)),
getDataName(GNOMAD_CONSTRAINTS_DATA))) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GNOMAD_CONSTRAINTS_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGnomadConstraints(),
Expand All @@ -342,7 +375,9 @@ private DownloadFile downloadGO(Path geneDownloadPath) throws IOException, Inter

// Check if the species is supported
String prefixId = getConfigurationFileIdPrefix(speciesConfiguration.getScientificName());
if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID)) {
if (configuration.getDownload().getGoAnnotation().getFiles().containsKey(prefixId + GO_ANNOTATION_FILE_ID)
&& !isAlreadyDownloaded(geneDownloadPath.resolve(getDataVersionFilename(GO_ANNOTATION_DATA)),
getDataName(GO_ANNOTATION_DATA))) {
logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GO_ANNOTATION_DATA));

downloadFile = downloadAndSaveDataSource(configuration.getDownload().getGoAnnotation(),
Expand Down

0 comments on commit 642935a

Please sign in to comment.