From 1d171d528d5b3bc27fba8281839456bdbdcb3cff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jul 2024 09:09:58 +0200 Subject: [PATCH] lib: update GeneDownloadManager to call the script gene_extra_info.pl, #TASK-5575, #TASK-5564 --- .../ensembl-scripts/ensembl_canonical.pl | 17 +++++++- .../scripts/ensembl-scripts/genome_info.pl | 2 +- .../org/opencb/cellbase/lib/EtlCommons.java | 4 ++ .../lib/download/GeneDownloadManager.java | 40 +++++++++++++++---- 4 files changed, 53 insertions(+), 10 deletions(-) diff --git a/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl index 9be361a55..bed648e2d 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/ensembl_canonical.pl @@ -36,7 +36,12 @@ $query->formatter("TSV"); -open (ENSEMBL_CANONICAL, ">$outdir/ensembl_canonical.txt") || die "Cannot open ensembl_canonical.txt file"; +# Open the file for writing +open(my $fh, '>', "$outdir/ensembl_canonical.txt") or die "Cannot open ensembl_canonical.txt file: $!"; + +# Save the original stdout +my $original_stdout = *STDOUT; +open(STDOUT, '>&', $fh) or die "Can't redirect STDOUT: $!"; my $query_runner = BioMart::QueryRunner->new(); @@ -44,5 +49,13 @@ $query_runner->uniqueRowsOnly(1); $query_runner->execute($query); #$query_runner->printHeader(); -print ENSEMBL_CANONICAL $query_runner->printResults(); +#print ENSEMBL_CANONICAL $query_runner->printResults(); +# Call printResults which prints to STDOUT (now redirected to the file) +$query_runner->printResults(); #$query_runner->printFooter(); + +# Restore the original stdout +open(STDOUT, '>&', $original_stdout) or die "Can't restore STDOUT: $!"; + +# Close the filehandle +close($fh) or die "Failed to close file: $!"; \ No newline at end of file diff --git a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl index 9cbc01a4c..8ecf3d7c8 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl +++ b/cellbase-app/app/scripts/ensembl-scripts/genome_info.pl @@ -143,7 +143,7 @@ sub print_parameters { print "Parameters: "; - print "species: $species, outfile: $outfile, "; + print "species: $species, assembly: $assembly, outfile: $outfile, "; print "ensembl-registry: $ENSEMBL_REGISTRY, "; print "ensembl-host: $ENSEMBL_HOST, ensembl-port: $ENSEMBL_PORT, "; print "ensembl-user: $ENSEMBL_USER, verbose: $verbose, help: $help"; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 35cbba0e9..dcec1f6de 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -110,6 +110,8 @@ public final class EtlCommons { public static final String REFSEQ_RNA_FNA_FILE_ID = "RNA_FNA"; // Gene annotation + public static final String ENSEMBL_CANONICAL_DATA = "ensembl_canonical"; + public static final String GENE_EXTRA_INFO_DATA = "gene_extra_info"; // - MANE Select public static final String MANE_SELECT_DATA = "MANE Select"; // Must match the configuration file @@ -354,6 +356,8 @@ public final class EtlCommons { dataNamesMap.put(GENOME_DATA, "Genome"); dataNamesMap.put(GENOME_INFO_DATA, "Genome Info"); dataNamesMap.put(GENE_DATA, "Gene"); + dataNamesMap.put(ENSEMBL_CANONICAL_DATA, "Ensembl canonical"); + dataNamesMap.put(GENE_EXTRA_INFO_DATA, "Gene extra info"); dataNamesMap.put(GENE_ANNOTATION_DATA, "Gene Annotation"); dataNamesMap.put(MANE_SELECT_DATA, "MANE Select"); dataNamesMap.put(LRG_DATA, "LRG"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 3e2f104b8..356c637ca 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -58,7 +58,10 @@ public List download() throws IOException, InterruptedException, C downloadFiles.addAll(downloadEnsemblData(ensemblDownloadPath)); // Ensembl canonical - downloadEnsemblCanonical(); + downloadEnsemblCanonical(geneDownloadPath); + + // Gene extra info + downloadGeneExtraInfo(geneDownloadPath); // RefSeq downloadFiles.addAll(downloadRefSeq(refSeqDownloadPath)); @@ -151,20 +154,43 @@ private List downloadRefSeq(Path refSeqDownloadPath) throws IOExce return downloadFiles; } - public void downloadEnsemblCanonical() throws IOException, CellBaseException { - logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); - Path sequenceFolder = downloadFolder.resolve(GENOME_DATA); - Files.createDirectories(sequenceFolder); + public void downloadEnsemblCanonical(Path geneDownloadPath) throws IOException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); try { // Build command line to run Perl script via docker image // Output binding AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( - sequenceFolder.toAbsolutePath().toString(), "/tmp"); + geneDownloadPath.toAbsolutePath().toString(), "/tmp"); // Params String params = "/opt/cellbase/scripts/ensembl-scripts/ensembl_canonical.pl" + + " --species \"" + speciesConfiguration.getId() + "\"" + + " --assembly \"" + assemblyConfiguration.getName() + "\"" + + " --outdir \"" + outputBinding.getValue() + "\""; + + // Execute perl script in docker + DockerUtils.run(dockerImage, null, outputBinding, params, null); + } catch (Exception e) { + throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); + } + + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(ENSEMBL_CANONICAL_DATA)); + } + + public void downloadGeneExtraInfo(Path geneDownloadPath) throws IOException, CellBaseException { + logger.info(DOWNLOADING_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); + + String dockerImage = "opencb/cellbase-builder:" + GitRepositoryState.get().getBuildVersion(); + try { + // Build command line to run Perl script via docker image + // Output binding + AbstractMap.SimpleEntry outputBinding = new AbstractMap.SimpleEntry<>( + geneDownloadPath.toAbsolutePath().toString(), "/tmp"); + + // Params + String params = "/opt/cellbase/scripts/ensembl-scripts/gene_extra_info.pl" + " --species \"" + speciesConfiguration.getId() + "\"" + " --outdir \"" + outputBinding.getValue() + "\""; @@ -174,7 +200,7 @@ public void downloadEnsemblCanonical() throws IOException, CellBaseException { throw new CellBaseException("Error executing Perl script from Docker " + dockerImage, e); } - logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENOME_INFO_DATA)); + logger.info(DOWNLOADING_DONE_LOG_MESSAGE, getDataName(GENE_EXTRA_INFO_DATA)); } private DownloadFile downloadMane(Path geneDownloadPath) throws IOException, InterruptedException, CellBaseException {