Skip to content

Commit

Permalink
lib: improve gene (Ensembl/RefSeq) builder by supporting multi-specie…
Browse files Browse the repository at this point in the history
…s (e.g., mmusculus), #TASK-6426, #TASK-5564
  • Loading branch information
jtarraga committed Jul 31, 2024
1 parent 7f77dec commit 0eb898e
Show file tree
Hide file tree
Showing 9 changed files with 414 additions and 252 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@
import static org.opencb.cellbase.lib.EtlCommons.*;
import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE;
import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE;
import static org.opencb.cellbase.lib.builders.EnsemblGeneBuilder.ENSEMBL_GENE_OUTPUT_FILENAME;
import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME;
import static org.opencb.cellbase.lib.builders.ProteinBuilder.OUTPUT_PROTEIN_OUTPUT_FILENAME;
import static org.opencb.cellbase.lib.builders.RefSeqGeneBuilder.REFSEQ_GENE_OUTPUT_FILENAME;
import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*;
import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME;
import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME;
Expand All @@ -69,8 +71,6 @@ public class BuildCommandExecutor extends CommandExecutor {

private boolean flexibleGTFParsing;

private static final String DATA_ALREADY_BUILT = "{} data has already been built.";

public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) {
super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf);

Expand Down Expand Up @@ -234,8 +234,49 @@ private AbstractBuilder buildGenomeSequence() throws CellBaseException {
}

private AbstractBuilder buildGene() throws CellBaseException {
return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing,
configuration);
logger.info(BUILDING_LOG_MESSAGE, getDataName(GENE_DATA));

// Sanity check
Path geneDownloadPath = downloadFolder.resolve(GENE_DATA);
Path geneBuildPath = buildFolder.resolve(GENE_DATA);

List<Path> versionFiles = new ArrayList<>(Arrays.asList(
geneDownloadPath.resolve(ENSEMBL_DATA).resolve(getDataVersionFilename(ENSEMBL_DATA)),
geneDownloadPath.resolve(REFSEQ_DATA).resolve(getDataVersionFilename(REFSEQ_DATA))));
List<String> dataList = GeneBuilder.getCommonDataSources(speciesConfiguration, configuration);
for (String data : dataList) {
Path versionFile;
switch (data) {
case MIRTARBASE_DATA:
versionFile = downloadFolder.resolve(REGULATION_DATA).resolve(MIRTARBASE_DATA).resolve(getDataVersionFilename(data));
break;
case MIRBASE_DATA:
versionFile = downloadFolder.resolve(REGULATION_DATA).resolve(MIRBASE_DATA).resolve(getDataVersionFilename(data));
break;
default:
versionFile = downloadFolder.resolve(GERP_DATA).resolve(getDataVersionFilename(data));
break;
}
versionFiles.add(versionFile);
}

List<Path> filesToCheck = new ArrayList<>(Arrays.asList(geneBuildPath.resolve(ENSEMBL_GENE_OUTPUT_FILENAME),
geneBuildPath.resolve(REFSEQ_GENE_OUTPUT_FILENAME)));
for (Path versionFile : versionFiles) {
filesToCheck.add(geneBuildPath.resolve(versionFile.getFileName()));
}
filesToCheck.addAll(versionFiles);

if (AbstractBuilder.existFiles(filesToCheck)) {
logger.warn(DATA_ALREADY_BUILT, getDataName(ENSEMBL_DATA) + " and " + getDataName(REFSEQ_DATA) + " genes");
return null;
}

System.exit(-1);

copyVersionFiles(versionFiles, geneBuildPath);

return new GeneBuilder(geneDownloadPath, geneBuildPath, speciesConfiguration, flexibleGTFParsing, configuration);
}

private AbstractBuilder buildRepeats() throws CellBaseException {
Expand Down Expand Up @@ -403,25 +444,8 @@ private Path getFastaReferenceGenome() throws CellBaseException {
SpeciesUtils.getSpeciesShortname(speciesConfiguration), assembly.getName(), null);
String fastaFilename = Paths.get(ensemblUrl).getFileName().toString();
Path gzFastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename);
Path fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(GZ_EXTENSION, ""));
if (!fastaPath.toFile().exists()) {
// Gunzip
logger.info("Gunzip file: {}", gzFastaPath);
try {
List<String> params = Arrays.asList("--keep", gzFastaPath.toString());
EtlCommons.runCommandLineProcess(null, "gunzip", params, null);
} catch (IOException e) {
throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e);
} catch (InterruptedException e) {
// Restore interrupted state...
Thread.currentThread().interrupt();
throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e);
}
}
if (!fastaPath.toFile().exists()) {
throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip");
}
return fastaPath;

return EtlCommons.getFastaPath(gzFastaPath);
}

private AbstractBuilder buildSplice() throws IOException, CellBaseException {
Expand Down Expand Up @@ -484,7 +508,11 @@ private void checkVersionFiles(List<Path> versionPaths) throws CellBaseException
private void copyVersionFiles(List<Path> versionPaths, Path targetPath) throws CellBaseException {
// Check version files before copying them
checkVersionFiles(versionPaths);
if (!targetPath.toFile().exists()) {
copyFiles(versionPaths, targetPath);
}

private void copyFiles(List<Path> versionPaths, Path targetPath) throws CellBaseException {
if (!Files.exists(targetPath)) {
try {
Files.createDirectories(targetPath);
} catch (IOException e) {
Expand Down
34 changes: 30 additions & 4 deletions cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,11 @@ public final class EtlCommons {

// Gene
public static final String GENE_DATA = "gene";
public static final String ENSEMBL_GENE_BASENAME = "ensemblGene";
public static final String GENE_ANNOTATION_DATA = "gene_annotation";
public static final String GENE_DISEASE_ANNOTATION_DATA = "gene_disease_annotation";

// RefSeq
public static final String REFSEQ_DATA = "refseq";
public static final String REFSEQ_GENE_BASENAME = "refSeqGene";
// Must match the configuration file
public static final String REFSEQ_GENOMIC_GTF_FILE_ID = "GENOMIC_GTF";
public static final String REFSEQ_GENOMIC_FNA_FILE_ID = "GENOMIC_FNA";
Expand Down Expand Up @@ -508,7 +506,7 @@ public static boolean runCommandLineProcess(File workingDirectory, String binPat

ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath);

LOGGER.debug("Executing command: {}", StringUtils.join(builder.command(), " "));
LOGGER.info("Executing command: {}", StringUtils.join(builder.command(), " "));
Process process = builder.start();
process.waitFor();

Expand Down Expand Up @@ -541,6 +539,34 @@ private static ProcessBuilder getProcessBuilder(File workingDirectory, String bi
return builder;
}

public static Path getFastaPath(Path gzFastaPath) throws CellBaseException {
// Sanity check
if (!Files.exists(gzFastaPath)) {
throw new CellBaseException("Gzipped FASTA file " + gzFastaPath + " does not exist");
}

// Check FASTA and unzip if necessary
Path fastaPath = gzFastaPath.getParent().resolve(gzFastaPath.getFileName().toString().replace(GZ_EXTENSION, ""));
if (!fastaPath.toFile().exists()) {
// Gunzip
LOGGER.info("Gunzip file {}", gzFastaPath);
try {
List<String> params = Arrays.asList("--keep", gzFastaPath.toString());
EtlCommons.runCommandLineProcess(null, "gunzip", params, null);
} catch (IOException e) {
throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e);
} catch (InterruptedException e) {
// Restore interrupted state...
Thread.currentThread().interrupt();
throw new CellBaseException("Error executing gunzip in FASTA file " + gzFastaPath, e);
}
}
if (!fastaPath.toFile().exists()) {
throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip");
}
return fastaPath;
}

public static boolean isMissing(String string) {
return !((string != null) && !string.isEmpty()
&& !string.replace(" ", "")
Expand Down Expand Up @@ -736,7 +762,7 @@ private static List<String> getRepeatsDataList(CellBaseConfiguration configurati
return dataList;
}

private static boolean isDataSupported(DownloadProperties.URLProperties props, String prefix) {
public static boolean isDataSupported(DownloadProperties.URLProperties props, String prefix) {
for (String key : props.getFiles().keySet()) {
if (key.startsWith(prefix)) {
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ public abstract class AbstractBuilder {
public static final String PARSING_LOG_MESSAGE = "Parsing {} ...";
public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done.";

public static final String SKIPPING_INDEX_DATA_LOG_MESSAGE = "Skipping index for data '{}': it is not supported for species '{}'.";
public static final String DATA_ALREADY_BUILT = "'{}' data has already been built.";

protected AbstractBuilder(CellBaseSerializer serializer) {
logger = LoggerFactory.getLogger(this.getClass());

Expand All @@ -80,7 +83,7 @@ public void disconnect() {
}
}

protected String getConfigurationFileIdPrefix(String scientificSpecies) {
protected static String getConfigurationFileIdPrefix(String scientificSpecies) {
String prefix = "";
if (StringUtils.isNotEmpty(scientificSpecies) && !scientificSpecies.equals("Homo sapiens") && scientificSpecies.contains(" ")) {
char c = scientificSpecies.charAt(0);
Expand All @@ -94,6 +97,8 @@ protected File checkFile(DownloadProperties.URLProperties props, String fileId,
String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString();
if (filename.contains(MANUAL_PREFIX)) {
filename = filename.replace(MANUAL_PREFIX, "");
} else if (filename.contains(SCRIPT_PREFIX)) {
filename = filename.split("@")[1];
}
Path filePath = targetPath.resolve(filename);
if (!Files.exists(filePath)) {
Expand Down
Loading

0 comments on commit 0eb898e

Please sign in to comment.