Skip to content

Commit

Permalink
lib: improve genome builder by checking files, and fixing sonnar issu…
Browse files Browse the repository at this point in the history
…es, #TASK-5576, #TASK-5564
  • Loading branch information
jtarraga committed Jul 25, 2024
1 parent e17e51d commit 733cade
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@
import java.util.List;

import static org.opencb.cellbase.lib.EtlCommons.*;
import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE;
import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE;
import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME;
import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME;


public class BuildCommandExecutor extends CommandExecutor {
Expand All @@ -61,10 +65,6 @@ public class BuildCommandExecutor extends CommandExecutor {

private boolean flexibleGTFParsing;

private static final List<String> VALID_SOURCES_TO_BUILD = Arrays.asList(GENOME_DATA, GENE_DATA, VARIATION_FUNCTIONAL_SCORE_DATA,
MISSENSE_VARIATION_SCORE_DATA, REGULATION_DATA, PROTEIN_DATA, CONSERVATION_DATA, CLINICAL_VARIANT_DATA, REPEATS_DATA,
ONTOLOGY_DATA, SPLICE_SCORE_DATA, PUBMED_DATA, PHARMACOGENOMICS_DATA);

public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) {
super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf);

Expand Down Expand Up @@ -115,10 +115,8 @@ public void execute() throws CellBaseException {
throw new CellBaseException("Download folder not found '" + spShortName + "_" + spAssembly + "/download'");
}
buildFolder = outputDirectory.resolve(spFolder + "/generated_json");
if (!buildFolder.toFile().exists()) {
if (!Files.exists(buildFolder)) {
Files.createDirectories(buildFolder);
}
if (!Files.exists(buildFolder)) {
Files.createDirectories(buildFolder);
}

// Check data sources
Expand Down Expand Up @@ -170,9 +168,11 @@ public void execute() throws CellBaseException {
+ "Valid values are: " + StringUtils.join(speciesConfiguration.getData(), ",")
+ ". You can use data parameter 'all' to download everything");
}

parser.parse();
parser.disconnect();
if (parser != null) {
parser.parse();
parser.disconnect();
logger.info(BUILDING_DONE_LOG_MESSAGE);
}
}
} catch (InterruptedException e) {
// Restore interrupted state...
Expand All @@ -184,16 +184,47 @@ public void execute() throws CellBaseException {
}

private AbstractBuilder buildGenomeSequence() throws CellBaseException {
logger.info(BUILDING_LOG_MESSAGE, getDataName(GENOME_DATA));

Path genomeDownloadFolder = downloadFolder.resolve(GENOME_DATA);
Path genomeBuildFolder = buildFolder.resolve(GENOME_DATA);

if (Files.exists(genomeBuildFolder.resolve(GENOME_OUTPUT_FILENAME))
&& Files.exists(genomeBuildFolder.resolve(GENOME_INFO_FILENAME))
&& Files.exists(genomeBuildFolder.resolve(getDataVersionFilename(GENOME_DATA)))) {
logger.warn("{} data has been already built", getDataName(GENOME_DATA));
return null;
}

// Sanity check
Path genomeVersionPath = downloadFolder.resolve(GENOME_DATA).resolve(getDataVersionFilename(GENOME_DATA));
copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA));
if (!Files.exists(genomeDownloadFolder.resolve(GENOME_INFO_FILENAME))) {
throw new CellBaseException("Genome info file " + GENOME_INFO_FILENAME + " does not exist at " + genomeDownloadFolder);
}

// Get FASTA path
Path fastaPath = getFastaReferenceGenome();
// Copy files if necessary
if (!Files.exists(genomeBuildFolder.resolve(getDataVersionFilename(GENOME_DATA)))) {
Path genomeVersionPath = genomeDownloadFolder.resolve(getDataVersionFilename(GENOME_DATA));
copyVersionFiles(Collections.singletonList(genomeVersionPath), buildFolder.resolve(GENOME_DATA));
}

if (!Files.exists(genomeBuildFolder.resolve(GENOME_INFO_FILENAME))) {
try {
Files.copy(genomeDownloadFolder.resolve(GENOME_INFO_FILENAME), genomeBuildFolder.resolve(GENOME_INFO_FILENAME));
} catch (IOException e) {
throw new CellBaseException("Error copying file " + GENOME_INFO_FILENAME, e);
}
}

// Create serializer and return the genome builder
CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder.resolve(GENOME_DATA), GENOME_DATA);
return new GenomeSequenceFastaBuilder(fastaPath, serializer);
// Parse file
if (!Files.exists(genomeBuildFolder.resolve(GENOME_OUTPUT_FILENAME))) {
// Get FASTA path
Path fastaPath = getFastaReferenceGenome();

// Create serializer and return the genome builder
CellBaseSerializer serializer = new CellBaseJsonFileSerializer(genomeBuildFolder, GENOME_DATA);
return new GenomeSequenceFastaBuilder(fastaPath, serializer);
}
return null;
}

private AbstractBuilder buildGene() throws CellBaseException {
Expand Down Expand Up @@ -279,8 +310,8 @@ private AbstractBuilder buildConservation() throws CellBaseException {
Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA);
Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA);
copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(getDataVersionFilename(GERP_DATA)),
conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)),
conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath);
conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)),
conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath);

int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE;
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(conservationBuildPath);
Expand Down Expand Up @@ -324,7 +355,8 @@ private Path getFastaReferenceGenome() throws CellBaseException {
// Gunzip
logger.info("Gunzip file: {}", fastaPath);
try {
EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaPath.toString()), null);
List<String> params = Arrays.asList("--keep", fastaPath.toString());
EtlCommons.runCommandLineProcess(null, "gunzip", params, null);
} catch (IOException e) {
throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e);
} catch (InterruptedException e) {
Expand Down Expand Up @@ -387,7 +419,7 @@ private void checkVersionFiles(List<Path> versionPaths) throws CellBaseException
}
try {
DataSource dataSource = dataSourceReader.readValue(versionPath.toFile());
if (org.apache.commons.lang3.StringUtils.isEmpty(dataSource.getVersion())) {
if (StringUtils.isEmpty(dataSource.getVersion())) {
throw new CellBaseException("Version missing version in file " + versionPath + ": a version must be specified in the"
+ " file");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public abstract class AbstractBuilder {
public static final String CHECKING_BEFORE_BUILDING_LOG_MESSAGE = "Checking files before building {} ...";
public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking {} done!";

public static final String BUILDING_LOG_MESSAGE = "Building {} ...";
public static final String BUILDING_LOG_MESSAGE = "Building {} data ...";
public static final String BUILDING_DONE_LOG_MESSAGE = "Building done.";

public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ...";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,50 +16,54 @@

package org.opencb.cellbase.lib.builders;

import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.core.GenomeSequenceChunk;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.commons.utils.FileUtils;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Path;

import static org.opencb.cellbase.lib.EtlCommons.*;

public class GenomeSequenceFastaBuilder extends AbstractBuilder {

private Path genomeReferenceFastaFile;

private static final int CHUNK_SIZE = 2000;
public static final String GENOME_OUTPUT_FILENAME = EtlCommons.GENOME_DATA + ".json.gz";

public GenomeSequenceFastaBuilder(Path genomeReferenceFastaFile, CellBaseSerializer serializer) {
super(serializer);
this.genomeReferenceFastaFile = genomeReferenceFastaFile;
}

@Override
public void parse() {
public void parse() throws CellBaseException {
logger.info(PARSING_LOG_MESSAGE, genomeReferenceFastaFile);

try {
try (BufferedReader br = FileUtils.newBufferedReader(genomeReferenceFastaFile)) {
String sequenceName = null;
String sequenceType = "";
String sequenceAssembly = null;
String line;
StringBuilder sequenceStringBuilder = new StringBuilder();

// Preparing input and output files
BufferedReader br;
br = FileUtils.newBufferedReader(genomeReferenceFastaFile);


while ((line = br.readLine()) != null) {

if (!line.startsWith(">")) {
sequenceStringBuilder.append(line);
} else {
// new chromosome, save data
if (sequenceStringBuilder.length() > 0) {
if (!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) {
System.out.println(sequenceName);
serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString());
}
if (sequenceStringBuilder.length() > 0 && StringUtils.isNotEmpty(sequenceName) && !sequenceName.contains("PATCH")
&& !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) {
serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString());
}

// initialize data structures
Expand All @@ -75,18 +79,17 @@ public void parse() {
}
}
// Last chromosome must be processed
if (!sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR") && !sequenceName.contains("contig")) {
if (StringUtils.isNotEmpty(sequenceName) && !sequenceName.contains("PATCH") && !sequenceName.contains("HSCHR")
&& !sequenceName.contains("contig")) {
serializeGenomeSequence(sequenceName, sequenceType, sequenceAssembly, sequenceStringBuilder.toString());
}

br.close();
} catch (IOException e) {
e.printStackTrace();
}
logger.info(PARSING_DONE_LOG_MESSAGE);
}

private void serializeGenomeSequence(String chromosome, String sequenceType, String sequenceAssembly, String sequence)
throws IOException {
private void serializeGenomeSequence(String chromosome, String sequenceType, String sequenceAssembly, String sequence) {
int chunk = 0;
int start = 1;
int end = CHUNK_SIZE - 1;
Expand All @@ -100,11 +103,10 @@ private void serializeGenomeSequence(String chromosome, String sequenceType, Str
genomeSequenceChunk = new GenomeSequenceChunk(chromosome, chromosome + "_" + 0 + "_" + chunkIdSuffix, start,
sequence.length() - 1, sequenceType, sequenceAssembly, chunkSequence);
serializer.serialize(genomeSequenceChunk);
start += CHUNK_SIZE - 1;
} else {
while (start < sequence.length()) {
if (chunk % 10000 == 0) {
System.out.println("Chr:" + chromosome + " chunkId:" + chunk);
logger.info("Chr: {}, chunkId: {}", chromosome, chunk);
}
// First chunk of the chromosome
if (start == 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ public class GenomeDownloadManager extends AbstractDownloadManager {

private Path sequenceFolder;

public static final String GENOME_INFO_FILENAME = "genome_info.json";

public GenomeDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration)
throws IOException, CellBaseException {
super(species, assembly, targetDirectory, configuration);
Expand Down Expand Up @@ -79,10 +81,8 @@ public List<DownloadFile> downloadReferenceGenome() throws IOException, Interrup
}

public void downloadGenomeInfo() throws IOException, CellBaseException {
String genomeInfoFilename = "genome_info.json";

// Already downloaded
if (isAlreadyDownloaded(sequenceFolder.resolve(genomeInfoFilename), getDataName(GENOME_INFO_DATA))) {
if (isAlreadyDownloaded(sequenceFolder.resolve(GENOME_INFO_FILENAME), getDataName(GENOME_INFO_DATA))) {
return;
}

Expand All @@ -100,7 +100,7 @@ public void downloadGenomeInfo() throws IOException, CellBaseException {
String params = "/opt/cellbase/scripts/ensembl-scripts/genome_info.pl"
+ " --species \"" + speciesConfiguration.getScientificName() + "\""
+ " --assembly \"" + assemblyConfiguration.getName() + "\""
+ " --outfile \"" + outputBinding.getValue() + "/" + genomeInfoFilename + "\"";
+ " --outfile \"" + outputBinding.getValue() + "/" + GENOME_INFO_FILENAME + "\"";

// Execute perl script in docker
DockerUtils.run(dockerImage, null, outputBinding, params, null);
Expand Down

0 comments on commit 733cade

Please sign in to comment.