Skip to content

Commit

Permalink
lib: add support for multi-species, checks and log messages in regula…
Browse files Browse the repository at this point in the history
…tion builder, #TASK-5576, #TASK-5564
  • Loading branch information
jtarraga committed Jul 26, 2024
1 parent 847f835 commit b0d1c67
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_DONE_LOG_MESSAGE;
import static org.opencb.cellbase.lib.builders.AbstractBuilder.BUILDING_LOG_MESSAGE;
import static org.opencb.cellbase.lib.builders.GenomeSequenceFastaBuilder.GENOME_OUTPUT_FILENAME;
import static org.opencb.cellbase.lib.builders.RegulatoryFeatureBuilder.*;
import static org.opencb.cellbase.lib.builders.RepeatsBuilder.REPEATS_OUTPUT_FILENAME;
import static org.opencb.cellbase.lib.download.GenomeDownloadManager.GENOME_INFO_FILENAME;

Expand All @@ -67,6 +68,8 @@ public class BuildCommandExecutor extends CommandExecutor {

private boolean flexibleGTFParsing;

private static final String DATA_ALREADY_BUILT = "{} data has already been built.";

public BuildCommandExecutor(AdminCliOptionsParser.BuildCommandOptions buildCommandOptions) {
super(buildCommandOptions.commonOptions.logLevel, buildCommandOptions.commonOptions.conf);

Expand Down Expand Up @@ -194,7 +197,7 @@ private AbstractBuilder buildGenomeSequence() throws CellBaseException {
if (Files.exists(genomeBuildFolder.resolve(GENOME_OUTPUT_FILENAME))
&& Files.exists(genomeBuildFolder.resolve(GENOME_INFO_FILENAME))
&& Files.exists(genomeBuildFolder.resolve(getDataVersionFilename(GENOME_DATA)))) {
logger.warn("{} data has been already built", getDataName(GENOME_DATA));
logger.warn(DATA_ALREADY_BUILT, getDataName(GENOME_DATA));
return null;
}

Expand Down Expand Up @@ -241,13 +244,12 @@ private AbstractBuilder buildRepeats() throws CellBaseException {
Path repeatsDownloadPath = downloadFolder.resolve(REPEATS_DATA);
Path repeatsBuildPath = buildFolder.resolve(REPEATS_DATA);
List<String> dataList = EtlCommons.getDataList(REPEATS_DATA, configuration, speciesConfiguration);
List<Path> filesToCheck = new ArrayList<>();
filesToCheck.add(repeatsBuildPath.resolve(REPEATS_OUTPUT_FILENAME));
List<Path> filesToCheck = new ArrayList<>(Arrays.asList(repeatsBuildPath.resolve(REPEATS_OUTPUT_FILENAME)));
for (String data : dataList) {
filesToCheck.add(repeatsBuildPath.resolve(getDataVersionFilename(data)));
}
if (AbstractBuilder.existFiles(filesToCheck)) {
logger.warn("{} data has been already built", getDataName(REPEATS_DATA));
logger.warn(DATA_ALREADY_BUILT, getDataName(REPEATS_DATA));
return null;
}
for (String data : dataList) {
Expand Down Expand Up @@ -300,11 +302,23 @@ private AbstractBuilder buildRevel() throws CellBaseException {
}

private AbstractBuilder buildRegulation() throws CellBaseException {
logger.info(BUILDING_LOG_MESSAGE, getDataName(REGULATION_DATA));

// Sanity check
Path regulationDownloadPath = downloadFolder.resolve(REGULATION_DATA);
Path regulationBuildPath = buildFolder.resolve(REGULATION_DATA);
copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)),
regulationDownloadPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA))), regulationBuildPath);
List<Path> filesToCheck = Arrays.asList(regulationBuildPath.resolve(REGULATORY_REGION_OUTPUT_FILENAME),
regulationBuildPath.resolve(REGULATORY_PFM_OUTPUT_FILENAME),
regulationBuildPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)),
regulationBuildPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)));
if (AbstractBuilder.existFiles(filesToCheck)) {
logger.warn(DATA_ALREADY_BUILT, getDataName(REGULATION_DATA));
return null;
}

copyVersionFiles(Arrays.asList(regulationDownloadPath.resolve(REGULATORY_BUILD_DATA).resolve(getDataVersionFilename(
REGULATORY_BUILD_DATA)), regulationDownloadPath.resolve(MOTIF_FEATURES_DATA).resolve(getDataVersionFilename(
MOTIF_FEATURES_DATA))), regulationBuildPath);

// Create the file serializer and the regulatory feature builder
CellBaseSerializer serializer = new CellBaseJsonFileSerializer(regulationBuildPath, REGULATORY_REGION_BASENAME);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,6 @@ public final class EtlCommons {

// Regulation
public static final String REGULATION_DATA = "regulation";
public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm";
public static final String REGULATORY_REGION_BASENAME = "regulatory_region";
// Regulatory build and motif features (see Ensembl files: regulatory build and motif features files)
public static final String REGULATORY_BUILD_DATA = "regulatory_build";
// Motif features (see Ensembl files)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public abstract class AbstractBuilder {
public static final String PARSING_LOG_MESSAGE = "Parsing {} ...";
public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done.";

public AbstractBuilder(CellBaseSerializer serializer) {
protected AbstractBuilder(CellBaseSerializer serializer) {
logger = LoggerFactory.getLogger(this.getClass());

this.serializer = serializer;
Expand All @@ -75,7 +75,7 @@ public void disconnect() {
try {
serializer.close();
} catch (Exception e) {
logger.error("Error closing serializer:\n" + StringUtils.join(e.getStackTrace(), "\n"));
logger.error("Error closing serializer. Stack trace: {}", e.getStackTrace());
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,33 +45,41 @@
public class RegulatoryFeatureBuilder extends AbstractBuilder {

private Path regulationPath;

private Set<Gff2> regulatoryFeatureSet;

public static final String REGULATORY_REGION_BASENAME = "regulatory_region";
public static final String REGULATORY_REGION_OUTPUT_FILENAME = REGULATORY_REGION_BASENAME + ".json.gz";
public static final String REGULATORY_PFM_BASENAME = "regulatory_pfm";
public static final String REGULATORY_PFM_OUTPUT_FILENAME = REGULATORY_PFM_BASENAME + ".json.gz";

public RegulatoryFeatureBuilder(Path regulationPath, CellBaseSerializer serializer) {
super(serializer);
this.regulationPath = regulationPath;
}

@Override
public void parse() throws Exception {
logger.info(BUILDING_LOG_MESSAGE, getDataName(REGULATION_DATA));

// Sanity check
checkDirectory(regulationPath, getDataName(REGULATION_DATA));

DataSource dataSource;
List<File> regulatoryFiles;
List<File> motifFeaturesFiles;

// Check build regulatory files
DataSource dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)).toFile());
List<File> regulatoryFiles = checkFiles(dataSource, regulationPath, getDataCategory(REGULATORY_BUILD_DATA) + "/"
dataSource = dataSourceReader.readValue(regulationPath.resolve(REGULATORY_BUILD_DATA)
.resolve(getDataVersionFilename(REGULATORY_BUILD_DATA)).toFile());
regulatoryFiles = checkFiles(dataSource, regulationPath.resolve(REGULATORY_BUILD_DATA), getDataCategory(REGULATORY_BUILD_DATA) + "/"
+ getDataName(REGULATORY_BUILD_DATA));
if (regulatoryFiles.size() != 1) {
throw new CellBaseException("One " + getDataName(REGULATORY_BUILD_DATA) + " file is expected, but currently there are "
+ regulatoryFiles.size() + " files");
}

// Check motif features files
dataSource = dataSourceReader.readValue(regulationPath.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)).toFile());
List<File> motifFeaturesFiles = checkFiles(dataSource, regulationPath, getDataCategory(MOTIF_FEATURES_DATA) + "/"
dataSource = dataSourceReader.readValue(regulationPath.resolve(MOTIF_FEATURES_DATA)
.resolve(getDataVersionFilename(MOTIF_FEATURES_DATA)).toFile());
motifFeaturesFiles = checkFiles(dataSource, regulationPath.resolve(MOTIF_FEATURES_DATA), getDataCategory(MOTIF_FEATURES_DATA) + "/"
+ getDataName(MOTIF_FEATURES_DATA));
if (motifFeaturesFiles.size() != 2) {
throw new CellBaseException("Two " + getDataName(MOTIF_FEATURES_DATA) + " files are expected, but currently there are "
Expand All @@ -84,8 +92,6 @@ public void parse() throws Exception {

// Parse regulatory build features
parseGffFile(regulatoryFiles.get(0).toPath());

logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(REGULATION_DATA));
}

protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSuchMethodException, FileFormatException {
Expand All @@ -110,19 +116,20 @@ protected void parseGffFile(Path regulatoryFeatureFile) throws IOException, NoSu
}
serializer.close();

logger.info(PARSING_DONE_LOG_MESSAGE, regulatoryFeatureFile);
logger.info(PARSING_DONE_LOG_MESSAGE);
}

private void loadPfmMatrices(Path motifGffFile, Path buildFolder) throws IOException, NoSuchMethodException, FileFormatException,
InterruptedException {
Path regulatoryPfmPath = buildFolder.resolve(REGULATORY_PFM_BASENAME + ".json.gz");
Path regulatoryPfmPath = buildFolder.resolve(REGULATORY_PFM_OUTPUT_FILENAME);
logger.info("Downloading and building PFM matrices in {} from {} ...", regulatoryPfmPath, motifGffFile);
if (Files.exists(regulatoryPfmPath)) {
logger.info("{} is already built", regulatoryPfmPath);
return;
}

Set<String> motifIds = new HashSet<>();
logger.info(PARSING_LOG_MESSAGE, motifGffFile);
try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) {
Gff2 tfbsMotifFeature;
Pattern filePattern = Pattern.compile("ENSPFM(\\d+)");
Expand All @@ -133,6 +140,7 @@ private void loadPfmMatrices(Path motifGffFile, Path buildFolder) throws IOExcep
}
}
}
logger.info(PARSING_DONE_LOG_MESSAGE);

ObjectMapper mapper = new ObjectMapper();
CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, REGULATORY_PFM_BASENAME, true);
Expand Down

0 comments on commit b0d1c67

Please sign in to comment.