Skip to content

Commit

Permalink
lib: update builder to build AlphaMissense predictions, #TASK-5419, #…
Browse files Browse the repository at this point in the history
…TASK-5388
  • Loading branch information
jtarraga committed Dec 21, 2023
1 parent ecfee15 commit 967d4cc
Show file tree
Hide file tree
Showing 4 changed files with 275 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public class BuildCommandOptions {

@Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, "
+ "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, "
+ "clinical_variants, repeats, svs, splice_score, pubmed. 'all' builds everything.", required = true, arity = 1)
+ "clinical_variants, repeats, svs, splice_score, pubmed and alphamissense; and 'all' builds everything.", required = true, arity = 1)
public String data;

@Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ public void execute() {
case EtlCommons.PHARMACOGENOMICS_DATA:
parser = buildPharmacogenomics();
break;
case EtlCommons.ALPHAMISSENSE_DATA:
parser = buildAlphaMissense();
break;
default:
logger.error("Build option '" + buildCommandOptions.data + "' is not valid");
break;
Expand Down Expand Up @@ -437,4 +440,23 @@ private CellBaseBuilder buildPharmacogenomics() throws IOException {
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(outFolder);
return new PharmGKBBuilder(inFolder, serializer);
}

private CellBaseBuilder buildAlphaMissense() throws IOException {
Path pubmedInputFolder = downloadFolder.resolve(EtlCommons.PUBMED_DATA);
Path pubmedOutputFolder = buildFolder.resolve(EtlCommons.PUBMED_DATA);
if (!pubmedOutputFolder.toFile().exists()) {
pubmedOutputFolder.toFile().mkdirs();
}

logger.info("Copying AlphaMissense version file...");
if (downloadFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME).toFile().exists()) {
Files.copy(downloadFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME),
buildFolder.resolve(EtlCommons.ALPHAMISSENSE_VERSION_FILENAME), StandardCopyOption.REPLACE_EXISTING);
}

String alphaMissenseFilename = new File(configuration.getDownload().getAlphaMissense().getFiles().get(0)).getName();
File alphaMissenseFile = downloadFolder.resolve(alphaMissenseFilename).toFile();
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.ALPHAMISSENSE_DATA);
return new AlphaMissenseBuilder(alphaMissenseFile, serializer);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.opencb.cellbase.lib.builders;

import com.fasterxml.jackson.databind.MapperFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
import com.fasterxml.jackson.databind.ObjectWriter;
import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.core.ProteinSubstitutionPrediction;
import org.opencb.biodata.models.core.ProteinSubstitutionScore;
import org.opencb.cellbase.core.serializer.CellBaseFileSerializer;
import org.opencb.cellbase.lib.builders.utils.RocksDBUtils;
import org.opencb.commons.utils.FileUtils;
import org.rocksdb.Options;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksIterator;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class AlphaMissenseBuilder extends CellBaseBuilder {

private File alphaMissenseFile;
private CellBaseFileSerializer fileSerializer;

private RocksDB rdb;

private String AA_CHANGE_PATTERN = "^([A-Z])(\\d+)([A-Z])$";
private Pattern aaChangePattern = Pattern.compile(AA_CHANGE_PATTERN);

private static ObjectMapper mapper;
private static ObjectReader predictionReader;
private static ObjectWriter jsonObjectWriter;

static {
mapper = new ObjectMapper();
mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true);
predictionReader = mapper.readerFor(ProteinSubstitutionPrediction.class);
jsonObjectWriter = mapper.writer();
}

public AlphaMissenseBuilder(File alphaMissenseFile, CellBaseFileSerializer serializer) {
super(serializer);

this.fileSerializer = serializer;
this.alphaMissenseFile = alphaMissenseFile;

logger = LoggerFactory.getLogger(AlphaMissenseBuilder.class);
}

@Override
public void parse() throws Exception {
logger.info("Parsing AlphaMissense file: {} ...", alphaMissenseFile.getName());

// Sanity check
FileUtils.checkFile(alphaMissenseFile.toPath());

Object[] dbConnection = RocksDBUtils.getDBConnection(serializer.getOutdir().resolve("rdb.idx").toString(), true);
rdb = (RocksDB) dbConnection[0];
Options dbOption = (Options) dbConnection[1];
String dbLocation = (String) dbConnection[2];

// AlphaMissense file reader
BufferedReader br = FileUtils.newBufferedReader(alphaMissenseFile.toPath());
String line;
int counter = 0;
while ((line = br.readLine()) != null) {
if (!line.startsWith("#")) {
// 0 1 2 3 4 5 6 7 8 9
// CHROM POS REF ALT genome uniprot_id transcript_id protein_variant am_pathogenicity am_class
String[] split = line.split("\t", -1);

String transcriptId;
String uniprotId;
int position;
String aaReference;
String aaAlternate;

if (StringUtils.isNotEmpty(split[6])) {
transcriptId = split[6];
} else {
logger.warn("Missing field 'transcript_id', skipping line: {}", line);
return;
}
if (StringUtils.isNotEmpty(split[5])) {
uniprotId = split[5];
} else {
logger.warn("Missing field 'uniprot_id', skipping line: {}", line);
return;
}
if (StringUtils.isNotEmpty(split[7])) {
Matcher matcher = aaChangePattern.matcher(split[7]);
if (matcher.matches()) {
aaReference = matcher.group(1);
position = Integer.parseInt(matcher.group(2));
aaAlternate = matcher.group(3);
} else {
logger.warn("Error parsing field 'protein_variant' = {}, skipping line: {}", split[7], line);
return;
}
} else {
logger.warn("Missing field 'protein_variant', skipping line: {}", line);
return;
}

// Create protein substitution score
ProteinSubstitutionScore score = new ProteinSubstitutionScore();
score.setAaAlternate(aaAlternate);
if (StringUtils.isNotEmpty(split[8])) {
score.setScore(Double.parseDouble(split[8]));
}
if (StringUtils.isNotEmpty(split[9])) {
score.setEffect(split[9]);
}

// Creating and/or updating protein substitution prediction
ProteinSubstitutionPrediction prediction;
String key = transcriptId + "_" + uniprotId + "_" + position + "_" + aaReference;
byte[] dbContent = rdb.get(key.getBytes());
if (dbContent == null) {
prediction = new ProteinSubstitutionPrediction(transcriptId, uniprotId, position, aaReference, "AlphaMissense",
Collections.singletonList(score));
} else {
prediction = predictionReader.readValue(dbContent);
prediction.getScores().add(score);
}
rdb.put(key.getBytes(), jsonObjectWriter.writeValueAsBytes(prediction));

// Log messages
counter++;
if (counter % 10000 == 0) {
logger.info("{} AlphaMissense predictions parsed", counter);
}
}
}

// Serialize/write the saved variant polygenic scores in the RocksDB
serializeRDB(rdb);
RocksDBUtils.closeIndex(rdb, dbOption, dbLocation);
serializer.close();

logger.info("Parsed AlphaMissense file: {}. Done!", alphaMissenseFile.getName());
}

private void serializeRDB(RocksDB rdb) throws IOException {
// DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's
// named "iterator"
RocksIterator rocksIterator = rdb.newIterator();

logger.info("Reading from RocksDB index and serializing to {}.json.gz", serializer.getOutdir().resolve(serializer.getFileName()));
int counter = 0;
for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) {
logger.info("variant = {}", new String(rocksIterator.key()));
ProteinSubstitutionPrediction prediction = predictionReader.readValue(rocksIterator.value());
serializer.serialize(prediction);
counter++;
if (counter % 10000 == 0) {
logger.info("{} written", counter);
}
}
serializer.close();
logger.info("Done.");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright 2015-2020 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.opencb.cellbase.lib.builders.utils;

import org.rocksdb.Options;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;

public class RocksDBUtils {

public static void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws IOException {
if (rdb != null) {
rdb.close();
}
if (dbOption != null) {
dbOption.dispose();
}
if (dbLocation != null && Files.exists(Paths.get(dbLocation))) {
org.apache.commons.io.FileUtils.deleteDirectory(new File(dbLocation));
}
}

public static Object[] getDBConnection(String dbLocation, boolean forceCreate) throws RocksDBException {
boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation));
// a static method that loads the RocksDB C++ library.
RocksDB.loadLibrary();
// the Options class contains a set of configurable DB options
// that determines the behavior of a database.
Options options = new Options().setCreateIfMissing(true);

// options.setMaxBackgroundCompactions(4);
// options.setMaxBackgroundFlushes(1);
// options.setCompressionType(CompressionType.NO_COMPRESSION);
// options.setMaxOpenFiles(-1);
// options.setIncreaseParallelism(4);
// options.setCompactionStyle(CompactionStyle.LEVEL);
// options.setLevelCompactionDynamicLevelBytes(true);

RocksDB db;
// a factory method that returns a RocksDB instance
if (indexingNeeded) {
db = RocksDB.open(options, dbLocation);
} else {
db = RocksDB.openReadOnly(options, dbLocation);
}

return new Object[]{db, options, dbLocation, indexingNeeded};
}
}

0 comments on commit 967d4cc

Please sign in to comment.