-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
lib: update builder to build AlphaMissense predictions, #TASK-5419, #…
…TASK-5388
- Loading branch information
Showing
4 changed files
with
275 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
184 changes: 184 additions & 0 deletions
184
cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/AlphaMissenseBuilder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
/* | ||
* Copyright 2015-2020 OpenCB | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.opencb.cellbase.lib.builders; | ||
|
||
import com.fasterxml.jackson.databind.MapperFeature; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
import com.fasterxml.jackson.databind.ObjectReader; | ||
import com.fasterxml.jackson.databind.ObjectWriter; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; | ||
import org.opencb.biodata.models.core.ProteinSubstitutionScore; | ||
import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; | ||
import org.opencb.cellbase.lib.builders.utils.RocksDBUtils; | ||
import org.opencb.commons.utils.FileUtils; | ||
import org.rocksdb.Options; | ||
import org.rocksdb.RocksDB; | ||
import org.rocksdb.RocksIterator; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.Collections; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
public class AlphaMissenseBuilder extends CellBaseBuilder { | ||
|
||
private File alphaMissenseFile; | ||
private CellBaseFileSerializer fileSerializer; | ||
|
||
private RocksDB rdb; | ||
|
||
private String AA_CHANGE_PATTERN = "^([A-Z])(\\d+)([A-Z])$"; | ||
private Pattern aaChangePattern = Pattern.compile(AA_CHANGE_PATTERN); | ||
|
||
private static ObjectMapper mapper; | ||
private static ObjectReader predictionReader; | ||
private static ObjectWriter jsonObjectWriter; | ||
|
||
static { | ||
mapper = new ObjectMapper(); | ||
mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); | ||
predictionReader = mapper.readerFor(ProteinSubstitutionPrediction.class); | ||
jsonObjectWriter = mapper.writer(); | ||
} | ||
|
||
public AlphaMissenseBuilder(File alphaMissenseFile, CellBaseFileSerializer serializer) { | ||
super(serializer); | ||
|
||
this.fileSerializer = serializer; | ||
this.alphaMissenseFile = alphaMissenseFile; | ||
|
||
logger = LoggerFactory.getLogger(AlphaMissenseBuilder.class); | ||
} | ||
|
||
@Override | ||
public void parse() throws Exception { | ||
logger.info("Parsing AlphaMissense file: {} ...", alphaMissenseFile.getName()); | ||
|
||
// Sanity check | ||
FileUtils.checkFile(alphaMissenseFile.toPath()); | ||
|
||
Object[] dbConnection = RocksDBUtils.getDBConnection(serializer.getOutdir().resolve("rdb.idx").toString(), true); | ||
rdb = (RocksDB) dbConnection[0]; | ||
Options dbOption = (Options) dbConnection[1]; | ||
String dbLocation = (String) dbConnection[2]; | ||
|
||
// AlphaMissense file reader | ||
BufferedReader br = FileUtils.newBufferedReader(alphaMissenseFile.toPath()); | ||
String line; | ||
int counter = 0; | ||
while ((line = br.readLine()) != null) { | ||
if (!line.startsWith("#")) { | ||
// 0 1 2 3 4 5 6 7 8 9 | ||
// CHROM POS REF ALT genome uniprot_id transcript_id protein_variant am_pathogenicity am_class | ||
String[] split = line.split("\t", -1); | ||
|
||
String transcriptId; | ||
String uniprotId; | ||
int position; | ||
String aaReference; | ||
String aaAlternate; | ||
|
||
if (StringUtils.isNotEmpty(split[6])) { | ||
transcriptId = split[6]; | ||
} else { | ||
logger.warn("Missing field 'transcript_id', skipping line: {}", line); | ||
return; | ||
} | ||
if (StringUtils.isNotEmpty(split[5])) { | ||
uniprotId = split[5]; | ||
} else { | ||
logger.warn("Missing field 'uniprot_id', skipping line: {}", line); | ||
return; | ||
} | ||
if (StringUtils.isNotEmpty(split[7])) { | ||
Matcher matcher = aaChangePattern.matcher(split[7]); | ||
if (matcher.matches()) { | ||
aaReference = matcher.group(1); | ||
position = Integer.parseInt(matcher.group(2)); | ||
aaAlternate = matcher.group(3); | ||
} else { | ||
logger.warn("Error parsing field 'protein_variant' = {}, skipping line: {}", split[7], line); | ||
return; | ||
} | ||
} else { | ||
logger.warn("Missing field 'protein_variant', skipping line: {}", line); | ||
return; | ||
} | ||
|
||
// Create protein substitution score | ||
ProteinSubstitutionScore score = new ProteinSubstitutionScore(); | ||
score.setAaAlternate(aaAlternate); | ||
if (StringUtils.isNotEmpty(split[8])) { | ||
score.setScore(Double.parseDouble(split[8])); | ||
} | ||
if (StringUtils.isNotEmpty(split[9])) { | ||
score.setEffect(split[9]); | ||
} | ||
|
||
// Creating and/or updating protein substitution prediction | ||
ProteinSubstitutionPrediction prediction; | ||
String key = transcriptId + "_" + uniprotId + "_" + position + "_" + aaReference; | ||
byte[] dbContent = rdb.get(key.getBytes()); | ||
if (dbContent == null) { | ||
prediction = new ProteinSubstitutionPrediction(transcriptId, uniprotId, position, aaReference, "AlphaMissense", | ||
Collections.singletonList(score)); | ||
} else { | ||
prediction = predictionReader.readValue(dbContent); | ||
prediction.getScores().add(score); | ||
} | ||
rdb.put(key.getBytes(), jsonObjectWriter.writeValueAsBytes(prediction)); | ||
|
||
// Log messages | ||
counter++; | ||
if (counter % 10000 == 0) { | ||
logger.info("{} AlphaMissense predictions parsed", counter); | ||
} | ||
} | ||
} | ||
|
||
// Serialize/write the saved variant polygenic scores in the RocksDB | ||
serializeRDB(rdb); | ||
RocksDBUtils.closeIndex(rdb, dbOption, dbLocation); | ||
serializer.close(); | ||
|
||
logger.info("Parsed AlphaMissense file: {}. Done!", alphaMissenseFile.getName()); | ||
} | ||
|
||
private void serializeRDB(RocksDB rdb) throws IOException { | ||
// DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's | ||
// named "iterator" | ||
RocksIterator rocksIterator = rdb.newIterator(); | ||
|
||
logger.info("Reading from RocksDB index and serializing to {}.json.gz", serializer.getOutdir().resolve(serializer.getFileName())); | ||
int counter = 0; | ||
for (rocksIterator.seekToFirst(); rocksIterator.isValid(); rocksIterator.next()) { | ||
logger.info("variant = {}", new String(rocksIterator.key())); | ||
ProteinSubstitutionPrediction prediction = predictionReader.readValue(rocksIterator.value()); | ||
serializer.serialize(prediction); | ||
counter++; | ||
if (counter % 10000 == 0) { | ||
logger.info("{} written", counter); | ||
} | ||
} | ||
serializer.close(); | ||
logger.info("Done."); | ||
} | ||
} |
68 changes: 68 additions & 0 deletions
68
cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/utils/RocksDBUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
/* | ||
* Copyright 2015-2020 OpenCB | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.opencb.cellbase.lib.builders.utils; | ||
|
||
import org.rocksdb.Options; | ||
import org.rocksdb.RocksDB; | ||
import org.rocksdb.RocksDBException; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.nio.file.Files; | ||
import java.nio.file.Paths; | ||
|
||
public class RocksDBUtils { | ||
|
||
public static void closeIndex(RocksDB rdb, Options dbOption, String dbLocation) throws IOException { | ||
if (rdb != null) { | ||
rdb.close(); | ||
} | ||
if (dbOption != null) { | ||
dbOption.dispose(); | ||
} | ||
if (dbLocation != null && Files.exists(Paths.get(dbLocation))) { | ||
org.apache.commons.io.FileUtils.deleteDirectory(new File(dbLocation)); | ||
} | ||
} | ||
|
||
public static Object[] getDBConnection(String dbLocation, boolean forceCreate) throws RocksDBException { | ||
boolean indexingNeeded = forceCreate || !Files.exists(Paths.get(dbLocation)); | ||
// a static method that loads the RocksDB C++ library. | ||
RocksDB.loadLibrary(); | ||
// the Options class contains a set of configurable DB options | ||
// that determines the behavior of a database. | ||
Options options = new Options().setCreateIfMissing(true); | ||
|
||
// options.setMaxBackgroundCompactions(4); | ||
// options.setMaxBackgroundFlushes(1); | ||
// options.setCompressionType(CompressionType.NO_COMPRESSION); | ||
// options.setMaxOpenFiles(-1); | ||
// options.setIncreaseParallelism(4); | ||
// options.setCompactionStyle(CompactionStyle.LEVEL); | ||
// options.setLevelCompactionDynamicLevelBytes(true); | ||
|
||
RocksDB db; | ||
// a factory method that returns a RocksDB instance | ||
if (indexingNeeded) { | ||
db = RocksDB.open(options, dbLocation); | ||
} else { | ||
db = RocksDB.openReadOnly(options, dbLocation); | ||
} | ||
|
||
return new Object[]{db, options, dbLocation, indexingNeeded}; | ||
} | ||
} |