Skip to content

Commit

Permalink
Merge branch 'repr-domains' into develop
Browse files Browse the repository at this point in the history
# Conflicts:
#	core/jms-implementation/support-mini-x86-32/work/template/interpro.zip
  • Loading branch information
matthiasblum committed Jan 11, 2024
2 parents 397ac78 + 13c2a25 commit ab25abd
Show file tree
Hide file tree
Showing 13 changed files with 288 additions and 10 deletions.

Large diffs are not rendered by default.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package uk.ac.ebi.interpro.scan.management.model.implementations;
import uk.ac.ebi.interpro.scan.model.*;

import java.util.HashSet;
import java.util.Set;

public class Domain {
private final Location location;
private final Set<Integer> residues;
private final int databaseRank;

public Location getLocation() {
return location;
}

public Set<Integer> getResidues() {
return residues;
}

public int getDatabaseRank() {
return databaseRank;
}

public Domain(Location location, int databaseRank) {
this.location = location;
this.residues = new HashSet<>();
this.databaseRank = databaseRank;

Set<LocationFragment> fragments = location.getLocationFragments();
for (LocationFragment fragment: fragments) {
for (int i = fragment.getStart(); i <= fragment.getEnd(); i++) {
this.residues.add(i);
}
}
}

public boolean overlaps(Domain other, double threshold) {
Set<Integer> overlap = new HashSet<Integer>(this.residues);
overlap.retainAll(other.getResidues());
if (overlap.size() > 0) {
return ((double) overlap.size() / Math.min(this.residues.size(), other.getResidues().size())) >= threshold;
}

return false;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package uk.ac.ebi.interpro.scan.management.model.implementations;

import java.util.*;

public class DomainResolver {
private final Map<Integer, Set<Integer>> graph;
private final List<Set<Integer>> sets;

public DomainResolver(Map<Integer, Set<Integer>> graph) {
this.graph = graph;
this.sets = new ArrayList<>();
}

public List<Set<Integer>> resolve() {
makeSets(new ArrayList<>(), new ArrayList<>(this.graph.keySet()));
return this.sets;
}

private void makeSets(List<Integer> currentSet, List<Integer> remainingNodes) {
if (isValid(currentSet)) {
if (remainingNodes.isEmpty()) {
this.sets.add(new HashSet<>(currentSet));
return;
}
} else {
return;
}

int currentNode = remainingNodes.get(0);
remainingNodes = remainingNodes.subList(1, remainingNodes.size());

makeSets(new ArrayList<>(currentSet) {{
add(currentNode);
}}, remainingNodes);
makeSets(new ArrayList<>(currentSet), remainingNodes);
}

private boolean isValid(List<Integer> candidate) {
for (int nodeA : candidate) {
for (int nodeB : candidate) {
if (nodeA != nodeB && !this.graph.get(nodeB).contains(nodeA)) {
return false;
}
}
}
return true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
public class PrepareForOutputStep extends Step {

private static final Logger LOGGER = LogManager.getLogger(PrepareForOutputStep.class.getName());
private static final int MAX_NUM_DOMAINS_BY_GROUP = 20;
private static final double DOMAIN_OVERLAP_THRESHOLD = 0.3;

//DAOs
private ProteinDAO proteinDAO;
Expand All @@ -50,6 +52,7 @@ public class PrepareForOutputStep extends Step {
private ConcurrentHashMap<String, List<String>> entry2GoTermsMap;
private ConcurrentHashMap<String, List<String>> entry2PathwayMap;
private ConcurrentHashMap<String, List<String>> pathwayMap;
private ConcurrentHashMap<String, Integer> domainsMap;

Random random = new Random();

Expand Down Expand Up @@ -153,6 +156,8 @@ public void execute(StepInstance stepInstance, String temporaryFileDirectory) {
getEntry2GoTermsMap();
}

getDomainsMap();

//proceed to rest of functionality
Utilities.verboseLog(1100, "Pre-marshall the proteins ...");
simulateMarshalling(stepInstance, "p", temporaryFileDirectory, mapToGO, mapToInterPro);
Expand Down Expand Up @@ -397,6 +402,7 @@ private void simulateMarshalling(StepInstance stepInstance, String sequenceType,
}

totalWaitTime = 0;
ArrayList<Domain> domains = new ArrayList<>();
for (String signatureLibraryName : signatureLibraryNames) {
final String dbKey = proteinKey + signatureLibraryName;

Expand Down Expand Up @@ -450,6 +456,16 @@ private void simulateMarshalling(StepInstance stepInstance, String sequenceType,
pantherMatch.setGoXRefs(goXrefs);
}

if (this.domainsMap.containsKey(match.getSignature().getAccession())) {
int databaseRank = this.domainsMap.get(match.getSignature().getAccession());
Set<Location> locations = match.getLocations();
if (locations != null) {
for (Location location: locations) {
domains.add(new Domain(location, databaseRank));
}
}
}

Entry simpleEntry = match.getSignature().getEntry();
if (simpleEntry != null) {
if (mapToInterPro) {
Expand All @@ -473,6 +489,10 @@ private void simulateMarshalling(StepInstance stepInstance, String sequenceType,
}
}

if (domains.size() > 0) {
selectRepresentativeDomains(domains);
}

//TODO Temp check what breaks if you dont do pre-marshalling
//String xmlProtein = writer.marshal(protein);

Expand Down Expand Up @@ -862,6 +882,25 @@ public void getEntry2GoTermsMap(){
}
}

public void getDomainsMap() {
if (domainsMap != null){
return;
}

try {
File file = new File(this.getEntryKVPath() + "/domains.json");
FileInputStream is = new FileInputStream(file);
ObjectMapper mapper = new ObjectMapper();
mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, true);
Map<String, Integer> jsonMap;
jsonMap = mapper.readValue(is, new TypeReference<>() {});
domainsMap = new ConcurrentHashMap<> (jsonMap);
} catch (Exception ex) {
ex.printStackTrace();
}

}

public Entry updateEntryXrefs(Entry entry) {
String entryAc = entry.getAccession();
Set<GoXref> goXrefs = (Set<GoXref>) getGoXrefsByEntryAc(entryAc); //entry2GoXrefsMap.get(entryAc);
Expand Down Expand Up @@ -1050,4 +1089,117 @@ private String getTryCountStats(ArrayList<Pair<Integer, Integer>> observedTryCou

return " tryCounts:" + tryCount + " maxTryCount:" + maxTryCount + " maxtotalWaitTime: " + maxtotalWaitTime;
}

private void selectRepresentativeDomains(ArrayList<Domain> domains) {
domains.sort(new Comparator<Domain>() {
@Override
public int compare(Domain d1, Domain d2) {
int delta = d1.getLocation().getStart() - d2.getLocation().getStart();
return delta != 0 ? delta : d1.getLocation().getEnd() - d2.getLocation().getEnd();
}
});

ArrayList<ArrayList<Domain>> groups = groupDomains(domains);

for (ArrayList<Domain> allDomainsInGroup : groups) {
allDomainsInGroup.sort(new Comparator<Domain>() {
@Override
public int compare(Domain d1, Domain d2) {
int delta = d2.getResidues().size() - d1.getResidues().size();
if (delta != 0) {
return delta;
}
return d1.getDatabaseRank() - d2.getDatabaseRank();
}
});

List<Domain> bestDomainsInGroup = allDomainsInGroup.subList(0, Math.min(MAX_NUM_DOMAINS_BY_GROUP, allDomainsInGroup.size()));

Map<Integer, Set<Integer>> graph = new HashMap<>();
for (int i = 0; i < bestDomainsInGroup.size(); i++) {
Set<Integer> edges = new HashSet<>();

for (int j = 0; j < bestDomainsInGroup.size(); j++) {
if (i != j) {
edges.add(i);
}
}

graph.put(i, edges);
}

for (int i = 0; i < bestDomainsInGroup.size(); i++) {
Domain domainA = bestDomainsInGroup.get(i);

for (int j = i + 1; j < bestDomainsInGroup.size(); j++) {
Domain domainB = bestDomainsInGroup.get(j);

if (domainA.overlaps(domainB, DOMAIN_OVERLAP_THRESHOLD)) {
graph.get(i).remove(j);
graph.get(j).remove(i);
}
}
}

List<Set<Integer>> subgroups = new DomainResolver(graph).resolve();

int maxCoverage = 0;
int maxPfams = 0;
List<Domain> bestSubgroup = null;
for (Set<Integer> subgroup: subgroups) {
Set<Integer> coverage = new HashSet<>();
int numPfams = 0;
List<Domain> candidate = new ArrayList<>();

for (int i: subgroup) {
Domain domain = bestDomainsInGroup.get(i);
coverage.addAll(domain.getResidues());
if (domain.getDatabaseRank() == 0) {
numPfams++;
}

candidate.add(domain);
}

int sizeCoverage = coverage.size();
if (sizeCoverage > maxCoverage || (sizeCoverage == maxCoverage && numPfams > maxPfams)) {
maxCoverage = sizeCoverage;
maxPfams = numPfams;
bestSubgroup = candidate;
}
}

if (bestSubgroup != null) {
for (Domain domain: bestSubgroup) {
domain.getLocation().setRepresentative(true);
}
}
}
}

private ArrayList<ArrayList<Domain>> groupDomains(ArrayList<Domain> domains) {
ArrayList<ArrayList<Domain>> groups = new ArrayList<>();
ArrayList<Domain> group = new ArrayList<>();
Domain domain = domains.get(0);
group.add(domain);
int stop = domain.getLocation().getEnd();

for (int i = 1; i < domains.size(); i++) {
domain = domains.get(i);
int start = domain.getLocation().getStart();

if (start <= stop) {
group.add(domain);
stop = Math.max(stop, domain.getLocation().getEnd());
} else {
groups.add(group);
group = new ArrayList<>();
group.add(domain);
stop = domain.getLocation().getEnd();
}
}

groups.add(group);
return groups;
}
}
2 changes: 1 addition & 1 deletion core/model/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@
<configuration>
<target>
<property name="project.build.schema.dir" value="${project.build.directory}/schemas/"/>
<property name="latest.interpro.schema.version" value="4.6"/>
<property name="latest.interpro.schema.version" value="4.7"/>
<mkdir dir="${project.build.schema.dir}"/>
<taskdef name="schemagen"
classname="com.sun.tools.jxc.SchemaGenTask"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import java.util.Set;

@Entity
@XmlType(name = "FunFamHmmer3MatchType")
@XmlType(name = "FunFamHmmer3MatchType", namespace = "https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas")
public class FunFamHmmer3Match extends Match<FunFamHmmer3Match.FunFamHmmer3Location> {
@Column(nullable = false)
private double evalue;
Expand Down Expand Up @@ -69,6 +69,7 @@ public void setScore(double score) {
this.score = score;
}

@XmlType(name = "FunFamHmmer3Location", namespace = "https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas")
public static class FunFamHmmer3Location extends Hmmer3Match.Hmmer3Location {
@Column(nullable = false, name = "hmmer3_seq_start")
@JsonIgnore
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ public abstract class Location<T extends LocationFragment> implements Serializab
// 'end' is reserved word in SQL.
private int end;

@Column(nullable = false)
private boolean representative = false;

@ManyToOne(cascade = CascadeType.PERSIST, optional = false)
@JsonBackReference
private Match match;
Expand Down Expand Up @@ -205,6 +208,16 @@ public void addLocationFragment(T locationFragment) {
this.end = end;
}
}

@XmlAttribute(name = "representative", required = true)
public boolean getRepresentative() {
return representative;
}

public void setRepresentative(boolean representative) {
this.representative = representative;
}

/**
* Ensure sub-classes of AbstractLocation are represented correctly in XML.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ public int hashCode() {
.toHashCode();
}

@XmlType(name = "PathwayDatabase", namespace = "https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas")
public enum PathwayDatabase {

META_CYC('t', "MetaCyc"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
<hmmer3-match score="0.035" evalue="3.7E-9">
<signature name="B12-binding" ac="PF02310"/>
<locations>
<hmmer3-location env-start="2" env-end="108" hmm-end="104" hmm-start="1" hmm-length="103" hmm-bounds="INCOMPLETE" evalue="3.7E-9" score="3.0" end="107" start="3" post-processed="true">
<hmmer3-location env-start="2" env-end="108" hmm-end="104" hmm-start="1" hmm-length="103" hmm-bounds="INCOMPLETE" evalue="3.7E-9" score="3.0" end="107" start="3" post-processed="true" representative="false">
<location-fragments>
<hmmer3-location-fragment end="107" start="3" dc-status="CONTINUOUS"/>
</location-fragments>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
<protein xmlns="https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas
https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas/interproscan-model-3.0.xsd">
https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas/interproscan-model-4.7.xsd">
<sequence md5="58F1C327155B01E0EE4C95CE0067973C">MEESVNVEYADEDEDEIEEYEEEEEEEEEESAEGAAGSSVSDVAISATEKLVASEVPEDAVAADTNVRQRVTARVEELKARYTRRMSLFELTGIVAESFNLLCRGRLPLVADAADPALDSELKVVVRELEEGVCPIVIEKNGEFLAPGDFDPECLKYHLNYMTDLWKSQGRM</sequence>
<xref db="UniParc" id="UPI0001C4B400" name="UPI0001C4B400"/>
<matches>
Expand All @@ -59,7 +59,7 @@
<locations>
<hmmer2-location start="7" end="172"
score="294.5" evalue="6.3E-86"
hmm-start="1" hmm-end="186" hmm-length="185" hmm-bounds="INCOMPLETE">
hmm-start="1" hmm-end="186" hmm-length="185" hmm-bounds="INCOMPLETE" representative="false">
<location-fragments>
<hmmer2-location-fragment start="7" end="172" dc-status="CONTINUOUS"/>
</location-fragments>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@

<!--TODO: Read this from Maven properties-->
<bean id="schema" class="org.springframework.core.io.FileSystemResource">
<constructor-arg value="target/test-classes/xsd/interproscan-model-3.0.xsd"/>
<constructor-arg value="target/test-classes/xsd/interproscan-model-4.7.xsd"/>
</bean>

<!-- Alternative declaration if don't need to set any additional properties:
Expand Down
Loading

0 comments on commit ab25abd

Please sign in to comment.