diff --git a/core/management/src/main/java/uk/ac/ebi/interpro/scan/management/model/implementations/PrepareForOutputStep.java b/core/management/src/main/java/uk/ac/ebi/interpro/scan/management/model/implementations/PrepareForOutputStep.java index b64c61803..d97ec3e48 100644 --- a/core/management/src/main/java/uk/ac/ebi/interpro/scan/management/model/implementations/PrepareForOutputStep.java +++ b/core/management/src/main/java/uk/ac/ebi/interpro/scan/management/model/implementations/PrepareForOutputStep.java @@ -50,6 +50,7 @@ public class PrepareForOutputStep extends Step { private ConcurrentHashMap> entry2GoTermsMap; private ConcurrentHashMap> entry2PathwayMap; private ConcurrentHashMap> pathwayMap; + private ConcurrentHashMap domainsMap; Random random = new Random(); @@ -153,6 +154,8 @@ public void execute(StepInstance stepInstance, String temporaryFileDirectory) { getEntry2GoTermsMap(); } + getDomainsMap(); + //proceed to rest of functionality Utilities.verboseLog(1100, "Pre-marshall the proteins ..."); simulateMarshalling(stepInstance, "p", temporaryFileDirectory, mapToGO, mapToInterPro); @@ -397,6 +400,7 @@ private void simulateMarshalling(StepInstance stepInstance, String sequenceType, } totalWaitTime = 0; + ArrayList domains = new ArrayList<>(); for (String signatureLibraryName : signatureLibraryNames) { final String dbKey = proteinKey + signatureLibraryName; @@ -450,6 +454,16 @@ private void simulateMarshalling(StepInstance stepInstance, String sequenceType, pantherMatch.setGoXRefs(goXrefs); } + if (this.domainsMap.containsKey(match.getSignature().getAccession())) { + boolean isPfam = this.domainsMap.get(match.getSignature().getAccession()); + Set locations = match.getLocations(); + if (locations != null) { + for (Location location: locations) { + domains.add(new Domain(location, isPfam)); + } + } + } + Entry simpleEntry = match.getSignature().getEntry(); if (simpleEntry != null) { if (mapToInterPro) { @@ -473,6 +487,10 @@ private void simulateMarshalling(StepInstance stepInstance, String sequenceType, } } + if (domains.size() > 0) { + selectRepresentativeDomains(domains); + } + //TODO Temp check what breaks if you dont do pre-marshalling //String xmlProtein = writer.marshal(protein); @@ -862,6 +880,25 @@ public void getEntry2GoTermsMap(){ } } + public void getDomainsMap() { + if (domainsMap != null){ + return; + } + + try { + File file = new File(this.getEntryKVPath() + "/domains.json"); + FileInputStream is = new FileInputStream(file); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, true); + Map jsonMap; + jsonMap = mapper.readValue(is, new TypeReference<>() {}); + domainsMap = new ConcurrentHashMap<> (jsonMap); + } catch (Exception ex) { + ex.printStackTrace(); + } + + } + public Entry updateEntryXrefs(Entry entry) { String entryAc = entry.getAccession(); Set goXrefs = (Set) getGoXrefsByEntryAc(entryAc); //entry2GoXrefsMap.get(entryAc); @@ -1050,4 +1087,106 @@ private String getTryCountStats(ArrayList> observedTryCou return " tryCounts:" + tryCount + " maxTryCount:" + maxTryCount + " maxtotalWaitTime: " + maxtotalWaitTime; } + + private void selectRepresentativeDomains(ArrayList domains) { + Collections.sort(domains, new Comparator() { + @Override + public int compare(Domain d1, Domain d2) { + if (d1.getLocation().getStart() == d2.getLocation().getStart()) { + return d2.getLocation().getEnd() - d1.getLocation().getEnd(); + } + return d1.getLocation().getStart() - d2.getLocation().getStart(); + } + }); + + ArrayList> groups = groupDomains(domains); + + for (ArrayList group : groups) { + Map> graph = new HashMap<>(); + + for (int i = 0; i < group.size(); i++) { + Set edges = new HashSet<>(); + + for (int j = 0; j < group.size(); j++) { + if (i != j) { + edges.add(i); + } + } + + graph.put(i, edges); + } + + for (int i = 0; i < group.size(); i++) { + Domain domainA = group.get(i); + + for (int j = i + 1; j < group.size(); j++) { + Domain domainB = group.get(j); + + if (domainA.overlaps(domainB, 0.3)) { + graph.get(i).remove(j); + graph.get(j).remove(i); + } + } + } + + List> subgroups = new DomainResolver(graph).resolve(); + + int maxCoverage = 0; + int maxPfams = 0; + List bestSubgroup = null; + for (Set subgroup: subgroups) { + Set coverage = new HashSet<>(); + int numPfams = 0; + List candidate = new ArrayList<>(); + + for (int i: subgroup) { + Domain domain = group.get(i); + coverage.addAll(domain.getResidues()); + if (domain.isPfam()) { + numPfams++; + } + + candidate.add(domain); + } + + int sizeCoverage = coverage.size(); + if (sizeCoverage > maxCoverage || (sizeCoverage == maxCoverage && numPfams > maxPfams)) { + maxCoverage = sizeCoverage; + maxPfams = numPfams; + bestSubgroup = candidate; + } + } + + if (bestSubgroup != null) { + for (Domain domain: bestSubgroup) { + domain.getLocation().setRepresentative(true); + } + } + } + } + + private ArrayList> groupDomains(ArrayList domains) { + ArrayList> groups = new ArrayList<>(); + ArrayList group = new ArrayList<>(); + Domain domain = domains.get(0); + group.add(domain); + int stop = domain.getLocation().getEnd(); + + for (int i = 1; i < domains.size(); i++) { + domain = domains.get(i); + int start = domain.getLocation().getStart(); + + if (start <= stop) { + group.add(domain); + } else { + groups.add(group); + group = new ArrayList<>(); + group.add(domain); + stop = domain.getLocation().getEnd(); + } + } + + groups.add(group); + return groups; + } }