From 62f1176fca9c0fa21fa21fbab54857c7d3ed8499 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 6 Sep 2024 13:41:36 -0400 Subject: [PATCH 01/16] State Machine and Engine Refactor and Return Values The state machine and engine now return appropriate objects regarding their state, and have been slightly refactored such that it's easier to read and use in tests. --- .../SimulationDispatcherEngine.java | 343 +++-- .../dispatcher/SimulationStateMachine.java | 1212 ++++++++--------- 2 files changed, 741 insertions(+), 814 deletions(-) diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherEngine.java b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherEngine.java index 7ee67d4c84..1af3550fc9 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherEngine.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherEngine.java @@ -1,175 +1,168 @@ -/* - * Copyright (C) 1999-2011 University of Connecticut Health Center - * - * Licensed under the MIT License (the "License"). - * You may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.opensource.org/licenses/mit-license.php - */ - -package cbit.vcell.message.server.dispatcher; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.vcell.util.DataAccessException; -import org.vcell.util.document.KeyValue; -import org.vcell.util.document.User; -import org.vcell.util.document.VCellServerID; - -import cbit.rmi.event.WorkerEvent; -import cbit.vcell.message.VCMessageSession; -import cbit.vcell.message.VCMessagingException; -import cbit.vcell.message.messages.StatusMessage; -import cbit.vcell.server.SimulationJobStatus; -import cbit.vcell.server.SimulationJobStatus.SchedulerStatus; -import cbit.vcell.server.UpdateSynchronizationException; -import cbit.vcell.solver.Simulation; -import cbit.vcell.solver.SimulationInfo; -import cbit.vcell.solver.VCSimulationIdentifier; -import cbit.vcell.solver.server.SimulationMessage; - -/** - * Insert the type's description here. - * Creation date: (10/18/2001 4:31:11 PM) - * @author: Jim Schaff - */ -public class SimulationDispatcherEngine { - public static final Logger lg = LogManager.getLogger(SimulationDispatcherEngine.class); - - private HashMap> simStateMachineHash = new HashMap>(); - - /** - * Scheduler constructor comment. - */ - public SimulationDispatcherEngine() { - } - - /** - * reset simulation state time stamps in case of transient error in getting running status - */ - void resetTimeStamps( ) { - long now = System.currentTimeMillis(); - for (List lst : simStateMachineHash.values()) { - for (SimulationStateMachine ssm: lst) { - ssm.setSolverProcessTimestamp(now); - } - } - } - - public SimulationStateMachine getSimulationStateMachine(KeyValue simulationKey, int jobIndex) { - List stateMachineList = simStateMachineHash.get(simulationKey); - if (stateMachineList==null){ - stateMachineList = new ArrayList(); - simStateMachineHash.put(simulationKey,stateMachineList); - } - for (SimulationStateMachine stateMachine : stateMachineList){ - if (stateMachine.getJobIndex() == jobIndex){ - return stateMachine; - } - } - SimulationStateMachine newStateMachine = new SimulationStateMachine(simulationKey, jobIndex); - stateMachineList.add(newStateMachine); - return newStateMachine; - } - - public void onDispatch(Simulation simulation, SimulationJobStatus simJobStatus, SimulationDatabase simulationDatabase, VCMessageSession dispatcherQueueSession) throws VCMessagingException, DataAccessException, SQLException{ - KeyValue simulationKey = simJobStatus.getVCSimulationIdentifier().getSimulationKey(); - SimulationStateMachine simStateMachine = getSimulationStateMachine(simulationKey, simJobStatus.getJobIndex()); - - simStateMachine.onDispatch(simulation, simJobStatus, simulationDatabase, dispatcherQueueSession); - } - - public void onStartRequest(VCSimulationIdentifier vcSimID, User user, int simulationScanCount, SimulationDatabase simulationDatabase, VCMessageSession session, VCMessageSession dispatcherQueueSession) throws VCMessagingException, DataAccessException, SQLException { - KeyValue simKey = vcSimID.getSimulationKey(); - - User.SpecialUser myUser = simulationDatabase.getUser(user.getName()); - boolean isAdmin = Arrays.asList(myUser.getMySpecials()).contains(User.SPECIAL_CLAIM.admins); - - SimulationInfo simulationInfo = null; - try { - simulationInfo = simulationDatabase.getSimulationInfo(user, simKey); - } catch (DataAccessException ex) { - if (lg.isWarnEnabled()) lg.warn("Bad simulation " + vcSimID); - StatusMessage message = new StatusMessage(new SimulationJobStatus(VCellServerID.getSystemServerID(), vcSimID, -1, null, - SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("Failed to dispatch simulation: "+ ex.getMessage()), null, null), user.getName(), null, null); - message.sendToClient(session); - return; - } - if (simulationInfo == null) { - if (lg.isWarnEnabled()) lg.warn("Can't start, simulation [" + vcSimID + "] doesn't exist in database"); - StatusMessage message = new StatusMessage(new SimulationJobStatus(VCellServerID.getSystemServerID(), vcSimID, -1, null, - SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("Can't start, simulation [" + vcSimID + "] doesn't exist"), null, null), user.getName(), null, null); - message.sendToClient(session); - return; - } - - if (!isAdmin && simulationScanCount > Integer.parseInt(cbit.vcell.resource.PropertyLoader.getRequiredProperty(cbit.vcell.resource.PropertyLoader.maxJobsPerScan))) { - if (lg.isWarnEnabled()) lg.warn("Too many simulations (" + simulationScanCount + ") for parameter scan." + vcSimID); - StatusMessage message = new StatusMessage(new SimulationJobStatus(VCellServerID.getSystemServerID(), vcSimID, -1, null, - SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("Too many simulations (" + simulationScanCount + ") for parameter scan."), null, null), user.getName(), null, null); - message.sendToClient(session); - return; - } - - for (int jobIndex = 0; jobIndex < simulationScanCount; jobIndex++){ - SimulationStateMachine simStateMachine = getSimulationStateMachine(simKey, jobIndex); - try { - simStateMachine.onStartRequest(user, vcSimID, simulationDatabase, session); - }catch (UpdateSynchronizationException e){ - simStateMachine.onStartRequest(user, vcSimID, simulationDatabase, session); - } - } - } - - - public void onStopRequest(VCSimulationIdentifier vcSimID, User user, SimulationDatabase simulationDatabase, VCMessageSession session) throws DataAccessException, VCMessagingException, SQLException { - KeyValue simKey = vcSimID.getSimulationKey(); - - SimulationJobStatus[] allActiveSimJobStatusArray = simulationDatabase.getActiveJobs(VCellServerID.getSystemServerID()); - ArrayList simJobStatusArray = new ArrayList(); - for (SimulationJobStatus activeSimJobStatus : allActiveSimJobStatusArray){ - if (activeSimJobStatus.getVCSimulationIdentifier().getSimulationKey().equals(vcSimID.getSimulationKey())){ - simJobStatusArray.add(activeSimJobStatus); - } - } - for (SimulationJobStatus simJobStatus : simJobStatusArray){ - SimulationStateMachine simStateMachine = getSimulationStateMachine(simKey, simJobStatus.getJobIndex()); - try { - simStateMachine.onStopRequest(user, simJobStatus, simulationDatabase, session); - }catch (UpdateSynchronizationException e){ - simStateMachine.onStopRequest(user, simJobStatus, simulationDatabase, session); - } - } - } - - - public void onWorkerEvent(WorkerEvent workerEvent, SimulationDatabase simulationDatabase, VCMessageSession session) { - try { - KeyValue simKey = workerEvent.getVCSimulationDataIdentifier().getSimulationKey(); - int jobIndex = workerEvent.getJobIndex(); - SimulationStateMachine simStateMachine = getSimulationStateMachine(simKey, jobIndex); - simStateMachine.onWorkerEvent(workerEvent, simulationDatabase, session); - } catch (Exception ex) { - lg.error(ex.getMessage(),ex); - } - } - - - public void onSystemAbort(SimulationJobStatus jobStatus, String failureMessage, SimulationDatabase simulationDatabase, VCMessageSession session) { - try { - KeyValue simKey = jobStatus.getVCSimulationIdentifier().getSimulationKey(); - int jobIndex = jobStatus.getJobIndex(); - SimulationStateMachine simStateMachine = getSimulationStateMachine(simKey, jobIndex); - simStateMachine.onSystemAbort(jobStatus, failureMessage, simulationDatabase, session); - } catch (Exception ex) { - lg.error(ex.getMessage(),ex); - } - } - -} +package cbit.vcell.message.server.dispatcher; + +import cbit.rmi.event.WorkerEvent; +import cbit.vcell.message.VCMessageSession; +import cbit.vcell.message.VCMessagingException; +import cbit.vcell.message.messages.StatusMessage; +import cbit.vcell.server.SimulationJobStatus; +import cbit.vcell.server.SimulationJobStatus.SchedulerStatus; +import cbit.vcell.server.UpdateSynchronizationException; +import cbit.vcell.solver.Simulation; +import cbit.vcell.solver.SimulationInfo; +import cbit.vcell.solver.VCSimulationIdentifier; +import cbit.vcell.solver.server.SimulationMessage; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.vcell.util.DataAccessException; +import org.vcell.util.document.KeyValue; +import org.vcell.util.document.User; +import org.vcell.util.document.VCellServerID; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; + +public class SimulationDispatcherEngine { + public static final Logger lg = LogManager.getLogger(SimulationDispatcherEngine.class); + + private HashMap> simStateMachineHash = new HashMap>(); + + /** + * reset simulation state time stamps in case of transient error in getting running status + */ + void resetTimeStamps( ) { + long now = System.currentTimeMillis(); + for (List lst : simStateMachineHash.values()) { + for (SimulationStateMachine ssm: lst) { + ssm.setSolverProcessTimestamp(now); + } + } + } + + public SimulationDispatcherEngine() { + } + + public SimulationStateMachine getSimulationStateMachine(KeyValue simulationKey, int jobIndex) { + List stateMachineList = simStateMachineHash.get(simulationKey); + if (stateMachineList==null){ + stateMachineList = new ArrayList(); + simStateMachineHash.put(simulationKey,stateMachineList); + } + for (SimulationStateMachine stateMachine : stateMachineList){ + if (stateMachine.getJobIndex() == jobIndex){ + return stateMachine; + } + } + SimulationStateMachine newStateMachine = new SimulationStateMachine(simulationKey, jobIndex); + stateMachineList.add(newStateMachine); + return newStateMachine; + } + + public void onDispatch(Simulation simulation, SimulationJobStatus simJobStatus, SimulationDatabase simulationDatabase, VCMessageSession dispatcherQueueSession) throws VCMessagingException, DataAccessException, SQLException { + KeyValue simulationKey = simJobStatus.getVCSimulationIdentifier().getSimulationKey(); + SimulationStateMachine simStateMachine = getSimulationStateMachine(simulationKey, simJobStatus.getJobIndex()); + + simStateMachine.onDispatch(simulation, simJobStatus, simulationDatabase, dispatcherQueueSession); + } + + public ArrayList onStartRequest(VCSimulationIdentifier vcSimID, User user, int simulationScanCount, SimulationDatabase simulationDatabase, VCMessageSession session, VCMessageSession dispatcherQueueSession) throws VCMessagingException, DataAccessException, SQLException { + KeyValue simKey = vcSimID.getSimulationKey(); + + User.SpecialUser myUser = simulationDatabase.getUser(user.getName()); + boolean isAdmin = Arrays.asList(myUser.getMySpecials()).contains(User.SPECIAL_CLAIM.admins); + + SimulationInfo simulationInfo = null; + SimulationJobStatus simJobStatus = null; + ArrayList status = new ArrayList<>(); + try { + simulationInfo = simulationDatabase.getSimulationInfo(user, simKey); + } catch (DataAccessException ex) { + if (lg.isWarnEnabled()) lg.warn("Bad simulation " + vcSimID); + simJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(), vcSimID, -1, null, + SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("Failed to dispatch simulation: "+ ex.getMessage()), null, null); + StatusMessage message = new StatusMessage(simJobStatus, user.getName(), null, null); + message.sendToClient(session); + status.add(message); + return status; + } + if (simulationInfo == null) { + if (lg.isWarnEnabled()) lg.warn("Can't start, simulation [" + vcSimID + "] doesn't exist in database"); + simJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(), vcSimID, -1, null, + SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("Can't start, simulation [" + vcSimID + "] doesn't exist"), null, null); + StatusMessage message = new StatusMessage(simJobStatus, user.getName(), null, null); + message.sendToClient(session); + status.add(message); + return status; + } + + if (!isAdmin && simulationScanCount > Integer.parseInt(cbit.vcell.resource.PropertyLoader.getRequiredProperty(cbit.vcell.resource.PropertyLoader.maxJobsPerScan))) { + if (lg.isWarnEnabled()) lg.warn("Too many simulations (" + simulationScanCount + ") for parameter scan." + vcSimID); + simJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(), vcSimID, -1, null, + SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("Too many simulations (" + simulationScanCount + ") for parameter scan."), null, null); + StatusMessage message = new StatusMessage(simJobStatus, user.getName(), null, null); + message.sendToClient(session); + status.add(message); + + return status; + } + + for (int simulationJobIndex = 0; simulationJobIndex < simulationScanCount; simulationJobIndex++){ + SimulationStateMachine simStateMachine = getSimulationStateMachine(simKey, simulationJobIndex); + try { + status.add(simStateMachine.onStartRequest(user, vcSimID, simulationDatabase, session)); + }catch (UpdateSynchronizationException e){ + status.add(simStateMachine.onStartRequest(user, vcSimID, simulationDatabase, session)); + } + } + return status; + } + + + public ArrayList onStopRequest(VCSimulationIdentifier vcSimID, User user, SimulationDatabase simulationDatabase, VCMessageSession session) throws DataAccessException, VCMessagingException, SQLException { + KeyValue simKey = vcSimID.getSimulationKey(); + + SimulationJobStatus[] allActiveSimJobStatusArray = simulationDatabase.getActiveJobs(VCellServerID.getSystemServerID()); + ArrayList simJobStatusArray = new ArrayList(); + for (SimulationJobStatus activeSimJobStatus : allActiveSimJobStatusArray){ + if (activeSimJobStatus.getVCSimulationIdentifier().getSimulationKey().equals(vcSimID.getSimulationKey())){ + simJobStatusArray.add(activeSimJobStatus); + } + } + ArrayList stoppedSimulations = new ArrayList<>(); + for (SimulationJobStatus simJobStatus : simJobStatusArray){ + SimulationStateMachine simStateMachine = getSimulationStateMachine(simKey, simJobStatus.getJobIndex()); + try { + stoppedSimulations.add(simStateMachine.onStopRequest(user, simJobStatus, simulationDatabase, session)); + }catch (UpdateSynchronizationException e){ + stoppedSimulations.add(simStateMachine.onStopRequest(user, simJobStatus, simulationDatabase, session)); + } + } + return stoppedSimulations; + } + + + public void onWorkerEvent(WorkerEvent workerEvent, SimulationDatabase simulationDatabase, VCMessageSession session) { + try { + KeyValue simKey = workerEvent.getVCSimulationDataIdentifier().getSimulationKey(); + int jobIndex = workerEvent.getJobIndex(); + SimulationStateMachine simStateMachine = getSimulationStateMachine(simKey, jobIndex); + simStateMachine.onWorkerEvent(workerEvent, simulationDatabase, session); + } catch (Exception ex) { + lg.error(ex.getMessage(),ex); + } + } + + + public void onSystemAbort(SimulationJobStatus jobStatus, String failureMessage, SimulationDatabase simulationDatabase, VCMessageSession session) { + try { + KeyValue simKey = jobStatus.getVCSimulationIdentifier().getSimulationKey(); + int jobIndex = jobStatus.getJobIndex(); + SimulationStateMachine simStateMachine = getSimulationStateMachine(simKey, jobIndex); + simStateMachine.onSystemAbort(jobStatus, failureMessage, simulationDatabase, session); + } catch (Exception ex) { + lg.error(ex.getMessage(),ex); + } + } +} diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java index 3df01647e5..53155b9533 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java @@ -1,659 +1,593 @@ package cbit.vcell.message.server.dispatcher; -import java.sql.SQLException; -import java.util.Arrays; -import java.util.Date; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.vcell.util.DataAccessException; -import org.vcell.util.document.KeyValue; -import org.vcell.util.document.User; -import org.vcell.util.document.VCellServerID; - import cbit.rmi.event.WorkerEvent; import cbit.vcell.field.FieldDataIdentifierSpec; -import cbit.vcell.message.VCMessage; -import cbit.vcell.message.VCMessageSession; -import cbit.vcell.message.VCMessagingConstants; -import cbit.vcell.message.VCMessagingException; -import cbit.vcell.message.VCellTopic; +import cbit.vcell.message.*; import cbit.vcell.message.messages.MessageConstants; import cbit.vcell.message.messages.SimulationTaskMessage; import cbit.vcell.message.messages.StatusMessage; import cbit.vcell.message.messages.WorkerEventMessage; -import cbit.vcell.message.server.htc.HtcProxy; -import cbit.vcell.message.server.htc.HtcProxy.MemLimitResults; import cbit.vcell.messaging.server.SimulationTask; import cbit.vcell.mongodb.VCMongoMessage; -import cbit.vcell.server.HtcJobID; -import cbit.vcell.server.RunningStateInfo; -import cbit.vcell.server.SimulationExecutionStatus; -import cbit.vcell.server.SimulationJobStatus; -import cbit.vcell.server.SimulationJobStatus.SchedulerStatus; -import cbit.vcell.server.SimulationQueueEntryStatus; -import cbit.vcell.server.SimulationStatus; -import cbit.vcell.server.UpdateSynchronizationException; -import cbit.vcell.solver.Simulation; -import cbit.vcell.solver.SimulationJob; -import cbit.vcell.solver.SolverDescription; -import cbit.vcell.solver.VCSimulationDataIdentifier; -import cbit.vcell.solver.VCSimulationIdentifier; +import cbit.vcell.resource.PropertyLoader; +import cbit.vcell.server.*; +import cbit.vcell.solver.*; import cbit.vcell.solver.server.SimulationMessage; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.vcell.util.DataAccessException; +import cbit.vcell.server.SimulationJobStatus.SchedulerStatus; +import cbit.vcell.server.SimulationJobStatus.SimulationQueueID; +import org.vcell.util.document.KeyValue; +import org.vcell.util.document.User; +import org.vcell.util.document.VCellServerID; + +import java.sql.SQLException; +import java.util.Arrays; +import java.util.Date; public class SimulationStateMachine { - public static final Logger lg = LogManager.getLogger(SimulationStateMachine.class); - - // bitmapped counter so that allows 3 retries for each request (but preserves ordinal nature) - // bits 0-3: retry count - // bits 4-31: submit - // max retries must be less than 15. - public static final int TASKID_USERCOUNTER_MASK = SimulationStatus.TASKID_USERCOUNTER_MASK; - public static final int TASKID_RETRYCOUNTER_MASK = SimulationStatus.TASKID_RETRYCOUNTER_MASK; - public static final int TASKID_USERINCREMENT = SimulationStatus.TASKID_USERINCREMENT; - - public static final int PRIORITY_LOW = 0; - public static final int PRIORITY_DEFAULT = 5; - public static final int PRIORITY_HIGH = 9; - - private final KeyValue simKey; - private final int jobIndex; - - /** - * in memory storage of last time information about this job was received or status was unknown due - * to transient failure or system restart - */ - private long solverProcessTimestamp; - - public SimulationStateMachine(KeyValue simKey, int jobIndex){ - this.simKey = simKey; - this.jobIndex = jobIndex; - updateSolverProcessTimestamp(); - } - - /* - public SimulationStateMachine(SimulationJobStatus[] simJobStatus) { - this(simJobStatus[0].getVCSimulationIdentifier().getSimulationKey(),simJobStatus[0].getJobIndex()); - } - */ - - /** - * set in memory last update time to now - */ - private void updateSolverProcessTimestamp( ) { - solverProcessTimestamp = System.currentTimeMillis(); - } - - /** - * set to specified time (for mass setting) - * @param solverProcessTimestamp - */ - void setSolverProcessTimestamp(long solverProcessTimestamp) { - this.solverProcessTimestamp = solverProcessTimestamp; - } - - public KeyValue getSimKey() { - return simKey; - } - - public int getJobIndex() { - return jobIndex; - } - -// public List getStateMachineTransitions() { -// return stateMachineTransitions; -// } - -// public String show(){ -// StringBuffer buffer = new StringBuffer(); -// buffer.append("SimulationStateMachine for SimID='"+simKey+"', jobIndex="+jobIndex+"\n"); -// for (StateMachineTransition stateMachineTransition : stateMachineTransitions){ -// buffer.append(stateMachineTransition+"\n"); -// } -// return buffer.toString(); -// } - -// private void addStateMachineTransition(StateMachineTransition stateMachineTransition){ -// stateMachineTransitions.add(stateMachineTransition); -// } - - /** - * return last time a status update was received in memory - * @return time since information last changed about this task - */ - long getSolverProcessTimestamp() { - return solverProcessTimestamp; - } - - public synchronized void onWorkerEvent(WorkerEvent workerEvent, SimulationDatabase simulationDatabase, VCMessageSession session) throws DataAccessException, VCMessagingException, SQLException { - updateSolverProcessTimestamp(); - WorkerEventMessage workerEventMessage = new WorkerEventMessage(workerEvent); - VCMongoMessage.sendWorkerEvent(workerEventMessage); - - String userName = workerEvent.getUserName(); // as the filter of the client - int workerEventTaskID = workerEvent.getTaskID(); - - if (lg.isTraceEnabled()) lg.trace("onWorkerEventMessage[" + workerEvent.getEventTypeID() + "," + workerEvent.getSimulationMessage() + "][simid=" + workerEvent.getVCSimulationDataIdentifier() + ",job=" + jobIndex + ",task=" + workerEventTaskID + "]"); - - VCSimulationDataIdentifier vcSimDataID = workerEvent.getVCSimulationDataIdentifier(); - if (vcSimDataID == null) { - VCMongoMessage.sendInfo("onWorkerEvent() ignoring WorkerEvent - no SimID in message): "+workerEvent.show()); - return; - } - KeyValue simKey = vcSimDataID.getSimulationKey(); - SimulationJobStatus oldSimulationJobStatus = simulationDatabase.getLatestSimulationJobStatus(simKey, jobIndex); - - if (oldSimulationJobStatus == null){ - VCMongoMessage.sendInfo("onWorkerEvent() ignoring WorkerEvent, no current SimulationJobStatus: "+workerEvent.show()); - return; - } - if (oldSimulationJobStatus == null || oldSimulationJobStatus.getSchedulerStatus().isDone() || oldSimulationJobStatus.getTaskID() > workerEventTaskID){ - VCMongoMessage.sendInfo("onWorkerEvent() ignoring outdated WorkerEvent, (currState="+oldSimulationJobStatus.getSchedulerStatus().getDescription()+"): "+workerEvent.show()); - return; - } - int taskID = oldSimulationJobStatus.getTaskID(); - SchedulerStatus oldSchedulerStatus = oldSimulationJobStatus.getSchedulerStatus(); - - // - // status information (initialized as if new record) - // - Date startDate = null; - Date lastUpdateDate = null; - Date endDate = null; - boolean hasData = false; - HtcJobID htcJobID = null; - String computeHost = null; - VCellServerID vcServerID = VCellServerID.getSystemServerID(); - Date submitDate = null; - Date queueDate = null; - int queuePriority = PRIORITY_DEFAULT; - SimulationJobStatus.SimulationQueueID simQueueID = SimulationJobStatus.SimulationQueueID.QUEUE_ID_WAITING; - - - // - // update using previously stored status (if available). - // - SimulationExecutionStatus oldSimExeStatus = oldSimulationJobStatus.getSimulationExecutionStatus(); - if (oldSimExeStatus!=null && oldSimExeStatus.getStartDate()!=null){ - startDate = oldSimExeStatus.getStartDate(); - } - if (oldSimExeStatus!=null && oldSimExeStatus.getLatestUpdateDate()!=null){ - lastUpdateDate = oldSimExeStatus.getLatestUpdateDate(); - } - if (oldSimExeStatus!=null && oldSimExeStatus.getEndDate()!=null){ - endDate = oldSimExeStatus.getEndDate(); - } - if (oldSimExeStatus!=null && oldSimExeStatus.hasData()){ - hasData = true; - } - if (oldSimExeStatus!=null && oldSimExeStatus.getComputeHost()!=null){ - computeHost = oldSimExeStatus.getComputeHost(); - } - if (oldSimExeStatus!=null && oldSimExeStatus.getHtcJobID()!=null){ - htcJobID = oldSimExeStatus.getHtcJobID(); - } - vcServerID = oldSimulationJobStatus.getServerID(); - submitDate = oldSimulationJobStatus.getSubmitDate(); - SimulationQueueEntryStatus oldQueueStatus = oldSimulationJobStatus.getSimulationQueueEntryStatus(); - if (oldQueueStatus!=null && oldQueueStatus.getQueueDate()!=null){ - queueDate = oldQueueStatus.getQueueDate(); - } - if (oldQueueStatus!=null){ - queuePriority = oldQueueStatus.getQueuePriority(); - } - if (oldQueueStatus!=null && oldQueueStatus.getQueueID()!=null){ - simQueueID = oldQueueStatus.getQueueID(); - } - - // - // update using new information from event - // - if (workerEvent.getHtcJobID()!=null){ - htcJobID = workerEvent.getHtcJobID(); - } - if (workerEvent.getHostName()!=null){ - computeHost = workerEvent.getHostName(); - } - SimulationMessage workerEventSimulationMessage = workerEvent.getSimulationMessage(); - if (workerEventSimulationMessage.getHtcJobId()!=null){ - htcJobID = workerEventSimulationMessage.getHtcJobId(); - } - - - SimulationJobStatus newJobStatus = null; - - if (workerEvent.isAcceptedEvent()) { - // - // job message accepted by HtcSimulationWorker and sent to Scheduler (PBS/SGE/SLURM) (with a htcJobID) ... previous state should be "WAITING" - // - if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued()) { - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - - // new exe status - lastUpdateDate = new Date(); - startDate = lastUpdateDate; - endDate = null; - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.DISPATCHED, - taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); - } - - } else if (workerEvent.isStartingEvent()) { - // only update database when the job event changes from started to running. The later progress event will not be recorded. - if ( oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()) { - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - - // new exe status - lastUpdateDate = new Date(); - if (startDate == null){ - startDate = lastUpdateDate; - } - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.RUNNING, - taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); - } - - } else if (workerEvent.isNewDataEvent()) { - if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ - - if (!oldSchedulerStatus.isRunning() || simQueueID != SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL || hasData==false){ - - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - - // new exe status - if (startDate == null){ - startDate = lastUpdateDate; - } - hasData = true; - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.RUNNING, - taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); - } - } - - } else if (workerEvent.isProgressEvent() || workerEvent.isWorkerAliveEvent()) { - if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ - - - if (!oldSchedulerStatus.isRunning() || simQueueID != SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL){ - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - - // new exe status - if (startDate == null){ - startDate = lastUpdateDate; - } - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.RUNNING, - taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); - - }else if (oldSchedulerStatus.isRunning()){ - if (oldSimExeStatus != null) { -// Date latestUpdate = oldSimExeStatus.getLatestUpdateDate(); -// if (System.currentTimeMillis() - latestUpdate.getTime() >= MessageConstants.INTERVAL_PING_SERVER_MS * 3 / 5) { - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.RUNNING, - taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); - } -// } - } - } - - } else if (workerEvent.isCompletedEvent()) { - if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - - // new exe status - endDate = new Date(); - hasData = true; - - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.COMPLETED, - taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); - - } - - } else if (workerEvent.isFailedEvent()) { - if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - - // new exe status - endDate = new Date(); - - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.FAILED, - taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); - - } - } else if (workerEvent.isWorkerExitErrorEvent()) { - if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - - // new exe status - endDate = new Date(); - - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - SimulationMessage simulationMessage = SimulationMessage.workerFailure("solver stopped unexpectedly, "+workerEventSimulationMessage.getDisplayMessage()); - - newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.FAILED, - taskID, simulationMessage, newQueueStatus, newExeStatus); - - } - } - if (newJobStatus!=null){ - if (!newJobStatus.compareEqual(oldSimulationJobStatus) || workerEvent.isProgressEvent() || workerEvent.isNewDataEvent()) { - Double progress = workerEvent.getProgress(); - Double timepoint = workerEvent.getTimePoint(); - RunningStateInfo runningStateInfo = null; - if (progress != null && timepoint != null){ - runningStateInfo = new RunningStateInfo(progress,timepoint); - } - simulationDatabase.updateSimulationJobStatus(newJobStatus,runningStateInfo); - StatusMessage msgForClient = new StatusMessage(newJobStatus, userName, progress, timepoint); - msgForClient.sendToClient(session); - if (lg.isTraceEnabled()) lg.trace("Send status to client: " + msgForClient); - } else { - simulationDatabase.updateSimulationJobStatus(newJobStatus); - StatusMessage msgForClient = new StatusMessage(newJobStatus, userName, null, null); - msgForClient.sendToClient(session); - if (lg.isTraceEnabled()) lg.trace("Send status to client: " + msgForClient); - } - }else if (workerEvent.isProgressEvent() || workerEvent.isNewDataEvent()){ - Double progress = workerEvent.getProgress(); - Double timepoint = workerEvent.getTimePoint(); - RunningStateInfo runningStateInfo = null; - if (progress!=null && timepoint!=null){ - runningStateInfo = new RunningStateInfo(progress,timepoint); - } - simulationDatabase.updateSimulationJobStatus(oldSimulationJobStatus,runningStateInfo); - StatusMessage msgForClient = new StatusMessage(oldSimulationJobStatus, userName, progress, timepoint); - msgForClient.sendToClient(session); - if (lg.isTraceEnabled()) lg.trace("Send status to client: " + msgForClient); - }else{ - VCMongoMessage.sendInfo("onWorkerEvent() ignoring WorkerEvent (currState="+oldSchedulerStatus.getDescription()+"): "+workerEvent.show()); - } + public static final Logger lg = LogManager.getLogger(SimulationStateMachine.class); + + // bitmapped counter so that allows 3 retries for each request (but preserves ordinal nature) + // bits 0-3: retry count + // bits 4-31: submit + // max retries must be less than 15. + public static final int TASKID_USERCOUNTER_MASK = SimulationStatus.TASKID_USERCOUNTER_MASK; + public static final int TASKID_RETRYCOUNTER_MASK = SimulationStatus.TASKID_RETRYCOUNTER_MASK; + public static final int TASKID_USERINCREMENT = SimulationStatus.TASKID_USERINCREMENT; + + public static final int PRIORITY_LOW = 0; + public static final int PRIORITY_DEFAULT = 5; + public static final int PRIORITY_HIGH = 9; + + private final KeyValue simKey; + private final int jobIndex; + + /** + * in memory storage of last time information about this job was received or status was unknown due + * to transient failure or system restart + */ + private long solverProcessTimestamp; + + private class CurrentState { + + public Date startDate; + public Date lastUpdateDate; + public Date endDate; + public boolean hasData; + public HtcJobID htcJobID; + public String computeHost; + public VCellServerID vcServerID; + public Date submitDate; + public Date queueDate; + public int queuePriority; + public SimulationJobStatus.SimulationQueueID simQueueID; + + public CurrentState(SimulationExecutionStatus oldSimExeStatus, + SimulationQueueEntryStatus oldQueueStatus, + SimulationJobStatus oldSimulationJobStatus){ + boolean isOldExeNull = oldSimExeStatus == null; + boolean isOldQueueNull = oldQueueStatus == null; + // + // status information (initialized as if new record) + // + startDate = !isOldExeNull && oldSimExeStatus.getStartDate()!=null ? oldSimExeStatus.getStartDate() :null; + lastUpdateDate = !isOldExeNull && oldSimExeStatus.getLatestUpdateDate()!=null ? oldSimExeStatus.getLatestUpdateDate() : null; + endDate = !isOldExeNull && oldSimExeStatus.getEndDate()!=null ? oldSimExeStatus.getEndDate() : null; + hasData = !isOldExeNull && oldSimExeStatus.hasData(); + htcJobID = !isOldExeNull && oldSimExeStatus.getHtcJobID()!=null ? oldSimExeStatus.getHtcJobID() : null; + computeHost = !isOldExeNull && oldSimExeStatus.getComputeHost()!=null ? oldSimExeStatus.getComputeHost() : null; + vcServerID = oldSimulationJobStatus.getServerID(); + submitDate = oldSimulationJobStatus.getSubmitDate(); + queueDate = !isOldQueueNull && oldQueueStatus.getQueueDate() != null ? oldQueueStatus.getQueueDate() : null; + queuePriority = !isOldQueueNull ? oldQueueStatus.getQueuePriority() : PRIORITY_DEFAULT; + simQueueID = !isOldQueueNull && oldQueueStatus.getQueueID()!=null ? oldQueueStatus.getQueueID() : SimulationJobStatus.SimulationQueueID.QUEUE_ID_WAITING; + + + } + } + + public SimulationStateMachine(KeyValue simKey, int jobIndex){ + this.simKey = simKey; + this.jobIndex = jobIndex; + updateSolverProcessTimestamp(); + } + + private void updateSolverProcessTimestamp( ) { + solverProcessTimestamp = System.currentTimeMillis(); + } + + /** + * set to specified time (for mass setting) + * @param solverProcessTimestamp + */ + void setSolverProcessTimestamp(long solverProcessTimestamp) { + this.solverProcessTimestamp = solverProcessTimestamp; + } + + public KeyValue getSimKey() { + return simKey; + } + + public int getJobIndex() { + return jobIndex; + } + + + long getSolverProcessTimestamp() { + return solverProcessTimestamp; + } + + protected boolean isWorkerEventOkay(WorkerEvent workerEvent, SimulationDatabase simulationDatabase) throws SQLException, DataAccessException { + VCSimulationDataIdentifier vcSimDataID = workerEvent.getVCSimulationDataIdentifier(); + int workerEventTaskID = workerEvent.getTaskID(); + if (vcSimDataID == null) { + VCMongoMessage.sendInfo("onWorkerEvent() ignoring WorkerEvent - no SimID in message): "+workerEvent.show()); + return false; + } + KeyValue simKey = vcSimDataID.getSimulationKey(); + SimulationJobStatus oldSimulationJobStatus = simulationDatabase.getLatestSimulationJobStatus(simKey, jobIndex); + + if (oldSimulationJobStatus == null){ + VCMongoMessage.sendInfo("onWorkerEvent() ignoring WorkerEvent, no current SimulationJobStatus: "+workerEvent.show()); + return false; + } + if (oldSimulationJobStatus.getSchedulerStatus().isDone() || oldSimulationJobStatus.getTaskID() > workerEventTaskID){ + VCMongoMessage.sendInfo("onWorkerEvent() ignoring outdated WorkerEvent, (currState="+oldSimulationJobStatus.getSchedulerStatus().getDescription()+"): "+workerEvent.show()); + return false; + } + return true; + } + + private SimulationJobStatus produceStateFromWorkerEvent( + WorkerEvent workerEvent, + SimulationJobStatus oldSimulationJobStatus){ + + SimulationExecutionStatus oldSimExeStatus = oldSimulationJobStatus.getSimulationExecutionStatus(); + SimulationQueueEntryStatus oldQueueStatus = oldSimulationJobStatus.getSimulationQueueEntryStatus(); + SimulationJobStatus.SchedulerStatus oldSchedulerStatus = oldSimulationJobStatus.getSchedulerStatus(); + VCSimulationDataIdentifier vcSimDataID = workerEvent.getVCSimulationDataIdentifier(); + + int taskID = oldSimulationJobStatus.getTaskID(); + + CurrentState currentState = new CurrentState(oldSimExeStatus, oldQueueStatus, oldSimulationJobStatus); + + // + // status information (initialized as if new record) + // + Date startDate = currentState.startDate; + Date lastUpdateDate = currentState.lastUpdateDate; + Date endDate = currentState.endDate; + boolean hasData = currentState.hasData; + HtcJobID htcJobID = currentState.htcJobID; + String computeHost = currentState.computeHost; + VCellServerID vcServerID = currentState.vcServerID; + Date submitDate = currentState.submitDate; + Date queueDate = currentState.queueDate; + int queuePriority = currentState.queuePriority; + SimulationJobStatus.SimulationQueueID simQueueID = currentState.simQueueID; + + // + // update using new information from event + // + if (workerEvent.getHtcJobID()!=null){ + htcJobID = workerEvent.getHtcJobID(); + } + if (workerEvent.getHostName()!=null){ + computeHost = workerEvent.getHostName(); + } + SimulationMessage workerEventSimulationMessage = workerEvent.getSimulationMessage(); + if (workerEventSimulationMessage.getHtcJobId()!=null){ + htcJobID = workerEventSimulationMessage.getHtcJobId(); + } + + + SimulationJobStatus newJobStatus = null; + + if (workerEvent.isAcceptedEvent()) { + // + // job message accepted by HtcSimulationWorker and sent to Scheduler (PBS/SGE/SLURM) (with a htcJobID) ... previous state should be "WAITING" + // + if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued()) { + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationQueueID.QUEUE_ID_NULL); + + // new exe status + lastUpdateDate = new Date(); + startDate = lastUpdateDate; + endDate = null; + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); + + newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.DISPATCHED, + taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); + } + + } else if (workerEvent.isStartingEvent()) { + // only update database when the job event changes from started to running. The later progress event will not be recorded. + if ( oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()) { + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationQueueID.QUEUE_ID_NULL); + + // new exe status + lastUpdateDate = new Date(); + if (startDate == null){ + startDate = lastUpdateDate; + } + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); + + newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.RUNNING, + taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); + } + + } else if (workerEvent.isNewDataEvent()) { + if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ + + if (!oldSchedulerStatus.isRunning() || simQueueID != SimulationQueueID.QUEUE_ID_NULL || hasData==false){ + + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationQueueID.QUEUE_ID_NULL); + + // new exe status + if (startDate == null){ + startDate = lastUpdateDate; + } + hasData = true; + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); + + newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.RUNNING, + taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); + } + } + + } else if (workerEvent.isProgressEvent() || workerEvent.isWorkerAliveEvent()) { + if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ + + + if (!oldSchedulerStatus.isRunning() || simQueueID != SimulationQueueID.QUEUE_ID_NULL){ + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationQueueID.QUEUE_ID_NULL); + + // new exe status + if (startDate == null){ + startDate = lastUpdateDate; + } + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); + + newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.RUNNING, + taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); + + }else if (oldSchedulerStatus.isRunning()){ + if (oldSimExeStatus != null) { + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationQueueID.QUEUE_ID_NULL); + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); + + newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.RUNNING, + taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); + } + } + } + + } else if (workerEvent.isCompletedEvent()) { + if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationQueueID.QUEUE_ID_NULL); + // new exe status + endDate = new Date(); + hasData = true; + + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); + newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.COMPLETED, + taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); + } + + } else if (workerEvent.isFailedEvent()) { + if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationQueueID.QUEUE_ID_NULL); + // new exe status + endDate = new Date(); + + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); + newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.FAILED, + taskID, workerEventSimulationMessage, newQueueStatus, newExeStatus); + + } + } else if (workerEvent.isWorkerExitErrorEvent()) { + if (oldSchedulerStatus.isWaiting() || oldSchedulerStatus.isQueued() || oldSchedulerStatus.isDispatched() || oldSchedulerStatus.isRunning()){ + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationQueueID.QUEUE_ID_NULL); + // new exe status + endDate = new Date(); + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); + + SimulationMessage simulationMessage = SimulationMessage.workerFailure("solver stopped unexpectedly, "+workerEventSimulationMessage.getDisplayMessage()); + newJobStatus = new SimulationJobStatus(vcServerID, vcSimDataID.getVcSimID(), jobIndex, submitDate, SchedulerStatus.FAILED, + taskID, simulationMessage, newQueueStatus, newExeStatus); + + } + } + + return newJobStatus; + } + + public synchronized void onWorkerEvent(WorkerEvent workerEvent, SimulationDatabase simulationDatabase, VCMessageSession session) throws DataAccessException, VCMessagingException, SQLException { + updateSolverProcessTimestamp(); + WorkerEventMessage workerEventMessage = new WorkerEventMessage(workerEvent); + VCMongoMessage.sendWorkerEvent(workerEventMessage); + + String userName = workerEvent.getUserName(); // as the filter of the client + int workerEventTaskID = workerEvent.getTaskID(); + + if (lg.isTraceEnabled()) lg.trace("onWorkerEventMessage[" + workerEvent.getEventTypeID() + "," + workerEvent.getSimulationMessage() + "][simid=" + workerEvent.getVCSimulationDataIdentifier() + ",job=" + jobIndex + ",task=" + workerEventTaskID + "]"); + + if (!isWorkerEventOkay(workerEvent, simulationDatabase)){ + return; + } + + VCSimulationDataIdentifier vcSimDataID = workerEvent.getVCSimulationDataIdentifier(); + KeyValue simKey = vcSimDataID.getSimulationKey(); + SimulationJobStatus oldSimulationJobStatus = simulationDatabase.getLatestSimulationJobStatus(simKey, jobIndex); + + SchedulerStatus oldSchedulerStatus = oldSimulationJobStatus.getSchedulerStatus(); + SimulationJobStatus newJobStatus = produceStateFromWorkerEvent(workerEvent, oldSimulationJobStatus); + + if (newJobStatus!=null){ + if (!newJobStatus.compareEqual(oldSimulationJobStatus) || workerEvent.isProgressEvent() || workerEvent.isNewDataEvent()) { + Double progress = workerEvent.getProgress(); + Double timepoint = workerEvent.getTimePoint(); + RunningStateInfo runningStateInfo = null; + if (progress != null && timepoint != null){ + runningStateInfo = new RunningStateInfo(progress,timepoint); + } + simulationDatabase.updateSimulationJobStatus(newJobStatus,runningStateInfo); + StatusMessage msgForClient = new StatusMessage(newJobStatus, userName, progress, timepoint); + + msgForClient.sendToClient(session); + if (lg.isTraceEnabled()) lg.trace("Send status to client: " + msgForClient); + } else { + simulationDatabase.updateSimulationJobStatus(newJobStatus); + StatusMessage msgForClient = new StatusMessage(newJobStatus, userName, null, null); + msgForClient.sendToClient(session); + if (lg.isTraceEnabled()) lg.trace("Send status to client: " + msgForClient); + } + }else if (workerEvent.isProgressEvent() || workerEvent.isNewDataEvent()){ + Double progress = workerEvent.getProgress(); + Double timepoint = workerEvent.getTimePoint(); + RunningStateInfo runningStateInfo = null; + if (progress!=null && timepoint!=null){ + runningStateInfo = new RunningStateInfo(progress,timepoint); + } + simulationDatabase.updateSimulationJobStatus(oldSimulationJobStatus,runningStateInfo); + StatusMessage msgForClient = new StatusMessage(oldSimulationJobStatus, userName, progress, timepoint); + // TODO: Implement messaging to client + msgForClient.sendToClient(session); + if (lg.isTraceEnabled()) lg.trace("Send status to client: " + msgForClient); + }else{ + VCMongoMessage.sendInfo("onWorkerEvent() ignoring WorkerEvent (currState="+oldSchedulerStatus.getDescription()+"): "+workerEvent.show()); + } // addStateMachineTransition(new StateMachineTransition(new WorkerStateMachineEvent(taskID, workerEvent), oldSimulationJobStatus, newJobStatus)); - } - - public synchronized void onStartRequest(User user, VCSimulationIdentifier vcSimID, SimulationDatabase simulationDatabase, VCMessageSession session) throws VCMessagingException, DataAccessException, SQLException { - - if (!user.equals(vcSimID.getOwner())) { - lg.error(user + " is not authorized to start simulation (key=" + simKey + ")"); - StatusMessage message = new StatusMessage(new SimulationJobStatus(VCellServerID.getSystemServerID(), vcSimID, 0, null, - SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("You are not authorized to start this simulation!"), null, null), user.getName(), null, null); - message.sendToClient(session); - VCMongoMessage.sendInfo("onStartRequest("+vcSimID.getID()+") ignoring start simulation request - wrong user): simID="+vcSimID); - return; - } - - SimulationJobStatus newJobStatus = saveSimulationStartRequest(vcSimID, jobIndex, simulationDatabase); -// addStateMachineTransition(new StateMachineTransition(new StartStateMachineEvent(newTaskID), oldSimulationJobStatus, newJobStatus)); - - StatusMessage message = new StatusMessage(newJobStatus, user.getName(), null, null); - message.sendToClient(session); - } - - public static SimulationJobStatus saveSimulationStartRequest(VCSimulationIdentifier vcSimID, int jobIndex, SimulationDatabase simulationDatabase) throws DataAccessException, SQLException { - // - // get latest simulation job task (if any). - // - SimulationJobStatus oldSimulationJobStatus = simulationDatabase.getLatestSimulationJobStatus(vcSimID.getSimulationKey(), jobIndex); - int oldTaskID = -1; - if (oldSimulationJobStatus != null){ - oldTaskID = oldSimulationJobStatus.getTaskID(); - } - // if already started by another thread - if (oldSimulationJobStatus != null && !oldSimulationJobStatus.getSchedulerStatus().isDone()) { - VCMongoMessage.sendInfo("onStartRequest("+ vcSimID.getID()+") ignoring start simulation request - (currentSimJobStatus:"+oldSimulationJobStatus.getSchedulerStatus().getDescription()+"): simID="+ vcSimID); - throw new RuntimeException("Can't start, simulation[" + vcSimID + "] job [" + jobIndex + "] task [" + oldTaskID + "] is running already ("+oldSimulationJobStatus.getSchedulerStatus().getDescription()+")"); - } - - int newTaskID; - - if (oldTaskID > -1){ - // calculate new task - newTaskID = (oldTaskID & SimulationStatus.TASKID_USERCOUNTER_MASK) + SimulationStatus.TASKID_USERINCREMENT; - }else{ - // first task, start with 0 - newTaskID = 0; - } - - Date currentDate = new Date(); - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(currentDate, PRIORITY_DEFAULT, SimulationJobStatus.SimulationQueueID.QUEUE_ID_WAITING); - - // new exe status - Date lastUpdateDate = new Date(); - String computeHost = null; - Date startDate = null; - Date endDate = null; - HtcJobID htcJobID = null; - boolean hasData = false; - - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - VCellServerID vcServerID = VCellServerID.getSystemServerID(); - Date submitDate = currentDate; - - SimulationJobStatus newJobStatus = new SimulationJobStatus(vcServerID, vcSimID, jobIndex, submitDate, SchedulerStatus.WAITING, - newTaskID, SimulationMessage.MESSAGE_JOB_WAITING, newQueueStatus, newExeStatus); - - simulationDatabase.insertSimulationJobStatus(newJobStatus); - return newJobStatus; - } - - - public synchronized void onDispatch(Simulation simulation, SimulationJobStatus oldSimulationJobStatus, SimulationDatabase simulationDatabase, VCMessageSession session) throws VCMessagingException, DataAccessException, SQLException { - updateSolverProcessTimestamp(); - VCSimulationIdentifier vcSimID = oldSimulationJobStatus.getVCSimulationIdentifier(); - int taskID = oldSimulationJobStatus.getTaskID(); - - if (!oldSimulationJobStatus.getSchedulerStatus().isWaiting()) { - VCMongoMessage.sendInfo("onDispatch("+vcSimID.getID()+") Can't start, simulation[" + vcSimID + "] job [" + jobIndex + "] task [" + taskID + "] is already dispatched ("+oldSimulationJobStatus.getSchedulerStatus().getDescription()+")"); - throw new RuntimeException("Can't start, simulation[" + vcSimID + "] job [" + jobIndex + "] task [" + taskID + "] is already dispatched ("+oldSimulationJobStatus.getSchedulerStatus().getDescription()+")"); - } - - FieldDataIdentifierSpec[] fieldDataIdentifierSpecs = simulationDatabase.getFieldDataIdentifierSpecs(simulation); - //Check if user wants long running sims activated in SlurmProxy.generateScript(...) - //only happens if user is allowed to be power user (entry in vc_specialusers table) and - //has checked the 'timeoutDisabledCheckBox' in SolverTaskDescriptionAdvancedPanel on the client-side GUI - boolean isPowerUser = simulation.getSolverTaskDescription().isTimeoutDisabled();//Set from GUI - if(isPowerUser) {//Check if user allowed to be power user for 'special1' long running sims (see User.SPECIALS and vc_specialusers table) - User.SpecialUser myUser = simulationDatabase.getUser(simulation.getVersion().getOwner().getName()); - //'powerUsers' (previously called 'special1') assigned to users by request to allow long running sims - isPowerUser = isPowerUser && Arrays.asList(myUser.getMySpecials()).contains(User.SPECIAL_CLAIM.powerUsers); - } - SimulationTask simulationTask = new SimulationTask(new SimulationJob(simulation, jobIndex, fieldDataIdentifierSpecs), taskID,null,isPowerUser); - - double requiredMemMB = simulationTask.getEstimatedMemorySizeMB(); - //SimulationStateMachine ultimately instantiated from {vcellroot}/docker/build/Dockerfile-sched-dev by way of cbit.vcell.message.server.dispatcher.SimulationDispatcher - String vcellUserid = simulationTask.getUser().getName(); - KeyValue simID = simulationTask.getSimulationInfo().getSimulationVersion().getVersionKey(); - SolverDescription solverDescription = simulationTask.getSimulation().getSolverTaskDescription().getSolverDescription(); - - MemLimitResults allowableMemMB = HtcProxy.getMemoryLimit(vcellUserid,simID,solverDescription, requiredMemMB, isPowerUser); - - final SimulationJobStatus newSimJobStatus; - if (requiredMemMB > allowableMemMB.getMemLimit()) { - // - // fail the simulation - // - Date currentDate = new Date(); - // new queue status - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(currentDate, PRIORITY_DEFAULT, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - SimulationExecutionStatus newSimExeStatus = new SimulationExecutionStatus(null, null, new Date(), null, false, null); - newSimJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(),vcSimID,jobIndex, - oldSimulationJobStatus.getSubmitDate(),SchedulerStatus.FAILED,taskID, - SimulationMessage.jobFailed("simulation required "+requiredMemMB+"MB of memory, only "+allowableMemMB.getMemLimit()+"MB allowed from "+allowableMemMB.getMemLimitSource()), - newQueueStatus,newSimExeStatus); - - simulationDatabase.updateSimulationJobStatus(newSimJobStatus); - - StatusMessage message = new StatusMessage(newSimJobStatus, simulation.getVersion().getOwner().getName(), null, null); - message.sendToClient(session); - - }else{ - // - // dispatch the simulation, new queue status - // - Date currentDate = new Date(); - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(currentDate, PRIORITY_DEFAULT, SimulationJobStatus.SimulationQueueID.QUEUE_ID_SIMULATIONJOB); - SimulationExecutionStatus newSimExeStatus = new SimulationExecutionStatus(null, null, new Date(), null, false, null); - newSimJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(),vcSimID,jobIndex, - oldSimulationJobStatus.getSubmitDate(),SchedulerStatus.DISPATCHED,taskID, - SimulationMessage.MESSAGE_JOB_DISPATCHED, - newQueueStatus,newSimExeStatus); - - SimulationTaskMessage simTaskMessage = new SimulationTaskMessage(simulationTask); - simTaskMessage.sendSimulationTask(session); - - simulationDatabase.updateSimulationJobStatus(newSimJobStatus); - - StatusMessage message = new StatusMessage(newSimJobStatus, simulation.getVersion().getOwner().getName(), null, null); - message.sendToClient(session); - - } + } + + public synchronized StatusMessage onStartRequest(User user, VCSimulationIdentifier vcSimID, SimulationDatabase simulationDatabase, VCMessageSession session) throws VCMessagingException, DataAccessException, SQLException { + + StatusMessage statusMessage; + if (!user.equals(vcSimID.getOwner())) { + lg.error(user + " is not authorized to start simulation (key=" + simKey + ")"); + SimulationJobStatus simulationJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(), vcSimID, 0, null, + SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("You are not authorized to start this simulation!"), null, null); + statusMessage = new StatusMessage(simulationJobStatus, user.getName(), null, null); + VCMongoMessage.sendInfo("onStartRequest("+vcSimID.getID()+") ignoring start simulation request - wrong user): simID="+vcSimID); + statusMessage.sendToClient(session); + return statusMessage; + } + SimulationJobStatus newJobStatus = saveSimulationStartRequest(vcSimID, jobIndex, simulationDatabase); + statusMessage = new StatusMessage(newJobStatus, user.getName(), null, null); + statusMessage.sendToClient(session); + return statusMessage; + } + + public static SimulationJobStatus saveSimulationStartRequest(VCSimulationIdentifier vcSimID, int jobIndex, SimulationDatabase simulationDatabase) throws DataAccessException, SQLException { + // + // get latest simulation job task (if any). + // + SimulationJobStatus oldSimulationJobStatus = simulationDatabase.getLatestSimulationJobStatus(vcSimID.getSimulationKey(), jobIndex); + int oldTaskID = -1; + if (oldSimulationJobStatus != null){ + oldTaskID = oldSimulationJobStatus.getTaskID(); + } + // if already started by another thread + if (oldSimulationJobStatus != null && !oldSimulationJobStatus.getSchedulerStatus().isDone()) { + VCMongoMessage.sendInfo("onStartRequest("+ vcSimID.getID()+") ignoring start simulation request - (currentSimJobStatus:"+oldSimulationJobStatus.getSchedulerStatus().getDescription()+"): simID="+ vcSimID); + throw new RuntimeException("Can't start, simulation[" + vcSimID + "] job [" + jobIndex + "] task [" + oldTaskID + "] is running already ("+oldSimulationJobStatus.getSchedulerStatus().getDescription()+")"); + } + + int newTaskID; + + if (oldTaskID > -1){ + // calculate new task + newTaskID = (oldTaskID & SimulationStatus.TASKID_USERCOUNTER_MASK) + SimulationStatus.TASKID_USERINCREMENT; + }else{ + // first task, start with 0 + newTaskID = 0; + } + + Date currentDate = new Date(); + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(currentDate, PRIORITY_DEFAULT, SimulationQueueID.QUEUE_ID_WAITING); + + // new exe status + Date lastUpdateDate = new Date(); + boolean hasData = false; + + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(null, null, lastUpdateDate, null, hasData, null); + + VCellServerID vcServerID = VCellServerID.getSystemServerID(); + Date submitDate = currentDate; + + SimulationJobStatus newJobStatus = new SimulationJobStatus(vcServerID, vcSimID, jobIndex, submitDate, SchedulerStatus.WAITING, + newTaskID, SimulationMessage.MESSAGE_JOB_WAITING, newQueueStatus, newExeStatus); + + simulationDatabase.insertSimulationJobStatus(newJobStatus); + return newJobStatus; + } + + + public synchronized void onDispatch(Simulation simulation, SimulationJobStatus oldSimulationJobStatus, SimulationDatabase simulationDatabase, VCMessageSession session) throws VCMessagingException, DataAccessException, SQLException { + updateSolverProcessTimestamp(); + VCSimulationIdentifier vcSimID = oldSimulationJobStatus.getVCSimulationIdentifier(); + int taskID = oldSimulationJobStatus.getTaskID(); + + if (!oldSimulationJobStatus.getSchedulerStatus().isWaiting()) { + VCMongoMessage.sendInfo("onDispatch("+vcSimID.getID()+") Can't start, simulation[" + vcSimID + "] job [" + jobIndex + "] task [" + taskID + "] is already dispatched ("+oldSimulationJobStatus.getSchedulerStatus().getDescription()+")"); + throw new RuntimeException("Can't start, simulation[" + vcSimID + "] job [" + jobIndex + "] task [" + taskID + "] is already dispatched ("+oldSimulationJobStatus.getSchedulerStatus().getDescription()+")"); + } + + FieldDataIdentifierSpec[] fieldDataIdentifierSpecs = simulationDatabase.getFieldDataIdentifierSpecs(simulation); + //Check if user wants long running sims activated in SlurmProxy.generateScript(...) + //only happens if user is allowed to be power user (entry in vc_specialusers table) and + //has checked the 'timeoutDisabledCheckBox' in SolverTaskDescriptionAdvancedPanel on the client-side GUI + boolean isPowerUser = simulation.getSolverTaskDescription().isTimeoutDisabled();//Set from GUI + if(isPowerUser) {//Check if user allowed to be power user for 'special1' long running sims (see User.SPECIALS and vc_specialusers table) + User.SpecialUser myUser = simulationDatabase.getUser(simulation.getVersion().getOwner().getName()); + //'powerUsers' (previously called 'special1') assigned to users by request to allow long running sims + isPowerUser = isPowerUser && Arrays.asList(myUser.getMySpecials()).contains(User.SPECIAL_CLAIM.powerUsers); + } + SimulationTask simulationTask = new SimulationTask(new SimulationJob(simulation, jobIndex, fieldDataIdentifierSpecs), taskID,null,isPowerUser); + + double estimatedMemMB = simulationTask.getEstimatedMemorySizeMB(); + double htcMinMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMinMemoryMB)); + double htcMaxMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMaxMemoryMB)); + double requestedMemoryMB = Math.max(estimatedMemMB, htcMinMemoryMB); + + final SimulationJobStatus newSimJobStatus; + if (requestedMemoryMB > htcMaxMemoryMB) { + // + // fail the simulation + // + Date currentDate = new Date(); + // new queue status + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(currentDate, PRIORITY_DEFAULT, SimulationQueueID.QUEUE_ID_NULL); + SimulationExecutionStatus newSimExeStatus = new SimulationExecutionStatus(null, null, new Date(), null, false, null); + newSimJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(),vcSimID,jobIndex, + oldSimulationJobStatus.getSubmitDate(), SchedulerStatus.FAILED,taskID, + SimulationMessage.jobFailed("simulation required "+estimatedMemMB+"MB of memory, only "+htcMaxMemoryMB+"MB allowed"), + newQueueStatus,newSimExeStatus); + + simulationDatabase.updateSimulationJobStatus(newSimJobStatus); + + StatusMessage message = new StatusMessage(newSimJobStatus, simulation.getVersion().getOwner().getName(), null, null); + message.sendToClient(session); + + }else{ + // + // dispatch the simulation, new queue status + // + Date currentDate = new Date(); + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(currentDate, PRIORITY_DEFAULT, SimulationQueueID.QUEUE_ID_SIMULATIONJOB); + SimulationExecutionStatus newSimExeStatus = new SimulationExecutionStatus(null, null, new Date(), null, false, null); + newSimJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(),vcSimID,jobIndex, + oldSimulationJobStatus.getSubmitDate(), SchedulerStatus.DISPATCHED,taskID, + SimulationMessage.MESSAGE_JOB_DISPATCHED, + newQueueStatus,newSimExeStatus); + + SimulationTaskMessage simTaskMessage = new SimulationTaskMessage(simulationTask); + simTaskMessage.sendSimulationTask(session); + + simulationDatabase.updateSimulationJobStatus(newSimJobStatus); + + StatusMessage message = new StatusMessage(newSimJobStatus, simulation.getVersion().getOwner().getName(), null, null); + message.sendToClient(session); + + } // addStateMachineTransition(new StateMachineTransition(new DispatchStateMachineEvent(taskID), oldSimulationJobStatus, newSimJobStatus)); - } - - public synchronized void onStopRequest(User user, SimulationJobStatus simJobStatus, SimulationDatabase simulationDatabase, VCMessageSession session) throws VCMessagingException, DataAccessException, SQLException { - updateSolverProcessTimestamp(); - - if (!user.equals(simJobStatus.getVCSimulationIdentifier().getOwner())) { - lg.error(user + " is not authorized to stop simulation (key=" + simKey + ")"); - StatusMessage message = new StatusMessage(new SimulationJobStatus(VCellServerID.getSystemServerID(), simJobStatus.getVCSimulationIdentifier(), 0, null, - SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("You are not authorized to stop this simulation!"), null, null), user.getName(), null, null); - message.sendToClient(session); - VCMongoMessage.sendInfo("onStopRequest("+simJobStatus.getVCSimulationIdentifier()+") ignoring stop simulation request - wrong user)"); - return; - } - - // stop latest task if active - SchedulerStatus schedulerStatus = simJobStatus.getSchedulerStatus(); - int taskID = simJobStatus.getTaskID(); - - if (schedulerStatus.isActive()){ - SimulationQueueEntryStatus simQueueEntryStatus = simJobStatus.getSimulationQueueEntryStatus(); - SimulationExecutionStatus simExeStatus = simJobStatus.getSimulationExecutionStatus(); - SimulationJobStatus newJobStatus = new SimulationJobStatus(simJobStatus.getServerID(),simJobStatus.getVCSimulationIdentifier(),jobIndex,simJobStatus.getSubmitDate(), - SchedulerStatus.STOPPED,taskID,SimulationMessage.solverStopped("simulation stopped by user"),simQueueEntryStatus,simExeStatus); - - // - // send stopSimulation to serviceControl topic - // - if (lg.isTraceEnabled()) lg.trace("send " + MessageConstants.MESSAGE_TYPE_STOPSIMULATION_VALUE + " to " + VCellTopic.ServiceControlTopic.getName() + " topic"); - VCMessage msg = session.createMessage(); - msg.setStringProperty(VCMessagingConstants.MESSAGE_TYPE_PROPERTY, MessageConstants.MESSAGE_TYPE_STOPSIMULATION_VALUE); - msg.setLongProperty(MessageConstants.SIMKEY_PROPERTY, Long.parseLong(simKey + "")); - msg.setIntProperty(MessageConstants.JOBINDEX_PROPERTY, jobIndex); - msg.setIntProperty(MessageConstants.TASKID_PROPERTY, taskID); - msg.setStringProperty(VCMessagingConstants.USERNAME_PROPERTY, user.getName()); - if (simExeStatus.getHtcJobID()!=null){ - msg.setStringProperty(MessageConstants.HTCJOBID_PROPERTY, simExeStatus.getHtcJobID().toDatabase()); - } - session.sendTopicMessage(VCellTopic.ServiceControlTopic, msg); - - simulationDatabase.updateSimulationJobStatus(newJobStatus); -// addStateMachineTransition(new StateMachineTransition(new StopStateMachineEvent(taskID), simJobStatus, newJobStatus)); - - // update client - StatusMessage message = new StatusMessage(newJobStatus, user.getName(), null, null); - message.sendToClient(session); - } - } - - public synchronized void onSystemAbort(SimulationJobStatus oldJobStatus, String failureMessage, SimulationDatabase simulationDatabase, VCMessageSession session) throws VCMessagingException, UpdateSynchronizationException, DataAccessException, SQLException { - updateSolverProcessTimestamp(); - - int taskID = oldJobStatus.getTaskID(); - - // - // status information (initialized as if new record) - // - Date startDate = null; - boolean hasData = false; - HtcJobID htcJobID = null; - String computeHost = null; - VCellServerID vcServerID = VCellServerID.getSystemServerID(); - Date submitDate = null; - Date queueDate = null; - int queuePriority = PRIORITY_DEFAULT; - - - // - // update using previously stored status (if available). - // - SimulationExecutionStatus oldSimExeStatus = oldJobStatus.getSimulationExecutionStatus(); - if (oldSimExeStatus!=null && oldSimExeStatus.getStartDate()!=null){ - startDate = oldSimExeStatus.getStartDate(); - } - if (oldSimExeStatus!=null && oldSimExeStatus.hasData()){ - hasData = true; - } - if (oldSimExeStatus!=null && oldSimExeStatus.getComputeHost()!=null){ - computeHost = oldSimExeStatus.getComputeHost(); - } - if (oldSimExeStatus!=null && oldSimExeStatus.getHtcJobID()!=null){ - htcJobID = oldSimExeStatus.getHtcJobID(); - } - vcServerID = oldJobStatus.getServerID(); - submitDate = oldJobStatus.getSubmitDate(); - SimulationQueueEntryStatus oldQueueStatus = oldJobStatus.getSimulationQueueEntryStatus(); - if (oldQueueStatus!=null && oldQueueStatus.getQueueDate()!=null){ - queueDate = oldQueueStatus.getQueueDate(); - } - if (oldQueueStatus!=null){ - queuePriority = oldQueueStatus.getQueuePriority(); - } - - SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(queueDate, queuePriority, SimulationJobStatus.SimulationQueueID.QUEUE_ID_NULL); - - Date endDate = new Date(); - Date lastUpdateDate = new Date(); - - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); - - SimulationJobStatus newJobStatus = new SimulationJobStatus(vcServerID, oldJobStatus.getVCSimulationIdentifier(), jobIndex, submitDate, SchedulerStatus.FAILED, - taskID, SimulationMessage.jobFailed(failureMessage), newQueueStatus, newExeStatus); - - simulationDatabase.updateSimulationJobStatus(newJobStatus); + } + + public synchronized StatusMessage onStopRequest(User user, SimulationJobStatus simJobStatus, SimulationDatabase simulationDatabase, VCMessageSession session) throws VCMessagingException, DataAccessException, SQLException { + updateSolverProcessTimestamp(); + + StatusMessage statusMessage; + if (!user.equals(simJobStatus.getVCSimulationIdentifier().getOwner())) { + lg.error(user + " is not authorized to stop simulation (key=" + simKey + ")"); + SimulationJobStatus simulationJobStatus = new SimulationJobStatus(VCellServerID.getSystemServerID(), simJobStatus.getVCSimulationIdentifier(), 0, null, + SchedulerStatus.FAILED, 0, SimulationMessage.workerFailure("You are not authorized to stop this simulation!"), null, null); + + VCMongoMessage.sendInfo("onStopRequest("+simJobStatus.getVCSimulationIdentifier()+") ignoring stop simulation request - wrong user)"); + statusMessage = new StatusMessage(simulationJobStatus, user.getName(), null, null); + statusMessage.sendToClient(session); + return statusMessage; + } + + // stop latest task if active + SchedulerStatus schedulerStatus = simJobStatus.getSchedulerStatus(); + int taskID = simJobStatus.getTaskID(); + + if (schedulerStatus.isActive()){ + SimulationQueueEntryStatus simQueueEntryStatus = simJobStatus.getSimulationQueueEntryStatus(); + SimulationExecutionStatus simExeStatus = simJobStatus.getSimulationExecutionStatus(); + SimulationJobStatus newJobStatus = new SimulationJobStatus(simJobStatus.getServerID(),simJobStatus.getVCSimulationIdentifier(),jobIndex,simJobStatus.getSubmitDate(), + SchedulerStatus.STOPPED,taskID,SimulationMessage.solverStopped("simulation stopped by user"),simQueueEntryStatus,simExeStatus); + + + if (lg.isTraceEnabled()) lg.trace("send " + MessageConstants.MESSAGE_TYPE_STOPSIMULATION_VALUE + " to " + VCellTopic.ServiceControlTopic.getName() + " topic"); + SimulationJobStatus simulationJobStatusRecord = new SimulationJobStatus( + null, new VCSimulationIdentifier(simKey, user), jobIndex, simJobStatus.getSubmitDate(), + SchedulerStatus.STOPPED, taskID, simJobStatus.getSimulationMessage(), simQueueEntryStatus, simExeStatus + ); + + // + // send stopSimulation to serviceControl topic + // + VCMessage msg = session.createMessage(); + msg.setStringProperty(VCMessagingConstants.MESSAGE_TYPE_PROPERTY, MessageConstants.MESSAGE_TYPE_STOPSIMULATION_VALUE); + msg.setLongProperty(MessageConstants.SIMKEY_PROPERTY, Long.parseLong(simKey + "")); + msg.setIntProperty(MessageConstants.JOBINDEX_PROPERTY, jobIndex); + msg.setIntProperty(MessageConstants.TASKID_PROPERTY, taskID); + msg.setStringProperty(VCMessagingConstants.USERNAME_PROPERTY, user.getName()); + if (simExeStatus.getHtcJobID()!=null){ + msg.setStringProperty(MessageConstants.HTCJOBID_PROPERTY, simExeStatus.getHtcJobID().toDatabase()); + } + session.sendTopicMessage(VCellTopic.ServiceControlTopic, msg); + + simulationDatabase.updateSimulationJobStatus(newJobStatus); + statusMessage = new StatusMessage(simulationJobStatusRecord, user.getName(), null, null); + statusMessage.sendToClient(session); + + return statusMessage; + } + return null; + } + + public synchronized void onSystemAbort(SimulationJobStatus oldJobStatus, String failureMessage, SimulationDatabase simulationDatabase, VCMessageSession session) throws VCMessagingException, UpdateSynchronizationException, DataAccessException, SQLException { + updateSolverProcessTimestamp(); + + int taskID = oldJobStatus.getTaskID(); + + // + // update using previously stored status (if available). + // + CurrentState currentState = new CurrentState(oldJobStatus.getSimulationExecutionStatus(), oldJobStatus.getSimulationQueueEntryStatus(), oldJobStatus); + + SimulationQueueEntryStatus newQueueStatus = new SimulationQueueEntryStatus(currentState.queueDate, currentState.queuePriority, SimulationQueueID.QUEUE_ID_NULL); + + Date endDate = new Date(); + Date lastUpdateDate = new Date(); + + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(currentState.startDate, currentState.computeHost, lastUpdateDate, endDate, currentState.hasData, currentState.htcJobID); + + SimulationJobStatus newJobStatus = new SimulationJobStatus(currentState.vcServerID, oldJobStatus.getVCSimulationIdentifier(), jobIndex, currentState.submitDate, SchedulerStatus.FAILED, + taskID, SimulationMessage.jobFailed(failureMessage), newQueueStatus, newExeStatus); + + simulationDatabase.updateSimulationJobStatus(newJobStatus); // addStateMachineTransition(new StateMachineTransition(new AbortStateMachineEvent(taskID, failureMessage), oldJobStatus, newJobStatus)); - String userName = VCMessagingConstants.USERNAME_PROPERTY_VALUE_ALL; - StatusMessage msgForClient = new StatusMessage(newJobStatus, userName, null, null); - msgForClient.sendToClient(session); - if (lg.isTraceEnabled()) lg.trace("Send status to client: " + msgForClient); - } - -// public int getLatestKnownTaskID() { -// int taskID = -1; -// for (StateMachineTransition transition : stateMachineTransitions){ -// if (transition.event.taskID!=null && transition.event.taskID>taskID){ -// taskID = transition.event.taskID; -// } -// if (transition.newSimJobStatus!=null && transition.newSimJobStatus.getTaskID()>taskID){ -// taskID = transition.newSimJobStatus.getTaskID(); -// } -// } -// return taskID; -// } -// + String userName = VCMessagingConstants.USERNAME_PROPERTY_VALUE_ALL; + StatusMessage msgForClient = new StatusMessage(newJobStatus, userName, null, null); + msgForClient.sendToClient(session); + if (lg.isTraceEnabled()) lg.trace("Send status to client: " + msgForClient); + } + } From 6d03b1259070f59d98fa5c2a5f605d8b5dd37630 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 6 Sep 2024 13:42:26 -0400 Subject: [PATCH 02/16] Separate Dispatcher from It's Main Method --- .../dispatcher/SimulationDispatcher.java | 188 ++++++++---------- .../dispatcher/SimulationDispatcherMain.java | 79 ++++++++ 2 files changed, 164 insertions(+), 103 deletions(-) create mode 100644 vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherMain.java diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcher.java b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcher.java index 09bbce31fb..d612019a4d 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcher.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcher.java @@ -29,21 +29,22 @@ import cbit.vcell.modeldb.AdminDBTopLevel; import cbit.vcell.modeldb.DatabaseServerImpl; import cbit.vcell.mongodb.VCMongoMessage; -import cbit.vcell.resource.OperatingSystemInfo; import cbit.vcell.resource.PropertyLoader; import cbit.vcell.server.*; import cbit.vcell.server.SimulationJobStatus.SchedulerStatus; import cbit.vcell.solver.Simulation; import cbit.vcell.solver.VCSimulationIdentifier; import com.google.gson.Gson; -import com.google.inject.Guice; -import com.google.inject.Injector; +import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.core.LoggerContext; +import org.apache.logging.log4j.core.appender.WriterAppender; +import org.apache.logging.log4j.core.config.Configuration; +import org.apache.logging.log4j.core.config.LoggerConfig; import org.vcell.db.ConnectionFactory; import org.vcell.db.DatabaseService; import org.vcell.db.KeyFactory; -import org.vcell.dependency.server.VCellServerModule; import org.vcell.util.DataAccessException; import org.vcell.util.PermissionException; import org.vcell.util.document.KeyValue; @@ -51,6 +52,7 @@ import org.vcell.util.document.VCellServerID; import org.vcell.util.exe.ExecutableException; +import java.io.StringWriter; import java.sql.SQLException; import java.text.SimpleDateFormat; import java.util.*; @@ -71,7 +73,9 @@ public class SimulationDispatcher { /** * minutes between zombie kill runs */ - public static final int ZOMBIE_MINUTES = 1; + public static final int ZOMBIE_MINUTES = 1; + // changed only for testing + static int INITIAL_ZOMBIE_DELAY = 0; /** * minutes between queue flushing */ @@ -79,7 +83,7 @@ public class SimulationDispatcher { /** * queue flush wait time */ - public static final long QUEUE_FLUSH_WAITIME = MessageConstants.MINUTE_IN_MS*5; + public final static long QUEUE_FLUSH_WAITIME = MessageConstants.MINUTE_IN_MS*5; private final VCMessagingService vcMessagingService_int; private final VCMessagingService vcMessagingService_sim; @@ -89,16 +93,17 @@ public class SimulationDispatcher { private final VCQueueConsumer simRequestConsumer_int; private final VCRpcMessageHandler rpcMessageHandler_int; - private final SimulationDispatcherEngine simDispatcherEngine = new SimulationDispatcherEngine(); + protected final SimulationDispatcherEngine simDispatcherEngine = new SimulationDispatcherEngine(); - private final DispatchThread dispatchThread; - private final SimulationMonitor simMonitor; + protected final DispatchThread dispatchThread; + protected final SimulationMonitor simMonitor; private final VCMessageSession dispatcherQueueSession_int; private final VCMessageSession clientStatusTopicSession_int; private final VCMessageSession simMonitorThreadSession_sim; private final HtcProxy htcProxy; public static Logger lg = LogManager.getLogger(SimulationDispatcher.class); + public final SimulationService simServiceImpl; public class SimulationServiceImpl implements SimulationService { @@ -144,8 +149,8 @@ public SimulationStatus startSimulation(User user, VCSimulationIdentifier vcSimu // wake up dispatcher thread if (dispatchThread!=null){ try { - synchronized (dispatchThread.notifyObject){ - dispatchThread.notifyObject.notify(); + synchronized (dispatchThread.dispatcherNotifyObject){ + dispatchThread.dispatcherNotifyObject.notify(); } }catch (IllegalMonitorStateException e){ lg.error("failed to notify dispatchThread",e); @@ -238,7 +243,8 @@ private void reloadSpecialUsers() { } public class DispatchThread extends Thread { - Object notifyObject = new Object(); + final Object dispatcherNotifyObject = new Object(); + final Object finishListener = new Object(); //used for tests public DispatchThread() { super(); @@ -316,7 +322,7 @@ public void run() { tempSimulationMap.put(simKey, sim); } if (lg.isDebugEnabled()) { - lg.debug("dispatching simKey="+vcSimID+", jobId="+jobStatus.getJobIndex()+", taskId="+jobStatus.getTaskID()); + lg.debug("dispatching simKey={}, jobId={}, taskId={}", vcSimID, jobStatus.getJobIndex(), jobStatus.getTaskID()); } simDispatcherEngine.onDispatch(sim, jobStatus, simulationDatabase, dispatcherQueueSession_int); bDispatchedAnyJobs = true; @@ -331,14 +337,19 @@ public void run() { } catch (Exception ex) { lg.error(ex.getMessage(), ex); } + finally { + synchronized (finishListener){ + finishListener.notify(); + } + } // if there are no messages or no qualified jobs or exceptions, sleep for a few seconds while // this will be interrupted if there is a start request. if (!bDispatchedAnyJobs){ - synchronized (notifyObject) { + synchronized (dispatcherNotifyObject) { try { long waitTime = 5 * MessageConstants.SECOND_IN_MS; - notifyObject.wait(waitTime); + dispatcherNotifyObject.wait(waitTime); } catch (InterruptedException ex) { lg.debug("Dispatch thread wait interrupted", ex); } @@ -354,30 +365,34 @@ public void run() { } class SimulationMonitor implements ThreadFactory, RejectedExecutionHandler { - private ScheduledThreadPoolExecutor executor; + protected final ScheduledThreadPoolExecutor executor; private int threadCount; + ZombieKiller initialZombieKiller = new ZombieKiller(); + QueueFlusher initialQueueFlusher = new QueueFlusher(); /** * synchronizes {@link SimulationDispatcher#onWorkerEventMessage(VCMessage, VCMessageSession)} and * {@link QueueFlusher#flushWorkerEventQueue()} */ - Object notifyObject = new Object(); + final Object monitorNotifyObject = new Object(); public SimulationMonitor( ) { threadCount = 1; executor = new ScheduledThreadPoolExecutor(2,this,this); - executor.scheduleAtFixedRate(new ZombieKiller( ), 0, ZOMBIE_MINUTES, TimeUnit.MINUTES); - executor.scheduleAtFixedRate(new QueueFlusher( ), 1,FLUSH_QUEUE_MINUTES,TimeUnit.MINUTES); + executor.scheduleAtFixedRate(initialZombieKiller, INITIAL_ZOMBIE_DELAY, ZOMBIE_MINUTES, TimeUnit.MINUTES); + executor.scheduleAtFixedRate(initialQueueFlusher, 1,FLUSH_QUEUE_MINUTES,TimeUnit.MINUTES); } /** * find and kill zombie processes */ class ZombieKiller implements Runnable { + public static final String noJob = "no jobStatus found in database for running htc job"; + public static final String newJobFound = "newer task found in database for running htc job"; + public static final String jobIsAlreadyDone = "jobStatus Done in database for running htc job"; @Override public void run() { try { traceThread(this); - Map runningJobs = htcProxy.getRunningJobs(); for (HtcJobInfo htcJobInfo : runningJobs.keySet()){ try { @@ -390,13 +405,13 @@ public void run() { String failureMessage = null; boolean killJob = false; if (simJobStatus==null){ - failureMessage = "no jobStatus found in database for running htc job"; + failureMessage = noJob; killJob = true; }else if (simTaskInfo.taskId < simJobStatus.getTaskID()){ - failureMessage = "newer task found in database for running htc job"; + failureMessage = newJobFound; killJob = true; }else if (simJobStatus.getSchedulerStatus().isDone()){ - failureMessage = "jobStatus Done in database for running htc job"; + failureMessage = jobIsAlreadyDone; if (simJobStatus.getSimulationExecutionStatus()==null){ killJob = true; }else{ @@ -409,9 +424,8 @@ public void run() { } if (killJob && HtcProxy.isMySimulationJob(htcJobInfo)){ if (lg.isWarnEnabled()) { - lg.warn("killing " + htcJobInfo + ", " + failureMessage); + lg.warn("killing {}; {}; {}", htcJobInfo, failureMessage, simJobStatus); } - VCMongoMessage.sendZombieJob(simJobStatus,failureMessage,htcJobInfo.getHtcJobID()); htcProxy.killJobSafe(htcJobInfo); } }catch (Exception e){ @@ -431,7 +445,10 @@ public void run() { /** * flush message queue */ - class QueueFlusher implements Runnable { + class QueueFlusher implements Runnable { + protected final static String timeOutFailure = "failed: timed out"; + protected final static String unreferencedFailure = "failed: unreferenced simulation"; + protected final Object finishListener = new Object(); //used for tests public void run() { try { traceThread(this); @@ -449,20 +466,24 @@ public void run() { abortStalledOrUnreferencedSimulationTasks(messageFlushTimeMS); } catch (Exception e1) { lg.error(e1.getMessage(), e1); + } finally { + synchronized (finishListener){ + finishListener.notify(); + } } } private void flushWorkerEventQueue() throws VCMessagingException{ VCMessage message = simMonitorThreadSession_sim.createObjectMessage(VCMongoMessage.getServiceStartupTime()); message.setStringProperty(VCMessagingConstants.MESSAGE_TYPE_PROPERTY,MessageConstants.MESSAGE_TYPE_FLUSH_VALUE); - synchronized (notifyObject) { + synchronized (monitorNotifyObject) { simMonitorThreadSession_sim.sendQueueMessage(VCellQueue.WorkerEventQueue, message, false, MessageConstants.MINUTE_IN_MS*5L); try { long startWaitTime = System.currentTimeMillis(); - notifyObject.wait(QUEUE_FLUSH_WAITIME); + monitorNotifyObject.wait(QUEUE_FLUSH_WAITIME); long endWaitTime = System.currentTimeMillis(); long elapsedFlushTime = endWaitTime-startWaitTime; - VCMongoMessage.sendInfo("flushed worker event queue: elapsedTime="+(elapsedFlushTime/1000.0)+" s"); + lg.info("flushed worker event queue: elapsedTime={} s", elapsedFlushTime / 1000.0); if (elapsedFlushTime >= QUEUE_FLUSH_WAITIME){ throw new VCMessagingException("worker event queue flush timed out (>"+QUEUE_FLUSH_WAITIME+" s), considerable message backlog?"); } @@ -514,11 +535,11 @@ private void abortStalledOrUnreferencedSimulationTasks(long messageFlushTimeMS) boolean bUnreferencedSimulation = unreferencedSimKeys.contains(activeJobStatus.getVCSimulationIdentifier().getSimulationKey()); if (bTimedOutSimulation || bUnreferencedSimulation){ - String failureMessage = (bTimedOutSimulation) ? ("failed: timed out") : ("failed: unreferenced simulation"); - lg.info("obsolete job detected at timestampMS="+currentTimeMS+", status=(" + activeJobStatus + ")"); + String failureMessage = (bTimedOutSimulation) ? timeOutFailure : unreferencedFailure; + lg.info("obsolete job detected at timestampMS={}, status={}", currentTimeMS, activeJobStatus); //SimulationStateMachine simStateMachine = simDispatcherEngine.getSimulationStateMachine(activeJobStatus.getVCSimulationIdentifier().getSimulationKey(), activeJobStatus.getJobIndex()); // lg.debug(simStateMachine.show()); - VCMongoMessage.sendObsoleteJob(activeJobStatus,failureMessage); + lg.warn("{} {}", activeJobStatus, failureMessage); simDispatcherEngine.onSystemAbort(activeJobStatus, failureMessage, simulationDatabase, clientStatusTopicSession_int); if (activeJobStatus.getSimulationExecutionStatus()!=null && activeJobStatus.getSimulationExecutionStatus().getHtcJobID()!=null){ HtcJobID htcJobId = activeJobStatus.getSimulationExecutionStatus().getHtcJobID(); @@ -547,25 +568,38 @@ public Thread newThread(Runnable r) { } } - /** - * Scheduler constructor comment. - */ - public SimulationDispatcher() throws Exception { + public static SimulationDispatcher simulationDispatcherCreator(SimulationDatabase simulationDatabase, VCMessagingService messagingServiceInternal, + VCMessagingService messagingServiceSim, HtcProxy htcProxy, boolean startDispatcher){ + return new SimulationDispatcher(simulationDatabase, messagingServiceInternal, messagingServiceSim, htcProxy, startDispatcher); + } + + public static SimulationDispatcher simulationDispatcherCreator() throws SQLException, DataAccessException { ConnectionFactory conFactory = DatabaseService.getInstance().createConnectionFactory(); KeyFactory keyFactory = conFactory.getKeyFactory(); DatabaseServerImpl databaseServerImpl = new DatabaseServerImpl(conFactory, keyFactory); AdminDBTopLevel adminDbTopLevel = new AdminDBTopLevel(conFactory); - this.simulationDatabase = new SimulationDatabaseDirect(adminDbTopLevel, databaseServerImpl, true); + SimulationDatabase simulationDatabase = new SimulationDatabaseDirect(adminDbTopLevel, databaseServerImpl, true); - this.vcMessagingService_int = new VCMessagingServiceActiveMQ(); + VCMessagingService vcMessagingServiceInternal = new VCMessagingServiceActiveMQ(); String jmshost_int = PropertyLoader.getRequiredProperty(PropertyLoader.jmsIntHostInternal); int jmsport_int = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.jmsIntPortInternal)); - this.vcMessagingService_int.setConfiguration(new ServerMessagingDelegate(), jmshost_int, jmsport_int); + vcMessagingServiceInternal.setConfiguration(new ServerMessagingDelegate(), jmshost_int, jmsport_int); - this.vcMessagingService_sim = new VCMessagingServiceActiveMQ(); + VCMessagingService vcMessagingServiceSim = new VCMessagingServiceActiveMQ(); String jmshost_sim = PropertyLoader.getRequiredProperty(PropertyLoader.jmsSimHostInternal); int jmsport_sim = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.jmsSimPortInternal)); - this.vcMessagingService_sim.setConfiguration(new ServerMessagingDelegate(), jmshost_sim, jmsport_sim); + vcMessagingServiceSim.setConfiguration(new ServerMessagingDelegate(), jmshost_sim, jmsport_sim); + + return SimulationDispatcher.simulationDispatcherCreator(simulationDatabase, + vcMessagingServiceInternal, vcMessagingServiceSim, SlurmProxy.createRemoteProxy(), true); + } + + private SimulationDispatcher(SimulationDatabase simulationDatabase, VCMessagingService messagingServiceInternal, + VCMessagingService messagingServiceSim, HtcProxy htcProxy, boolean startDispatcher){ + this.simulationDatabase = simulationDatabase; + this.vcMessagingService_int = messagingServiceInternal; + this.vcMessagingService_sim = messagingServiceSim; + QueueListener workerEventListener = new QueueListener() { public void onQueueMessage(VCMessage vcMessage, VCMessageSession session) throws RollbackException { onWorkerEventMessage(vcMessage, session); @@ -579,7 +613,7 @@ public void onQueueMessage(VCMessage vcMessage, VCMessageSession session) throws // // set up consumer for Simulation Request (non-blocking RPC) messages // - SimulationService simServiceImpl = new SimulationServiceImpl(); + simServiceImpl = new SimulationServiceImpl(); VCMessageSelector simRequestSelector = null; threadName = "Sim Request Consumer"; @@ -591,17 +625,17 @@ public void onQueueMessage(VCMessage vcMessage, VCMessageSession session) throws this.dispatcherQueueSession_int = this.vcMessagingService_int.createProducerSession(); this.clientStatusTopicSession_int = this.vcMessagingService_int.createProducerSession(); - this.dispatchThread = new DispatchThread(); - this.dispatchThread.start(); this.simMonitorThreadSession_sim = this.vcMessagingService_sim.createProducerSession(); - this.simMonitor = new SimulationMonitor(); - this.htcProxy = SlurmProxy.createRemoteProxy(); - } + this.htcProxy = htcProxy; + // Wait until all resources are created to start separate threads - public void init() { - + this.simMonitor = new SimulationMonitor(); + this.dispatchThread = new DispatchThread(); + if (startDispatcher){ + this.dispatchThread.start(); + } } @@ -617,8 +651,8 @@ private void onWorkerEventMessage(VCMessage vcMessage, VCMessageSession session) if (vcMessage.propertyExists(VCMessagingConstants.MESSAGE_TYPE_PROPERTY) && vcMessage.getStringProperty(VCMessagingConstants.MESSAGE_TYPE_PROPERTY).equals(MessageConstants.MESSAGE_TYPE_FLUSH_VALUE)){ if (simMonitor!=null){ try { - synchronized (simMonitor.notifyObject){ - simMonitor.notifyObject.notify(); + synchronized (simMonitor.monitorNotifyObject){ + simMonitor.monitorNotifyObject.notify(); } }catch (IllegalMonitorStateException e){ lg.warn(e); @@ -657,57 +691,5 @@ private void traceThread(Object source) { " commencing run cycle at " + new SimpleDateFormat("k:m:s").format(new Date( )) ); } } - - /** - * Starts the application. - * @param args an array of command-line arguments - */ - public static void main(java.lang.String[] args) { - - if (args.length != 0) { - System.out.println("No arguments expected: " + SimulationDispatcher.class.getName()); - System.exit(1); - } - - try { - OperatingSystemInfo.getInstance(); - PropertyLoader.loadProperties(REQUIRED_SERVICE_PROPERTIES); - - Injector injector = Guice.createInjector(new VCellServerModule()); - - SimulationDispatcher simulationDispatcher = injector.getInstance(SimulationDispatcher.class); - simulationDispatcher.init(); - - } catch (Throwable e) { - lg.error("uncaught exception initializing SimulationDispatcher: "+e.getLocalizedMessage(), e); - System.exit(1); - } - } - - - private static final String REQUIRED_SERVICE_PROPERTIES[] = { - PropertyLoader.vcellServerIDProperty, - PropertyLoader.installationRoot, - PropertyLoader.dbConnectURL, - PropertyLoader.dbDriverName, - PropertyLoader.dbUserid, - PropertyLoader.dbPasswordFile, - PropertyLoader.userTimezone, - PropertyLoader.mongodbHostInternal, - PropertyLoader.mongodbPortInternal, - PropertyLoader.mongodbDatabase, - PropertyLoader.jmsIntHostInternal, - PropertyLoader.jmsIntPortInternal, - PropertyLoader.jmsSimHostInternal, - PropertyLoader.jmsSimPortInternal, - PropertyLoader.jmsUser, - PropertyLoader.jmsPasswordFile, - PropertyLoader.htcUser, - PropertyLoader.jmsBlobMessageUseMongo, - PropertyLoader.maxJobsPerScan, - PropertyLoader.maxOdeJobsPerUser, - PropertyLoader.maxPdeJobsPerUser, - PropertyLoader.slurm_partition - }; } diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherMain.java b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherMain.java new file mode 100644 index 0000000000..a23560f10b --- /dev/null +++ b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherMain.java @@ -0,0 +1,79 @@ +/* + * Copyright (C) 1999-2011 University of Connecticut Health Center + * + * Licensed under the MIT License (the "License"). + * You may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.opensource.org/licenses/mit-license.php + */ + +package cbit.vcell.message.server.dispatcher; + +import cbit.vcell.resource.OperatingSystemInfo; +import cbit.vcell.resource.PropertyLoader; +import com.google.inject.Guice; +import com.google.inject.Injector; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.vcell.dependency.server.VCellServerModule; + +/** + * Insert the type's description here. + * Creation date: (10/18/2001 4:31:11 PM) + * @author: Jim Schaff + */ +public class SimulationDispatcherMain { + public static Logger lg = LogManager.getLogger(SimulationDispatcher.class); + /** + * Starts the application. + * @param args an array of command-line arguments + */ + public static void main(String[] args) { + + if (args.length != 0) { + System.out.println("No arguments expected: " + SimulationDispatcherMain.class.getName()); + System.exit(1); + } + + try { + OperatingSystemInfo.getInstance(); + PropertyLoader.loadProperties(REQUIRED_SERVICE_PROPERTIES); + + Injector injector = Guice.createInjector(new VCellServerModule()); + + SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(); + injector.injectMembers(simulationDispatcher); + } catch (Throwable e) { + lg.error("uncaught exception initializing SimulationDispatcher: "+e.getLocalizedMessage(), e); + System.exit(1); + } + } + + + private static final String REQUIRED_SERVICE_PROPERTIES[] = { + PropertyLoader.vcellServerIDProperty, + PropertyLoader.installationRoot, + PropertyLoader.dbConnectURL, + PropertyLoader.dbDriverName, + PropertyLoader.dbUserid, + PropertyLoader.dbPasswordFile, + PropertyLoader.userTimezone, + PropertyLoader.mongodbHostInternal, + PropertyLoader.mongodbPortInternal, + PropertyLoader.mongodbDatabase, + PropertyLoader.jmsIntHostInternal, + PropertyLoader.jmsIntPortInternal, + PropertyLoader.jmsSimHostInternal, + PropertyLoader.jmsSimPortInternal, + PropertyLoader.jmsUser, + PropertyLoader.jmsPasswordFile, + PropertyLoader.htcUser, + PropertyLoader.jmsBlobMessageUseMongo, + PropertyLoader.maxJobsPerScan, + PropertyLoader.maxOdeJobsPerUser, + PropertyLoader.maxPdeJobsPerUser, + PropertyLoader.slurm_partition + }; + +} From 811a2061c0ca8a95caf4cc651280b87835e030fb Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 6 Sep 2024 13:44:34 -0400 Subject: [PATCH 03/16] HTC Memory Request Bug Fixed Previously the amount of memory that could be requested was not bounded, for whatever the user requested was always allowed. Later in the process the requests would be rejected by slurm, but they still where made. Now they now longer are made if the request is above the maximum set. --- .../cbit/vcell/resource/PropertyLoader.java | 3 + .../vcell/message/server/htc/HtcProxy.java | 226 ++---------------- .../server/htc/slurm/SlurmProxyTest.java | 6 +- .../V_REL_274633859_0_0.slurm.sub | 2 +- .../cvode/V_REL_274630682_0_0.slurm.sub | 2 +- .../V_REL_274514696_0_0.slurm.sub | 2 +- .../gibson/V_REL_274635122_0_0.slurm.sub | 2 +- .../V_REL_274641698_0_0.slurm.sub | 2 +- .../langevin/V_REL_274672135_0_0.slurm.sub | 2 +- .../V_REL_274641196_0_0.slurm.sub | 2 +- .../nfsim/V_REL_274642453_0_0.slurm.sub | 2 +- .../V_REL_274631114_0_0.slurm.sub | 2 +- .../smoldyn/V_REL_274630052_0_0.slurm.sub | 2 +- 13 files changed, 33 insertions(+), 222 deletions(-) diff --git a/vcell-core/src/main/java/cbit/vcell/resource/PropertyLoader.java b/vcell-core/src/main/java/cbit/vcell/resource/PropertyLoader.java index a8d6c5243c..6f9db95278 100644 --- a/vcell-core/src/main/java/cbit/vcell/resource/PropertyLoader.java +++ b/vcell-core/src/main/java/cbit/vcell/resource/PropertyLoader.java @@ -80,6 +80,9 @@ public static void setConfigProvider(VCellConfigProvider configProvider) { public static final String htcPbsHome = record("vcell.htc.pbs.home",ValueType.GEN); public static final String htcSgeHome = record("vcell.htc.sge.home",ValueType.GEN); public static final String htcNodeList = record("vcell.htc.nodelist",ValueType.GEN); + public static final String htcMinMemoryMB = record("vcell.htc.memory.min.mb", ValueType.INT); // minimum memory request in MB, currently 4g + public static final String htcMaxMemoryMB = record("vcell.htc.memory.max.mb", ValueType.INT); // maximum memory request in MB + public static final String htcPowerUserMemoryFloorMB = record("vcell.htc.memory.pu.floor.mb", ValueType.INT); // MIN memory allowed if declared to be a power user, currently 50g (Previously Existing Value) public static final String htc_vcellfvsolver_docker_name = record("vcell.htc.vcellfvsolver.docker.name",ValueType.GEN); public static final String htc_vcellfvsolver_solver_list = record("vcell.htc.vcellfvsolver.solver.list",ValueType.GEN); diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/htc/HtcProxy.java b/vcell-server/src/main/java/cbit/vcell/message/server/htc/HtcProxy.java index d1131ad19a..3cfb5ca9bd 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/htc/HtcProxy.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/htc/HtcProxy.java @@ -219,7 +219,7 @@ public static SimTaskInfo getSimTaskInfoFromSimJobName(String simJobName) throws } public static String createHtcSimJobName(SimTaskInfo simTaskInfo) { - return HTC_SIMULATION_JOB_NAME_PREFIX+simTaskInfo.simId.toString()+"_"+simTaskInfo.jobIndex+"_"+simTaskInfo.taskId; + return simulationJobNamePrefix()+simTaskInfo.simId.toString()+"_"+simTaskInfo.jobIndex+"_"+simTaskInfo.taskId; } public static String toUnixStyleText(String javaString) throws IOException { @@ -242,8 +242,8 @@ public static String toUnixStyleText(String javaString) throws IOException { public abstract String getSubmissionFileExtension(); public static class MemLimitResults { - private static final long FALLBACK_MEM_LIMIT_MB=4096; // MAX memory allowed if not set in limitFile, currently 4g - private static final long POWER_USER_MEMORY_FLOOR=51200; // MIN memory allowed if declared to be a power user, currently 50g + private static final long FALLBACK_MEM_LIMIT_MB= Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMinMemoryMB)); // MAX memory allowed if not set in limitFile, currently 4g + private static final long POWER_USER_MEMORY_FLOOR=Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcPowerUserMemoryFloorMB)); // MIN memory allowed if declared to be a power user, currently 50g private long memLimit; private String memLimitSource; public MemLimitResults(long memLimit, String memLimitSource) { @@ -257,223 +257,27 @@ public long getMemLimit() { public String getMemLimitSource() { return memLimitSource; } - private static MemLimitResults getFallbackMemLimitMB(SolverDescription solverDescription,double estimatedMemSizeMB, boolean isPowerUser) { - Long result = null; - String source = null; - try { - List solverMemLimits = Files.readAllLines(Paths.get(new File("/"+System.getProperty(PropertyLoader.htcLogDirInternal)+"/slurmMinMem.txt").getAbsolutePath())); - for (Iterator iterator = solverMemLimits.iterator(); iterator.hasNext();) { - String solverAndLimit = iterator.next().trim(); - if(solverAndLimit.length()==0 || solverAndLimit.startsWith("//")) { - continue; - } - StringTokenizer st = new StringTokenizer(solverAndLimit,":"); - String limitSolver = st.nextToken(); - if(limitSolver.equalsIgnoreCase("all") && result == null) {//use all if there is not solver matching name in slurmMinMem.txt - result = Long.parseLong(st.nextToken()); - source = "used slurmMinMem.txt all"; - }else if(solverDescription != null && limitSolver.equals(solverDescription.name())) {//use matching solver mem limit from file - result = Long.parseLong(st.nextToken()); - source = "used slurmMinMem.txt "+solverDescription.name(); - break; - } - } - if(result == null) {//empty slurmMinMem.txt - result = FALLBACK_MEM_LIMIT_MB; - source = "Empty used FALLBACK_MEM_LIMIT_MB"; - } - } catch (Exception e) { - LG.debug(e); - result = FALLBACK_MEM_LIMIT_MB; - source = "Exception "+e.getClass().getSimpleName()+" used FALLBACK_MEM_LIMIT_MB"; - } - if(estimatedMemSizeMB > result) {//Use estimated if bigger - result = (long)estimatedMemSizeMB; - source = "used Estimated"; + private static MemLimitResults getJobRequestedMemoryLimit(SolverDescription solverDescription, double estimatedMemSizeMB, boolean isPowerUser) { + long batchJobMemoryLimit = FALLBACK_MEM_LIMIT_MB; + String detailedMessage = "default memory limit"; + + if(estimatedMemSizeMB > batchJobMemoryLimit) {//Use estimated if bigger + batchJobMemoryLimit = (long)estimatedMemSizeMB; + detailedMessage = "used Estimated"; } - if (isPowerUser && result < POWER_USER_MEMORY_FLOOR){ - result = (long)POWER_USER_MEMORY_FLOOR; - source = "poweruser's memory override"; + if (isPowerUser && batchJobMemoryLimit < POWER_USER_MEMORY_FLOOR){ + batchJobMemoryLimit = POWER_USER_MEMORY_FLOOR; + detailedMessage = "poweruser's memory override"; } - return new MemLimitResults(result, source); + return new MemLimitResults(batchJobMemoryLimit, detailedMessage); } } public static final boolean bDebugMemLimit = false; public static MemLimitResults getMemoryLimit(String vcellUserid, KeyValue simID, SolverDescription solverDescription ,double estimatedMemSizeMB, boolean isPowerUser) { - return MemLimitResults.getFallbackMemLimitMB(solverDescription, estimatedMemSizeMB*1.5, isPowerUser); -// boolean bUseEstimate = estimatedMemSizeMB >= MemLimitResults.getFallbackMemLimitMB(solverDescription); -// return new MemLimitResults((bUseEstimate?(long)estimatedMemSizeMB:MemLimitResults.getFallbackMemLimitMB(solverDescription)), (bUseEstimate?"used Estimated":"used FALLBACK_MEM_LIMIT")); -// //One of 5 limits are returned (ordered from highest to lowest priority): -// // MemoryMax:PerSimulation Has PropertyLoader.simPerUserMemoryLimitFile, specific user AND simID MATCHED in file (userid MemLimitMb simID) -// // MemoryMax:PerUser Has PropertyLoader.simPerUserMemoryLimitFile, specific user (but not simID) MATCHED in file (userid MemLimitMb '*') -// // MemoryMax:PerSolver Has PropertyLoader.simPerUserMemoryLimitFile, specific solverDescription (but not simID or user) MATCHED in file (solverName MemLimitMb '*') -// // MemoryMax:SimulationTask.getEstimatedMemorySizeMB() Has PropertyLoader.simPerUserMemoryLimitFile, no user or sim MATCHED in file ('defaultSimMemoryLimitMb' MemLimitMb '*') -// // estimated > MemoryMax:AllUsersMemLimit -// // MemoryMax:AllUsersMemLimit(defaultSimMemoryLimitMb) Has PropertyLoader.simPerUserMemoryLimitFile, no user or sim MATCHED in file ('defaultSimMemoryLimitMb' MemLimitMb '*') -// // estimated < MemoryMax:AllUsersMemLimit -// // MemoryMax:HtcProxy.MemLimitResults.FALLBACK_MEM_LIMIT No PropertyLoader.simPerUserMemoryLimitFile -// // estimated < FALLBACK -// -// Long defaultSimMemoryLimitMbFromFile = null; -// File memLimitFile = null; -// try { -// //${vcellroot}/docker/swarm/serverconfig-uch.sh->VCELL_SIMDATADIR_EXTERNAL=/share/apps/vcell3/users -// //${vcellroot}/docker/swarm/serverconfig-uch.sh-> VCELL_SIMDATADIR_HOST=/opt/vcelldata/users -// //${vcellroot}/docker/swarm/docker-compose.yml-> Volume map "${VCELL_SIMDATADIR_HOST}:/simdata" -// Long perUserMemMax = null; -// Long perSimMemMax = null; -// Long perSolverMax = null; -// String memLimitFileDirVal = System.getProperty(PropertyLoader.primarySimDataDirInternalProperty); -// String memLimitFileVal = System.getProperty(PropertyLoader.simPerUserMemoryLimitFile); -// if(memLimitFileDirVal != null && memLimitFileVal != null) { -// memLimitFile = new File(memLimitFileDirVal,memLimitFileVal); -// } -// if(memLimitFile != null && memLimitFile.exists()) { -// List perUserLimits = Files.readAllLines(Paths.get(memLimitFile.getAbsolutePath())); -// for (Iterator iterator = perUserLimits.iterator(); iterator.hasNext();) { -// String userAndLimit = iterator.next().trim(); -// if(userAndLimit.length()==0 || userAndLimit.startsWith("//")) { -// if(bDebugMemLimit){LG.trace("-----skipped '"+userAndLimit+"'");} -// continue; -// } -//// LG.trace("-----"+userAndLimit); -// -// StringTokenizer st = new StringTokenizer(userAndLimit); -// String limitUserid = st.nextToken(); -// if(limitUserid.equals(vcellUserid) || (solverDescription != null && limitUserid.equals(solverDescription.name()))) {//check user -// long memLimit = 0; -// try { -// memLimit = Long.parseLong(st.nextToken()); -// } catch (Exception e) { -// if(bDebugMemLimit){LG.debug("-----ERROR '"+userAndLimit+"' token memlimit not parsed");} -// //bad line in limit file, continue processing other lines -// //lg.debug(e); -// continue; -// } -// if(solverDescription != null && limitUserid.equals(solverDescription.name())) { -// perSolverMax = memLimit; -// if(bDebugMemLimit){LG.debug("-----"+"MATCH Solver "+userAndLimit);} -// continue; -// } -// //get simid -// String simSpecifier = null; -// try { -// simSpecifier = st.nextToken(); -// //check token is '*' or long -// if(!simSpecifier.equals("*") && Long.valueOf(simSpecifier).longValue() < 0 ) { -// throw new Exception(" token 'simSpecifier' expected to be '*' or simID"); -// } -// } catch (Exception e) { -// if(bDebugMemLimit){LG.debug("-----ERROR '"+userAndLimit+"' "+e.getClass().getName()+" "+e.getMessage());} -// //bad line in limit file, continue processing other lines -// //lg.debug(e); -// continue; -// } -// // * means all sims for that user, don't set if sim specific limit is already set -// if(simSpecifier.equals("*") && perSimMemMax == null) { -// perUserMemMax = memLimit;// use this unless overriden by specific simid -// if(bDebugMemLimit){LG.debug("-----"+"MATCH USER "+userAndLimit);} -// } -// //Set sim specific limit, set even if * limit has been set -// if(simID != null && simID.toString().equals(simSpecifier)) { -// perSimMemMax = memLimit;// use sim limit -// if(bDebugMemLimit){LG.debug("-----"+"MATCH SIM "+userAndLimit);} -// } -// }else if(limitUserid.equals("defaultSimMemoryLimitMb")) {//Master sim mem limit -// try { -// defaultSimMemoryLimitMbFromFile = Long.parseLong(st.nextToken()); -// if(bDebugMemLimit){LG.debug("-----"+"MATCH DEFAULT "+userAndLimit);} -// } catch (Exception e) { -// if(bDebugMemLimit){LG.debug("-----ERROR '"+userAndLimit+"' "+e.getClass().getName()+" "+e.getMessage());} -// //bad line in limit file, continue processing other lines -// //LG.debug(e); -// continue; -// } -// }else { -// if(bDebugMemLimit){LG.debug("-----"+"NO MATCH "+userAndLimit);} -// } -// } -// if(perUserMemMax != null || perSimMemMax != null) { -// long finalMax = (perSimMemMax!=null?perSimMemMax:perUserMemMax); -// if(bDebugMemLimit){LG.debug("Set memory limit for user '"+vcellUserid+"' to "+finalMax + (perSimMemMax!=null?" for simID="+simID:""));} -// return new MemLimitResults(finalMax, -// (perSimMemMax!=null? -// "MemoryMax(FILE PerSimulation):"+simID+",User='"+vcellUserid+"' from "+memLimitFile.getAbsolutePath(): -// "MemoryMax(FILE PerUser):'"+vcellUserid+"' from "+memLimitFile.getAbsolutePath())); -// }else if(perSolverMax != null) { -// if(perSolverMax == 0) {//Use estimated size always if solver had 0 for memory limit -// return new MemLimitResults( -// Math.max((long)Math.ceil(estimatedMemSizeMB*1.5), -// (defaultSimMemoryLimitMbFromFile!=null?defaultSimMemoryLimitMbFromFile:MemLimitResults.FALLBACK_MEM_LIMIT_MB)), -// "MemoryMax(FILE PerSolver ESTIMATED):'"+solverDescription.name()+"' from "+memLimitFile.getAbsolutePath()); -// }else { -// return new MemLimitResults(perSolverMax, "MemoryMax(FILE PerSolver):'"+solverDescription.name()+"' from "+memLimitFile.getAbsolutePath()); -// } -// } -// }else { -// if(bDebugMemLimit){LG.debug("-----MemLimitFile "+(memLimitFile==null?"not defined":memLimitFile.getAbsolutePath()+" not exist"));} -// } -// } catch (Exception e) { -// //ignore, try defaults -// LG.error(e); -// } -//// long estimatedMemSizeMBL = (long)Math.ceil(estimatedMemSizeMB*1.5); -// boolean bHasMemLimitFile = defaultSimMemoryLimitMbFromFile!=null; -// long maxAllowedMem = (bHasMemLimitFile?defaultSimMemoryLimitMbFromFile:MemLimitResults.FALLBACK_MEM_LIMIT_MB); -//// boolean bUseEstimated = (estimatedMemSizeMBL <= maxAllowedMem); -//// return new MemLimitResults(maxAllowedMem, -//// (bUseEstimated? -//// "MemoryMax(ESTIMATED):SimulationTask.getEstimatedMemorySizeMB()="+estimatedMemSizeMBL: -//// (bHasMemLimitFile? -//// "MemoryMax(FILE AllUsers):AllUsersMemLimit(defaultSimMemoryLimitMb) from "+memLimitFile.getAbsolutePath(): -//// "MemoryMax(HARDCODE):HtcProxy.MemLimitResults.FALLBACK_MEM_LIMIT_MB"))); -// return new MemLimitResults(maxAllowedMem, -// (bHasMemLimitFile? -// "MemoryMax(FILE AllUsers):AllUsersMemLimit(defaultSimMemoryLimitMb) from "+memLimitFile.getAbsolutePath(): -// "MemoryMax(HARDCODE):HtcProxy.MemLimitResults.FALLBACK_MEM_LIMIT_MB")); + return MemLimitResults.getJobRequestedMemoryLimit(solverDescription, estimatedMemSizeMB*1.5, isPowerUser); } -// public static boolean isStochMultiTrial(SimulationTask simTask) { -// return simTask.getSimulationJob().getSimulation().getSolverTaskDescription().getSolverDescription() == SolverDescription.StochGibson && -// simTask.getSimulationJob().getSimulation().getSolverTaskDescription().getStochOpt() != null && -// !simTask.getSimulationJob().getSimulation().getSolverTaskDescription().getStochOpt().isHistogram() && -// simTask.getSimulationJob().getSimulation().getSolverTaskDescription().getStochOpt().getNumOfTrials() > 1; -// -// } } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/htc/slurm/SlurmProxyTest.java b/vcell-server/src/test/java/cbit/vcell/message/server/htc/slurm/SlurmProxyTest.java index 7b7955546f..dda5334a7a 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/htc/slurm/SlurmProxyTest.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/htc/slurm/SlurmProxyTest.java @@ -79,7 +79,11 @@ public void setup() setProperty(PropertyLoader.htc_vcellsolvers_docker_name, "ghcr.io/virtualcell/vcell-solvers:v0.8.1.2"); setProperty(PropertyLoader.htc_vcellbatch_solver_list, "RungeKuttaFehlberg,HybridMilstein,StochGibson,Langevin,AdamsMoulton,Smoldyn,MovingBoundary,SundialsPDE,CombinedSundials,NFSim"); setProperty(PropertyLoader.htc_vcellbatch_docker_name, "ghcr.io/virtualcell/vcell-batch:7.6.0.43"); - } + + setProperty(PropertyLoader.htcPowerUserMemoryFloorMB, "51200"); + setProperty(PropertyLoader.htcMinMemoryMB, "4096"); + setProperty(PropertyLoader.htcMaxMemoryMB, "81920"); + } @AfterEach public void teardown() { diff --git a/vcell-server/src/test/resources/slurm_fixtures/adams_moulton/V_REL_274633859_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/adams_moulton/V_REL_274633859_0_0.slurm.sub index 009d8ed87a..915146b753 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/adams_moulton/V_REL_274633859_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/adams_moulton/V_REL_274633859_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/cvode/V_REL_274630682_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/cvode/V_REL_274630682_0_0.slurm.sub index 3e4a33399d..8ff7e5e13f 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/cvode/V_REL_274630682_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/cvode/V_REL_274630682_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/finite_volume/V_REL_274514696_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/finite_volume/V_REL_274514696_0_0.slurm.sub index 4ac98ad675..6390257360 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/finite_volume/V_REL_274514696_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/finite_volume/V_REL_274514696_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/gibson/V_REL_274635122_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/gibson/V_REL_274635122_0_0.slurm.sub index 742352b14c..1d84678455 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/gibson/V_REL_274635122_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/gibson/V_REL_274635122_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/gibson_milstein/V_REL_274641698_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/gibson_milstein/V_REL_274641698_0_0.slurm.sub index c44c6bbb6d..a55a11e7b4 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/gibson_milstein/V_REL_274641698_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/gibson_milstein/V_REL_274641698_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/V_REL_274672135_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/langevin/V_REL_274672135_0_0.slurm.sub index f61745b830..1944ad5f24 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/langevin/V_REL_274672135_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/V_REL_274672135_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/moving_boundary/V_REL_274641196_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/moving_boundary/V_REL_274641196_0_0.slurm.sub index 7ac68f8a5e..6814f4c465 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/moving_boundary/V_REL_274641196_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/moving_boundary/V_REL_274641196_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/nfsim/V_REL_274642453_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/nfsim/V_REL_274642453_0_0.slurm.sub index a16c0ffde0..5066f18768 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/nfsim/V_REL_274642453_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/nfsim/V_REL_274642453_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/runge_kutta_fehlberg/V_REL_274631114_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/runge_kutta_fehlberg/V_REL_274631114_0_0.slurm.sub index ada20fa438..c60f96d905 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/runge_kutta_fehlberg/V_REL_274631114_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/runge_kutta_fehlberg/V_REL_274631114_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/smoldyn/V_REL_274630052_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/smoldyn/V_REL_274630052_0_0.slurm.sub index 8fb09c12e8..ae6b433cb7 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/smoldyn/V_REL_274630052_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/smoldyn/V_REL_274630052_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source='Exception NoSuchFileException used FALLBACK_MEM_LIMIT_MB' +# VCell SlurmProxy memory limit source=default memory limit TMPDIR=/scratch/vcell From 189dbc4880e9b55bce90a075e6512eab00ba3d72 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 6 Sep 2024 14:10:31 -0400 Subject: [PATCH 04/16] Simulation Control Tests --- .../dispatcher/DispatcherTestUtils.java | 165 +++++++++++ .../server/dispatcher/MockHtcProxy.java | 94 +++++++ .../dispatcher/MockMessagingService.java | 52 ++++ .../server/dispatcher/MockSimulationDB.java | 218 +++++++++++++++ .../dispatcher/MockVCMessageSession.java | 106 +++++++ .../dispatcher/SimulationDispatcherTest.java | 209 ++++++++++++++ .../SimulationStateMachineTest.java | 259 ++++++++++++++++++ .../src/test/resources/log4j2-test.xml | 21 ++ 8 files changed, 1124 insertions(+) create mode 100644 vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java create mode 100644 vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockHtcProxy.java create mode 100644 vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockMessagingService.java create mode 100644 vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java create mode 100644 vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockVCMessageSession.java create mode 100644 vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherTest.java create mode 100644 vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java create mode 100644 vcell-server/src/test/resources/log4j2-test.xml diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java new file mode 100644 index 0000000000..2167e1f5ee --- /dev/null +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java @@ -0,0 +1,165 @@ +package cbit.vcell.message.server.dispatcher; + +import cbit.vcell.geometry.Geometry; +import cbit.vcell.mapping.MathSymbolMapping; +import cbit.vcell.math.*; +import cbit.vcell.mathmodel.MathModel; +import cbit.vcell.parser.ExpressionBindingException; +import cbit.vcell.resource.PropertyLoader; +import cbit.vcell.server.HtcJobID; +import cbit.vcell.server.SimulationExecutionStatus; +import cbit.vcell.server.SimulationJobStatus; +import cbit.vcell.solver.MeshSpecification; +import cbit.vcell.solver.Simulation; +import cbit.vcell.solver.VCSimulationIdentifier; +import cbit.vcell.solver.server.SimulationMessage; +import org.vcell.util.DataAccessException; +import org.vcell.util.ISize; +import org.vcell.util.document.*; + +import java.beans.PropertyVetoException; +import java.sql.SQLException; +import java.time.Instant; +import java.util.Date; + +public class DispatcherTestUtils { + private static String previousServerID = ""; + private static String previousHtcMax = ""; + private static String previousHtcMin = ""; + private static String previousHtcPowerFloor = ""; + private static String previousMongoBlob = ""; + private static String previousJMSIntHostProperty = ""; + private static String previousJMSIntPortProperty = ""; + private static String previousSimJMSIntHostProperty = ""; + private static String previousSimJMSIntPortProperty = ""; + private static String previousHTCHost = ""; + private static String previousHTCUser = ""; + private static String previousHTCUserKeyFile = ""; + private static String previousMaxJobsPerScan = ""; + private static String previousOdeJobsPerUser = ""; + private static String previousPdeJobsPerUser = ""; + + public static final VCellServerID testVCellServerID = VCellServerID.getServerID("test"); + public static final MockVCMessageSession testMessageSession = new MockVCMessageSession(); + public static final int jobIndex = 0; + public static final int taskID = 0; + public static final KeyValue simKey = new KeyValue("0"); + public static User alice = new User("Alice", new KeyValue("0")); + public static User bob = new User("Bob", new KeyValue("1")); + public static final VCSimulationIdentifier simID = new VCSimulationIdentifier(simKey, alice); + public static final HtcJobID htcJobID = new HtcJobID("2", HtcJobID.BatchSystemType.SLURM); + + public static void setRequiredProperties(){ + previousServerID = PropertyLoader.getProperty(PropertyLoader.vcellServerIDProperty, ""); + PropertyLoader.setProperty(PropertyLoader.vcellServerIDProperty, testVCellServerID.toString()); + + previousHtcMax = PropertyLoader.getProperty(PropertyLoader.htcMaxMemoryMB, ""); + PropertyLoader.setProperty(PropertyLoader.htcMaxMemoryMB, "4096"); + + previousHtcMin = PropertyLoader.getProperty(PropertyLoader.htcMinMemoryMB, ""); + PropertyLoader.setProperty(PropertyLoader.htcMinMemoryMB, "1024"); + + previousHtcPowerFloor = PropertyLoader.getProperty(PropertyLoader.htcPowerUserMemoryFloorMB, ""); + PropertyLoader.setProperty(PropertyLoader.htcPowerUserMemoryFloorMB, "51200"); + + previousMongoBlob = PropertyLoader.getProperty(PropertyLoader.jmsBlobMessageUseMongo, ""); + PropertyLoader.setProperty(PropertyLoader.jmsBlobMessageUseMongo, ""); + + previousJMSIntHostProperty = PropertyLoader.getProperty(PropertyLoader.jmsIntHostInternal, ""); + PropertyLoader.setProperty(PropertyLoader.jmsIntHostInternal, "host"); + + previousJMSIntPortProperty = PropertyLoader.getProperty(PropertyLoader.jmsIntPortInternal, ""); + PropertyLoader.setProperty(PropertyLoader.jmsIntPortInternal, "80"); + + previousSimJMSIntHostProperty = PropertyLoader.getProperty(PropertyLoader.jmsSimHostInternal, ""); + PropertyLoader.setProperty(PropertyLoader.jmsSimHostInternal, "host"); + + previousSimJMSIntPortProperty = PropertyLoader.getProperty(PropertyLoader.jmsSimPortInternal, ""); + PropertyLoader.setProperty(PropertyLoader.jmsSimPortInternal, "80"); + + previousHTCHost = PropertyLoader.getProperty(PropertyLoader.htcHosts, ""); + PropertyLoader.setProperty(PropertyLoader.htcHosts, "host"); + + previousHTCUser = PropertyLoader.getProperty(PropertyLoader.htcUser, ""); + PropertyLoader.setProperty(PropertyLoader.htcUser, "user"); + + previousHTCUserKeyFile = PropertyLoader.getProperty(PropertyLoader.htcUserKeyFile, ""); + PropertyLoader.setProperty(PropertyLoader.htcUserKeyFile, "keyFile"); + + previousMaxJobsPerScan = PropertyLoader.getProperty(PropertyLoader.maxJobsPerScan, ""); + PropertyLoader.setProperty(PropertyLoader.maxJobsPerScan, "100"); + + previousPdeJobsPerUser = PropertyLoader.getProperty(PropertyLoader.maxPdeJobsPerUser, ""); + PropertyLoader.setProperty(PropertyLoader.maxPdeJobsPerUser, "100"); + + previousOdeJobsPerUser = PropertyLoader.getProperty(PropertyLoader.maxOdeJobsPerUser, ""); + PropertyLoader.setProperty(PropertyLoader.maxOdeJobsPerUser, "100"); + + PropertyLoader.setProperty(PropertyLoader.mongodbDatabase, "fakehost"); + } + + public static void restoreRequiredProperties(){ + PropertyLoader.setProperty(PropertyLoader.vcellServerIDProperty, previousServerID); + PropertyLoader.setProperty(PropertyLoader.htcMaxMemoryMB, previousHtcMax); + PropertyLoader.setProperty(PropertyLoader.htcMinMemoryMB, previousHtcMin); + PropertyLoader.setProperty(PropertyLoader.htcPowerUserMemoryFloorMB, previousHtcPowerFloor); + PropertyLoader.setProperty(PropertyLoader.jmsBlobMessageUseMongo, previousMongoBlob); + PropertyLoader.setProperty(PropertyLoader.jmsIntPortInternal, previousJMSIntPortProperty); + PropertyLoader.setProperty(PropertyLoader.jmsIntHostInternal, previousJMSIntHostProperty); + PropertyLoader.setProperty(PropertyLoader.jmsSimPortInternal, previousSimJMSIntPortProperty); + PropertyLoader.setProperty(PropertyLoader.jmsSimHostInternal, previousSimJMSIntHostProperty); + PropertyLoader.setProperty(PropertyLoader.htcHosts, previousHTCHost); + PropertyLoader.setProperty(PropertyLoader.htcUser, previousHTCUser); + PropertyLoader.setProperty(PropertyLoader.htcUserKeyFile, previousHTCUserKeyFile); + PropertyLoader.setProperty(PropertyLoader.maxJobsPerScan, previousMaxJobsPerScan); + PropertyLoader.setProperty(PropertyLoader.maxOdeJobsPerUser, previousOdeJobsPerUser); + PropertyLoader.setProperty(PropertyLoader.maxPdeJobsPerUser, previousPdeJobsPerUser); + } + + public static Simulation createMockSimulation(int iSizeX, int iSizeY, int iSizeZ) throws PropertyVetoException, MathException, ExpressionBindingException { + VolVariable volVariable = new VolVariable("t", new Variable.Domain(new CompartmentSubDomain("t", 1))); + VolVariable volVariable2 = new VolVariable("b", new Variable.Domain(new CompartmentSubDomain("b", 2))); + MathSymbolMapping mathSymbolMapping = new MathSymbolMapping(); + Geometry geometry = new Geometry("T", 3); + MathModel mathModel = new MathModel(new Version("Test", alice)); + MathDescription mathDescription = new MathDescription("Test", mathSymbolMapping); + mathDescription.setGeometry(new Geometry("T", 3)); + Simulation simulation = new Simulation(SimulationVersion.createTempSimulationVersion(), + mathDescription, mathModel); + MeshSpecification meshSpecification = new MeshSpecification(geometry); + meshSpecification.setSamplingSize(new ISize(iSizeX, iSizeY, iSizeZ)); + simulation.setMeshSpecification(meshSpecification); + mathDescription.setAllVariables(new Variable[]{volVariable, volVariable2}); + return simulation; + } + + public static void insertOrUpdateStatus(KeyValue simKey, int jobIndex, int taskID, User user, SimulationJobStatus.SchedulerStatus status, SimulationDatabase simulationDB) throws SQLException, DataAccessException { + SimulationJobStatus jobStatus = simulationDB.getLatestSimulationJobStatus(simKey, jobIndex); + VCSimulationIdentifier simID = new VCSimulationIdentifier(simKey, user); + SimulationJobStatus simulationJobStatus = new SimulationJobStatus(testVCellServerID, simID, jobIndex, Date.from(Instant.now()), status, taskID, + SimulationMessage.workerAccepted("accepted"), null, + new SimulationExecutionStatus(Date.from(Instant.now()), "", + Date.from(Instant.now()), Date.from(Instant.now()), false, htcJobID)); + if (jobStatus == null){ + simulationDB.insertSimulationJobStatus(simulationJobStatus); + } else { + simulationDB.updateSimulationJobStatus(simulationJobStatus); + } + } + + public static void insertOrUpdateStatus(KeyValue simKey, int jobIndex, int taskID, User user, SimulationDatabase simulationDB) throws SQLException, DataAccessException { + insertOrUpdateStatus(simKey, jobIndex, taskID, user, SimulationJobStatus.SchedulerStatus.RUNNING, simulationDB); + } + + public static void insertOrUpdateStatus(SimulationDatabase simulationDatabase, SimulationJobStatus.SchedulerStatus status) throws SQLException, DataAccessException { + insertOrUpdateStatus(simKey, jobIndex, taskID, alice, status, simulationDatabase); + } + + /** + Defaults to a running status. + */ + public static void insertOrUpdateStatus(SimulationDatabase simulationDatabase) throws SQLException, DataAccessException { + insertOrUpdateStatus(simKey, jobIndex, taskID, alice, simulationDatabase); + } + +} diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockHtcProxy.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockHtcProxy.java new file mode 100644 index 0000000000..10a419ab69 --- /dev/null +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockHtcProxy.java @@ -0,0 +1,94 @@ +package cbit.vcell.message.server.dispatcher; + +import cbit.vcell.message.server.cmd.CommandService; +import cbit.vcell.message.server.htc.HtcException; +import cbit.vcell.message.server.htc.HtcJobNotFoundException; +import cbit.vcell.message.server.htc.HtcJobStatus; +import cbit.vcell.message.server.htc.HtcProxy; +import cbit.vcell.message.server.htc.slurm.SlurmJobStatus; +import cbit.vcell.messaging.server.SimulationTask; +import cbit.vcell.server.HtcJobID; +import cbit.vcell.server.SimulationJobStatus; +import cbit.vcell.simdata.PortableCommand; +import cbit.vcell.solvers.ExecutableCommand; +import org.vcell.util.DataAccessException; +import org.vcell.util.exe.ExecutableException; + +import java.io.File; +import java.io.IOException; +import java.sql.SQLException; +import java.util.*; + +public class MockHtcProxy extends HtcProxy { + private final MockSimulationDB mockSimulationDB; + public MockHtcProxy(CommandService commandService, String htcUser, MockSimulationDB mockSimulationDB) { + super(commandService, htcUser); + this.mockSimulationDB = mockSimulationDB; + } + public final ArrayList jobsKilledSafely = new ArrayList<>(); + public final ArrayList jobsKilledUnsafely = new ArrayList<>(); + + @Override + public void killJobSafe(HtcJobInfo htcJobInfo) throws ExecutableException, HtcJobNotFoundException, HtcException { + jobsKilledSafely.add(htcJobInfo); + } + + @Override + public void killJobUnsafe(HtcJobID htcJobId) throws ExecutableException, HtcJobNotFoundException, HtcException { + jobsKilledUnsafely.add(htcJobId); + } + + @Override + public void killJobs(String htcJobSubstring) throws ExecutableException, HtcJobNotFoundException, HtcException { + + } + + @Override + public Map getJobStatus(List requestedHtcJobInfos) throws ExecutableException, IOException { + return Map.of(); + } + + @Override + public HtcJobID submitJob(String jobName, File sub_file_internal, File sub_file_external, ExecutableCommand.Container commandSet, int ncpus, double memSize, Collection postProcessingCommands, SimulationTask simTask, File primaryUserDirExternal) throws ExecutableException { + return null; + } + + @Override + public HtcJobID submitOptimizationJob(String jobName, File sub_file_internal, File sub_file_external, File optProblemInputFile, File optProblemOutputFile, File optReportFile) throws ExecutableException { + return null; + } + + @Override + public HtcProxy cloneThreadsafe() { + return null; + } + + @Override + public Map getRunningJobs() throws ExecutableException, IOException { + HashMap map = new HashMap<>(); + SimulationJobStatus[] statuses; + try { + statuses = mockSimulationDB.getActiveJobs(DispatcherTestUtils.testVCellServerID); + } catch (DataAccessException | SQLException e) { + throw new RuntimeException(e); + } + for (SimulationJobStatus status : statuses){ + if (status.getSchedulerStatus().isRunning()){ + HtcJobInfo jobInfo = new HtcJobInfo(DispatcherTestUtils.htcJobID, HtcProxy.createHtcSimJobName(new SimTaskInfo(status.getVCSimulationIdentifier().getSimulationKey(), status.getJobIndex(), status.getTaskID()))); + HtcJobStatus jobStatus = new HtcJobStatus(SlurmJobStatus.RUNNING); + map.put(jobInfo, jobStatus); + } + } + return map; + } + + @Override + public PartitionStatistics getPartitionStatistics() throws HtcException, ExecutableException, IOException { + return new PartitionStatistics(1, 20, 100); + } + + @Override + public String getSubmissionFileExtension() { + return ""; + } +} diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockMessagingService.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockMessagingService.java new file mode 100644 index 0000000000..b2a634c996 --- /dev/null +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockMessagingService.java @@ -0,0 +1,52 @@ +package cbit.vcell.message.server.dispatcher; + +import cbit.vcell.message.*; + +import java.util.ArrayList; +import java.util.List; + +public class MockMessagingService implements VCMessagingService { + + public ArrayList messagingConsumers = new ArrayList<>(); + public final MockVCMessageSession mockVCMessageSession = new MockVCMessageSession(); + + @Override + public VCMessageSession createProducerSession() { + return mockVCMessageSession; + } + + @Override + public void addMessageConsumer(VCMessagingConsumer vcMessagingConsumer) { + messagingConsumers.add(vcMessagingConsumer); + } + + @Override + public void removeMessageConsumer(VCMessagingConsumer vcMessagingConsumer) { + + } + + @Override + public List getMessageConsumers() { + return List.of(); + } + + @Override + public void close() throws VCMessagingException { + + } + + @Override + public VCMessageSelector createSelector(String clientMessageFilter) { + return null; + } + + @Override + public VCMessagingDelegate getDelegate() { + return null; + } + + @Override + public void setConfiguration(VCMessagingDelegate delegate, String jmshost, int jmsport) { + + } +} diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java new file mode 100644 index 0000000000..3069df8d26 --- /dev/null +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java @@ -0,0 +1,218 @@ +package cbit.vcell.message.server.dispatcher; + +import cbit.vcell.field.FieldDataIdentifierSpec; +import cbit.vcell.messaging.db.SimulationRequirements; +import cbit.vcell.server.*; +import cbit.vcell.solver.Simulation; +import cbit.vcell.solver.SimulationInfo; +import org.vcell.util.DataAccessException; +import org.vcell.util.ObjectNotFoundException; +import org.vcell.util.document.*; + +import java.math.BigDecimal; +import java.sql.SQLException; +import java.time.Instant; +import java.util.*; + +public class MockSimulationDB implements SimulationDatabase{ + + private HashMap> dbTable = new HashMap<>(); + + public static User.SpecialUser specialAdmin = new User.SpecialUser("Tom", new KeyValue("999"), new User.SPECIAL_CLAIM[User.SPECIAL_CLAIM.admins.ordinal()]); + + private final HashMap users = new HashMap<>(){ + {put(specialAdmin.getName(), specialAdmin); put(DispatcherTestUtils.alice.getName(), DispatcherTestUtils.alice);} + }; + + private final HashMap simulations = new HashMap<>(); + + private final Set unreferencedSimKeys = new HashSet<>(); + + // Return a latest simulation that differs in one of these ways + public enum BadLatestSimulation{ + HIGHER_TASK_ID, + RETURN_NULL, + IS_DONE, + DO_NOTHING + } + + public BadLatestSimulation badLatestSimulation = BadLatestSimulation.DO_NOTHING; + + + @Override + public SimulationJobStatus getLatestSimulationJobStatus(KeyValue simKey, int jobIndex) throws DataAccessException, SQLException { + ArrayList simList = dbTable.get(simKey.toString()); + if (simList == null){ + return null; + } + SimulationJobStatus latestSim = null; + for (SimulationJobStatus jobStatus : simList){ + boolean equalJobIndex = jobStatus.getJobIndex() == jobIndex; + boolean isLatestSimNull = latestSim == null; + if ((equalJobIndex && isLatestSimNull) || (!isLatestSimNull && equalJobIndex && latestSim.getSubmitDate().after(jobStatus.getSubmitDate()))){ + latestSim = jobStatus; + } + } + switch (badLatestSimulation){ + case RETURN_NULL -> { + return null; + } case HIGHER_TASK_ID -> { + SimulationJobStatus simulationJobStatus = new SimulationJobStatus(latestSim.getServerID(), latestSim.getVCSimulationIdentifier(), latestSim.getJobIndex(), + latestSim.getSubmitDate(), latestSim.getSchedulerStatus(), latestSim.getTaskID() + 1, latestSim.getSimulationMessage(), latestSim.getSimulationQueueEntryStatus(), latestSim.getSimulationExecutionStatus()); + return simulationJobStatus; + } case IS_DONE -> { + return new SimulationJobStatus(latestSim.getServerID(), latestSim.getVCSimulationIdentifier(), latestSim.getJobIndex(), latestSim.getSubmitDate(), SimulationJobStatus.SchedulerStatus.COMPLETED, + latestSim.getTaskID(), latestSim.getSimulationMessage(), latestSim.getSimulationQueueEntryStatus(), null); + }default -> { + return latestSim; + } + } + + } + + @Override + public void insertSimulationJobStatus(SimulationJobStatus simulationJobStatus) throws DataAccessException, SQLException { + String simKey = simulationJobStatus.getVCSimulationIdentifier().getSimulationKey().toString(); + if (dbTable.containsKey(simKey)){ + dbTable.get(simKey).add(simulationJobStatus); + } else { + dbTable.put(simKey, new ArrayList<>(){{add(simulationJobStatus);}}); + } + } + + @Override + public SimulationJobStatus[] getActiveJobs(VCellServerID vcellServerID) throws DataAccessException, SQLException { + ArrayList allActiveJobs = new ArrayList<>(); + for (ArrayList jobStatuses : dbTable.values()){ + for (SimulationJobStatus jobStatus: jobStatuses){ + if (jobStatus.getSchedulerStatus().isActive()){ + allActiveJobs.add(jobStatus); + } + } + } + return allActiveJobs.toArray(new SimulationJobStatus[]{}); + } + + @Override + public SimulationJobStatus[] queryJobs(SimpleJobStatusQuerySpec simStatusQuerySpec) throws ObjectNotFoundException, DataAccessException { + throw new ObjectNotFoundException(""); + } + + @Override + public Map getSimulationRequirements(Collection simKeys) throws SQLException { + HashMap map = new HashMap<>(); + for (KeyValue simKey : simKeys){ + map.put(simKey, new SimulationRequirements(simKey, 3)); + } + return map; + } + + @Override + public void updateSimulationJobStatus(SimulationJobStatus newSimulationJobStatus) throws DataAccessException, UpdateSynchronizationException, SQLException { + updateSimulationJobStatus(newSimulationJobStatus, null); + } + + @Override + public void updateSimulationJobStatus(SimulationJobStatus newSimulationJobStatus, StateInfo runningStateInfo) throws DataAccessException, UpdateSynchronizationException, SQLException { + String simKey = newSimulationJobStatus.getVCSimulationIdentifier().getSimulationKey().toString(); + ArrayList jobStatuses = dbTable.get(simKey); + for (int i = 0; i < jobStatuses.size(); i++){ + SimulationJobStatus jobStatus = jobStatuses.get(i); + boolean sameJob = jobStatus.getJobIndex() == newSimulationJobStatus.getJobIndex(); + if (sameJob){ + jobStatuses.set(i,newSimulationJobStatus); + break; + } + } + } + + @Override + public KeyValue[] getSimulationKeysFromBiomodel(KeyValue biomodelKey) throws SQLException, DataAccessException { + throw new SQLException(); + } + + @Override + public Simulation getSimulation(User user, KeyValue simKey) throws DataAccessException { + return simulations.get(simKey.toString() + user.getName()); + } + + @Override + public FieldDataIdentifierSpec[] getFieldDataIdentifierSpecs(Simulation sim) throws DataAccessException { + return new FieldDataIdentifierSpec[0]; + } + + @Override + public Set getUnreferencedSimulations() throws SQLException { + return unreferencedSimKeys; + } + + @Override + public User.SpecialUser getUser(String username) throws DataAccessException, SQLException { + User user = users.get(username); + if (user instanceof User.SpecialUser){ + return (User.SpecialUser) user; + } + User.SpecialUser specialUser = new User.SpecialUser(user.getName(), user.getID(), new User.SPECIAL_CLAIM[]{}); + return specialUser; + } + + @Override + public Map> getSpecialUsers() throws DataAccessException, SQLException { + Map> map = new HashMap<>(); + Map subMap = new HashMap<>(); + subMap.put(specialAdmin, "f"); + map.put(User.SPECIAL_CLAIM.admins, subMap); + return map; + } + + @Override + public SimulationInfo getSimulationInfo(User user, KeyValue simKey) throws ObjectNotFoundException, DataAccessException { + return mockSimulationInfo(user, simKey); + } + + @Override + public SimulationStatus[] getSimulationStatus(KeyValue[] simKeys) throws ObjectNotFoundException, DataAccessException { + return new SimulationStatus[0]; + } + + @Override + public SimulationStatus getSimulationStatus(KeyValue simulationKey) throws ObjectNotFoundException, DataAccessException { + SimulationJobStatus status = dbTable.get(simulationKey.toString()).get(0); + SimulationStatus simulationStatus = new SimulationStatus(new SimulationJobStatus[]{status}); + return simulationStatus; + } + + @Override + public SimpleJobStatus[] getSimpleJobStatus(User user, SimpleJobStatusQuerySpec simStatusQuerySpec) throws ObjectNotFoundException, DataAccessException { + throw new ObjectNotFoundException(""); + } + + + private SimulationInfo mockSimulationInfo(User user, KeyValue simKey){ + KeyValue versionKey = new KeyValue("22"); + KeyValue versionBranchPoint = new KeyValue("23"); + VersionFlag versionFlag = VersionFlag.fromInt(0); + KeyValue parentSimulationRef = new KeyValue("24"); + SimulationVersion simulationVersion = new SimulationVersion(versionKey, "Mock Sim Info", user, null, + versionBranchPoint, new BigDecimal(22), Date.from(Instant.now()), versionFlag, "Version annot", + parentSimulationRef); + SimulationInfo simulationInfo = new SimulationInfo(simKey, simulationVersion, VCellSoftwareVersion.fromString("50")); + return simulationInfo; + } + + public void resetDataBase(){ + dbTable = new HashMap<>(); + badLatestSimulation = BadLatestSimulation.DO_NOTHING; + unreferencedSimKeys.clear(); + simulations.clear(); + } + + public void insertSimulation(User user, Simulation sim){ + simulations.put(sim.getKey().toString() + user.getName(), sim); + } + + public void insertUnreferencedSimKey(KeyValue k){ + unreferencedSimKeys.add(k); + } + +} diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockVCMessageSession.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockVCMessageSession.java new file mode 100644 index 0000000000..8a35e48096 --- /dev/null +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockVCMessageSession.java @@ -0,0 +1,106 @@ +package cbit.vcell.message.server.dispatcher; + +import cbit.vcell.message.*; +import cbit.vcell.message.jms.VCMessageJms; +import org.apache.activemq.command.ActiveMQMessage; +import org.apache.activemq.command.ActiveMQObjectMessage; +import org.apache.activemq.command.ActiveMQTextMessage; +import org.vcell.util.document.UserLoginInfo; + +import javax.jms.JMSException; +import javax.jms.ObjectMessage; +import javax.jms.TextMessage; +import java.io.Serializable; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.Queue; + +public class MockVCMessageSession implements VCMessageSession { + + + public MockVCMessageSession(){ } + + private final HashMap> topics = new HashMap<>(){{ + put(VCellTopic.ClientStatusTopic, new LinkedList<>()); + put(VCellTopic.ServiceControlTopic, new LinkedList<>()); + }}; + private final HashMap> queues = new HashMap<>(){{ + put(VCellQueue.WorkerEventQueue, new LinkedList<>()); + put(VCellQueue.DbRequestQueue, new LinkedList<>()); + put(VCellQueue.DataRequestQueue, new LinkedList<>()); + put(VCellQueue.SimReqQueue, new LinkedList<>()); + put(VCellQueue.SimJobQueue, new LinkedList<>()); + }}; + + @Override + public Object sendRpcMessage(VCellQueue queue, VCRpcRequest vcRpcRequest, boolean returnRequired, long timeoutMS, String[] specialProperties, Object[] specialValues, UserLoginInfo userLoginInfo) throws VCMessagingException, VCMessagingInvocationTargetException { + return null; + } + + @Override + public void sendQueueMessage(VCellQueue queue, VCMessage message, Boolean persistent, Long clientTimeoutMS) throws VCMessagingException { + queues.get(queue).add(message); + } + + @Override + public void sendTopicMessage(VCellTopic topic, VCMessage message) throws VCMessagingException { + topics.get(topic).add(message); + } + + @Override + public void rollback() { + + } + + @Override + public void commit() { + + } + + @Override + public VCMessage createTextMessage(String text) { + TextMessage textMessage = new ActiveMQTextMessage(); + try { + textMessage.setText(text); + } catch (JMSException e) { + throw new RuntimeException(e); + } + return new VCMessageJms(textMessage, null); + } + + @Override + public VCMessage createMessage() { + return new VCMessageJms(new ActiveMQMessage(), null, null); + } + + @Override + public VCMessage createObjectMessage(Serializable object) { + ObjectMessage objectMessage = new ActiveMQObjectMessage(); + try { + objectMessage.setObjectProperty(VCMessageJms.BLOB_MESSAGE_FILE_NAME, ""); + } catch (JMSException e) { + throw new RuntimeException(e); + } + return new VCMessageJms(objectMessage, object, null); + + } + + @Override + public VCMessagingDelegate getDelegate() { + return null; + } + + @Override + public void close() { + + } + + public VCMessage getTopicMessage(VCellTopic vCellTopic){ + return topics.get(vCellTopic).remove(); + } + + public VCMessage getQueueMessage(VCellQueue vCellQueue){ + return queues.get(vCellQueue).remove(); + } + +} diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherTest.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherTest.java new file mode 100644 index 0000000000..dc352ad062 --- /dev/null +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherTest.java @@ -0,0 +1,209 @@ +package cbit.vcell.message.server.dispatcher; + +import cbit.vcell.math.MathException; +import cbit.vcell.message.VCMessagingConstants; +import cbit.vcell.message.VCellTopic; +import cbit.vcell.message.messages.MessageConstants; +import cbit.vcell.parser.ExpressionBindingException; +import cbit.vcell.server.SimulationJobStatus; +import cbit.vcell.server.SimulationStatus; +import cbit.vcell.solver.Simulation; +import org.apache.logging.log4j.Level; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.Logger; +import org.apache.logging.log4j.core.LoggerContext; +import org.apache.logging.log4j.core.appender.WriterAppender; +import org.apache.logging.log4j.core.config.Configuration; +import org.apache.logging.log4j.core.config.Configurator; +import org.apache.logging.log4j.core.config.LoggerConfig; +import org.apache.logging.log4j.spi.ExtendedLogger; +import org.junit.jupiter.api.*; +import org.vcell.util.DataAccessException; +import org.vcell.util.document.User; + +import java.beans.PropertyVetoException; +import java.io.IOException; +import java.io.StringWriter; +import java.sql.SQLException; + +@Tag("Fast") +public class SimulationDispatcherTest { + public static ExtendedLogger lg = LoggerContext.getContext().getLogger(SimulationDispatcher.class); + private final static User testUser = DispatcherTestUtils.alice; + private MockSimulationDB mockSimulationDB = new MockSimulationDB(); + private final MockMessagingService mockMessagingServiceInternal = new MockMessagingService(); + private final MockMessagingService mockMessagingServiceSim = new MockMessagingService(); + private final MockHtcProxy mockHtcProxy = new MockHtcProxy(null, "htcUser", mockSimulationDB); + private static StringWriter logOutPut; + private static WriterAppender appender; + + @BeforeAll + public static void setSystemProperties(){ + DispatcherTestUtils.setRequiredProperties(); + + logOutPut = new StringWriter(); + appender = WriterAppender.newBuilder().setTarget(logOutPut).setName("Simulation Dispatcher Test").build(); + LoggerContext context = LoggerContext.getContext(false); + Configuration configuration = context.getConfiguration(); + configuration.addLoggerAppender((Logger) lg, appender); + } + + @AfterAll + public static void restoreSystemProperties() throws IOException { + DispatcherTestUtils.restoreRequiredProperties(); + appender.stop(); + logOutPut.close(); + } + + //################# Test Simulation Service Impl ####################### + // All the get functions withing SimulationDispatcher seem to be exercising the DB and not simulation control, so not tested + + @Test + public void onStartRequestTest() throws DataAccessException, SQLException { + SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(mockSimulationDB, mockMessagingServiceInternal, + mockMessagingServiceSim, mockHtcProxy, false); + SimulationStatus simStatus = simulationDispatcher.simServiceImpl.startSimulation(testUser, DispatcherTestUtils.simID, 1); + SimulationJobStatus jobStatus = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, 0); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isWaiting()); + } + + @Test + public void onStopRequestTest() throws DataAccessException, SQLException { + DispatcherTestUtils.insertOrUpdateStatus(mockSimulationDB); + SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(mockSimulationDB, mockMessagingServiceInternal, + mockMessagingServiceSim, mockHtcProxy, false); + SimulationStatus simStatus = simulationDispatcher.simServiceImpl.stopSimulation(testUser, DispatcherTestUtils.simID); + SimulationJobStatus jobStatus = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, 0); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isStopped()); + + String s = mockMessagingServiceInternal.mockVCMessageSession.getTopicMessage(VCellTopic.ServiceControlTopic).getStringProperty(VCMessagingConstants.MESSAGE_TYPE_PROPERTY); + Assertions.assertEquals(MessageConstants.MESSAGE_TYPE_STOPSIMULATION_VALUE, s); + } + + + //###################### Test Dispatcher Thread ########################### + @Test + public void dispatcherThreadTest() throws SQLException, DataAccessException, InterruptedException, PropertyVetoException, MathException, ExpressionBindingException { + DispatcherTestUtils.insertOrUpdateStatus(mockSimulationDB, SimulationJobStatus.SchedulerStatus.WAITING); + SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(mockSimulationDB, mockMessagingServiceInternal, + mockMessagingServiceSim, mockHtcProxy, true); + SimulationDispatcher.DispatchThread thread = simulationDispatcher.dispatchThread; + synchronized (thread.dispatcherNotifyObject){ + thread.dispatcherNotifyObject.notify(); + } + SimulationJobStatus jobStatus = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, 0); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isWaiting(), "Still waiting."); + + synchronized (thread.finishListener){ + thread.finishListener.wait(); + } + + jobStatus = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, 0); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isFailed(), "Simulation gets aborted since theres no simulation in DB."); + + Simulation mockSimulation = DispatcherTestUtils.createMockSimulation(20, 20, 20); + mockSimulationDB.insertSimulation(DispatcherTestUtils.alice, mockSimulation); + DispatcherTestUtils.insertOrUpdateStatus(mockSimulation.getKey(), DispatcherTestUtils.jobIndex, DispatcherTestUtils.taskID, DispatcherTestUtils.alice, + SimulationJobStatus.SchedulerStatus.WAITING, mockSimulationDB); + synchronized (thread.dispatcherNotifyObject){ + thread.dispatcherNotifyObject.notify(); + } + synchronized (thread.finishListener){ + thread.finishListener.wait(); + } + + jobStatus = mockSimulationDB.getLatestSimulationJobStatus(mockSimulation.getKey(), 0); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isDispatched(), "Dispatches"); + } + + + + //###################### Test Simulation Monitor ########################## + + @Test + public void zombieKillerTest() throws SQLException, DataAccessException, InterruptedException, IOException { + SimulationDispatcher.INITIAL_ZOMBIE_DELAY = 10; + SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(mockSimulationDB, mockMessagingServiceInternal, + mockMessagingServiceSim, mockHtcProxy, false); + DispatcherTestUtils.insertOrUpdateStatus(mockSimulationDB); + mockHtcProxy.jobsKilledSafely.clear(); + + mockSimulationDB.badLatestSimulation = MockSimulationDB.BadLatestSimulation.HIGHER_TASK_ID; + SimulationDispatcher.SimulationMonitor.ZombieKiller zombieKiller = simulationDispatcher.simMonitor.initialZombieKiller; + zombieKiller.run(); + Assertions.assertTrue(logOutPut.toString().contains(SimulationDispatcher.SimulationMonitor.ZombieKiller.newJobFound)); + Assertions.assertEquals(1, mockHtcProxy.jobsKilledSafely.size()); + + mockSimulationDB.badLatestSimulation = MockSimulationDB.BadLatestSimulation.RETURN_NULL; + zombieKiller.run(); + Assertions.assertTrue(logOutPut.toString().contains(SimulationDispatcher.SimulationMonitor.ZombieKiller.noJob)); + Assertions.assertEquals(2, mockHtcProxy.jobsKilledSafely.size()); + + mockSimulationDB.badLatestSimulation = MockSimulationDB.BadLatestSimulation.IS_DONE; + zombieKiller.run(); + Assertions.assertTrue(logOutPut.toString().contains(SimulationDispatcher.SimulationMonitor.ZombieKiller.jobIsAlreadyDone)); + Assertions.assertEquals(3, mockHtcProxy.jobsKilledSafely.size()); + } + + @Test + public void queueFlusherTest() throws SQLException, DataAccessException, InterruptedException { + SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(mockSimulationDB, mockMessagingServiceInternal, + mockMessagingServiceSim, mockHtcProxy, false); + DispatcherTestUtils.insertOrUpdateStatus(mockSimulationDB); + + SimulationDispatcher.SimulationMonitor simMonitor = simulationDispatcher.simMonitor; + SimulationDispatcher.SimulationMonitor.QueueFlusher queueFlusher = simMonitor.initialQueueFlusher; + SimulationStateMachine sm = simulationDispatcher.simDispatcherEngine.getSimulationStateMachine(DispatcherTestUtils.simKey, DispatcherTestUtils.jobIndex); + sm.setSolverProcessTimestamp(0); + Thread queueThread = new Thread(queueFlusher); + queueThread.start(); + int retries = 0; + while (queueThread.getState() != Thread.State.TIMED_WAITING){ + if (retries == 10){ + break; + } + Thread.sleep(500); + retries += 1; + } + synchronized (simMonitor.monitorNotifyObject){ + simMonitor.monitorNotifyObject.notify(); + } + synchronized (queueFlusher.finishListener){ + queueFlusher.finishListener.wait(); + } + + SimulationJobStatus status = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, DispatcherTestUtils.jobIndex); + Assertions.assertTrue(status.getSchedulerStatus().isFailed()); + Assertions.assertTrue(mockHtcProxy.jobsKilledUnsafely.contains(status.getSimulationExecutionStatus().getHtcJobID())); + Assertions.assertTrue(logOutPut.toString().contains(SimulationDispatcher.SimulationMonitor.QueueFlusher.timeOutFailure)); + + // reset for next test + simulationDispatcher.simDispatcherEngine.resetTimeStamps(); + mockHtcProxy.jobsKilledUnsafely.clear(); + mockSimulationDB.resetDataBase(); + + mockSimulationDB.insertUnreferencedSimKey(DispatcherTestUtils.simKey); + DispatcherTestUtils.insertOrUpdateStatus(mockSimulationDB); + queueThread = new Thread(queueFlusher); + queueThread.start(); + retries = 0; + while (queueThread.getState() != Thread.State.TIMED_WAITING){ + if (retries == 10){ + break; + } + Thread.sleep(500); + retries += 1; + } + synchronized (simMonitor.monitorNotifyObject){ + simMonitor.monitorNotifyObject.notify(); + } + synchronized (queueFlusher.finishListener){ + queueFlusher.finishListener.wait(); + } + status = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, DispatcherTestUtils.jobIndex); + Assertions.assertTrue(status.getSchedulerStatus().isFailed()); + Assertions.assertTrue(mockHtcProxy.jobsKilledUnsafely.contains(status.getSimulationExecutionStatus().getHtcJobID())); + Assertions.assertTrue(logOutPut.toString().contains(SimulationDispatcher.SimulationMonitor.QueueFlusher.unreferencedFailure)); + } + +} diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java new file mode 100644 index 0000000000..c181b0762a --- /dev/null +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java @@ -0,0 +1,259 @@ +package cbit.vcell.message.server.dispatcher; + +import cbit.rmi.event.WorkerEvent; +import cbit.vcell.math.MathException; +import cbit.vcell.message.VCMessagingException; +import cbit.vcell.message.VCellTopic; +import cbit.vcell.message.messages.StatusMessage; +import cbit.vcell.parser.ExpressionBindingException; +import cbit.vcell.server.SimulationJobStatus; +import cbit.vcell.solver.Simulation; +import cbit.vcell.solver.VCSimulationIdentifier; +import cbit.vcell.solver.server.SimulationMessage; +import org.junit.jupiter.api.*; +import org.vcell.util.DataAccessException; +import org.vcell.util.document.KeyValue; +import org.vcell.util.document.User; +import org.vcell.util.document.VCellServerID; + +import java.beans.PropertyVetoException; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.NoSuchElementException; + +@Tag("Fast") +public class SimulationStateMachineTest { + private static final User testUser = DispatcherTestUtils.alice; + private static final MockVCMessageSession testMessageSession = new MockVCMessageSession(); + private static final int jobIndex = DispatcherTestUtils.jobIndex; + private static final int taskID = DispatcherTestUtils.taskID; + private static final KeyValue simKey = DispatcherTestUtils.simKey; + private static final VCSimulationIdentifier simID = DispatcherTestUtils.simID; + + private MockSimulationDB simulationDB; + private SimulationStateMachine stateMachine; + + @BeforeAll + public static void setSystemProperties(){ + DispatcherTestUtils.setRequiredProperties(); + } + + @AfterAll + public static void restoreSystemProperties(){ + DispatcherTestUtils.restoreRequiredProperties(); + } + + @BeforeEach + public void setUp(){ + simulationDB = new MockSimulationDB(); + stateMachine = new SimulationStateMachine(simKey, jobIndex); + } + + private record ChangedStateValues( + VCSimulationIdentifier simID, + SimulationJobStatus.SchedulerStatus schedulerStatus, + int workerEventJob, + int taskID, + String changesResult + ){ } + + private WorkerEvent createWorkerEvent(ChangedStateValues w){ + SimulationMessage acceptedSimulationMessage = SimulationMessage.workerAccepted("accepted"); + return new WorkerEvent(w.workerEventJob, simKey, + w.simID, jobIndex, "", + w.taskID, null, null, + acceptedSimulationMessage); + } + + private SimulationJobStatus getLatestJobSubmission() throws SQLException, DataAccessException { + return simulationDB.getLatestSimulationJobStatus(simKey, jobIndex); + } + + private SimulationJobStatus getClientTopicMessage(){ + return (SimulationJobStatus) testMessageSession.getTopicMessage(VCellTopic.ClientStatusTopic).getObjectContent(); + } + + @Test + public void workerEventRejectionsTest() throws SQLException, DataAccessException { + int taskID = 16; + + ArrayList changedValues = new ArrayList<>(){{ + add(new ChangedStateValues(simID, SimulationJobStatus.SchedulerStatus.RUNNING, WorkerEvent.JOB_WORKER_ALIVE, taskID, "No old status.")); // no old status failure + add(new ChangedStateValues(simID, SimulationJobStatus.SchedulerStatus.COMPLETED, WorkerEvent.JOB_WORKER_ALIVE, taskID, "Work is already done.")); // work is done failure + add(new ChangedStateValues(simID, SimulationJobStatus.SchedulerStatus.RUNNING, WorkerEvent.JOB_WORKER_ALIVE, 0, "Task ID is lower")); // old status has higher number taskID failure + }}; + + for (int i = 0; i < changedValues.size(); i++){ + ChangedStateValues workerEventChangedValues = changedValues.get(i); + if (i > 1) { + DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, testUser, workerEventChangedValues.schedulerStatus, simulationDB); + } + WorkerEvent workerEvent = createWorkerEvent(workerEventChangedValues); + Assertions.assertFalse(stateMachine.isWorkerEventOkay(workerEvent, simulationDB), workerEventChangedValues.changesResult); + } + + ChangedStateValues passingWorkerValues= new ChangedStateValues(simID, null, WorkerEvent.JOB_WORKER_ALIVE , taskID, ""); + WorkerEvent passingWorkerEvent = createWorkerEvent(passingWorkerValues); + + for (SimulationJobStatus.SchedulerStatus passingStatus: SimulationJobStatus.SchedulerStatus.values()){ + if (!passingStatus.isDone()){ + DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, testUser, passingStatus, simulationDB); + Assertions.assertTrue(stateMachine.isWorkerEventOkay(passingWorkerEvent, simulationDB)); + } + } + + } + + @Test + public void stateShouldTransitionToFailure() throws SQLException, DataAccessException, VCMessagingException, PropertyVetoException, MathException, ExpressionBindingException { + ArrayList changedValues = new ArrayList<>(){{ + add(new ChangedStateValues(simID, null, WorkerEvent.JOB_FAILURE, taskID, "The current worker has failed.")); + add(new ChangedStateValues(simID, null, WorkerEvent.JOB_WORKER_EXIT_ERROR, taskID, "The current worker exited with an error.")); + }}; + + for (ChangedStateValues changedValue : changedValues){ + DispatcherTestUtils.insertOrUpdateStatus(simulationDB); + stateMachine.onWorkerEvent(createWorkerEvent(changedValue), simulationDB, testMessageSession); + SimulationJobStatus result = getLatestJobSubmission(); + Assertions.assertTrue(result.getSchedulerStatus().isFailed(), changedValue.changesResult); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isFailed(), changedValue.changesResult); + } + + simulationDB = new MockSimulationDB(); + StatusMessage statusMessage = stateMachine.onStartRequest(DispatcherTestUtils.bob, simID, simulationDB, testMessageSession); + Assertions.assertTrue(statusMessage.getSimulationJobStatus().getSchedulerStatus().isFailed(), "Different from initial user that owns the simulation"); + + SimulationJobStatus jobStatus = getLatestJobSubmission(); + Assertions.assertNull(jobStatus, "If it fails on start request, there should be nothing in the DB."); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isFailed(), "Only the client receives start request failure status."); + + DispatcherTestUtils.insertOrUpdateStatus(simulationDB); + Assertions.assertThrows(RuntimeException.class, + () -> {stateMachine.onStartRequest(testUser, simID, simulationDB, testMessageSession);}, + "Can't start simulation job unless previous is done."); + Assertions.assertThrows(NoSuchElementException.class,() -> getClientTopicMessage().getSchedulerStatus().isFailed(), "No message sent to client."); + + + DispatcherTestUtils.insertOrUpdateStatus(simulationDB); + jobStatus = getLatestJobSubmission(); + stateMachine.onSystemAbort(jobStatus, "Test Abort", simulationDB, testMessageSession); + jobStatus = getLatestJobSubmission(); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isFailed()); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isFailed(), "On abort client gets failed status."); + +// + Simulation memoryIntensiveSimulation = DispatcherTestUtils.createMockSimulation(900, 900, 900); + + DispatcherTestUtils.insertOrUpdateStatus(simulationDB); + Assertions.assertThrows(RuntimeException.class, + () -> {stateMachine.onDispatch(memoryIntensiveSimulation, getLatestJobSubmission(), simulationDB, testMessageSession);}, + "Can't dispatch simulation that is already running."); + Assertions.assertThrows(NoSuchElementException.class, () -> getClientTopicMessage().getSchedulerStatus().isFailed(), "Client receives failure because simulation is already running."); + + DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, testUser, SimulationJobStatus.SchedulerStatus.WAITING, simulationDB); + stateMachine.onDispatch(memoryIntensiveSimulation, getLatestJobSubmission(), simulationDB, testMessageSession); + jobStatus = getLatestJobSubmission(); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isFailed(), "Memory size too large"); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isFailed(), "Failed because of memory size."); + + DispatcherTestUtils.insertOrUpdateStatus(simulationDB); + statusMessage = stateMachine.onStopRequest(DispatcherTestUtils.bob, getLatestJobSubmission(), simulationDB, testMessageSession); + Assertions.assertTrue(statusMessage.getSimulationJobStatus().getSchedulerStatus().isFailed(), "Stopping as another user."); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isFailed(), "Can't stop as another user."); + } + + @Test + public void stateShouldTransitionToWaiting() throws SQLException, VCMessagingException, DataAccessException { + stateMachine.onStartRequest(testUser, simID, simulationDB, testMessageSession); + SimulationJobStatus jobStatus = getLatestJobSubmission(); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isWaiting(), "Just started new task."); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isWaiting()); + } + + @Test + public void stateShouldTransitionToDispatched() throws SQLException, DataAccessException, VCMessagingException, PropertyVetoException, MathException, ExpressionBindingException { + DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, testUser, SimulationJobStatus.SchedulerStatus.WAITING, simulationDB); + WorkerEvent acceptedWorker = createWorkerEvent(new ChangedStateValues(simID, null, WorkerEvent.JOB_ACCEPTED, taskID, "Worker just got accepted")); + stateMachine.onWorkerEvent(acceptedWorker, simulationDB, testMessageSession); + SimulationJobStatus jobStatus = getLatestJobSubmission(); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isDispatched(), "Job recently got accepted, only works if previous state was waiting."); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isDispatched()); + + DispatcherTestUtils.insertOrUpdateStatus(simulationDB); + stateMachine.onWorkerEvent(acceptedWorker, simulationDB, testMessageSession); + jobStatus = getLatestJobSubmission(); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isRunning(), "The state has not changed from running, because something that is running can not be dispatched."); + + + DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, testUser, SimulationJobStatus.SchedulerStatus.WAITING, simulationDB); + Simulation simulation = DispatcherTestUtils.createMockSimulation(50, 50, 50); + stateMachine.onDispatch(simulation, getLatestJobSubmission(), simulationDB, testMessageSession); + jobStatus = getLatestJobSubmission(); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isDispatched()); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isDispatched()); + } + + @Test + public void stateShouldTransitionToRunning() throws SQLException, DataAccessException, VCMessagingException { + for (int workerStatus: WorkerEvent.ALL_JOB_EVENTS){ + WorkerEvent workerEvent = createWorkerEvent(new ChangedStateValues(simID, null, workerStatus, taskID, "")); + DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, testUser, SimulationJobStatus.SchedulerStatus.WAITING, simulationDB); + stateMachine.onWorkerEvent(workerEvent, simulationDB, testMessageSession); + SimulationJobStatus jobStatus = getLatestJobSubmission(); + if (workerEvent.isProgressEvent() || workerEvent.isNewDataEvent() || workerEvent.isStartingEvent() || workerEvent.isWorkerAliveEvent()){ + Assertions.assertTrue(jobStatus.getSchedulerStatus().isRunning()); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isRunning()); + } else { + Assertions.assertFalse(jobStatus.getSchedulerStatus().isRunning()); + try { + Assertions.assertFalse(getClientTopicMessage().getSchedulerStatus().isRunning()); + } catch (NoSuchElementException ignored){} + } + } + } + + @Test + public void stateShouldTransitionToCompleted() throws SQLException, VCMessagingException, DataAccessException { + for (int workerStatus : WorkerEvent.ALL_JOB_EVENTS){ + WorkerEvent workerEvent = createWorkerEvent(new ChangedStateValues(simID, SimulationJobStatus.SchedulerStatus.RUNNING, workerStatus, taskID, "")); + DispatcherTestUtils.insertOrUpdateStatus(simulationDB); + stateMachine.onWorkerEvent(workerEvent, simulationDB, testMessageSession); + SimulationJobStatus jobStatus = getLatestJobSubmission(); + if (workerEvent.isCompletedEvent()){ + Assertions.assertTrue(jobStatus.getSchedulerStatus().isCompleted()); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isCompleted()); + } else { + Assertions.assertFalse(jobStatus.getSchedulerStatus().isCompleted()); + try { + Assertions.assertFalse(getClientTopicMessage().getSchedulerStatus().isCompleted()); + } catch (NoSuchElementException ignored){} + } + } + } + + @Test + public void stateShouldTransitionToStopped() throws SQLException, DataAccessException, VCMessagingException { + + for (SimulationJobStatus.SchedulerStatus status : SimulationJobStatus.SchedulerStatus.values()){ + DispatcherTestUtils.insertOrUpdateStatus(simKey,jobIndex, taskID,testUser, status, simulationDB); + if (status.isActive()){ + stateMachine.onStopRequest(testUser, getLatestJobSubmission(), simulationDB, testMessageSession); + Assertions.assertTrue(getLatestJobSubmission().getSchedulerStatus().isStopped(), ""); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isStopped()); + } else { + StatusMessage statusMessage = stateMachine.onStopRequest(testUser, getLatestJobSubmission(), simulationDB, testMessageSession); + Assertions.assertNull(statusMessage); + try { + Assertions.assertFalse(getClientTopicMessage().getSchedulerStatus().isCompleted()); + } catch (NoSuchElementException ignored){} + } + } + } + + @Test + public void stateShouldTransitionToQueued(){ + System.out.print("Not used in state machine"); + } + + +} diff --git a/vcell-server/src/test/resources/log4j2-test.xml b/vcell-server/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000000..7d78860d47 --- /dev/null +++ b/vcell-server/src/test/resources/log4j2-test.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + From f2cce4a145d9a521cf2a93a262d741441fb3556d Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 6 Sep 2024 14:13:11 -0400 Subject: [PATCH 05/16] Worker Event States In List for Easy Test Access --- vcell-core/src/main/java/cbit/rmi/event/WorkerEvent.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vcell-core/src/main/java/cbit/rmi/event/WorkerEvent.java b/vcell-core/src/main/java/cbit/rmi/event/WorkerEvent.java index 3c5c4a2392..660d7e9ce6 100644 --- a/vcell-core/src/main/java/cbit/rmi/event/WorkerEvent.java +++ b/vcell-core/src/main/java/cbit/rmi/event/WorkerEvent.java @@ -19,6 +19,8 @@ import cbit.vcell.solver.VCSimulationIdentifier; import cbit.vcell.solver.server.SimulationMessage; +import java.util.ArrayList; + /** * Insert the type's description here. * Creation date: (2/5/2004 12:35:20 PM) @@ -36,6 +38,10 @@ public class WorkerEvent extends MessageEvent { public static final int JOB_WORKER_EXIT_NORMAL = 1015; public static final int JOB_WORKER_EXIT_ERROR = 1016; + public static final ArrayList ALL_JOB_EVENTS = new ArrayList<>(){{add(JOB_ACCEPTED); + add(JOB_STARTING); add(JOB_DATA); add(JOB_PROGRESS); add(JOB_FAILURE); add(JOB_COMPLETED); add(JOB_WORKER_ALIVE); + add(JOB_WORKER_EXIT_NORMAL); add(JOB_WORKER_EXIT_ERROR);}}; + private VCSimulationIdentifier vcSimulationIdentifier = null; private int jobIndex = -1; private String hostName = null; From eec507695621f44cefe3a92de05c37e7ac23a015 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 6 Sep 2024 14:13:41 -0400 Subject: [PATCH 06/16] Updated Entrypoint And Env Variables Docker --- docker/build/Dockerfile-sched-dev | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docker/build/Dockerfile-sched-dev b/docker/build/Dockerfile-sched-dev index cc9827cb94..47f831de56 100644 --- a/docker/build/Dockerfile-sched-dev +++ b/docker/build/Dockerfile-sched-dev @@ -68,7 +68,10 @@ ENV softwareVersion=SOFTWARE-VERSION-NOT-SET \ maxOdeJobsPerUser="max-ode-jobs-per-user-not-set" \ vcell_ssh_cmd_cmdtimeout="cmdSrvcSshCmdTimeoutMS-not-set" \ vcell_ssh_cmd_restoretimeout="cmdSrvcSshCmdRestoreTimeoutFactor-not-set" \ - maxPdeJobsPerUser="max-pde-jobs-per-user-not-set" + maxPdeJobsPerUser="max-pde-jobs-per-user-not-set" \ + htcMinMemoryMB="htc-min-memory-not-set" \ + htcMaxMemoryMB="htc-max-memory-not-set" \ + htcPowerUserMemoryFloorMB="htc-power-user-memory-floor-not-set" ENV dbpswdfile=/run/secrets/dbpswd \ jmspswdfile=/run/secrets/jmspswd \ @@ -119,4 +122,7 @@ ENTRYPOINT java \ -Dvcell.server.maxPdeJobsPerUser=${maxPdeJobsPerUser} \ -Dvcell.ssh.cmd.cmdtimeout=${vcell_ssh_cmd_cmdtimeout} \ -Dvcell.ssh.cmd.restoretimeout=${vcell_ssh_cmd_restoretimeout} \ - -cp "./lib/*" cbit.vcell.message.server.dispatcher.SimulationDispatcher + -Dvcell.htc.memory.min.mb=${htcMinMemoryMB} \ + -Dvcell.htc.memory.max.mb=${htcMaxMemoryMB} \ + -Dvcell.htc.memory.pu.floor.mb=${htcPowerUserMemoryFloorMB} \ + -cp "./lib/*" cbit.vcell.message.server.dispatcher.SimulationDispatcherMain From 3050b8ad750d13290b4875bd699c05b6fbfa6de0 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 6 Sep 2024 14:40:27 -0400 Subject: [PATCH 07/16] MockDB Uses TreeMap With User Comparator --- .../vcell/message/server/dispatcher/MockSimulationDB.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java index 3069df8d26..c9a03cbb5d 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java @@ -157,9 +157,9 @@ public User.SpecialUser getUser(String username) throws DataAccessException, SQL } @Override - public Map> getSpecialUsers() throws DataAccessException, SQLException { - Map> map = new HashMap<>(); - Map subMap = new HashMap<>(); + public TreeMap> getSpecialUsers() throws DataAccessException, SQLException { + TreeMap> map = new TreeMap<>(); + TreeMap subMap = new TreeMap<>(new User.UserNameComparator()); subMap.put(specialAdmin, "f"); map.put(User.SPECIAL_CLAIM.admins, subMap); return map; From 73934e678b2885c59dbc8762e129bed78ec3b823 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 6 Sep 2024 15:40:01 -0400 Subject: [PATCH 08/16] Jim Suggestions --- .../server/dispatcher/SimulationStateMachine.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java index 53155b9533..cb1b5c2c8f 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java @@ -421,8 +421,12 @@ public static SimulationJobStatus saveSimulationStartRequest(VCSimulationIdentif // new exe status Date lastUpdateDate = new Date(); boolean hasData = false; + String computeHost = null; + Date startDate = null; + Date endDate = null; + HtcJobID htcJobID = null; - SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(null, null, lastUpdateDate, null, hasData, null); + SimulationExecutionStatus newExeStatus = new SimulationExecutionStatus(startDate, computeHost, lastUpdateDate, endDate, hasData, htcJobID); VCellServerID vcServerID = VCellServerID.getSystemServerID(); Date submitDate = currentDate; @@ -533,10 +537,6 @@ public synchronized StatusMessage onStopRequest(User user, SimulationJobStatus s if (lg.isTraceEnabled()) lg.trace("send " + MessageConstants.MESSAGE_TYPE_STOPSIMULATION_VALUE + " to " + VCellTopic.ServiceControlTopic.getName() + " topic"); - SimulationJobStatus simulationJobStatusRecord = new SimulationJobStatus( - null, new VCSimulationIdentifier(simKey, user), jobIndex, simJobStatus.getSubmitDate(), - SchedulerStatus.STOPPED, taskID, simJobStatus.getSimulationMessage(), simQueueEntryStatus, simExeStatus - ); // // send stopSimulation to serviceControl topic @@ -553,7 +553,7 @@ null, new VCSimulationIdentifier(simKey, user), jobIndex, simJobStatus.getSubmit session.sendTopicMessage(VCellTopic.ServiceControlTopic, msg); simulationDatabase.updateSimulationJobStatus(newJobStatus); - statusMessage = new StatusMessage(simulationJobStatusRecord, user.getName(), null, null); + statusMessage = new StatusMessage(newJobStatus, user.getName(), null, null); statusMessage.sendToClient(session); return statusMessage; From f02bcaa25c7591257b4c624e30907983f30feb83 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Tue, 10 Sep 2024 08:05:42 -0400 Subject: [PATCH 09/16] Explicit Description of Simulation Dispatcher Tests --- .../dispatcher/SimulationDispatcherTest.java | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherTest.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherTest.java index dc352ad062..d9800d8fad 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherTest.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherTest.java @@ -30,7 +30,7 @@ public class SimulationDispatcherTest { public static ExtendedLogger lg = LoggerContext.getContext().getLogger(SimulationDispatcher.class); private final static User testUser = DispatcherTestUtils.alice; - private MockSimulationDB mockSimulationDB = new MockSimulationDB(); + private final MockSimulationDB mockSimulationDB = new MockSimulationDB(); private final MockMessagingService mockMessagingServiceInternal = new MockMessagingService(); private final MockMessagingService mockMessagingServiceSim = new MockMessagingService(); private final MockHtcProxy mockHtcProxy = new MockHtcProxy(null, "htcUser", mockSimulationDB); @@ -82,8 +82,9 @@ public void onStopRequestTest() throws DataAccessException, SQLException { //###################### Test Dispatcher Thread ########################### + @Test - public void dispatcherThreadTest() throws SQLException, DataAccessException, InterruptedException, PropertyVetoException, MathException, ExpressionBindingException { + public void dispatcherThreadFailsJobsWithNoSimulationReference() throws SQLException, DataAccessException, InterruptedException { DispatcherTestUtils.insertOrUpdateStatus(mockSimulationDB, SimulationJobStatus.SchedulerStatus.WAITING); SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(mockSimulationDB, mockMessagingServiceInternal, mockMessagingServiceSim, mockHtcProxy, true); @@ -91,6 +92,8 @@ public void dispatcherThreadTest() throws SQLException, DataAccessException, Int synchronized (thread.dispatcherNotifyObject){ thread.dispatcherNotifyObject.notify(); } + + // Check that the simulation is in waiting, for the dispatcher hasn't consumed it's request yet SimulationJobStatus jobStatus = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, 0); Assertions.assertTrue(jobStatus.getSchedulerStatus().isWaiting(), "Still waiting."); @@ -98,9 +101,18 @@ public void dispatcherThreadTest() throws SQLException, DataAccessException, Int thread.finishListener.wait(); } + // Makes sure that requests that have no simulation reference within the DB are failed jobStatus = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, 0); Assertions.assertTrue(jobStatus.getSchedulerStatus().isFailed(), "Simulation gets aborted since theres no simulation in DB."); + } + + @Test + public void dispatcherThreadDispatchesWaitingJobsWithSimulationsIn() throws SQLException, DataAccessException, InterruptedException, PropertyVetoException, MathException, ExpressionBindingException { + SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(mockSimulationDB, mockMessagingServiceInternal, + mockMessagingServiceSim, mockHtcProxy, true); + SimulationDispatcher.DispatchThread thread = simulationDispatcher.dispatchThread; + // Create and insert simulation. Then ensure that this simulation has it's job status changed to dispatched Simulation mockSimulation = DispatcherTestUtils.createMockSimulation(20, 20, 20); mockSimulationDB.insertSimulation(DispatcherTestUtils.alice, mockSimulation); DispatcherTestUtils.insertOrUpdateStatus(mockSimulation.getKey(), DispatcherTestUtils.jobIndex, DispatcherTestUtils.taskID, DispatcherTestUtils.alice, @@ -112,14 +124,15 @@ public void dispatcherThreadTest() throws SQLException, DataAccessException, Int thread.finishListener.wait(); } - jobStatus = mockSimulationDB.getLatestSimulationJobStatus(mockSimulation.getKey(), 0); + SimulationJobStatus jobStatus = mockSimulationDB.getLatestSimulationJobStatus(mockSimulation.getKey(), 0); Assertions.assertTrue(jobStatus.getSchedulerStatus().isDispatched(), "Dispatches"); } //###################### Test Simulation Monitor ########################## - + // Rig the mock simulation DB to return a simulation job status that features some misbehavior of what's expected, + // which prompts for removal by the zombie killer @Test public void zombieKillerTest() throws SQLException, DataAccessException, InterruptedException, IOException { SimulationDispatcher.INITIAL_ZOMBIE_DELAY = 10; @@ -146,7 +159,7 @@ public void zombieKillerTest() throws SQLException, DataAccessException, Interru } @Test - public void queueFlusherTest() throws SQLException, DataAccessException, InterruptedException { + public void queueFlusherKillIdleJobs() throws SQLException, DataAccessException, InterruptedException { SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(mockSimulationDB, mockMessagingServiceInternal, mockMessagingServiceSim, mockHtcProxy, false); DispatcherTestUtils.insertOrUpdateStatus(mockSimulationDB); @@ -177,16 +190,20 @@ public void queueFlusherTest() throws SQLException, DataAccessException, Interru Assertions.assertTrue(mockHtcProxy.jobsKilledUnsafely.contains(status.getSimulationExecutionStatus().getHtcJobID())); Assertions.assertTrue(logOutPut.toString().contains(SimulationDispatcher.SimulationMonitor.QueueFlusher.timeOutFailure)); - // reset for next test - simulationDispatcher.simDispatcherEngine.resetTimeStamps(); - mockHtcProxy.jobsKilledUnsafely.clear(); - mockSimulationDB.resetDataBase(); + } + + @Test + public void queueFlusherKillsNoLongerReferencedSims() throws SQLException, DataAccessException, InterruptedException { + SimulationDispatcher simulationDispatcher = SimulationDispatcher.simulationDispatcherCreator(mockSimulationDB, mockMessagingServiceInternal, + mockMessagingServiceSim, mockHtcProxy, false); + SimulationDispatcher.SimulationMonitor simMonitor = simulationDispatcher.simMonitor; + SimulationDispatcher.SimulationMonitor.QueueFlusher queueFlusher = simMonitor.initialQueueFlusher; mockSimulationDB.insertUnreferencedSimKey(DispatcherTestUtils.simKey); DispatcherTestUtils.insertOrUpdateStatus(mockSimulationDB); - queueThread = new Thread(queueFlusher); + Thread queueThread = new Thread(queueFlusher); queueThread.start(); - retries = 0; + int retries = 0; while (queueThread.getState() != Thread.State.TIMED_WAITING){ if (retries == 10){ break; @@ -200,7 +217,7 @@ public void queueFlusherTest() throws SQLException, DataAccessException, Interru synchronized (queueFlusher.finishListener){ queueFlusher.finishListener.wait(); } - status = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, DispatcherTestUtils.jobIndex); + SimulationJobStatus status = mockSimulationDB.getLatestSimulationJobStatus(DispatcherTestUtils.simKey, DispatcherTestUtils.jobIndex); Assertions.assertTrue(status.getSchedulerStatus().isFailed()); Assertions.assertTrue(mockHtcProxy.jobsKilledUnsafely.contains(status.getSimulationExecutionStatus().getHtcJobID())); Assertions.assertTrue(logOutPut.toString().contains(SimulationDispatcher.SimulationMonitor.QueueFlusher.unreferencedFailure)); From f17b6aef9632613737b0e10ec0c1306f192a5e6c Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Tue, 8 Oct 2024 09:07:15 -0400 Subject: [PATCH 10/16] Build Webapp Docker Image for Minikube Remote --- docker/build/build.sh | 2 + webapp-ng/Dockerfile-webapp-remote | 24 ++++++++++ webapp-ng/angular.json | 29 ++++++++++++ webapp-ng/package.json | 1 + .../src/environments/environment.remote.ts | 44 +++++++++++++++++++ 5 files changed, 100 insertions(+) create mode 100644 webapp-ng/Dockerfile-webapp-remote create mode 100644 webapp-ng/src/environments/environment.remote.ts diff --git a/docker/build/build.sh b/docker/build/build.sh index 600c90beca..e63d040cd8 100755 --- a/docker/build/build.sh +++ b/docker/build/build.sh @@ -130,6 +130,8 @@ build_webapp() { if [[ $? -ne 0 ]]; then echo "failed to build prod"; exit 1; fi build_webapp_common island if [[ $? -ne 0 ]]; then echo "failed to build island"; exit 1; fi + build_webapp_common remote + if [[ $? -ne 0 ]]; then echo "failed to build remote"; exit 1; fi } build_batch() { diff --git a/webapp-ng/Dockerfile-webapp-remote b/webapp-ng/Dockerfile-webapp-remote new file mode 100644 index 0000000000..ded448c15f --- /dev/null +++ b/webapp-ng/Dockerfile-webapp-remote @@ -0,0 +1,24 @@ +# Build stage +FROM node:20.11-alpine3.19 AS build + +RUN apk update && apk add git + +RUN mkdir -p /app + +WORKDIR /app + +COPY package.json . +COPY package-lock.json . + +RUN npm install --legacy-peer-deps + +COPY . . + +RUN npm run build_remote + +# ----------------- + +FROM nginx:1.17.1-alpine +COPY --from=build /app/dist/login-demo /usr/share/nginx/html +COPY ./nginx-custom.conf /etc/nginx/conf.d/default.conf +EXPOSE 80 diff --git a/webapp-ng/angular.json b/webapp-ng/angular.json index 3392559567..024c5f9592 100644 --- a/webapp-ng/angular.json +++ b/webapp-ng/angular.json @@ -139,6 +139,32 @@ "maximumWarning": "6kb" } ] + }, + "configuration_remote": { + "fileReplacements": [ + { + "replace": "src/environments/environment.ts", + "with": "src/environments/environment.remote.ts" + } + ], + "optimization": true, + "outputHashing": "all", + "sourceMap": false, + "namedChunks": false, + "extractLicenses": true, + "vendorChunk": false, + "buildOptimizer": true, + "budgets": [ + { + "type": "initial", + "maximumWarning": "2mb", + "maximumError": "5mb" + }, + { + "type": "anyComponentStyle", + "maximumWarning": "6kb" + } + ] } }, @@ -161,6 +187,9 @@ }, "configuration_island": { "buildTarget": "login-demo:build:configuration_island" + }, + "configuration_remote": { + "buildTarget": "login-demo:build:configuration_remote" } } }, diff --git a/webapp-ng/package.json b/webapp-ng/package.json index debb8a544e..46cc4e5224 100644 --- a/webapp-ng/package.json +++ b/webapp-ng/package.json @@ -9,6 +9,7 @@ "build_dev": "ng build -c configuration_dev", "build_stage": "ng build -c configuration_stage", "build_island": "ng build -c configuration_island", + "build_remote": "ng build -c configuration_remote", "test": "ng test", "test:ci": "ng test --no-watch --no-progress --browsers=ChromeHeadlessCI", "lint": "ng lint", diff --git a/webapp-ng/src/environments/environment.remote.ts b/webapp-ng/src/environments/environment.remote.ts new file mode 100644 index 0000000000..30153365c1 --- /dev/null +++ b/webapp-ng/src/environments/environment.remote.ts @@ -0,0 +1,44 @@ +import config from '../../auth_config.json'; + +const { domain, clientId, authorizationParams: { audience }, apiUri, errorPath } = config as { + domain: string; + clientId: string; + authorizationParams: { + audience?: string; + }, + apiUri: string; + errorPath: string; +}; + +export const environment = { + production: true, + auth: { + domain, + clientId, + authorizationParams: { + audience: `${audience}`, + redirect_uri: window.location.origin, + }, + errorPath, + }, + apiUri: `${apiUri}`, + httpInterceptor: { + allowedList: [ + { + // uri: `${config.apiUri}/api/*`, + // uri: `${apiUri}/api/*`, + // uri: '/api/*', + uri: 'https://minikube-remote/api/*', + + // allowAnonymous: true, + tokenOptions: { + authorizationParams: { + audience: `${audience}`, + scope: 'openid profile email' + } + } + }, + ], + + }, +}; From 9e863fe4d0bc43850b5829ad3cdb91b52d6600e3 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Wed, 9 Oct 2024 10:37:39 -0400 Subject: [PATCH 11/16] Grab Min Memory Property When Required --- .../java/cbit/vcell/message/server/htc/HtcProxy.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/htc/HtcProxy.java b/vcell-server/src/main/java/cbit/vcell/message/server/htc/HtcProxy.java index 3cfb5ca9bd..9b0245a154 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/htc/HtcProxy.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/htc/HtcProxy.java @@ -242,8 +242,6 @@ public static String toUnixStyleText(String javaString) throws IOException { public abstract String getSubmissionFileExtension(); public static class MemLimitResults { - private static final long FALLBACK_MEM_LIMIT_MB= Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMinMemoryMB)); // MAX memory allowed if not set in limitFile, currently 4g - private static final long POWER_USER_MEMORY_FLOOR=Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcPowerUserMemoryFloorMB)); // MIN memory allowed if declared to be a power user, currently 50g private long memLimit; private String memLimitSource; public MemLimitResults(long memLimit, String memLimitSource) { @@ -258,15 +256,16 @@ public String getMemLimitSource() { return memLimitSource; } private static MemLimitResults getJobRequestedMemoryLimit(SolverDescription solverDescription, double estimatedMemSizeMB, boolean isPowerUser) { - long batchJobMemoryLimit = FALLBACK_MEM_LIMIT_MB; + long batchJobMemoryLimit = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMinMemoryMB)); // MAX memory allowed if not set in limitFile, currently 4g String detailedMessage = "default memory limit"; if(estimatedMemSizeMB > batchJobMemoryLimit) {//Use estimated if bigger batchJobMemoryLimit = (long)estimatedMemSizeMB; detailedMessage = "used Estimated"; } - if (isPowerUser && batchJobMemoryLimit < POWER_USER_MEMORY_FLOOR){ - batchJobMemoryLimit = POWER_USER_MEMORY_FLOOR; + long powerUserMemory = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcPowerUserMemoryFloorMB)); // MIN memory allowed if declared to be a power user, currently 50g + if (isPowerUser && batchJobMemoryLimit < powerUserMemory){ + batchJobMemoryLimit = powerUserMemory; detailedMessage = "poweruser's memory override"; } From c3b69779d78b2cdaea75143f4c96ccb6a4387057 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Wed, 9 Oct 2024 10:38:51 -0400 Subject: [PATCH 12/16] Add Required Properties for Submit and Post-Processor --- docker/build/Dockerfile-submit-dev | 3 +++ .../vcell/message/server/batch/sim/HtcSimulationWorker.java | 5 ++++- .../vcell/message/server/batch/sim/SolverPostprocessor.java | 6 +++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docker/build/Dockerfile-submit-dev b/docker/build/Dockerfile-submit-dev index 9df249c39b..6e6c2b0b43 100644 --- a/docker/build/Dockerfile-submit-dev +++ b/docker/build/Dockerfile-submit-dev @@ -170,4 +170,7 @@ ENTRYPOINT java \ -Dvcell.simdatadir.archive.external=${simdatadir_archive_external} \ -Dvcell.ssh.cmd.cmdtimeout=${vcell_ssh_cmd_cmdtimeout} \ -Dvcell.ssh.cmd.restoretimeout=${vcell_ssh_cmd_restoretimeout} \ + -Dvcell.htc.memory.min.mb=${htcMinMemoryMB} \ + -Dvcell.htc.memory.max.mb=${htcMaxMemoryMB} \ + -Dvcell.htc.memory.pu.floor.mb=${htcPowerUserMemoryFloorMB} \ -cp "./lib/*" cbit.vcell.message.server.batch.sim.HtcSimulationWorker diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java b/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java index 1ab3d64dfd..f2f73a7796 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java @@ -678,7 +678,10 @@ public static void main(String[] args) throws IOException { PropertyLoader.slurm_qos, PropertyLoader.slurm_partition_pu, PropertyLoader.slurm_reservation_pu, - PropertyLoader.slurm_qos_pu + PropertyLoader.slurm_qos_pu, + PropertyLoader.htcMinMemoryMB, + PropertyLoader.htcMaxMemoryMB, + PropertyLoader.htcPowerUserMemoryFloorMB }; diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/SolverPostprocessor.java b/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/SolverPostprocessor.java index 8cd7c2667e..5adc524047 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/SolverPostprocessor.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/SolverPostprocessor.java @@ -146,7 +146,11 @@ private static Exception runPostprocessingCommands(String filename, Logger lg) { } private static final String POST_PROCESSOR_PROPERTIES[] = { PropertyLoader.primarySimDataDirInternalProperty, - PropertyLoader.secondarySimDataDirInternalProperty + PropertyLoader.secondarySimDataDirInternalProperty, + PropertyLoader.mongodbDatabase, + PropertyLoader.jmsSimHostInternal, + PropertyLoader.jmsSimPortInternal, + PropertyLoader.jmsBlobMessageUseMongo }; } From a0f93eba3f0c49c8cd3d5fb56fdc63e9a5eec0e8 Mon Sep 17 00:00:00 2001 From: jcschaff Date: Wed, 9 Oct 2024 10:55:18 -0400 Subject: [PATCH 13/16] update SlurmProxyTest fixtures for minor changes in script comment --- .../slurm_fixtures/adams_moulton/V_REL_274633859_0_0.slurm.sub | 2 +- .../slurm_fixtures/cvode/V_REL_274630682_0_0.slurm.sub | 2 +- .../slurm_fixtures/finite_volume/V_REL_274514696_0_0.slurm.sub | 2 +- .../slurm_fixtures/gibson/V_REL_274635122_0_0.slurm.sub | 2 +- .../gibson_milstein/V_REL_274641698_0_0.slurm.sub | 2 +- .../slurm_fixtures/langevin/V_REL_274672135_0_0.slurm.sub | 2 +- .../moving_boundary/V_REL_274641196_0_0.slurm.sub | 2 +- .../slurm_fixtures/nfsim/V_REL_274642453_0_0.slurm.sub | 2 +- .../runge_kutta_fehlberg/V_REL_274631114_0_0.slurm.sub | 2 +- .../slurm_fixtures/smoldyn/V_REL_274630052_0_0.slurm.sub | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/vcell-server/src/test/resources/slurm_fixtures/adams_moulton/V_REL_274633859_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/adams_moulton/V_REL_274633859_0_0.slurm.sub index 915146b753..e9cb49f785 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/adams_moulton/V_REL_274633859_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/adams_moulton/V_REL_274633859_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/cvode/V_REL_274630682_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/cvode/V_REL_274630682_0_0.slurm.sub index 8ff7e5e13f..1b2591f59d 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/cvode/V_REL_274630682_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/cvode/V_REL_274630682_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/finite_volume/V_REL_274514696_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/finite_volume/V_REL_274514696_0_0.slurm.sub index 6390257360..c253addba5 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/finite_volume/V_REL_274514696_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/finite_volume/V_REL_274514696_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/gibson/V_REL_274635122_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/gibson/V_REL_274635122_0_0.slurm.sub index 1d84678455..858d70d464 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/gibson/V_REL_274635122_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/gibson/V_REL_274635122_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/gibson_milstein/V_REL_274641698_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/gibson_milstein/V_REL_274641698_0_0.slurm.sub index a55a11e7b4..cbdb1552d7 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/gibson_milstein/V_REL_274641698_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/gibson_milstein/V_REL_274641698_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/langevin/V_REL_274672135_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/langevin/V_REL_274672135_0_0.slurm.sub index 1944ad5f24..3058343356 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/langevin/V_REL_274672135_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/langevin/V_REL_274672135_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/moving_boundary/V_REL_274641196_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/moving_boundary/V_REL_274641196_0_0.slurm.sub index 6814f4c465..4945567251 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/moving_boundary/V_REL_274641196_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/moving_boundary/V_REL_274641196_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/nfsim/V_REL_274642453_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/nfsim/V_REL_274642453_0_0.slurm.sub index 5066f18768..c7f19aa926 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/nfsim/V_REL_274642453_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/nfsim/V_REL_274642453_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/runge_kutta_fehlberg/V_REL_274631114_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/runge_kutta_fehlberg/V_REL_274631114_0_0.slurm.sub index c60f96d905..095da3b7b5 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/runge_kutta_fehlberg/V_REL_274631114_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/runge_kutta_fehlberg/V_REL_274631114_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell diff --git a/vcell-server/src/test/resources/slurm_fixtures/smoldyn/V_REL_274630052_0_0.slurm.sub b/vcell-server/src/test/resources/slurm_fixtures/smoldyn/V_REL_274630052_0_0.slurm.sub index ae6b433cb7..087bc864e5 100644 --- a/vcell-server/src/test/resources/slurm_fixtures/smoldyn/V_REL_274630052_0_0.slurm.sub +++ b/vcell-server/src/test/resources/slurm_fixtures/smoldyn/V_REL_274630052_0_0.slurm.sub @@ -8,7 +8,7 @@ #SBATCH --mem=4096M #SBATCH --no-kill #SBATCH --no-requeue -# VCell SlurmProxy memory limit source=default memory limit +# VCell SlurmProxy memory limit source='default memory limit' TMPDIR=/scratch/vcell From 2266176eb67d92c4504e0a9b366fc98b3ed0d295 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Thu, 10 Oct 2024 09:14:49 -0400 Subject: [PATCH 14/16] Ensure Power Users get Extra Memory --- .../server/dispatcher/SimulationStateMachine.java | 4 ++++ .../server/dispatcher/DispatcherTestUtils.java | 15 ++++++++++++--- .../server/dispatcher/MockSimulationDB.java | 4 +++- .../dispatcher/SimulationStateMachineTest.java | 8 ++++++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java index cb1b5c2c8f..d99139e348 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java @@ -466,6 +466,10 @@ public synchronized void onDispatch(Simulation simulation, SimulationJobStatus o double htcMaxMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMaxMemoryMB)); double requestedMemoryMB = Math.max(estimatedMemMB, htcMinMemoryMB); + if (isPowerUser){ + htcMaxMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcPowerUserMemoryFloorMB)); + } + final SimulationJobStatus newSimJobStatus; if (requestedMemoryMB > htcMaxMemoryMB) { // diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java index 2167e1f5ee..1f536bbaa3 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java @@ -13,11 +13,13 @@ import cbit.vcell.solver.Simulation; import cbit.vcell.solver.VCSimulationIdentifier; import cbit.vcell.solver.server.SimulationMessage; +import org.joda.time.DateTime; import org.vcell.util.DataAccessException; import org.vcell.util.ISize; import org.vcell.util.document.*; import java.beans.PropertyVetoException; +import java.math.BigDecimal; import java.sql.SQLException; import java.time.Instant; import java.util.Date; @@ -116,15 +118,18 @@ public static void restoreRequiredProperties(){ PropertyLoader.setProperty(PropertyLoader.maxPdeJobsPerUser, previousPdeJobsPerUser); } - public static Simulation createMockSimulation(int iSizeX, int iSizeY, int iSizeZ) throws PropertyVetoException, MathException, ExpressionBindingException { + public static Simulation createMockSimulation(int iSizeX, int iSizeY, int iSizeZ, User user) throws PropertyVetoException, MathException, ExpressionBindingException { VolVariable volVariable = new VolVariable("t", new Variable.Domain(new CompartmentSubDomain("t", 1))); VolVariable volVariable2 = new VolVariable("b", new Variable.Domain(new CompartmentSubDomain("b", 2))); MathSymbolMapping mathSymbolMapping = new MathSymbolMapping(); Geometry geometry = new Geometry("T", 3); - MathModel mathModel = new MathModel(new Version("Test", alice)); + MathModel mathModel = new MathModel(new Version("Test", user)); MathDescription mathDescription = new MathDescription("Test", mathSymbolMapping); mathDescription.setGeometry(new Geometry("T", 3)); - Simulation simulation = new Simulation(SimulationVersion.createTempSimulationVersion(), + SimulationVersion simulationVersion = new SimulationVersion(new KeyValue("5"), "Test", user, + new GroupAccessNone(), null, new BigDecimal("2"), Date.from(Instant.now()), VersionFlag.fromInt(1), + "", new KeyValue("3")); + Simulation simulation = new Simulation(simulationVersion, mathDescription, mathModel); MeshSpecification meshSpecification = new MeshSpecification(geometry); meshSpecification.setSamplingSize(new ISize(iSizeX, iSizeY, iSizeZ)); @@ -133,6 +138,10 @@ public static Simulation createMockSimulation(int iSizeX, int iSizeY, int iSizeZ return simulation; } + public static Simulation createMockSimulation(int iSizeX, int iSizeY, int iSizeZ) throws PropertyVetoException, MathException, ExpressionBindingException { + return createMockSimulation(iSizeX, iSizeY, iSizeZ, alice); + } + public static void insertOrUpdateStatus(KeyValue simKey, int jobIndex, int taskID, User user, SimulationJobStatus.SchedulerStatus status, SimulationDatabase simulationDB) throws SQLException, DataAccessException { SimulationJobStatus jobStatus = simulationDB.getLatestSimulationJobStatus(simKey, jobIndex); VCSimulationIdentifier simID = new VCSimulationIdentifier(simKey, user); diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java index c9a03cbb5d..4f2af94a82 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java @@ -19,9 +19,11 @@ public class MockSimulationDB implements SimulationDatabase{ private HashMap> dbTable = new HashMap<>(); public static User.SpecialUser specialAdmin = new User.SpecialUser("Tom", new KeyValue("999"), new User.SPECIAL_CLAIM[User.SPECIAL_CLAIM.admins.ordinal()]); + public static User.SpecialUser specialUser = new User.SpecialUser("Tim", new KeyValue("2"), new User.SPECIAL_CLAIM[]{User.SpecialUser.SPECIAL_CLAIM.powerUsers}); private final HashMap users = new HashMap<>(){ - {put(specialAdmin.getName(), specialAdmin); put(DispatcherTestUtils.alice.getName(), DispatcherTestUtils.alice);} + {put(specialAdmin.getName(), specialAdmin); put(DispatcherTestUtils.alice.getName(), DispatcherTestUtils.alice); + put(specialUser.getName(), specialUser);} }; private final HashMap simulations = new HashMap<>(); diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java index c181b0762a..0cdaa52532 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java @@ -191,6 +191,14 @@ public void stateShouldTransitionToDispatched() throws SQLException, DataAccessE jobStatus = getLatestJobSubmission(); Assertions.assertTrue(jobStatus.getSchedulerStatus().isDispatched()); Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isDispatched()); + + DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, MockSimulationDB.specialUser, SimulationJobStatus.SchedulerStatus.WAITING, simulationDB); + simulation = DispatcherTestUtils.createMockSimulation(900, 900, 900, MockSimulationDB.specialUser); + simulation.getSolverTaskDescription().setTimeoutDisabled(true); + stateMachine.onDispatch(simulation, getLatestJobSubmission(), simulationDB, testMessageSession); + jobStatus = getLatestJobSubmission(); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isDispatched()); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isDispatched()); } @Test From acd44a5210aa4b242f39fb6dd57565e037af56d0 Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 11 Oct 2024 07:41:44 -0400 Subject: [PATCH 15/16] Include Power User Memory Limit and Test --- docker/build/Dockerfile-sched-dev | 4 +++- docker/build/Dockerfile-submit-dev | 7 ++++++- .../java/cbit/vcell/resource/PropertyLoader.java | 1 + .../server/batch/sim/HtcSimulationWorker.java | 3 ++- .../server/dispatcher/SimulationDispatcherMain.java | 4 +++- .../server/dispatcher/SimulationStateMachine.java | 6 +++--- .../server/dispatcher/DispatcherTestUtils.java | 5 +++++ .../message/server/dispatcher/MockSimulationDB.java | 4 ++-- .../dispatcher/SimulationStateMachineTest.java | 13 ++++++++++--- 9 files changed, 35 insertions(+), 12 deletions(-) diff --git a/docker/build/Dockerfile-sched-dev b/docker/build/Dockerfile-sched-dev index 47f831de56..08e0386838 100644 --- a/docker/build/Dockerfile-sched-dev +++ b/docker/build/Dockerfile-sched-dev @@ -71,7 +71,8 @@ ENV softwareVersion=SOFTWARE-VERSION-NOT-SET \ maxPdeJobsPerUser="max-pde-jobs-per-user-not-set" \ htcMinMemoryMB="htc-min-memory-not-set" \ htcMaxMemoryMB="htc-max-memory-not-set" \ - htcPowerUserMemoryFloorMB="htc-power-user-memory-floor-not-set" + htcPowerUserMemoryFloorMB="htc-power-user-memory-floor-not-set" \ + htcPowerUserMemoryMaxMB="htc-power-user-memory-max-not-set" ENV dbpswdfile=/run/secrets/dbpswd \ jmspswdfile=/run/secrets/jmspswd \ @@ -125,4 +126,5 @@ ENTRYPOINT java \ -Dvcell.htc.memory.min.mb=${htcMinMemoryMB} \ -Dvcell.htc.memory.max.mb=${htcMaxMemoryMB} \ -Dvcell.htc.memory.pu.floor.mb=${htcPowerUserMemoryFloorMB} \ + -Dvcell.htc.memory.pu.max.mb=${htcPowerUserMemoryMaxMB} \ -cp "./lib/*" cbit.vcell.message.server.dispatcher.SimulationDispatcherMain diff --git a/docker/build/Dockerfile-submit-dev b/docker/build/Dockerfile-submit-dev index 6e6c2b0b43..438ebad206 100644 --- a/docker/build/Dockerfile-submit-dev +++ b/docker/build/Dockerfile-submit-dev @@ -91,7 +91,11 @@ ENV softwareVersion=SOFTWARE-VERSION-NOT-SET \ vcell_ssh_cmd_cmdtimeout="cmdSrvcSshCmdTimeoutMS-not-set" \ vcell_ssh_cmd_restoretimeout="cmdSrvcSshCmdRestoreTimeoutFactor-not-set" \ simdatadir_archive_external="simdatadir_archive_external-not-set" \ - simdatadir_archive_internal="simdatadir_archive_internal-not-set" + simdatadir_archive_internal="simdatadir_archive_internal-not-set" \ + htcMinMemoryMB="htc-min-memory-not-set" \ + htcMaxMemoryMB="htc-max-memory-not-set" \ + htcPowerUserMemoryFloorMB="htc-power-user-memory-floor-not-set" \ + htcPowerUserMemoryMaxMB="htc-power-user-memory-max-not-set" ENV jmspswdfile=/run/secrets/jmspswd \ jmsrestpswdfile=/run/secrets/jmsrestpswd \ @@ -173,4 +177,5 @@ ENTRYPOINT java \ -Dvcell.htc.memory.min.mb=${htcMinMemoryMB} \ -Dvcell.htc.memory.max.mb=${htcMaxMemoryMB} \ -Dvcell.htc.memory.pu.floor.mb=${htcPowerUserMemoryFloorMB} \ + -Dvcell.htc.memory.pu.max.mb=${htcPowerUserMemoryMaxMB} \ -cp "./lib/*" cbit.vcell.message.server.batch.sim.HtcSimulationWorker diff --git a/vcell-core/src/main/java/cbit/vcell/resource/PropertyLoader.java b/vcell-core/src/main/java/cbit/vcell/resource/PropertyLoader.java index 6f9db95278..2c5539362c 100644 --- a/vcell-core/src/main/java/cbit/vcell/resource/PropertyLoader.java +++ b/vcell-core/src/main/java/cbit/vcell/resource/PropertyLoader.java @@ -83,6 +83,7 @@ public static void setConfigProvider(VCellConfigProvider configProvider) { public static final String htcMinMemoryMB = record("vcell.htc.memory.min.mb", ValueType.INT); // minimum memory request in MB, currently 4g public static final String htcMaxMemoryMB = record("vcell.htc.memory.max.mb", ValueType.INT); // maximum memory request in MB public static final String htcPowerUserMemoryFloorMB = record("vcell.htc.memory.pu.floor.mb", ValueType.INT); // MIN memory allowed if declared to be a power user, currently 50g (Previously Existing Value) + public static final String htcPowerUserMemoryMaxMB = record("vcell.htc.memory.pu.max.mb", ValueType.INT); // MAX memory allowed if declared to be a power user public static final String htc_vcellfvsolver_docker_name = record("vcell.htc.vcellfvsolver.docker.name",ValueType.GEN); public static final String htc_vcellfvsolver_solver_list = record("vcell.htc.vcellfvsolver.solver.list",ValueType.GEN); diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java b/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java index f2f73a7796..63575003d7 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/batch/sim/HtcSimulationWorker.java @@ -681,7 +681,8 @@ public static void main(String[] args) throws IOException { PropertyLoader.slurm_qos_pu, PropertyLoader.htcMinMemoryMB, PropertyLoader.htcMaxMemoryMB, - PropertyLoader.htcPowerUserMemoryFloorMB + PropertyLoader.htcPowerUserMemoryFloorMB, + PropertyLoader.htcPowerUserMemoryMaxMB }; diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherMain.java b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherMain.java index a23560f10b..7de4d645b7 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherMain.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationDispatcherMain.java @@ -73,7 +73,9 @@ public static void main(String[] args) { PropertyLoader.maxJobsPerScan, PropertyLoader.maxOdeJobsPerUser, PropertyLoader.maxPdeJobsPerUser, - PropertyLoader.slurm_partition + PropertyLoader.slurm_partition, + PropertyLoader.htcPowerUserMemoryMaxMB, + PropertyLoader.htcMaxMemoryMB }; } diff --git a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java index d99139e348..1f318017d6 100644 --- a/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java +++ b/vcell-server/src/main/java/cbit/vcell/message/server/dispatcher/SimulationStateMachine.java @@ -464,11 +464,11 @@ public synchronized void onDispatch(Simulation simulation, SimulationJobStatus o double estimatedMemMB = simulationTask.getEstimatedMemorySizeMB(); double htcMinMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMinMemoryMB)); double htcMaxMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcMaxMemoryMB)); - double requestedMemoryMB = Math.max(estimatedMemMB, htcMinMemoryMB); - if (isPowerUser){ - htcMaxMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcPowerUserMemoryFloorMB)); + htcMinMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcPowerUserMemoryFloorMB)); + htcMaxMemoryMB = Integer.parseInt(PropertyLoader.getRequiredProperty(PropertyLoader.htcPowerUserMemoryMaxMB)); } + double requestedMemoryMB = Math.max(estimatedMemMB, htcMinMemoryMB); final SimulationJobStatus newSimJobStatus; if (requestedMemoryMB > htcMaxMemoryMB) { diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java index 1f536bbaa3..379e34d114 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/DispatcherTestUtils.java @@ -29,6 +29,7 @@ public class DispatcherTestUtils { private static String previousHtcMax = ""; private static String previousHtcMin = ""; private static String previousHtcPowerFloor = ""; + private static String previousHtcPowerMax = ""; private static String previousMongoBlob = ""; private static String previousJMSIntHostProperty = ""; private static String previousJMSIntPortProperty = ""; @@ -64,6 +65,9 @@ public static void setRequiredProperties(){ previousHtcPowerFloor = PropertyLoader.getProperty(PropertyLoader.htcPowerUserMemoryFloorMB, ""); PropertyLoader.setProperty(PropertyLoader.htcPowerUserMemoryFloorMB, "51200"); + previousHtcPowerMax = PropertyLoader.getProperty(PropertyLoader.htcPowerUserMemoryMaxMB, ""); + PropertyLoader.setProperty(PropertyLoader.htcPowerUserMemoryMaxMB, "64000"); + previousMongoBlob = PropertyLoader.getProperty(PropertyLoader.jmsBlobMessageUseMongo, ""); PropertyLoader.setProperty(PropertyLoader.jmsBlobMessageUseMongo, ""); @@ -105,6 +109,7 @@ public static void restoreRequiredProperties(){ PropertyLoader.setProperty(PropertyLoader.htcMaxMemoryMB, previousHtcMax); PropertyLoader.setProperty(PropertyLoader.htcMinMemoryMB, previousHtcMin); PropertyLoader.setProperty(PropertyLoader.htcPowerUserMemoryFloorMB, previousHtcPowerFloor); + PropertyLoader.setProperty(PropertyLoader.htcPowerUserMemoryMaxMB, previousHtcPowerMax); PropertyLoader.setProperty(PropertyLoader.jmsBlobMessageUseMongo, previousMongoBlob); PropertyLoader.setProperty(PropertyLoader.jmsIntPortInternal, previousJMSIntPortProperty); PropertyLoader.setProperty(PropertyLoader.jmsIntHostInternal, previousJMSIntHostProperty); diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java index 4f2af94a82..30fe536545 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/MockSimulationDB.java @@ -19,11 +19,11 @@ public class MockSimulationDB implements SimulationDatabase{ private HashMap> dbTable = new HashMap<>(); public static User.SpecialUser specialAdmin = new User.SpecialUser("Tom", new KeyValue("999"), new User.SPECIAL_CLAIM[User.SPECIAL_CLAIM.admins.ordinal()]); - public static User.SpecialUser specialUser = new User.SpecialUser("Tim", new KeyValue("2"), new User.SPECIAL_CLAIM[]{User.SpecialUser.SPECIAL_CLAIM.powerUsers}); + public static User.SpecialUser powerUser = new User.SpecialUser("Tim", new KeyValue("2"), new User.SPECIAL_CLAIM[]{User.SpecialUser.SPECIAL_CLAIM.powerUsers}); private final HashMap users = new HashMap<>(){ {put(specialAdmin.getName(), specialAdmin); put(DispatcherTestUtils.alice.getName(), DispatcherTestUtils.alice); - put(specialUser.getName(), specialUser);} + put(powerUser.getName(), powerUser);} }; private final HashMap simulations = new HashMap<>(); diff --git a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java index 0cdaa52532..754624a7ae 100644 --- a/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java +++ b/vcell-server/src/test/java/cbit/vcell/message/server/dispatcher/SimulationStateMachineTest.java @@ -14,7 +14,6 @@ import org.vcell.util.DataAccessException; import org.vcell.util.document.KeyValue; import org.vcell.util.document.User; -import org.vcell.util.document.VCellServerID; import java.beans.PropertyVetoException; import java.sql.SQLException; @@ -143,6 +142,8 @@ public void stateShouldTransitionToFailure() throws SQLException, DataAccessExce // Simulation memoryIntensiveSimulation = DispatcherTestUtils.createMockSimulation(900, 900, 900); + Simulation powerMemoryIntensiveSimulation = DispatcherTestUtils.createMockSimulation(9000, 9000, 5000, MockSimulationDB.powerUser); + powerMemoryIntensiveSimulation.getSolverTaskDescription().setTimeoutDisabled(true); DispatcherTestUtils.insertOrUpdateStatus(simulationDB); Assertions.assertThrows(RuntimeException.class, @@ -156,6 +157,12 @@ public void stateShouldTransitionToFailure() throws SQLException, DataAccessExce Assertions.assertTrue(jobStatus.getSchedulerStatus().isFailed(), "Memory size too large"); Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isFailed(), "Failed because of memory size."); + DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, MockSimulationDB.powerUser, SimulationJobStatus.SchedulerStatus.WAITING, simulationDB); + stateMachine.onDispatch(powerMemoryIntensiveSimulation, getLatestJobSubmission(), simulationDB, testMessageSession); + jobStatus = getLatestJobSubmission(); + Assertions.assertTrue(jobStatus.getSchedulerStatus().isFailed(), "Memory size too large"); + Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isFailed(), "Failed because of memory size."); + DispatcherTestUtils.insertOrUpdateStatus(simulationDB); statusMessage = stateMachine.onStopRequest(DispatcherTestUtils.bob, getLatestJobSubmission(), simulationDB, testMessageSession); Assertions.assertTrue(statusMessage.getSimulationJobStatus().getSchedulerStatus().isFailed(), "Stopping as another user."); @@ -192,8 +199,8 @@ public void stateShouldTransitionToDispatched() throws SQLException, DataAccessE Assertions.assertTrue(jobStatus.getSchedulerStatus().isDispatched()); Assertions.assertTrue(getClientTopicMessage().getSchedulerStatus().isDispatched()); - DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, MockSimulationDB.specialUser, SimulationJobStatus.SchedulerStatus.WAITING, simulationDB); - simulation = DispatcherTestUtils.createMockSimulation(900, 900, 900, MockSimulationDB.specialUser); + DispatcherTestUtils.insertOrUpdateStatus(simKey, jobIndex, taskID, MockSimulationDB.powerUser, SimulationJobStatus.SchedulerStatus.WAITING, simulationDB); + simulation = DispatcherTestUtils.createMockSimulation(900, 900, 900, MockSimulationDB.powerUser); simulation.getSolverTaskDescription().setTimeoutDisabled(true); stateMachine.onDispatch(simulation, getLatestJobSubmission(), simulationDB, testMessageSession); jobStatus = getLatestJobSubmission(); From d3744b8cfb1dfca38d920258b37fc57bcad0005a Mon Sep 17 00:00:00 2001 From: Ezequiel Valencia Date: Fri, 11 Oct 2024 07:42:37 -0400 Subject: [PATCH 16/16] Update Zeke Compile ReadMe --- docker/swarm/README_zeke_stack_on_linux.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/swarm/README_zeke_stack_on_linux.md b/docker/swarm/README_zeke_stack_on_linux.md index f18809c8c5..97b7d0bf65 100644 --- a/docker/swarm/README_zeke_stack_on_linux.md +++ b/docker/swarm/README_zeke_stack_on_linux.md @@ -11,7 +11,7 @@ 2. ```bash pushd ../build - ./build.sh --skip-maven --skip-singularity --skip-sudo all localhost:5000/virtualcell dev_zeke + ./build.sh --skip-maven --skip-sudo all localhost:5000/virtualcell dev_zeke popd ```