From 5ff2ae284b68ac084267f6b2f1d99f8baf2e3d51 Mon Sep 17 00:00:00 2001 From: jorgee Date: Wed, 23 Oct 2024 07:29:13 +0200 Subject: [PATCH] include job status report when failure and no exit code found Signed-off-by: jorgee --- .../executor/AbstractGridExecutor.groovy | 56 +++++++++++++++++++ .../nextflow/executor/BridgeExecutor.groovy | 5 ++ .../nextflow/executor/CondorExecutor.groovy | 5 ++ .../nextflow/executor/FluxExecutor.groovy | 5 ++ .../nextflow/executor/GridTaskHandler.groovy | 6 +- .../executor/HyperQueueExecutor.groovy | 5 ++ .../nextflow/executor/LsfExecutor.groovy | 10 ++++ .../nextflow/executor/MoabExecutor.groovy | 5 ++ .../nextflow/executor/NqsiiExecutor.groovy | 9 +++ .../nextflow/executor/OarExecutor.groovy | 5 ++ .../nextflow/executor/PbsExecutor.groovy | 5 ++ .../nextflow/executor/SgeExecutor.groovy | 9 +++ .../nextflow/executor/SlurmExecutor.groovy | 17 ++++++ 13 files changed, 140 insertions(+), 2 deletions(-) diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/AbstractGridExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/AbstractGridExecutor.groovy index 306352bbb8..38dd905e79 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/AbstractGridExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/AbstractGridExecutor.groovy @@ -274,6 +274,15 @@ abstract class AbstractGridExecutor extends Executor { */ protected abstract List getKillCommand() + /** + * The command to be used to report the status of a grid job + * + * @param jobId Id of the submitted job + * @param queue Queue where job has been submitted + * @return The command line to be used to report the job status + */ + abstract List queueJobStatusReportCommand(jobId, queue) + /** * Status as returned by the grid engine */ @@ -425,5 +434,52 @@ abstract class AbstractGridExecutor extends Executor { // Instead, it is the command wrapper script that is launched run within a container process. return isFusionEnabled() } + + /** + * Dump de status report of a job. + * + * @param jobId Native id of the job + * @param queue Queue of the submitted + * @return String The job status report + */ + String dumpJobStatusReport(jobId, queue){ + List cmd = queueJobStatusReportCommand(jobId, queue) + if( !cmd ) { + // If no specific status report command dump queue status as previously + return dumpQueueStatus() + } + try { + log.trace "[${name.toUpperCase()}] getting job $jobId status report > cmd: ${cmd.join(' ')}" + + final buf = new StringBuilder() + final process = new ProcessBuilder(cmd).redirectErrorStream(true).start() + final consumer = process.consumeProcessOutputStream(buf) + process.waitForOrKill(60_000) + final exit = process.exitValue(); consumer.join() // <-- make sure sync with the output consume #1045 + final result = buf.toString() + + if( exit == 0 ) { + log.trace "[${name.toUpperCase()}] getting job $jobId status report> cmd exit: $exit" + return result + } + else { + def m = """\ + [${name.toUpperCase()}] job $jobId status report cannot be fetched. + - cmd executed: ${cmd.join(' ')} + - exit status : $exit + - output : + """.stripIndent(true) + m += result.indent(' ') + log.warn1(m, firstOnly: true) + return dumpQueueStatus() + } + + } + catch( Exception e ) { + log.warn "[${name.toUpperCase()}] failed to retrieve $jobId status report -- See the log file for details.", e + return dumpQueueStatus() + } + + } } diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/BridgeExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/BridgeExecutor.groovy index 0c086ee124..acb88b833b 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/BridgeExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/BridgeExecutor.groovy @@ -123,6 +123,11 @@ class BridgeExecutor extends AbstractGridExecutor { @Override protected List getKillCommand() { ['ccc_mdel'] } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + return ['ccc_mstat','-H', jobId.toString()] + } + @Override protected List queueStatusCommand(Object queue) { diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/CondorExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/CondorExecutor.groovy index 6e7dcafd88..a154475688 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/CondorExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/CondorExecutor.groovy @@ -104,6 +104,11 @@ class CondorExecutor extends AbstractGridExecutor { ['condor_rm'] } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + return null + } + @Override protected List queueStatusCommand(Object queue) { ["condor_q", "-nobatch"] diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/FluxExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/FluxExecutor.groovy index b6184dde07..b94593f220 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/FluxExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/FluxExecutor.groovy @@ -140,6 +140,11 @@ class FluxExecutor extends AbstractGridExecutor { @Override protected List getKillCommand() { ['flux', 'job', 'cancel'] } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + ['flux', 'job', 'info', jobId.toString()] + } + @Override protected List queueStatusCommand(Object queue) { diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/GridTaskHandler.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/GridTaskHandler.groovy index 11d310b600..3ef310ab79 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/GridTaskHandler.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/GridTaskHandler.groovy @@ -83,6 +83,8 @@ class GridTaskHandler extends TaskHandler implements FusionAwareTask { BatchCleanup batch + private boolean failureDetected + /** only for testing purpose */ protected GridTaskHandler() {} @@ -372,8 +374,8 @@ class GridTaskHandler extends TaskHandler implements FusionAwareTask { def errMessage = [] errMessage << "Failed to get exit status for process ${this} -- exitStatusReadTimeoutMillis: $exitStatusReadTimeoutMillis; delta: $delta" // -- dump current queue stats - errMessage << "Current queue status:" - errMessage << executor.dumpQueueStatus()?.indent('> ') + errMessage << "Current $jobId status:" + errMessage << executor.dumpJobStatusReport(jobId, queue)?.indent('> ') // -- dump directory listing errMessage << "Content of workDir: ${task.workDir}" errMessage << workDirList?.indent('> ') diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/HyperQueueExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/HyperQueueExecutor.groovy index b3e6c9ca16..9728358025 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/HyperQueueExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/HyperQueueExecutor.groovy @@ -98,6 +98,11 @@ class HyperQueueExecutor extends AbstractGridExecutor { return TupleHelper.listOf('hq', 'job', 'cancel') } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + return ['hq', 'job', 'info', jobId.toString() ] + } + @Override protected List killTaskCommand(def jobId) { final result = getKillCommand() diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/LsfExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/LsfExecutor.groovy index e5f425a2ac..0a7ffa8d12 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/LsfExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/LsfExecutor.groovy @@ -191,6 +191,16 @@ class LsfExecutor extends AbstractGridExecutor implements TaskArrayExecutor { @Override protected List getKillCommand() { ['bkill'] } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + final result = ['bjobs', '-w'] + + if( queue ) + result << '-q' << queue.toString() + result << jobId.toString() + return result + } + @Override protected List queueStatusCommand( queue ) { // note: use the `-w` option to avoid that the printed jobid may be truncated when exceed 7 digits diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/MoabExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/MoabExecutor.groovy index 1271a07a41..4fd2b5430d 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/MoabExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/MoabExecutor.groovy @@ -125,6 +125,11 @@ class MoabExecutor extends AbstractGridExecutor { @Override protected List getKillCommand() { ['mjobctl', '-c'] } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + return ['checkjob', '-v', jobId.toString()] + } + @Override protected List queueStatusCommand(Object queue) { return ['showq', '--xml', '-w', "user="+System.getProperty('user.name')] diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/NqsiiExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/NqsiiExecutor.groovy index 7cc887d7f7..8c1fe9749d 100755 --- a/modules/nextflow/src/main/groovy/nextflow/executor/NqsiiExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/NqsiiExecutor.groovy @@ -113,6 +113,15 @@ class NqsiiExecutor extends AbstractGridExecutor { @Override protected List getKillCommand() { ['qdel'] } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + def result = ['qstat', '-J', jobId.toString()] + if( queue ) + result << '-q' << queue.toString() + + return result + } + @Override protected List queueStatusCommand(Object queue) { def result = ['qstat'] diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/OarExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/OarExecutor.groovy index f737b6b9f1..3be0beafea 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/OarExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/OarExecutor.groovy @@ -140,6 +140,11 @@ class OarExecutor extends AbstractGridExecutor { @Override protected List getKillCommand() { ['oardel'] } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + return ['oarstat', '-j', jobId.toString(), '-f'] + } + @Override protected List queueStatusCommand(Object queue) { // To have a parsable list of jobs in queue by user diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/PbsExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/PbsExecutor.groovy index b25f020648..359de46523 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/PbsExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/PbsExecutor.groovy @@ -136,6 +136,11 @@ class PbsExecutor extends AbstractGridExecutor implements TaskArrayExecutor { @Override protected List getKillCommand() { ['qdel'] } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + return ['tracejob', jobId.toString()] + } + @Override protected List queueStatusCommand(Object queue) { String cmd = 'qstat -f -1' diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/SgeExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/SgeExecutor.groovy index a04633b8ec..314b5cf117 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/SgeExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/SgeExecutor.groovy @@ -136,6 +136,15 @@ class SgeExecutor extends AbstractGridExecutor implements TaskArrayExecutor { @Override protected List getKillCommand() { ['qdel'] } + @Override + List queueJobStatusReportCommand(Object jobId, Object queue) { + def result = ['qstat', '-j', jobId.toString()] + if( queue ) + result << '-q' << queue.toString() + + return result + } + @Override protected List queueStatusCommand(Object queue) { def result = ['qstat'] diff --git a/modules/nextflow/src/main/groovy/nextflow/executor/SlurmExecutor.groovy b/modules/nextflow/src/main/groovy/nextflow/executor/SlurmExecutor.groovy index 08d7e08576..a68141172a 100644 --- a/modules/nextflow/src/main/groovy/nextflow/executor/SlurmExecutor.groovy +++ b/modules/nextflow/src/main/groovy/nextflow/executor/SlurmExecutor.groovy @@ -154,6 +154,23 @@ class SlurmExecutor extends AbstractGridExecutor implements TaskArrayExecutor { @Override protected List getKillCommand() { ['scancel'] } + @Override + List queueJobStatusReportCommand(jobId, queue) { + + // Report id status nodelist and reason + final result = ['squeue', '-o', '%i %T %N %R', '-j', jobId.toString()] + if (queue) + result << '-p' << queue.toString() + + final user = System.getProperty('user.name') + if (user) + result << '-u' << user + else + log.debug "Cannot retrieve current user" + + return result + } + @Override protected List queueStatusCommand(Object queue) {