Skip to content

Commit

Permalink
include job status report when failure and no exit code found
Browse files Browse the repository at this point in the history
Signed-off-by: jorgee <[email protected]>
  • Loading branch information
jorgee committed Oct 23, 2024
1 parent 24d595c commit 5ff2ae2
Show file tree
Hide file tree
Showing 13 changed files with 140 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,15 @@ abstract class AbstractGridExecutor extends Executor {
*/
protected abstract List<String> getKillCommand()

/**
* The command to be used to report the status of a grid job
*
* @param jobId Id of the submitted job
* @param queue Queue where job has been submitted
* @return The command line to be used to report the job status
*/
abstract List<String> queueJobStatusReportCommand(jobId, queue)

/**
* Status as returned by the grid engine
*/
Expand Down Expand Up @@ -425,5 +434,52 @@ abstract class AbstractGridExecutor extends Executor {
// Instead, it is the command wrapper script that is launched run within a container process.
return isFusionEnabled()
}

/**
* Dump de status report of a job.
*
* @param jobId Native id of the job
* @param queue Queue of the submitted
* @return String The job status report
*/
String dumpJobStatusReport(jobId, queue){
List cmd = queueJobStatusReportCommand(jobId, queue)
if( !cmd ) {
// If no specific status report command dump queue status as previously
return dumpQueueStatus()
}
try {
log.trace "[${name.toUpperCase()}] getting job $jobId status report > cmd: ${cmd.join(' ')}"

final buf = new StringBuilder()
final process = new ProcessBuilder(cmd).redirectErrorStream(true).start()
final consumer = process.consumeProcessOutputStream(buf)
process.waitForOrKill(60_000)
final exit = process.exitValue(); consumer.join() // <-- make sure sync with the output consume #1045
final result = buf.toString()

if( exit == 0 ) {
log.trace "[${name.toUpperCase()}] getting job $jobId status report> cmd exit: $exit"
return result
}
else {
def m = """\
[${name.toUpperCase()}] job $jobId status report cannot be fetched.
- cmd executed: ${cmd.join(' ')}
- exit status : $exit
- output :
""".stripIndent(true)
m += result.indent(' ')
log.warn1(m, firstOnly: true)
return dumpQueueStatus()
}

}
catch( Exception e ) {
log.warn "[${name.toUpperCase()}] failed to retrieve $jobId status report -- See the log file for details.", e
return dumpQueueStatus()
}

}
}

Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ class BridgeExecutor extends AbstractGridExecutor {
@Override
protected List<String> getKillCommand() { ['ccc_mdel'] }

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
return ['ccc_mstat','-H', jobId.toString()]
}

@Override
protected List<String> queueStatusCommand(Object queue) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@ class CondorExecutor extends AbstractGridExecutor {
['condor_rm']
}

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
return null
}

@Override
protected List<String> queueStatusCommand(Object queue) {
["condor_q", "-nobatch"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ class FluxExecutor extends AbstractGridExecutor {
@Override
protected List<String> getKillCommand() { ['flux', 'job', 'cancel'] }

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
['flux', 'job', 'info', jobId.toString()]
}

@Override
protected List<String> queueStatusCommand(Object queue) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class GridTaskHandler extends TaskHandler implements FusionAwareTask {

BatchCleanup batch

private boolean failureDetected

/** only for testing purpose */
protected GridTaskHandler() {}

Expand Down Expand Up @@ -372,8 +374,8 @@ class GridTaskHandler extends TaskHandler implements FusionAwareTask {
def errMessage = []
errMessage << "Failed to get exit status for process ${this} -- exitStatusReadTimeoutMillis: $exitStatusReadTimeoutMillis; delta: $delta"
// -- dump current queue stats
errMessage << "Current queue status:"
errMessage << executor.dumpQueueStatus()?.indent('> ')
errMessage << "Current $jobId status:"
errMessage << executor.dumpJobStatusReport(jobId, queue)?.indent('> ')
// -- dump directory listing
errMessage << "Content of workDir: ${task.workDir}"
errMessage << workDirList?.indent('> ')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ class HyperQueueExecutor extends AbstractGridExecutor {
return TupleHelper.listOf('hq', 'job', 'cancel')
}

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
return ['hq', 'job', 'info', jobId.toString() ]
}

@Override
protected List<String> killTaskCommand(def jobId) {
final result = getKillCommand()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,16 @@ class LsfExecutor extends AbstractGridExecutor implements TaskArrayExecutor {
@Override
protected List<String> getKillCommand() { ['bkill'] }

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
final result = ['bjobs', '-w']

if( queue )
result << '-q' << queue.toString()
result << jobId.toString()
return result
}

@Override
protected List<String> queueStatusCommand( queue ) {
// note: use the `-w` option to avoid that the printed jobid may be truncated when exceed 7 digits
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ class MoabExecutor extends AbstractGridExecutor {
@Override
protected List<String> getKillCommand() { ['mjobctl', '-c'] }

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
return ['checkjob', '-v', jobId.toString()]
}

@Override
protected List<String> queueStatusCommand(Object queue) {
return ['showq', '--xml', '-w', "user="+System.getProperty('user.name')]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,15 @@ class NqsiiExecutor extends AbstractGridExecutor {
@Override
protected List<String> getKillCommand() { ['qdel'] }

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
def result = ['qstat', '-J', jobId.toString()]
if( queue )
result << '-q' << queue.toString()

return result
}

@Override
protected List<String> queueStatusCommand(Object queue) {
def result = ['qstat']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ class OarExecutor extends AbstractGridExecutor {
@Override
protected List<String> getKillCommand() { ['oardel'] }

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
return ['oarstat', '-j', jobId.toString(), '-f']
}

@Override
protected List<String> queueStatusCommand(Object queue) {
// To have a parsable list of jobs in queue by user
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ class PbsExecutor extends AbstractGridExecutor implements TaskArrayExecutor {
@Override
protected List<String> getKillCommand() { ['qdel'] }

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
return ['tracejob', jobId.toString()]
}

@Override
protected List<String> queueStatusCommand(Object queue) {
String cmd = 'qstat -f -1'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,15 @@ class SgeExecutor extends AbstractGridExecutor implements TaskArrayExecutor {
@Override
protected List<String> getKillCommand() { ['qdel'] }

@Override
List<String> queueJobStatusReportCommand(Object jobId, Object queue) {
def result = ['qstat', '-j', jobId.toString()]
if( queue )
result << '-q' << queue.toString()

return result
}

@Override
protected List<String> queueStatusCommand(Object queue) {
def result = ['qstat']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,23 @@ class SlurmExecutor extends AbstractGridExecutor implements TaskArrayExecutor {
@Override
protected List<String> getKillCommand() { ['scancel'] }

@Override
List<String> queueJobStatusReportCommand(jobId, queue) {

// Report id status nodelist and reason
final result = ['squeue', '-o', '%i %T %N %R', '-j', jobId.toString()]
if (queue)
result << '-p' << queue.toString()

final user = System.getProperty('user.name')
if (user)
result << '-u' << user
else
log.debug "Cannot retrieve current user"

return result
}

@Override
protected List<String> queueStatusCommand(Object queue) {

Expand Down

0 comments on commit 5ff2ae2

Please sign in to comment.