Skip to content

Commit

Permalink
deal with more failures and null things; log more needful things
Browse files Browse the repository at this point in the history
  • Loading branch information
jaredjennings committed May 11, 2021
1 parent 1f909f3 commit 6738e9b
Showing 1 changed file with 47 additions and 33 deletions.
80 changes: 47 additions & 33 deletions app/org/thp/cortex/services/K8sJobRunnerSrv.scala
Original file line number Diff line number Diff line change
@@ -1,30 +1,22 @@
package org.thp.cortex.services

import java.util.concurrent.TimeUnit
import java.nio.charset.StandardCharsets
import java.nio.file._

import scala.concurrent.duration.FiniteDuration
import scala.concurrent.{ExecutionContext, Future}
import scala.util.Try
import scala.util.{Try, Success, Failure}
import scala.collection.JavaConverters._

import play.api.libs.json.Json
import play.api.{Configuration, Logger}

import akka.actor.ActorSystem
import io.fabric8.kubernetes.client.{DefaultKubernetesClient, ConfigBuilder, Watcher}
import io.fabric8.kubernetes.api.model.batch.{Job => KJob, JobBuilder => KJobBuilder}
import io.fabric8.kubernetes.client.{DefaultKubernetesClient}
import io.fabric8.kubernetes.api.model.batch.{JobBuilder => KJobBuilder}
import io.fabric8.kubernetes.api.model.{PersistentVolumeClaimVolumeSourceBuilder}
// import com.spotify.docker.client.DockerClient.LogsParam
// import com.spotify.docker.client.messages.HostConfig.Bind
// import com.spotify.docker.client.messages.{ContainerConfig, HostConfig}
// import com.spotify.docker.client.{DefaultDockerClient, DockerClient}
import javax.inject.{Inject, Singleton}
import org.thp.cortex.models._

import org.elastic4play.utils.RichFuture

@Singleton
class K8sJobRunnerSrv(
client: DefaultKubernetesClient,
Expand Down Expand Up @@ -58,29 +50,27 @@ class K8sJobRunnerSrv(
}.get

def run(jobDirectory: Path, dockerImage: String, job: Job, timeout: Option[FiniteDuration])(implicit ec: ExecutionContext): Future[Unit] = {
// Spicy meatball: under Kubernetes, executions can fail for reasons
// other than bad inputs, for example, if a node dies. So we maybe
// can't say, "Kubernetes, only do this once! I will do any
// necessary retrying." But there may be quotas on analyzer usage,
// and retrying outside Cortex may use them up.

val cacertsFile = jobDirectory.resolve("input").resolve("cacerts")

val relativeJobDirectory = jobBaseDirectory.relativize(jobDirectory).toString()
// make the default longer than likely values, but still not infinite
val timeout_or_default = timeout getOrElse new FiniteDuration(8, TimeUnit.HOURS)
// https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
// FIXME: this collapses case, jeopardizing the uniqueness of the
// identifier.
val kname = "_".r.replaceAllIn(job.id.map(_.toLower), "-")
// identifier. LDH: lowercase, digits, hyphens.
val ldh_jobid = "_".r.replaceAllIn(job.id.map(_.toLower), "-")
val kjobName = "neuron-job-" + ldh_jobid
val pvcvs = new PersistentVolumeClaimVolumeSourceBuilder()
.withClaimName(persistentVolumeClaimName)
.withReadOnly(false)
.build();
val kjob1 = new KJobBuilder()
.withApiVersion("batch/v1")
.withNewMetadata()
.withName(kname)
.withName(kjobName)
.withLabels(Map(
"cortex-job-id" -> job.id,
"cortex-worker-id" -> job.workerId(),
"cortex-job" -> "true").asJava)
"cortex-neuron-job" -> "true").asJava)
.endMetadata()
.withNewSpec()
.withNewTemplate()
Expand All @@ -95,7 +85,7 @@ class K8sJobRunnerSrv(
.withArgs("/job")
.addNewEnv()
.withName("CORTEX_JOB_FOLDER")
.withValue(jobBaseDirectory.relativize(jobDirectory).toString())
.withValue(relativeJobDirectory)
.endEnv();
val kjob2 = if (Files.exists(cacertsFile)) {
kjob1.addNewEnv()
Expand Down Expand Up @@ -124,20 +114,44 @@ class K8sJobRunnerSrv(
.endTemplate()
.endSpec()
.build();
logger.info(s"Constructed k8s Job ${kjob3.getMetadata().getName()}\n")

val execution = Future {
val created_kjob = client.batch().jobs().create(kjob3)
logger.info(s"Created k8s Job ${created_kjob.getMetadata().getName()}")
// FIXME: use the given timeout value
val ended_kjob = client.batch().jobs().withName(kname)
.waitUntilCondition(j => (j.getStatus().getFailed() > 0 || j.getStatus().getSucceeded() > 0),
5, TimeUnit.MINUTES );
()
val created_env = created_kjob
.getSpec().getTemplate().getSpec().getContainers().get(0)
.getEnv().asScala;
logger.info(
s"Created Kubernetes Job ${created_kjob.getMetadata().getName()}\n" +
s" timeout: ${timeout_or_default.toString}\n" +
s" image : $dockerImage\n" +
s" mount : pvc ${persistentVolumeClaimName} subdir ${relativeJobDirectory} as /job" +
created_env.map(ev => s"\n env : ${ev.getName()} = ${ev.getValue()}").mkString)
val ended_kjob = client.batch().jobs().withLabel("cortex-job-id", job.id)
.waitUntilCondition((x => Option(x).flatMap(j =>
Option(j.getStatus).flatMap(s =>
Some(s.getConditions.asScala.map(_.getType).filter(t =>
t.equals("Complete") || t.equals("Failed")).nonEmpty)))
getOrElse false),
timeout_or_default.length, timeout_or_default.unit);
if(ended_kjob != null) {
logger.info(s"Kubernetes Job ${ended_kjob.getMetadata().getName()} " +
s"(for job ${job.id}) status is now ${ended_kjob.getStatus().toString()}")
} else {
logger.info(s"Kubernetes Job for ${job.id} no longer exists")
}
}.andThen {
case r =>
val foo_kjob = client.batch().jobs().withName(kname).get()
logger.info(s"k8s Job ${foo_kjob.getMetadata().getUid()} status ${foo_kjob.getStatus().toString()}")
// let's find the job by the attribute we know is fundamentally
// unique, rather than one constructed from it
case Success(r) =>
val deleted = client.batch().jobs().withLabel("cortex-job-id", job.id).delete()
if(deleted) {
logger.info(s"Deleted Kubernetes Job for job ${job.id}")
} else {
logger.info(s"While trying to delete Kubernetes Job for ${job.id}, the job was not found; this is OK")
}
Future {}
case Failure(t) =>
logger.warn(s"Some problem happened; not deleting Kubernetes Job for job ${job.id}")
Future {}
}
execution
Expand Down

0 comments on commit 6738e9b

Please sign in to comment.