Skip to content

Commit

Permalink
[JENKINS-69277] relaunch an agent if the connection is broken (#152)
Browse files Browse the repository at this point in the history
* [JENKINS-69277] relaunch an agent if the connection is broken

Even though this is a OnceRetentionStrategy we expect a durable task to
be able to run to completion even if the connection is broken.  So we
relaunch the agent if the agent is offline and it is not done.

We are not forcing a reconnect, as this allows any existing connect
operation (which could be the first one, to continue running).

* [JENKINS-69277] avoid spotbugs warning

Whilst spotbugs complained about the lack of synchronization on
terminating we do not want to put the connect call inside the
synchronized block as it may be long running, or cause some form of
re-entry on a different thread.

whilst the locking is not perfect, we may try to relaunch and agent just
as it was done, we have set the computer to not accept any tasks so the
agent should become idle, and be terminated (either in the current
termination block) or the next time the check runs.

* check if launch is supported early on

* [JENKINS-69277] Add a unit test to cover OnceRetentionStrategy

iAdds a unit test to cover both the happy case of a cloud agent using
OnceRetentionStrategy and the unhappy case where the agent dies and
needs restarting.

---------

Co-authored-by: Carroll Chiou <[email protected]>
  • Loading branch information
jtnord and car-roll authored Apr 16, 2024
1 parent c8635e9 commit 6802fe0
Show file tree
Hide file tree
Showing 3 changed files with 267 additions and 1 deletion.
27 changes: 27 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,33 @@
<artifactId>ssh-slaves</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-job</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-durable-task-step</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-cps</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest</artifactId>
<version>2.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.awaitility</groupId>
<artifactId>awaitility</artifactId>
<version>4.2.0</version>
<scope>test</scope>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,19 @@ public long check(final AbstractCloudComputer c) {
done(c);
}
}

// if the agent is not done and it is offline we should trigger a relaunch
// do not do this whilst we hold the lock, as connect is documented only as "This method may return immediately"
// so lets not presume any asynchronous nature and instead that this could be a long blocking op.
boolean shouldRestart = false;
synchronized (this) {
if (!terminating && c.isOffline() && c.isLaunchSupported() && !disabled) {

Check warning on line 81 in src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Partially covered line

Line 81 is only partially covered, 4 branches are missing
shouldRestart = true;
}
}
if (shouldRestart) {

Check warning on line 85 in src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Partially covered line

Line 85 is only partially covered, one branch is missing
LOGGER.log(Level.FINE, "Attempting relaunch of {0}", c.getName());
c.connect(false);
}
// Return one because we want to check every minute if idle.
return 1;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
package org.jenkinsci.plugins.durabletask.executors;

import java.io.File;
import java.io.IOException;
import java.time.Duration;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jenkinsci.plugins.durabletask.BourneShellScript;
import org.jenkinsci.plugins.durabletask.WindowsBatchScript;
import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition;
import org.jenkinsci.plugins.workflow.job.WorkflowJob;
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.jvnet.hudson.test.BuildWatcher;
import org.jvnet.hudson.test.Issue;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.LoggerRule;
import org.jvnet.hudson.test.SimpleCommandLauncher;
import hudson.Functions;
import hudson.model.Computer;
import hudson.model.LoadStatistics;
import hudson.model.TaskListener;
import hudson.model.Descriptor.FormException;
import hudson.model.labels.LabelAtom;
import hudson.remoting.RequestAbortedException;
import hudson.remoting.Which;
import hudson.slaves.AbstractCloudComputer;
import hudson.slaves.AbstractCloudSlave;
import hudson.slaves.Cloud;
import hudson.slaves.NodeProvisioner.PlannedNode;
import hudson.util.RemotingDiagnostics;
import jenkins.model.Jenkins;

import static org.awaitility.Awaitility.await;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.arrayWithSize;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.is;

public class OnceRetentionStrategyTest {

private final static Logger LOGGER = Logger.getLogger(OnceRetentionStrategyTest.class.getName());

@Rule
public LoggerRule lr = new LoggerRule().record(LOGGER, Level.ALL);

@Rule
public JenkinsRule jr = new JenkinsRule();

@Rule
public TemporaryFolder temp = new TemporaryFolder();

@Rule
public BuildWatcher bw = new BuildWatcher();

@BeforeClass
public static void setupStatics() {
LoadStatistics.CLOCK = 100; // 100ms for the LoadStatistics so we have quick provisioning
// deadConnectionsShouldReLaunched seems to fail always when run as part of the suite - but passes always when run individually without the binary wrapper
// (because the ping command and its parent cmd.exe get killed :-o )
WindowsBatchScript.USE_BINARY_WRAPPER=true;
BourneShellScript.USE_BINARY_WRAPPER=true;
}

@Test
public void withoutRestartNodesAreCleaned() throws Exception {
MySimpleCloud sc = new MySimpleCloud("simples", temp.newFolder());

jr.jenkins.clouds.add(sc);
jr.jenkins.setNumExecutors(0);
//jr.jenkins.getLabel("simples").nodeProvisioner. = new ImmediateNodeProvisioner();

WorkflowJob foo = jr.jenkins.createProject(WorkflowJob.class, "foo");
foo.setDefinition(new CpsFlowDefinition(String.join(System.lineSeparator(),
"node('simples') {",
Functions.isWindows() ? " bat 'echo hello'" : " sh 'echo hello'",
"}"),
true));
WorkflowRun run = foo.scheduleBuild2(0).waitForStart();

jr.waitForCompletion(run);
jr.assertBuildStatusSuccess(run);

// agent should have been removed and we should have no agents but this happens on a different Thread..
await().atMost(Duration.ofSeconds(5)).until(() -> jr.jenkins.getNodes().isEmpty());
assertThat(jr.jenkins.getNodes(), is(empty()));
assertThat(jr.jenkins.getComputers(), arrayWithSize(1)); // the Jenkins computer itself
}


@Test
@Issue("JENKINS-69277")
public void deadConnectionsShouldReLaunched() throws Exception {
MySimpleCloud sc = new MySimpleCloud("simples", temp.newFolder());

jr.jenkins.clouds.add(sc);
jr.jenkins.setNumExecutors(0);
//jr.jenkins.getLabel("simples").nodeProvisioner. = new ImmediateNodeProvisioner();

WorkflowJob foo = jr.jenkins.createProject(WorkflowJob.class, "foo");
foo.setDefinition(new CpsFlowDefinition(String.join(System.lineSeparator(),
"node('simples') {",
Functions.isWindows() ? " bat 'echo hello & ping -w 1000 -n 30 localhost'"
: " sh 'echo hello & sleep 30'",
"}"),
true));
WorkflowRun run = foo.scheduleBuild2(0).waitForStart();
LOGGER.log(Level.FINE, "waiting for hello");

jr.waitForMessage("hello", run);
LOGGER.log(Level.FINE, "killing da agent");

killAgent();
// retention strategy will kick in but may take up to 60 seconds.
jr.waitForCompletion(run);
jr.assertBuildStatusSuccess(run);

// agent should have been removed and we should have no agents but this happens on a different Thread..
await().atMost(Duration.ofSeconds(5)).until(() -> jr.jenkins.getNodes().isEmpty());
assertThat(jr.jenkins.getNodes(), is(empty()));
assertThat(jr.jenkins.getComputers(), arrayWithSize(1)); // the Jenkins computer itself
}

private void killAgent() throws IOException, InterruptedException {
Computer[] computers = Jenkins.get().getComputers();
for (Computer c : computers) {
if (c instanceof MySimpleCloudComputer) {
LOGGER.log(Level.FINE, "Asking {0} to commit suicide", c);
try {
RemotingDiagnostics.executeGroovy("Runtime.getRuntime().halt(1)", c.getChannel());
} catch (RequestAbortedException ignored) {
// we have just asked the Computer to commit suicide so this is expected.
}
}
}
}

public static class MySimpleCloud extends Cloud {

private final LabelAtom label;
private final File agentRootDir;
private int count = 0;

public MySimpleCloud(String name, File agentRootDir) {
super(name);
this.agentRootDir = agentRootDir;
label = Jenkins.get().getLabelAtom(name);
}

@Override
public boolean canProvision(CloudState state) {
boolean retVal = state.getLabel().matches(Collections.singleton(label));
LOGGER.log(Level.FINE, "SimpleCloud.canProvision({0},{1}) -> {2}", new Object[] {state.getLabel(), state.getAdditionalPlannedCapacity(), retVal});
return retVal;
}

@Override
public Collection<PlannedNode> provision(CloudState state, int excessWorkload) {
LOGGER.log(Level.FINE, "SimpleCloud.provision(({0}, {1}), {2}", new Object[] {state.getLabel(), state.getAdditionalPlannedCapacity(), excessWorkload});
Collection<PlannedNode> retVal = new HashSet<PlannedNode>();
for (int i =0; i < excessWorkload - state.getAdditionalPlannedCapacity(); ++i) {
String agentName;
synchronized (this) {
if (count != 0) {
// sometimes we end up with 2 agents due to the lovely cloud API.
LOGGER.log(Level.FINE, "not provisioning another agent as we have already provisioned.");
return Collections.emptyList();
}
agentName = "cloud-"+name+"-"+(++count);
}

PlannedNode n = new PlannedNode(agentName,
Computer.threadPoolForRemoting.submit( () -> new MySimpleCloudSlave(agentName, new File(agentRootDir, agentName), label)),
1);
LOGGER.log(Level.FINE, "SimpleCloud.provision() -> Added planned node for {0}", name);
retVal.add(n);
}
return retVal;
}
}

public static class MySimpleCloudSlave extends AbstractCloudSlave {

private final LabelAtom label;

public MySimpleCloudSlave(String name, File remoteFS, LabelAtom label) throws FormException, IOException {
super(name,
remoteFS.getAbsolutePath(),
new SimpleCommandLauncher("java -Xmx512m -Djava.awt.headless=true -jar " + Which.jarFile(hudson.remoting.Launcher.class) + " -jar-cache " + remoteFS + " -workDir " + remoteFS));
this.setRetentionStrategy(new OnceRetentionStrategy(1));
LOGGER.log(Level.FINE, "SimpleCloudSlave()");
this.label = label;
}

@Override
public AbstractCloudComputer createComputer() {
LOGGER.log(Level.FINE, "SimpleCloudSlave.createComputer()");
return new MySimpleCloudComputer(this);
}

@Override
protected void _terminate(TaskListener listener) throws IOException, InterruptedException {
// nothing to do here
}

@Override
public Set<LabelAtom> getAssignedLabels() {
return new HashSet<LabelAtom>(Arrays.asList(label, getSelfLabel()));
}
}

public static class MySimpleCloudComputer extends AbstractCloudComputer<MySimpleCloudSlave> {

public MySimpleCloudComputer(MySimpleCloudSlave slave) {
super(slave);
}
}

}

0 comments on commit 6802fe0

Please sign in to comment.