Skip to content

Commit

Permalink
RSDK-8966 - Fix bug preventing removal of offline remotes (#4454)
Browse files Browse the repository at this point in the history
Co-authored-by: Cheuk <[email protected]>
  • Loading branch information
Kschappacher and cheukt authored Oct 25, 2024
1 parent 781bde2 commit e84feb2
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 11 deletions.
8 changes: 8 additions & 0 deletions resource/graph_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,14 @@ func (w *GraphNode) transitionTo(state NodeState) {
return
}

// if state of a node is [NodeStateRemoving] it cannot transition to [NodeStateUnhealthy] until it is removed.
// currently this is the only hard blocked transition
// note this does not block SwapResource from transitioning a removing resource to ready
if w.state == NodeStateRemoving && state == NodeStateUnhealthy {
w.logger.Debug("node cannot transition from [NodeStateRemoving] to [NodeStateUnhealthy], blocking transition")
return
}

if !w.canTransitionTo(state) && w.logger != nil {
w.logger.Warnw("unexpected resource state transition", "from", w.state.String(), "to", state.String())
}
Expand Down
40 changes: 31 additions & 9 deletions resource/graph_node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,24 +132,34 @@ func lifecycleTest(t *testing.T, node *resource.GraphNode, initialDeps []string)
state, transitionedAt = toState, toTransitionedAt
}

// mark it as [NodeStateUnhealthy]
ourErr := errors.New("whoops")
node.LogAndSetLastError(ourErr)
_, err := node.Resource()
test.That(t, err, test.ShouldNotBeNil)
test.That(t, err.Error(), test.ShouldContainSubstring, "whoops")

verifyStateTransition(t, node, resource.NodeStateUnhealthy)

// mark it for removal
test.That(t, node.MarkedForRemoval(), test.ShouldBeFalse)
node.MarkForRemoval()
test.That(t, node.MarkedForRemoval(), test.ShouldBeTrue)

_, err := node.Resource()
test.That(t, err, test.ShouldNotBeNil)
test.That(t, err.Error(), test.ShouldContainSubstring, "pending removal")

verifyStateTransition(t, node, resource.NodeStateRemoving)

ourErr := errors.New("whoops")
// Attempt to change status to [NodeStateUnhealthy]
ourErr = errors.New("whoops")
node.LogAndSetLastError(ourErr)
status := node.ResourceStatus()
// Ensure that error is set and node stays in [NodeStateUnhealthy]
// since state transition [NodeStateUnhealthy] -> [NodeStateRemoving] is blocked
test.That(t, status.Error.Error(), test.ShouldContainSubstring, "whoops")
test.That(t, node.MarkedForRemoval(), test.ShouldBeTrue)

_, err = node.Resource()
test.That(t, err, test.ShouldNotBeNil)
test.That(t, err.Error(), test.ShouldContainSubstring, "whoops")
test.That(t, err.Error(), test.ShouldContainSubstring, "pending removal")

verifyStateTransition(t, node, resource.NodeStateUnhealthy)
verifyStateTransition(t, node, resource.NodeStateRemoving)

test.That(t, node.UnresolvedDependencies(), test.ShouldResemble, initialDeps)

Expand Down Expand Up @@ -341,3 +351,15 @@ func TestClose(t *testing.T) {
t.Fatal("node took too long to close, might be a deadlock")
}
}

// TestTransitionToBlocking ensures a node marked removing cannot transition to [NodeStateUnhealthy] state.
func TestTransitionToBlocking(t *testing.T) {
node := withTestLogger(t, resource.NewUninitializedNode())
// Set state removing
node.MarkForRemoval()
test.That(t, node.MarkedForRemoval(), test.ShouldBeTrue)
// Attempt to set state to [NodeStateUnhealthy]
node.LogAndSetLastError(errors.New("Its error time"))
// Node should stay still be in state removing
test.That(t, node.MarkedForRemoval(), test.ShouldBeTrue)
}
146 changes: 146 additions & 0 deletions robot/impl/local_robot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4284,3 +4284,149 @@ func TestMaintenanceConfig(t *testing.T) {
test.That(t, sensorUnBlocked, test.ShouldNotBeNil)
})
}

func TestRemovingOfflineRemote(t *testing.T) {
logger, _ := logging.NewObservedTestLogger(t)
ctx := context.Background()

motorName := "remoteMotorFoo"
motorResourceName := resource.NewName(motor.API, motorName)

remoteCfg := &config.Config{
Components: []resource.Config{
{
Name: motorName,
Model: resource.DefaultModelFamily.WithModel("fake"),
API: motor.API,
ConvertedAttributes: &fakemotor.Config{},
},
},
}

remoteRobot := setupLocalRobot(t, ctx, remoteCfg, logger.Sublogger("remote"))
remoteOptions, _, remoteAddr := robottestutils.CreateBaseOptionsAndListener(t)
err := remoteRobot.StartWeb(ctx, remoteOptions)
test.That(t, err, test.ShouldBeNil)

// Set up a local main robot which is connected to the remote.
mainRobotCfg := &config.Config{
Remotes: []config.Remote{
{
Name: "remote",
Address: remoteAddr,
// These values dictate how quickly we'll observe the remote going offline. And how
// quickly we'll observe it coming back online.
ConnectionCheckInterval: 10 * time.Millisecond,
ReconnectInterval: 10 * time.Millisecond,
},
},
}
mainRobotI := setupLocalRobot(t, ctx, mainRobotCfg, logger.Sublogger("main"))
// We'll manually access the resource manager to move the test forward.
mainRobot := mainRobotI.(*localRobot)

// Grab the RobotClient resource graph node from the main robot that is connected to the
// remote. We'll use this to know when the main robot observes the remote has gone offline.
mainToRemoteClientRes, _ := mainRobot.RemoteByName("remote")
test.That(t, mainToRemoteClientRes, test.ShouldNotBeNil)
mainToRemoteClient := mainToRemoteClientRes.(*client.RobotClient)
test.That(t, mainToRemoteClient.Connected(), test.ShouldBeTrue)

// Stop the remote's web server. Wait for the main robot to observe there's a connection problem.
logger.Info("Stopping web")
remoteRobot.StopWeb()
testutils.WaitForAssertion(t, func(tb testing.TB) {
tb.Helper()
test.That(tb, mainToRemoteClient.Connected(), test.ShouldBeFalse)
})

// Reconfigure the main robot with the offline remote removed
mainRobot.Reconfigure(ctx, &config.Config{})

// Ensure that the remote has been removed correctly
findRemote, ok := mainRobot.RemoteByName("remote")
test.That(t, findRemote, test.ShouldBeEmpty)
test.That(t, ok, test.ShouldBeFalse)

// Ensure that motor is removed
names := mainRobot.ResourceNames()
test.That(t, names, test.ShouldNotContain, motorResourceName)
}

// TestRemovingOfflineRemotes tests a case where a robot's reconfigure loop is modifying
// a resource graph node at the same time as the complete config loop. In this case the remote is
// marked to be removed by the reconfig loop and then changed to [NodeStateUnhealthy] by the complete config
// loop. This caused the remote that should have been removed to be stay on the robot
// and continue to try and reconnect. We recreate that scenario and ensure that our fix
// prevents that behavior and removes the remote correctly.
func TestRemovingOfflineRemotes(t *testing.T) {
// Close the robot to stop the background workers from processing any messages to triggerConfig
r := setupLocalRobot(t, context.Background(), &config.Config{}, logging.NewTestLogger(t))
r.Close(context.Background())
localRobot := r.(*localRobot)

// Create a context that we can cancel to similuate the remote connection timeout
ctxCompleteConfig, cancelCompleteConfig := context.WithCancel(context.Background())
defer cancelCompleteConfig()

// This cancel is used to ensure the goroutine is cleaned up properly after the test
ctxReconfig, cancelReconfig := context.WithCancel(context.Background())
defer cancelReconfig()

// Create a remote graph node and manually add it to the graph
// This is to avoid calling reconfigure and blocking on trying to connect to the remote
remoteName := fromRemoteNameToRemoteNodeName("remoteOffline")
configRemote := config.Remote{
Name: "remoteOffline",
Address: "123.123.123.123",
}
configRemote.Validate("")
node := resource.NewConfiguredGraphNode(
resource.Config{
ConvertedAttributes: &configRemote,
}, nil, builtinModel)
// Set node to [NodeStateUnhealthy]
node.LogAndSetLastError(errors.New("Its so bad plz help"))
localRobot.manager.resources.AddNode(remoteName, node)

// Set up a wait group to ensure go routines do not leak after test ends
var wg sync.WaitGroup
// Spin up the two competing go routines
wg.Add(1)
go func() {
defer wg.Done()
localRobot.manager.completeConfig(ctxCompleteConfig, localRobot, false)
}()

// Ensure that complete config grabs the lock
time.Sleep(1 * time.Second)
wg.Add(1)
go func() {
defer wg.Done()
r.Reconfigure(ctxReconfig, &config.Config{})
}()

// Sleep needed to ensure reconfig is waiting on complete cofig to release the lock
// and that complete config is hanging on trying to dial the remote
time.Sleep(2 * time.Second)

// Ensure that the remote is not marked for removal while trying to connect to the remote
remote, ok := localRobot.manager.resources.Node(remoteName)
test.That(t, ok, test.ShouldBeTrue)
test.That(t, remote.MarkedForRemoval(), test.ShouldBeTrue)

// Simulate a timeout by canceling the context while trying to connect to the remote
cancelCompleteConfig()

// Ensure that the remote is removed from the robot
testutils.WaitForAssertion(t, func(tb testing.TB) {
tb.Helper()
remote, ok := r.RemoteByName("remoteOffline")
test.That(tb, ok, test.ShouldBeFalse)
test.That(tb, remote, test.ShouldBeNil)
})

// Wait for both goroutines to complete before finishing test
cancelReconfig()
wg.Wait()
}
4 changes: 2 additions & 2 deletions robot/impl/resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,10 @@ func (manager *resourceManager) updateRemoteResourceNames(
// it is possible that we are switching to a new remote with a identical resource name(s), so we may
// need to create these resource clients.
if !recreateAllClients {
// ticker event, likely no changes to remote resources, skip closing and readding duplicate name resource clients
// ticker event, likely no changes to remote resources, skip closing and re-adding duplicate name resource clients
continue
}
// reconfiguration attempt, remote could have changed, so close all duplicate name remote resource clients and readd new ones later
// reconfiguration attempt, remote could have changed, so close all duplicate name remote resource clients and re-add new ones later
resLogger.CDebugw(ctx, "attempting to remove remote resource")
if err := manager.markChildrenForUpdate(resName); err != nil {
resLogger.CErrorw(ctx,
Expand Down

0 comments on commit e84feb2

Please sign in to comment.