From e84feb2f9b37d4095869e52a078bed4b9ea9c14a Mon Sep 17 00:00:00 2001 From: Kurt S <56745262+Kschappacher@users.noreply.github.com> Date: Fri, 25 Oct 2024 14:28:26 -0400 Subject: [PATCH] RSDK-8966 - Fix bug preventing removal of offline remotes (#4454) Co-authored-by: Cheuk <90270663+cheukt@users.noreply.github.com> --- resource/graph_node.go | 8 ++ resource/graph_node_test.go | 40 +++++++-- robot/impl/local_robot_test.go | 146 +++++++++++++++++++++++++++++++++ robot/impl/resource_manager.go | 4 +- 4 files changed, 187 insertions(+), 11 deletions(-) diff --git a/resource/graph_node.go b/resource/graph_node.go index 5828060a380..fdbd963db30 100644 --- a/resource/graph_node.go +++ b/resource/graph_node.go @@ -524,6 +524,14 @@ func (w *GraphNode) transitionTo(state NodeState) { return } + // if state of a node is [NodeStateRemoving] it cannot transition to [NodeStateUnhealthy] until it is removed. + // currently this is the only hard blocked transition + // note this does not block SwapResource from transitioning a removing resource to ready + if w.state == NodeStateRemoving && state == NodeStateUnhealthy { + w.logger.Debug("node cannot transition from [NodeStateRemoving] to [NodeStateUnhealthy], blocking transition") + return + } + if !w.canTransitionTo(state) && w.logger != nil { w.logger.Warnw("unexpected resource state transition", "from", w.state.String(), "to", state.String()) } diff --git a/resource/graph_node_test.go b/resource/graph_node_test.go index bd89d08ae85..282eb97e063 100644 --- a/resource/graph_node_test.go +++ b/resource/graph_node_test.go @@ -132,24 +132,34 @@ func lifecycleTest(t *testing.T, node *resource.GraphNode, initialDeps []string) state, transitionedAt = toState, toTransitionedAt } + // mark it as [NodeStateUnhealthy] + ourErr := errors.New("whoops") + node.LogAndSetLastError(ourErr) + _, err := node.Resource() + test.That(t, err, test.ShouldNotBeNil) + test.That(t, err.Error(), test.ShouldContainSubstring, "whoops") + + verifyStateTransition(t, node, resource.NodeStateUnhealthy) + // mark it for removal test.That(t, node.MarkedForRemoval(), test.ShouldBeFalse) node.MarkForRemoval() test.That(t, node.MarkedForRemoval(), test.ShouldBeTrue) - _, err := node.Resource() - test.That(t, err, test.ShouldNotBeNil) - test.That(t, err.Error(), test.ShouldContainSubstring, "pending removal") - - verifyStateTransition(t, node, resource.NodeStateRemoving) - - ourErr := errors.New("whoops") + // Attempt to change status to [NodeStateUnhealthy] + ourErr = errors.New("whoops") node.LogAndSetLastError(ourErr) + status := node.ResourceStatus() + // Ensure that error is set and node stays in [NodeStateUnhealthy] + // since state transition [NodeStateUnhealthy] -> [NodeStateRemoving] is blocked + test.That(t, status.Error.Error(), test.ShouldContainSubstring, "whoops") + test.That(t, node.MarkedForRemoval(), test.ShouldBeTrue) + _, err = node.Resource() test.That(t, err, test.ShouldNotBeNil) - test.That(t, err.Error(), test.ShouldContainSubstring, "whoops") + test.That(t, err.Error(), test.ShouldContainSubstring, "pending removal") - verifyStateTransition(t, node, resource.NodeStateUnhealthy) + verifyStateTransition(t, node, resource.NodeStateRemoving) test.That(t, node.UnresolvedDependencies(), test.ShouldResemble, initialDeps) @@ -341,3 +351,15 @@ func TestClose(t *testing.T) { t.Fatal("node took too long to close, might be a deadlock") } } + +// TestTransitionToBlocking ensures a node marked removing cannot transition to [NodeStateUnhealthy] state. +func TestTransitionToBlocking(t *testing.T) { + node := withTestLogger(t, resource.NewUninitializedNode()) + // Set state removing + node.MarkForRemoval() + test.That(t, node.MarkedForRemoval(), test.ShouldBeTrue) + // Attempt to set state to [NodeStateUnhealthy] + node.LogAndSetLastError(errors.New("Its error time")) + // Node should stay still be in state removing + test.That(t, node.MarkedForRemoval(), test.ShouldBeTrue) +} diff --git a/robot/impl/local_robot_test.go b/robot/impl/local_robot_test.go index b8acda30b62..cebb2795eca 100644 --- a/robot/impl/local_robot_test.go +++ b/robot/impl/local_robot_test.go @@ -4284,3 +4284,149 @@ func TestMaintenanceConfig(t *testing.T) { test.That(t, sensorUnBlocked, test.ShouldNotBeNil) }) } + +func TestRemovingOfflineRemote(t *testing.T) { + logger, _ := logging.NewObservedTestLogger(t) + ctx := context.Background() + + motorName := "remoteMotorFoo" + motorResourceName := resource.NewName(motor.API, motorName) + + remoteCfg := &config.Config{ + Components: []resource.Config{ + { + Name: motorName, + Model: resource.DefaultModelFamily.WithModel("fake"), + API: motor.API, + ConvertedAttributes: &fakemotor.Config{}, + }, + }, + } + + remoteRobot := setupLocalRobot(t, ctx, remoteCfg, logger.Sublogger("remote")) + remoteOptions, _, remoteAddr := robottestutils.CreateBaseOptionsAndListener(t) + err := remoteRobot.StartWeb(ctx, remoteOptions) + test.That(t, err, test.ShouldBeNil) + + // Set up a local main robot which is connected to the remote. + mainRobotCfg := &config.Config{ + Remotes: []config.Remote{ + { + Name: "remote", + Address: remoteAddr, + // These values dictate how quickly we'll observe the remote going offline. And how + // quickly we'll observe it coming back online. + ConnectionCheckInterval: 10 * time.Millisecond, + ReconnectInterval: 10 * time.Millisecond, + }, + }, + } + mainRobotI := setupLocalRobot(t, ctx, mainRobotCfg, logger.Sublogger("main")) + // We'll manually access the resource manager to move the test forward. + mainRobot := mainRobotI.(*localRobot) + + // Grab the RobotClient resource graph node from the main robot that is connected to the + // remote. We'll use this to know when the main robot observes the remote has gone offline. + mainToRemoteClientRes, _ := mainRobot.RemoteByName("remote") + test.That(t, mainToRemoteClientRes, test.ShouldNotBeNil) + mainToRemoteClient := mainToRemoteClientRes.(*client.RobotClient) + test.That(t, mainToRemoteClient.Connected(), test.ShouldBeTrue) + + // Stop the remote's web server. Wait for the main robot to observe there's a connection problem. + logger.Info("Stopping web") + remoteRobot.StopWeb() + testutils.WaitForAssertion(t, func(tb testing.TB) { + tb.Helper() + test.That(tb, mainToRemoteClient.Connected(), test.ShouldBeFalse) + }) + + // Reconfigure the main robot with the offline remote removed + mainRobot.Reconfigure(ctx, &config.Config{}) + + // Ensure that the remote has been removed correctly + findRemote, ok := mainRobot.RemoteByName("remote") + test.That(t, findRemote, test.ShouldBeEmpty) + test.That(t, ok, test.ShouldBeFalse) + + // Ensure that motor is removed + names := mainRobot.ResourceNames() + test.That(t, names, test.ShouldNotContain, motorResourceName) +} + +// TestRemovingOfflineRemotes tests a case where a robot's reconfigure loop is modifying +// a resource graph node at the same time as the complete config loop. In this case the remote is +// marked to be removed by the reconfig loop and then changed to [NodeStateUnhealthy] by the complete config +// loop. This caused the remote that should have been removed to be stay on the robot +// and continue to try and reconnect. We recreate that scenario and ensure that our fix +// prevents that behavior and removes the remote correctly. +func TestRemovingOfflineRemotes(t *testing.T) { + // Close the robot to stop the background workers from processing any messages to triggerConfig + r := setupLocalRobot(t, context.Background(), &config.Config{}, logging.NewTestLogger(t)) + r.Close(context.Background()) + localRobot := r.(*localRobot) + + // Create a context that we can cancel to similuate the remote connection timeout + ctxCompleteConfig, cancelCompleteConfig := context.WithCancel(context.Background()) + defer cancelCompleteConfig() + + // This cancel is used to ensure the goroutine is cleaned up properly after the test + ctxReconfig, cancelReconfig := context.WithCancel(context.Background()) + defer cancelReconfig() + + // Create a remote graph node and manually add it to the graph + // This is to avoid calling reconfigure and blocking on trying to connect to the remote + remoteName := fromRemoteNameToRemoteNodeName("remoteOffline") + configRemote := config.Remote{ + Name: "remoteOffline", + Address: "123.123.123.123", + } + configRemote.Validate("") + node := resource.NewConfiguredGraphNode( + resource.Config{ + ConvertedAttributes: &configRemote, + }, nil, builtinModel) + // Set node to [NodeStateUnhealthy] + node.LogAndSetLastError(errors.New("Its so bad plz help")) + localRobot.manager.resources.AddNode(remoteName, node) + + // Set up a wait group to ensure go routines do not leak after test ends + var wg sync.WaitGroup + // Spin up the two competing go routines + wg.Add(1) + go func() { + defer wg.Done() + localRobot.manager.completeConfig(ctxCompleteConfig, localRobot, false) + }() + + // Ensure that complete config grabs the lock + time.Sleep(1 * time.Second) + wg.Add(1) + go func() { + defer wg.Done() + r.Reconfigure(ctxReconfig, &config.Config{}) + }() + + // Sleep needed to ensure reconfig is waiting on complete cofig to release the lock + // and that complete config is hanging on trying to dial the remote + time.Sleep(2 * time.Second) + + // Ensure that the remote is not marked for removal while trying to connect to the remote + remote, ok := localRobot.manager.resources.Node(remoteName) + test.That(t, ok, test.ShouldBeTrue) + test.That(t, remote.MarkedForRemoval(), test.ShouldBeTrue) + + // Simulate a timeout by canceling the context while trying to connect to the remote + cancelCompleteConfig() + + // Ensure that the remote is removed from the robot + testutils.WaitForAssertion(t, func(tb testing.TB) { + tb.Helper() + remote, ok := r.RemoteByName("remoteOffline") + test.That(tb, ok, test.ShouldBeFalse) + test.That(tb, remote, test.ShouldBeNil) + }) + + // Wait for both goroutines to complete before finishing test + cancelReconfig() + wg.Wait() +} diff --git a/robot/impl/resource_manager.go b/robot/impl/resource_manager.go index 3b7320e957d..abffd34a1f1 100644 --- a/robot/impl/resource_manager.go +++ b/robot/impl/resource_manager.go @@ -237,10 +237,10 @@ func (manager *resourceManager) updateRemoteResourceNames( // it is possible that we are switching to a new remote with a identical resource name(s), so we may // need to create these resource clients. if !recreateAllClients { - // ticker event, likely no changes to remote resources, skip closing and readding duplicate name resource clients + // ticker event, likely no changes to remote resources, skip closing and re-adding duplicate name resource clients continue } - // reconfiguration attempt, remote could have changed, so close all duplicate name remote resource clients and readd new ones later + // reconfiguration attempt, remote could have changed, so close all duplicate name remote resource clients and re-add new ones later resLogger.CDebugw(ctx, "attempting to remove remote resource") if err := manager.markChildrenForUpdate(resName); err != nil { resLogger.CErrorw(ctx,