Skip to content

Commit

Permalink
RSDK-9222 Serve restart_status HTTP endpoint (#4554)
Browse files Browse the repository at this point in the history
  • Loading branch information
benjirewis authored Nov 18, 2024
1 parent 8f47d2c commit 069b244
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 36 deletions.
117 changes: 81 additions & 36 deletions robot/impl/local_robot.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ type localRobot struct {
// when a local tarball is updated.
localModuleVersions map[string]semver.Version
ftdc *ftdc.FTDC

// whether the robot is actively reconfiguring
reconfiguring atomic.Bool
}

// ExportResourcesAsDot exports the resource graph as a DOT representation for
Expand Down Expand Up @@ -1191,44 +1194,17 @@ func (r *localRobot) applyLocalModuleVersions(cfg *config.Config) {
}

func (r *localRobot) reconfigure(ctx context.Context, newConfig *config.Config, forceSync bool) {
// Maintenance config can be configured to block reconfigure based off of a sensor reading
// These sensors can be configured on the main robot, or a remote
// In situations where there are conflicting sensor names the following behavior happens
// Main robot and remote share sensor name -> main robot sensor is chosen
// Only remote has the sensor name -> remote sensor is read
// Multiple remotes share a senor name -> conflict error is returned and reconfigure happens
// To specify a specific remote sensor use the name format remoteName:sensorName to specify a remote sensor
if newConfig.MaintenanceConfig != nil {
name, err := resource.NewFromString(newConfig.MaintenanceConfig.SensorName)
if err != nil {
r.logger.Warnf("sensor_name %s in maintenance config is not in a supported format", newConfig.MaintenanceConfig.SensorName)
} else {
sensorComponent, err := robot.ResourceFromRobot[sensor.Sensor](r, name)
if err != nil {
r.logger.Warnf("%s, Starting reconfiguration", err.Error())
} else {
canReconfigure, err := r.checkMaintenanceSensorReadings(ctx, newConfig.MaintenanceConfig.MaintenanceAllowedKey, sensorComponent)
if !canReconfigure {
if err != nil {
r.logger.CErrorw(ctx, "error reading maintenance sensor", "error", err)
} else {
r.logger.Info("maintenance_allowed_key found from readings on maintenance sensor. Skipping reconfiguration.")
}
diff, err := config.DiffConfigs(*r.Config(), *newConfig, false)
if err != nil {
r.logger.CErrorw(ctx, "error diffing the configs", "error", err)
}
// NetworkEqual checks if Cloud/Auth/Network are equal between configs
if diff != nil && !diff.NetworkEqual {
r.logger.Info("Machine reconfiguration skipped but Cloud/Auth/Network config section contain changes and will be applied.")
}
return
}
r.logger.Info("maintenance_allowed_key found from readings on maintenance sensor. Starting reconfiguration")
}
}
if !r.reconfigureAllowed(ctx, newConfig, true) {
return
}

// If reconfigure is allowed, assume we are reconfiguring until this function
// returns.
r.reconfiguring.Store(true)
defer func() {
r.reconfiguring.Store(false)
}()

r.configRevisionMu.Lock()
r.configRevision = config.Revision{
Revision: newConfig.Revision,
Expand Down Expand Up @@ -1520,6 +1496,63 @@ func (r *localRobot) Version(ctx context.Context) (robot.VersionResponse, error)
return robot.Version()
}

// reconfigureAllowed returns whether the local robot can reconfigure.
func (r *localRobot) reconfigureAllowed(ctx context.Context, cfg *config.Config, log bool) bool {
// Hack: if we should not log (allowance of reconfiguration is being checked
// from the `/restart_status` endpoint), then use a no-op logger. Otherwise
// use robot's logger.
logger := r.logger
if !log {
logger = logging.NewBlankLogger("")
}

// Reconfigure is always allowed in the absence of a MaintenanceConfig.
if cfg.MaintenanceConfig == nil {
return true
}

// Maintenance config can be configured to block reconfigure based off of a sensor reading
// These sensors can be configured on the main robot, or a remote
// In situations where there are conflicting sensor names the following behavior happens
// Main robot and remote share sensor name -> main robot sensor is chosen
// Only remote has the sensor name -> remote sensor is read
// Multiple remotes share a senor name -> conflict error is returned and reconfigure happens
// To specify a specific remote sensor use the name format remoteName:sensorName to specify a remote sensor
name, err := resource.NewFromString(cfg.MaintenanceConfig.SensorName)
if err != nil {
logger.Warnf("sensor_name %s in maintenance config is not in a supported format", cfg.MaintenanceConfig.SensorName)
return true
}
sensorComponent, err := robot.ResourceFromRobot[sensor.Sensor](r, name)
if err != nil {
logger.Warnf("%s, Starting reconfiguration", err.Error())
return true
}
canReconfigure, err := r.checkMaintenanceSensorReadings(ctx, cfg.MaintenanceConfig.MaintenanceAllowedKey, sensorComponent)
// The boolean return value of checkMaintenanceSensorReadings
// (canReconfigure) is meaningful even when an error is also returned. Check
// it first.
if !canReconfigure {
if err != nil {
logger.CErrorw(ctx, "error reading maintenance sensor", "error", err)
} else {
logger.Info("maintenance_allowed_key found from readings on maintenance sensor. Skipping reconfiguration.")
}
diff, err := config.DiffConfigs(*r.Config(), *cfg, false)
if err != nil {
logger.CErrorw(ctx, "error diffing the configs", "error", err)
}
// NetworkEqual checks if Cloud/Auth/Network are equal between configs
if diff != nil && !diff.NetworkEqual {
logger.Info("Machine reconfiguration skipped but Cloud/Auth/Network config section contain changes and will be applied.")
}
return false
}
logger.Info("maintenance_allowed_key found from readings on maintenance sensor. Starting reconfiguration")

return true
}

// checkMaintenanceSensorReadings ensures that errors from reading a sensor are handled properly.
func (r *localRobot) checkMaintenanceSensorReadings(ctx context.Context,
maintenanceAllowedKey string, sensor resource.Sensor,
Expand All @@ -1544,3 +1577,15 @@ func (r *localRobot) checkMaintenanceSensorReadings(ctx context.Context,
}
return canReconfigure, nil
}

// RestartAllowed returns whether the robot can safely be restarted. The robot
// can be safely restarted if the robot is not in the middle of a reconfigure,
// and a reconfigure would be allowed.
func (r *localRobot) RestartAllowed() bool {
ctx := context.Background()

if !r.reconfiguring.Load() && r.reconfigureAllowed(ctx, r.Config(), false) {
return true
}
return false
}
3 changes: 3 additions & 0 deletions robot/robot.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ type LocalRobot interface {
// visualization.
// DOT reference: https://graphviz.org/doc/info/lang.html
ExportResourcesAsDot(index int) (resource.GetSnapshotInfo, error)

// RestartAllowed returns whether the robot can safely be restarted.
RestartAllowed() bool
}

// A RemoteRobot is a Robot that was created through a connection.
Expand Down
27 changes: 27 additions & 0 deletions robot/web/web.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package web

import (
"context"
"encoding/json"
"fmt"
"html/template"
"io"
Expand Down Expand Up @@ -858,6 +859,9 @@ func (svc *webService) initMux(options weboptions.Options) (*goji.Mux, error) {
// TODO: accept params to display different formats
mux.HandleFunc(pat.New("/debug/graph"), svc.handleVisualizeResourceGraph)

// serve restart status
mux.HandleFunc(pat.New("/restart_status"), svc.handleRestartStatus)

prefix := "/viam"
addPrefix := func(h http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down Expand Up @@ -1067,3 +1071,26 @@ func (svc *webService) foreignServiceHandler(srv interface{}, stream googlegrpc.
return stream.SendMsg(invokeResp)
}
}

// RestartStatusResponse is the JSON response of the `restart_status` HTTP
// endpoint.
type RestartStatusResponse struct {
// RestartAllowed represents whether this instance of the viam-server can be
// safely restarted.
RestartAllowed bool `json:"restart_allowed"`
}

// Handles the `/restart_status` endpoint.
func (svc *webService) handleRestartStatus(w http.ResponseWriter, r *http.Request) {
localRobot, isLocal := svc.r.(robot.LocalRobot)
if !isLocal {
return
}

response := RestartStatusResponse{RestartAllowed: localRobot.RestartAllowed()}

w.Header().Set("Content-Type", "application/json")
// Only log errors from encoding here. A failure to encode should never
// happen.
utils.UncheckedError(json.NewEncoder(w).Encode(response))
}

0 comments on commit 069b244

Please sign in to comment.