-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix AP upgrade version issue #27277
Fix AP upgrade version issue #27277
Changes from 8 commits
b6037d3
f75d137
a3f6166
e25fcd8
9613a50
a1dba62
3baffa8
2bd366d
7efc4d2
d25a59c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
```release-note:bug | ||
storage/raft (enterprise): Fix a regression introduced in 1.15.8 that causes | ||
autopilot to fail to discover new server versions and so not trigger an upgrade. | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,7 @@ import ( | |
"github.com/hashicorp/vault/sdk/logical" | ||
"github.com/hashicorp/vault/sdk/physical" | ||
"github.com/hashicorp/vault/vault/cluster" | ||
"github.com/hashicorp/vault/version" | ||
etcdbolt "go.etcd.io/bbolt" | ||
) | ||
|
||
|
@@ -582,6 +583,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend | |
failGetInTxn: new(uint32), | ||
raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled, | ||
raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval, | ||
effectiveSDKVersion: version.GetVersion().Version, | ||
}, nil | ||
} | ||
|
||
|
@@ -660,12 +662,6 @@ func (b *RaftBackend) FailGetInTxn(fail bool) { | |
atomic.StoreUint32(b.failGetInTxn, val) | ||
} | ||
|
||
func (b *RaftBackend) SetEffectiveSDKVersion(sdkVersion string) { | ||
b.l.Lock() | ||
b.effectiveSDKVersion = sdkVersion | ||
b.l.Unlock() | ||
} | ||
|
||
func (b *RaftBackend) RedundancyZone() string { | ||
b.l.RLock() | ||
defer b.l.RUnlock() | ||
|
@@ -1089,6 +1085,11 @@ type SetupOpts struct { | |
// RecoveryModeConfig is the configuration for the raft cluster in recovery | ||
// mode. | ||
RecoveryModeConfig *raft.Configuration | ||
|
||
// EffectiveSDKVersion is typically the version string baked into the binary. | ||
// We pass it in though because it can be overridden in tests or via ENV in | ||
// core. | ||
EffectiveSDKVersion string | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I preferred making this an option when we setup the cluster since that is an existing mechanism for runtime config that isn't known at |
||
} | ||
|
||
func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error { | ||
|
@@ -1132,6 +1133,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error { | |
return errors.New("no local node id configured") | ||
} | ||
|
||
if opts.EffectiveSDKVersion != "" { | ||
// Override the SDK version | ||
b.effectiveSDKVersion = opts.EffectiveSDKVersion | ||
} | ||
|
||
// Setup the raft config | ||
raftConfig := raft.DefaultConfig() | ||
if err := b.applyConfigSettings(raftConfig); err != nil { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,39 @@ func TestRaft_Autopilot_Disable(t *testing.T) { | |
require.Nil(t, nil, state) | ||
} | ||
|
||
// TestRaft_Autopilot_BinaryVersionPlumbing is an apparently trivial test that | ||
// ensures that the default plumbing in Vault core to configure the binary | ||
// version of the raft library is working. Hopefully this will trivially pass | ||
// from now on, however it would have caught a regression in the past! | ||
func TestRaft_Autopilot_BinaryVersionPlumbing(t *testing.T) { | ||
t.Parallel() | ||
|
||
coreCfg, clusterOpts := raftClusterBuilder(t, &RaftClusterOpts{ | ||
EnableAutopilot: true, | ||
// We need 2 nodes because the code path that regressed was different on a | ||
// standby vs active node so we'd not detect the problem if we only test on | ||
// an active node. | ||
NumCores: 2, | ||
}) | ||
|
||
// Default options should not set EffectiveSDKVersion(Map) which would defeat | ||
// the point of this test by plumbing versions via config. | ||
require.Nil(t, clusterOpts.EffectiveSDKVersionMap) | ||
require.Empty(t, coreCfg.EffectiveSDKVersion) | ||
|
||
c := vault.NewTestCluster(t, coreCfg, &clusterOpts) | ||
defer c.Cleanup() | ||
|
||
// Wait for follower to be perf standby | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this is CE Vault, the follower will never become a perf standby, will it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, in CE that helper just waits for active node. So yeah it's possible this test would not catch the regression in CE before this fix or would fail for the wrong reason, but either way I think it passes now and actively catches the issue in Enterprise so it seem worthwhile keeping it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll update the comment 👍 |
||
testhelpers.WaitForActiveNodeAndStandbys(t, c) | ||
for _, core := range c.Cores { | ||
be := core.UnderlyingRawStorage.(*raft.RaftBackend) | ||
require.Equal(t, version.GetVersion().Version, be.UpgradeVersion(), | ||
"expected raft upgrade version to default to Vault version for core %q", | ||
core.NodeID) | ||
} | ||
} | ||
|
||
// TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted | ||
// to be voters after the stabilization time has elapsed. Also checks that | ||
// the autopilot state is Healthy once all nodes are available. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,9 +149,10 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) { | |
raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true)) | ||
|
||
if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{ | ||
TLSKeyring: raftTLS, | ||
ClusterListener: c.getClusterListener(), | ||
StartAsLeader: creating, | ||
TLSKeyring: raftTLS, | ||
ClusterListener: c.getClusterListener(), | ||
StartAsLeader: creating, | ||
EffectiveSDKVersion: c.effectiveSDKVersion, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that |
||
}); err != nil { | ||
return err | ||
} | ||
|
@@ -309,7 +310,6 @@ func (c *Core) setupRaftActiveNode(ctx context.Context) error { | |
} | ||
|
||
c.logger.Info("starting raft active node") | ||
raftBackend.SetEffectiveSDKVersion(c.effectiveSDKVersion) | ||
|
||
c.pendingRaftPeers = &sync.Map{} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Setting it by default to the binary version is a defensive move so we have something that's 99% of the time going to be correct anyway even if the plumbing of the effective version is still incorrect in some edge case (or becomes incorrect later).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good call