Skip to content

Commit

Permalink
PKI: Track last time auto tidy was run across restarts (#28488)
Browse files Browse the repository at this point in the history
* Track the last PKI auto-tidy time ran for use across nodes

 - If the interval time for auto-tidy is longer then say a regularly
   scheduled restart of Vault, auto-tidy is never run. This is due to
   the time of the last run of tidy is only kept in memory and
   initialized on startup to the current time
 - Store the last run of any tidy, to maintain previous behavior, to
   a cluster local file, which is read in/initialized upon a mount
   initialization.

* Add auto-tidy configuration fields for backing off at startup

* Add new auto-tidy fields to UI

* Update api docs for auto-tidy

* Add cl

* Update field description text

* Apply Claire's suggestions from code review

Co-authored-by: claire bontempo <[email protected]>

* Implementing PR feedback from the UI team

* remove explicit defaults and types so we retrieve from backend, decouple enabling auto tidy from duration, move params to auto settings section

---------

Co-authored-by: claire bontempo <[email protected]>
Co-authored-by: claire bontempo <[email protected]>
  • Loading branch information
3 people authored Sep 26, 2024
1 parent 31d5814 commit 2db2a9f
Show file tree
Hide file tree
Showing 16 changed files with 570 additions and 289 deletions.
71 changes: 62 additions & 9 deletions builtin/logical/pki/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ func Backend(conf *logical.BackendConfig) *backend {
issuing.PathCerts,
issuing.PathCertMetadata,
acmePathPrefix,
autoTidyLastRunPath,
},

Root: []string{
Expand Down Expand Up @@ -294,8 +295,12 @@ func Backend(conf *logical.BackendConfig) *backend {
conf.System.ReplicationState().HasState(consts.ReplicationDRSecondary)
b.crlBuilder = newCRLBuilder(!cannotRebuildCRLs)

// Delay the first tidy until after we've started up.
b.lastTidy = time.Now()
// Delay the first tidy until after we've started up, this will be reset within the initialize function
now := time.Now()
b.lastAutoTidy = now

// Keep track of when this mount was started up.
b.mountStartup = now

b.unifiedTransferStatus = newUnifiedTransferStatus()

Expand All @@ -320,7 +325,14 @@ type backend struct {

tidyStatusLock sync.RWMutex
tidyStatus *tidyStatus
lastTidy time.Time
// lastAutoTidy should be accessed through the tidyStatusLock,
// use getAutoTidyLastRun and writeAutoTidyLastRun instead of direct access
lastAutoTidy time.Time

// autoTidyBackoff a random time in the future in which auto-tidy can't start
// for after the system starts up to avoid a thundering herd of tidy operations
// at startup.
autoTidyBackoff time.Time

unifiedTransferStatus *UnifiedTransferStatus

Expand All @@ -335,6 +347,9 @@ type backend struct {
// Context around ACME operations
acmeState *acmeState
acmeAccountLock sync.RWMutex // (Write) Locked on Tidy, (Read) Locked on Account Creation

// Track when this mount was started.
mountStartup time.Time
}

// BackendOps a bridge/legacy interface until we can further
Expand Down Expand Up @@ -448,9 +463,38 @@ func (b *backend) initialize(ctx context.Context, ir *logical.InitializationRequ
b.GetCertificateCounter().SetError(err)
}

// Initialize lastAutoTidy from disk
b.initializeLastTidyFromStorage(sc)

return b.initializeEnt(sc, ir)
}

// initializeLastTidyFromStorage reads the time we last ran auto tidy from storage and initializes
// b.lastAutoTidy with the value. If no previous value existed, we persist time.Now() and initialize
// b.lastAutoTidy with that value.
func (b *backend) initializeLastTidyFromStorage(sc *storageContext) {
now := time.Now()

lastTidyTime, err := sc.getAutoTidyLastRun()
if err != nil {
lastTidyTime = now
b.Logger().Error("failed loading previous tidy last run time, using now", "error", err.Error())
}
if lastTidyTime.IsZero() {
// No previous time was set, persist now so we can track a starting point across Vault restarts
lastTidyTime = now
if err = b.updateLastAutoTidyTime(sc, now); err != nil {
b.Logger().Error("failed persisting tidy last run time", "error", err.Error())
}
}

// We bypass using updateLastAutoTidyTime here to avoid the storage write on init
// that normally isn't required
b.tidyStatusLock.Lock()
defer b.tidyStatusLock.Unlock()
b.lastAutoTidy = lastTidyTime
}

func (b *backend) cleanup(ctx context.Context) {
sc := b.makeStorageContext(ctx, b.storage)

Expand Down Expand Up @@ -708,13 +752,20 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er

// Check if we should run another tidy...
now := time.Now()
b.tidyStatusLock.RLock()
nextOp := b.lastTidy.Add(config.Interval)
b.tidyStatusLock.RUnlock()
nextOp := b.getLastAutoTidyTime().Add(config.Interval)
if now.Before(nextOp) {
return nil
}

if b.autoTidyBackoff.IsZero() {
b.autoTidyBackoff = config.CalculateStartupBackoff(b.mountStartup)
}

if b.autoTidyBackoff.After(now) {
b.Logger().Info("Auto tidy will not run as we are still within the random backoff ending at", "backoff_until", b.autoTidyBackoff)
return nil
}

// Ensure a tidy isn't already running... If it is, we'll trigger
// again when the running one finishes.
if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) {
Expand All @@ -724,9 +775,11 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
// Prevent ourselves from starting another tidy operation while
// this one is still running. This operation runs in the background
// and has a separate error reporting mechanism.
b.tidyStatusLock.Lock()
b.lastTidy = now
b.tidyStatusLock.Unlock()
err = b.updateLastAutoTidyTime(sc, now)
if err != nil {
// We don't really mind if this write fails, we'll re-run in the future
b.Logger().Warn("failed to persist auto tidy last run time", "error", err.Error())
}

// Because the request from the parent storage will be cleared at
// some point (and potentially reused) -- due to tidy executing in
Expand Down
Loading

0 comments on commit 2db2a9f

Please sign in to comment.