Skip to content
This repository has been archived by the owner on Sep 30, 2024. It is now read-only.

Commit

Permalink
fix/msp/jobs: do not provision job absence alert for crons over 23h30m (
Browse files Browse the repository at this point in the history
#64502)

Closes
https://linear.app/sourcegraph/issue/CORE-237/error-creating-alertpolicy-with-an-invalid-duration-value-of-24h10m

GCP no longer allows alert policies on missing metrics for durations >
23h30m. Since the introduction of the MSP job runtime, jobs can depend
on Sentry execution check-ins instead to be notified on missing job
runs.

## Test plan

```
go build -o ./sg ./dev/sg && ./sg install -f -p=false
sg msp generate -all
```
  • Loading branch information
bobheadxi authored Aug 15, 2024
1 parent c93c377 commit 8f59ef4
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 25 deletions.
7 changes: 7 additions & 0 deletions dev/managedservicesplatform/spec/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,13 @@ type EnvironmentJobScheduleSpec struct {
// than once a week.
//
// Protip: use https://crontab.guru
//
// Note that no GCP alert for missing job executions is provisioned if the
// cron has an interval of more than 23h30m, due to GCP alerts limitations.
// Instead, you should use the MSP job runtime and monitor Sentry
// alerts instead: https://develop.sentry.dev/sdk/telemetry/check-ins/
// The MSP job runtime automatically registers Sentry check-ins on each
// execution.
Cron string `yaml:"cron"`
}

Expand Down
56 changes: 31 additions & 25 deletions dev/managedservicesplatform/stacks/monitoring/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,35 +63,41 @@ func createJobAlerts(
// Use the duration calculated from the cron, with some leeway. Use
// math.Ceil just in case, as Minutes() may give us floats
absentMinutes := int(math.Ceil(interval.Minutes())) + 10
// GCP does not allow absence alerts above 23h30m - for jobs that run
// at a longer interval, depend on the implementor using the MSP job
// runtime to report runs to Sentry, and have Sentry handle alerting
// on missing executions instead.
if absentMinutes < int((23*time.Hour + 30*time.Minute).Minutes()) {
alert, err := alertpolicy.New(stack, id, &alertpolicy.Config{
Service: vars.Service,
EnvironmentID: vars.EnvironmentID,

alert, err := alertpolicy.New(stack, id, &alertpolicy.Config{
Service: vars.Service,
EnvironmentID: vars.EnvironmentID,

ID: "job_execution_absence",
Name: "Cloud Run Job Execution Absence",
Description: fmt.Sprintf("No Cloud Run Job executions were detected in expected window (%dm)", absentMinutes),
ProjectID: vars.ProjectID,
MetricAbsence: &alertpolicy.MetricAbsence{
ConditionBuilder: alertpolicy.ConditionBuilder{
ResourceName: vars.Service.ID,
ResourceKind: alertpolicy.CloudRunJob,
Filters: map[string]string{
"metric.type": "run.googleapis.com/job/completed_task_attempt_count",
ID: "job_execution_absence",
Name: "Cloud Run Job Execution Absence",
Description: fmt.Sprintf("No Cloud Run Job executions were detected in expected window (%dm)", absentMinutes),
ProjectID: vars.ProjectID,
MetricAbsence: &alertpolicy.MetricAbsence{
ConditionBuilder: alertpolicy.ConditionBuilder{
ResourceName: vars.Service.ID,
ResourceKind: alertpolicy.CloudRunJob,
Filters: map[string]string{
"metric.type": "run.googleapis.com/job/completed_task_attempt_count",
},
Aligner: alertpolicy.MonitoringAlignCount,
Reducer: alertpolicy.MonitoringReduceSum,
Period: "60s",
},
Aligner: alertpolicy.MonitoringAlignCount,
Reducer: alertpolicy.MonitoringReduceSum,
Period: "60s",
// Must be in seconds
Duration: fmt.Sprintf("%ds", absentMinutes*60),
},
// Must be in seconds
Duration: fmt.Sprintf("%ds", absentMinutes*60),
},
NotificationChannels: channels,
})
if err != nil {
return nil, errors.Wrap(err, "job_execution_absence")
NotificationChannels: channels,
})
if err != nil {
return nil, errors.Wrap(err, "job_execution_absence")
}
alerts = append(alerts, alert.AlertPolicy)
}
alerts = append(alerts, alert.AlertPolicy)

}

return alerts, nil
Expand Down

0 comments on commit 8f59ef4

Please sign in to comment.