Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(crons): mark misses as UNKNOWN when tick volume is ABNORMAL #80355

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions src/sentry/monitors/clock_tasks/check_missed.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,11 @@ def dispatch_check_missing(ts: datetime, volume_anomaly_result: TickVolumeAnomol
produce_task(payload)


def mark_environment_missing(monitor_environment_id: int, ts: datetime):
def mark_environment_missing(
monitor_environment_id: int,
ts: datetime,
volume_anomaly_result: TickVolumeAnomolyResult,
):
logger.info("mark_missing", extra={"monitor_environment_id": monitor_environment_id})

try:
Expand All @@ -118,6 +122,14 @@ def mark_environment_missing(monitor_environment_id: int, ts: datetime):
assert monitor_environment.next_checkin is not None
expected_time = monitor_environment.next_checkin

# If this tick had an abnormal volume of processed check-ins, mark it as
# UNKNOWN to avoid false-positive monitor incidents (and notifications).
match volume_anomaly_result:
case TickVolumeAnomolyResult.NORMAL:
status = CheckInStatus.MISSED
case TickVolumeAnomolyResult.ABNORMAL:
status = CheckInStatus.UNKNOWN

# add missed checkin.
#
# XXX(epurkhiser): The date_added is backdated so that this missed
Expand All @@ -127,7 +139,7 @@ def mark_environment_missing(monitor_environment_id: int, ts: datetime):
project_id=monitor_environment.monitor.project_id,
monitor=monitor_environment.monitor,
monitor_environment=monitor_environment,
status=CheckInStatus.MISSED,
status=status,
date_added=expected_time,
expected_time=expected_time,
monitor_config=monitor.get_validated_config(),
Expand Down
8 changes: 7 additions & 1 deletion src/sentry/monitors/consumers/clock_tasks_consumer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from sentry.monitors.clock_tasks.check_missed import mark_environment_missing
from sentry.monitors.clock_tasks.check_timeout import mark_checkin_timeout
from sentry.monitors.clock_tasks.mark_unknown import mark_checkin_unknown
from sentry.monitors.types import TickVolumeAnomolyResult

MONITORS_CLOCK_TASKS_CODEC: Codec[MonitorsClockTasks] = get_topic_codec(Topic.MONITORS_CLOCK_TASKS)

Expand Down Expand Up @@ -57,7 +58,12 @@ def process_clock_task(message: Message[KafkaPayload | FilteredPayload]):
return

if is_mark_missing(wrapper):
mark_environment_missing(int(wrapper["monitor_environment_id"]), ts)
volume_anomaly_result = TickVolumeAnomolyResult.from_str(
wrapper.get("volume_anomaly_result", "normal")
)
mark_environment_missing(
int(wrapper["monitor_environment_id"]), ts, volume_anomaly_result
)
return

logger.error("Unsupported clock-tick task type: %s", wrapper["type"])
Expand Down
38 changes: 31 additions & 7 deletions tests/sentry/monitors/clock_tasks/test_check_missed.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_missing_checkin(self, mock_produce_task):
assert mock_produce_task.call_count == 1
assert mock_produce_task.mock_calls[0] == mock.call(payload)

mark_environment_missing(monitor_environment.id, ts)
mark_environment_missing(monitor_environment.id, ts, TickVolumeAnomolyResult.NORMAL)

# Monitor status is updated
monitor_environment = MonitorEnvironment.objects.get(
Expand Down Expand Up @@ -164,7 +164,11 @@ def test_missing_checkin_with_timezone(self, mock_produce_task):
assert mock_produce_task.call_count == 1

# Missed check-in correctly updates
mark_environment_missing(monitor_environment.id, ts + timedelta(minutes=1))
mark_environment_missing(
monitor_environment.id,
ts + timedelta(minutes=1),
TickVolumeAnomolyResult.NORMAL,
)
monitor_environment.refresh_from_db()

assert MonitorCheckIn.objects.filter(
Expand Down Expand Up @@ -245,6 +249,7 @@ def test_missing_checkin_with_margin(self, mock_produce_task):
mark_environment_missing(
monitor_environment.id,
ts + timedelta(minutes=4),
TickVolumeAnomolyResult.NORMAL,
)

monitor_environment = MonitorEnvironment.objects.get(
Expand Down Expand Up @@ -349,6 +354,7 @@ def test_missing_checkin_with_margin_schedule_overlap(self, mock_produce_task):
mark_environment_missing(
monitor_environment.id,
ts + timedelta(minutes=10),
TickVolumeAnomolyResult.NORMAL,
)

# The missed checkin is created when it was supposed to happen
Expand Down Expand Up @@ -410,6 +416,7 @@ def test_missing_checkin_with_skipped_clock_ticks(self, mock_produce_task):
mark_environment_missing(
monitor_environment.id,
ts + timedelta(minutes=1),
TickVolumeAnomolyResult.NORMAL,
)

# MonitorEnvironment is correctly updated with the next checkin time
Expand All @@ -425,6 +432,7 @@ def test_missing_checkin_with_skipped_clock_ticks(self, mock_produce_task):
mark_environment_missing(
monitor_environment.id,
ts + timedelta(minutes=3),
TickVolumeAnomolyResult.NORMAL,
)

# MonitorEnvironment is updated with the next_checkin correctly being
Expand Down Expand Up @@ -597,10 +605,18 @@ def test_missed_exception_handling(self, mock_produce_task):

# assert failing monitor raises an error
with pytest.raises(ValueError):
mark_environment_missing(failing_monitor_environment.id, ts)
mark_environment_missing(
failing_monitor_environment.id,
ts,
TickVolumeAnomolyResult.NORMAL,
)

# assert regular monitor works
mark_environment_missing(successful_monitor_environment.id, ts)
mark_environment_missing(
successful_monitor_environment.id,
ts,
TickVolumeAnomolyResult.NORMAL,
)

# We still put the monitor in an error state
assert MonitorEnvironment.objects.filter(
Expand Down Expand Up @@ -702,7 +718,11 @@ def test_missed_checkin_backlog_handled(self, mock_produce_task):
# Execute the queued mark_missing tasks. This will have moved the
# next_checkin_latest forward, meaning the next tick should NOT create
# a missed check-in.
mark_environment_missing(monitor_environment.id, ts + timedelta(minutes=1))
mark_environment_missing(
monitor_environment.id,
ts + timedelta(minutes=1),
TickVolumeAnomolyResult.NORMAL,
)

# We have created a missed check-in
missed_checkin = MonitorCheckIn.objects.get(
Expand All @@ -713,7 +733,11 @@ def test_missed_checkin_backlog_handled(self, mock_produce_task):

# Execute the second task. This should detect that we've already moved
# past the next_checkin_latest and NOT create a new missed
mark_environment_missing(monitor_environment.id, ts + timedelta(minutes=2))
mark_environment_missing(
monitor_environment.id,
ts + timedelta(minutes=2),
TickVolumeAnomolyResult.NORMAL,
)

missed_count = MonitorCheckIn.objects.filter(
monitor_environment=monitor_environment.id, status=CheckInStatus.MISSED
Expand Down Expand Up @@ -752,7 +776,7 @@ def test_status_updated_before_task_execution(self):
status=MonitorStatus.DISABLED,
)

mark_environment_missing(monitor_environment.id, ts)
mark_environment_missing(monitor_environment.id, ts, TickVolumeAnomolyResult.NORMAL)

# Do NOT generate a missed check-in
assert not MonitorCheckIn.objects.filter(
Expand Down
12 changes: 10 additions & 2 deletions tests/sentry/monitors/consumers/test_clock_tasks_consumer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
MONITORS_CLOCK_TASKS_CODEC,
MonitorClockTasksStrategyFactory,
)
from sentry.monitors.types import TickVolumeAnomolyResult

partition = Partition(Topic("test"), 0)

Expand Down Expand Up @@ -43,11 +44,18 @@ def test_dispatch_mark_missing(mock_mark_environment_missing):
send_task(
consumer,
ts,
{"type": "mark_missing", "ts": ts.timestamp(), "monitor_environment_id": 1},
{
"type": "mark_missing",
"ts": ts.timestamp(),
"monitor_environment_id": 1,
"volume_anomaly_result": TickVolumeAnomolyResult.NORMAL.value,
},
)

assert mock_mark_environment_missing.call_count == 1
assert mock_mark_environment_missing.mock_calls[0] == mock.call(1, ts)
assert mock_mark_environment_missing.mock_calls[0] == mock.call(
1, ts, TickVolumeAnomolyResult.NORMAL
)


@mock.patch("sentry.monitors.consumers.clock_tasks_consumer.mark_checkin_timeout")
Expand Down
Loading