From b65daa30b4792dde136bbbcc4a10ba8e168c3b93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20R=C3=ADos?= Date: Tue, 3 Dec 2024 07:26:59 +0100 Subject: [PATCH] add: `libin` diagram first iteration (#2541) * add: `libin` diagram first iteration * Disable user_retention_classifications for now * fix time to first response * contributor classifications working * Added lifecycle model * trying to improve query - some problems seem to be upstream * Consolidate change in developers * get change in contributors * more comments * Update warehouse/metrics_mesh/oso_metrics/lifecycle.sql * Update warehouse/metrics_mesh/oso_metrics/lifecycle.sql * Remove the libin.sql * Remove unused metrics models * add the additional ctes --------- Co-authored-by: Reuven V. Gonzales Co-authored-by: Reuven Gonzales --- .../events_daily_to_artifact_with_lag.sql | 28 +++ .../metrics_mesh/models/metrics_factories.py | 124 +++++++---- .../oso_metrics/change_in_developers.sql | 16 +- .../contributor_activity_classification.sql | 186 ++++++++++++++++- .../oso_metrics/first_commit_date.sql | 17 -- .../oso_metrics/last_commit_date.sql | 17 -- .../metrics_mesh/oso_metrics/lifecycle.sql | 195 ++++++++++++++++++ .../oso_metrics/time_to_first_response.sql | 4 +- .../user_retention_classification.sql | 67 +++--- .../test_change_in_developers_over_window.yml | 2 +- warehouse/metrics_tools/factory/factory.py | 10 +- warehouse/metrics_tools/joiner/__init__.py | 14 +- warehouse/metrics_tools/runner.py | 13 +- 13 files changed, 561 insertions(+), 132 deletions(-) create mode 100644 warehouse/metrics_mesh/models/events_daily_to_artifact_with_lag.sql delete mode 100644 warehouse/metrics_mesh/oso_metrics/first_commit_date.sql delete mode 100644 warehouse/metrics_mesh/oso_metrics/last_commit_date.sql create mode 100644 warehouse/metrics_mesh/oso_metrics/lifecycle.sql diff --git a/warehouse/metrics_mesh/models/events_daily_to_artifact_with_lag.sql b/warehouse/metrics_mesh/models/events_daily_to_artifact_with_lag.sql new file mode 100644 index 000000000..cb3f09bfe --- /dev/null +++ b/warehouse/metrics_mesh/models/events_daily_to_artifact_with_lag.sql @@ -0,0 +1,28 @@ +MODEL ( + name metrics.events_daily_to_artifact_with_lag, + kind FULL, + start '2015-01-01', + cron '@daily', + partitioned_by (day("bucket_day"), "event_type"), + grain ( + bucket_day, + event_type, + event_source, + from_artifact_id, + to_artifact_id + ), +); +SELECT bucket_day, + to_artifact_id, + from_artifact_id, + event_source, + event_type, + amount, + LAG(bucket_day) OVER ( + PARTITION BY to_artifact_id, + from_artifact_id, + event_source, + event_type + ORDER BY bucket_day + ) AS last_event +FROM metrics.events_daily_to_artifact \ No newline at end of file diff --git a/warehouse/metrics_mesh/models/metrics_factories.py b/warehouse/metrics_mesh/models/metrics_factories.py index 1b7818d10..fc7326cfd 100644 --- a/warehouse/metrics_mesh/models/metrics_factories.py +++ b/warehouse/metrics_mesh/models/metrics_factories.py @@ -10,7 +10,9 @@ model_prefix="timeseries", timeseries_sources=[ "events_daily_to_artifact", + "events_daily_to_artifact_with_lag", "issue_event_time_deltas", + "first_of_event_from_artifact", ], metric_queries={ # This will automatically generate star counts for the given roll up periods. @@ -76,68 +78,70 @@ entity_types=["artifact", "project", "collection"], is_intermediate=True, ), - "developer_classifications": MetricQueryDef( - ref="developer_activity_classification.sql", + "contributor_active_days": MetricQueryDef( + ref="active_days.sql", vars={ - "full_time_ratio": 10 / 30, + "activity_event_types": [ + "COMMIT_CODE", + "ISSUE_OPENED", + "PULL_REQUEST_OPENED", + "PULL_REQUEST_MERGED", + ], }, rolling=RollingConfig( windows=[30, 90, 180], unit="day", - cron="@daily", - ), - ), - "contributor_classifications": MetricQueryDef( - ref="contributor_activity_classification.sql", - vars={"full_time_ratio": 10 / 30}, - rolling=RollingConfig( - windows=[30, 90, 180], - unit="day", - cron="@daily", + cron="@daily", # This determines how often this is calculated ), + entity_types=["artifact", "project", "collection"], + is_intermediate=True, ), - "user_retention_classifications": MetricQueryDef( - ref="user_retention_classification.sql", + "developer_classifications": MetricQueryDef( + ref="developer_activity_classification.sql", vars={ - "activity_event_types": ["CONTRACT_INVOCATION_SUCCESS_DAILY_COUNT"], + "full_time_ratio": 10 / 30, }, rolling=RollingConfig( windows=[30, 90, 180], unit="day", cron="@daily", ), - entity_types=["artifact", "project", "collection"], - ), - "change_in_30_day_developer_activity": MetricQueryDef( - vars={ - "comparison_interval": 30, - }, - ref="change_in_developers.sql", - rolling=RollingConfig( - windows=[2], - unit="period", - cron="@daily", - ), ), - "change_in_90_day_developer_activity": MetricQueryDef( + "contributor_classifications": MetricQueryDef( + ref="contributor_activity_classification.sql", vars={ - "comparison_interval": 90, + "full_time_ratio": 10 / 30, + "activity_event_types": [ + "COMMIT_CODE", + "ISSUE_OPENED", + "PULL_REQUEST_OPENED", + "PULL_REQUEST_MERGED", + ], }, - ref="change_in_developers.sql", rolling=RollingConfig( - windows=[2], - unit="period", + windows=[30, 90, 180], + unit="day", cron="@daily", ), ), - "change_in_180_day_developer_activity": MetricQueryDef( - vars={ - "comparison_interval": 180, - }, + # Currently this query performs really poorly. We need to do some debugging on it + # "user_retention_classifications": MetricQueryDef( + # ref="user_retention_classification.sql", + # vars={ + # "activity_event_types": ["CONTRACT_INVOCATION_SUCCESS_DAILY_COUNT"], + # }, + # rolling=RollingConfig( + # windows=[30, 90, 180], + # unit="day", + # cron="@daily", + # ), + # entity_types=["artifact", "project", "collection"], + # ), + "change_in_developer_activity": MetricQueryDef( ref="change_in_developers.sql", rolling=RollingConfig( - windows=[2], - unit="period", + windows=[30, 90, 180], + unit="day", cron="@daily", ), ), @@ -195,8 +199,8 @@ ), entity_types=["artifact", "project", "collection"], ), - "avg_time_to_first_response": MetricQueryDef( - ref="prs_time_to_merge.sql", + "avg_time_to_first_response": MetricQueryDef( + ref="time_to_first_response.sql", rolling=RollingConfig( windows=[90, 180], unit="day", @@ -230,7 +234,7 @@ cron="@daily", ), entity_types=["artifact", "project", "collection"], - ), + ), "transactions": MetricQueryDef( ref="transactions.sql", rolling=RollingConfig( @@ -239,7 +243,41 @@ cron="@daily", ), entity_types=["artifact", "project", "collection"], - ), + ), + "contributors_lifecycle": MetricQueryDef( + ref="lifecycle.sql", + vars={ + "activity_event_types": [ + "COMMIT_CODE", + "ISSUE_OPENED", + "PULL_REQUEST_OPENED", + "PULL_REQUEST_MERGED", + ], + }, + rolling=RollingConfig( + windows=[30, 90, 180], + unit="day", + cron="@daily", + ), + entity_types=["artifact", "project", "collection"], + ), + # "libin": MetricQueryDef( + # ref="libin.sql", + # vars={ + # "activity_event_types": [ + # "COMMIT_CODE", + # "ISSUE_OPENED", + # "PULL_REQUEST_OPENED", + # "PULL_REQUEST_MERGED", + # ], + # }, + # rolling=RollingConfig( + # windows=[30, 90, 180], + # unit="day", + # cron="@daily", + # ), + # entity_types=["artifact"], + # ), }, default_dialect="clickhouse", ) diff --git a/warehouse/metrics_mesh/oso_metrics/change_in_developers.sql b/warehouse/metrics_mesh/oso_metrics/change_in_developers.sql index 4c4ea7275..d8c8934f7 100644 --- a/warehouse/metrics_mesh/oso_metrics/change_in_developers.sql +++ b/warehouse/metrics_mesh/oso_metrics/change_in_developers.sql @@ -9,13 +9,13 @@ WITH latest AS ( classification.amount FROM @metrics_peer_ref( developer_classifications, - window := @comparison_interval, - unit := 'day', + window := @rolling_window, + unit := @rolling_unit, ) as classification WHERE classification.metrics_sample_date = @relative_window_sample_date( @metrics_end('DATE'), - @comparison_interval, - 'day', + @rolling_window, + @rolling_unit, 0 ) ), @@ -31,13 +31,13 @@ previous AS ( classification.amount FROM @metrics_peer_ref( developer_classifications, - window := @comparison_interval, - unit := 'day' + window := @rolling_window, + unit := @rolling_unit ) as classification WHERE classification.metrics_sample_date = @relative_window_sample_date( @metrics_end('DATE'), - @comparison_interval, - 'day', + @rolling_window, + @rolling_unit, -1 ) ) diff --git a/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql b/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql index 5691b9315..c3ddd7902 100644 --- a/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql +++ b/warehouse/metrics_mesh/oso_metrics/contributor_activity_classification.sql @@ -1,3 +1,154 @@ +with first_of_activity_to_entity as ( + -- We use this CTE to get the first of a specific type of event to a specific + -- entity. + select MIN(time) as `time`, + event_source, + from_artifact_id, + to_artifact_id + from metrics.first_of_event_from_artifact + where event_type in @activity_event_types + group by event_source, + from_artifact_id, + to_artifact_id +), +filtered_first_of as ( + -- Filtered first of events to just the current period we are measuring. + select distinct event_source, + from_artifact_id, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := fo, + include_column_alias := true + ) + from first_of_activity_to_entity as fo + where `time` between @metrics_start('DATE') and @metrics_end('DATE') +), +new_contributors as ( + -- Only new contributors. we do this by joining on the filtered first of events + -- in this time range + select active.metrics_sample_date, + active.event_source, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := active, + include_column_alias := true, + ), + @metric_name('new_contributors') as metric, + COUNT(DISTINCT active.from_artifact_id) as amount + from @metrics_peer_ref( + contributor_active_days, + window := @rolling_window, + unit := @rolling_unit + ) as active + inner join filtered_first_of as ffo on active.from_artifact_id = ffo.from_artifact_id + and active.event_source = ffo.event_source + and @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := active + ) = @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := ffo + ) + where active.metrics_sample_date = @metrics_end('DATE') + group by metric, + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), + active.event_source, + active.metrics_sample_date +), +lag_events_filtered as ( + -- This filters for lagged events of the activity types we care about + select events.bucket_day, + events.event_source, + events.from_artifact_id, + events.to_artifact_id, + MAX(last_event) as last_event + from metrics.events_daily_to_artifact_with_lag as events + where event_type in @activity_event_types + group by bucket_day, + event_source, + from_artifact_id, + to_artifact_id +), +contributors_earliest_event_in_period as ( + -- This uses a window function to get the earliest event in a given period for + -- a specific contributor. We then use the "last_event" value of this to + -- determine the resurrection status. + select events.bucket_day, + events.event_source, + events.from_artifact_id, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := events, + include_column_alias := true, + ), + events.last_event, + ROW_NUMBER() OVER ( + PARTITION BY @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := events + ), + events.from_artifact_id, + events.event_source, + ORDER BY bucket_day ASC + ) as event_rank + from lag_events_filtered as events +), +contributors_last_event as ( + -- Gets the resurrected contributors based on the date of the last event. + select events.event_source, + events.from_artifact_id, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := events, + include_column_alias := true, + ), + CASE + WHEN COUNT(events.last_event) < COUNT(*) THEN NULL + ELSE MAX(events.last_event) + END as last_event + from contributors_earliest_event_in_period as events + where event_rank = 1 + group by events.from_artifact_id, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := events, + ), + events.event_source +), +resurrected_contributors as ( + -- resurrected users are users that had previously churned or went dormant for + -- at least one period but have returned + select active.metrics_sample_date, + active.event_source, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := active, + include_column_alias := true, + ), + @metric_name('resurrected_contributors') as metric, + COUNT(DISTINCT active.from_artifact_id) as amount + from @metrics_peer_ref( + contributor_active_days, + window := @rolling_window, + unit := @rolling_unit + ) as active + inner join contributors_last_event as last_event on active.from_artifact_id = last_event.from_artifact_id + and active.event_source = last_event.event_source + and @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := active + ) = @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := last_event + ) + where active.metrics_sample_date = @metrics_end('DATE') + and last_event.last_event is not null + and last_event.last_event <= @metrics_start('DATE') - INTERVAL @rolling_window DAY + group by metric, + @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), + active.event_source, + active.metrics_sample_date +) select active.metrics_sample_date, active.event_source, @metrics_entity_type_col( @@ -9,7 +160,7 @@ select active.metrics_sample_date, @metric_name('full_time_contributors') as metric, COUNT(DISTINCT active.from_artifact_id) as amount from @metrics_peer_ref( - developer_active_days, + contributor_active_days, window := @rolling_window, unit := @rolling_unit ) as active @@ -35,7 +186,7 @@ select active.metrics_sample_date, @metric_name('part_time_contributors') as metric, COUNT(DISTINCT active.from_artifact_id) as amount from @metrics_peer_ref( - developer_active_days, + contributor_active_days, window := @rolling_window, unit := @rolling_unit ) as active @@ -44,9 +195,34 @@ where active.amount / @rolling_window < @full_time_ratio group by metric, from_artifact_id, @metrics_entity_type_col('to_{entity_type}_id', table_alias := active), - event_source, - metrics_sample_date + active.event_source, + active.metrics_sample_date +union all +select new.metrics_sample_date, + new.event_source, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := new, + include_column_alias := true, + ), + '' as from_artifact_id, + new.metric, + new.amount as amount +from new_contributors as new +union all +select resurrected.metrics_sample_date, + resurrected.event_source, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := resurrected, + include_column_alias := true, + ), + '' as from_artifact_id, + resurrected.metric, + resurrected.amount as amount +from resurrected_contributors as resurrected union all +-- All active contributors select active.metrics_sample_date, active.event_source, @metrics_entity_type_col( @@ -58,7 +234,7 @@ select active.metrics_sample_date, @metric_name('active_contributors') as metric, COUNT(DISTINCT active.from_artifact_id) as amount from @metrics_peer_ref( - developer_active_days, + contributor_active_days, window := @rolling_window, unit := @rolling_unit ) as active diff --git a/warehouse/metrics_mesh/oso_metrics/first_commit_date.sql b/warehouse/metrics_mesh/oso_metrics/first_commit_date.sql deleted file mode 100644 index a0518d888..000000000 --- a/warehouse/metrics_mesh/oso_metrics/first_commit_date.sql +++ /dev/null @@ -1,17 +0,0 @@ --- TODO (@ravenac95) keeping this for now, might prove useful, but we likely need --- a different kind of model for first commit data -select @metrics_sample_date(events.bucket_day) as metrics_sample_date, - events.event_source, - events.to_artifact_id as to_artifact_id, - '' as from_artifact_id, - @metric_name() as metric, - 1 as amount, - MIN(events.bucket_day) as first_commit_date -from metrics.events_daily_to_artifact as events -where events.event_type = 'COMMIT_CODE' - and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE') -group by 1, - metric, - from_artifact_id, - to_artifact_id, - event_source \ No newline at end of file diff --git a/warehouse/metrics_mesh/oso_metrics/last_commit_date.sql b/warehouse/metrics_mesh/oso_metrics/last_commit_date.sql deleted file mode 100644 index 355bde9a7..000000000 --- a/warehouse/metrics_mesh/oso_metrics/last_commit_date.sql +++ /dev/null @@ -1,17 +0,0 @@ --- TODO (@ravenac95) keeping this for now, might prove useful, but we likely need --- a different kind of model for last commit data -select @metrics_sample_date(events.bucket_day) as metrics_sample_date, - events.event_source, - events.to_artifact_id as to_artifact_id, - '' as from_artifact_id, - @metric_name() as metric, - 1 as amount, - MAX(events.bucket_day) as last_commit_date -from metrics.events_daily_to_artifact as events -where events.event_type = 'COMMIT_CODE' - and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE') -group by 1, - metric, - from_artifact_id, - to_artifact_id, - event_source \ No newline at end of file diff --git a/warehouse/metrics_mesh/oso_metrics/lifecycle.sql b/warehouse/metrics_mesh/oso_metrics/lifecycle.sql new file mode 100644 index 000000000..1ffe10dfc --- /dev/null +++ b/warehouse/metrics_mesh/oso_metrics/lifecycle.sql @@ -0,0 +1,195 @@ +with latest as ( + select classification.metrics_sample_date, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := classification + ), + classification.event_source, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'active_%' THEN amount + END + ), + 0 + ) as active, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'full_%' THEN amount + END + ), + 0 + ) as full, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'part_%' THEN amount + END + ), + 0 + ) as part, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'new_%' THEN amount + END + ), + 0 + ) as new, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'resurrected_%' THEN amount + END + ), + 0 + ) as resurrected + from @metrics_peer_ref( + contributor_classifications, + window := @rolling_window, + unit := @rolling_unit + ) as classification + where classification.metrics_sample_date = @relative_window_sample_date( + @metrics_end('DATE'), + @rolling_window, + @rolling_unit, + 0 + ) + group by classification.metrics_sample_date, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := classification + ), + classification.event_source +), +previous as ( + select classification.metrics_sample_date, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := classification + ), + classification.event_source, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'active_%' THEN amount + END + ), + 0 + ) as active, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'full_%' THEN amount + END + ), + 0 + ) as full, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'part_%' THEN amount + END + ), + 0 + ) as part, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'new_%' THEN amount + END + ), + 0 + ) as new, + COALESCE( + MAX( + CASE + WHEN classification.metric LIKE 'resurrected_%' THEN amount + END + ), + 0 + ) as resurrected + from @metrics_peer_ref( + contributor_classifications, + window := @rolling_window, + unit := @rolling_unit, + ) as classification + where classification.metrics_sample_date = @relative_window_sample_date( + @metrics_end('DATE'), + @rolling_window, + @rolling_unit, + -1 + ) + group by metrics_sample_date, + @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := classification + ), + event_source +), +churned_contributors as ( + -- Churn is prev.active - (latest.active - latest.new - latest.resurrected) + select @metrics_end('DATE') as metrics_sample_date, + COALESCE(latest.event_source, previous.event_source) as event_source, + @metrics_entity_type_alias( + COALESCE( + @metrics_entity_type_col('to_{entity_type}_id', table_alias := latest), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := previous) + ), + 'to_{entity_type}_id', + ), + '' as from_artifact_id, + @metrics_name('churned_contributors') as metric, + CASE + WHEN previous.active is null THEN 0 + ELSE previous.active - ( + COALESCE(latest.active, 0) - COALESCE(latest.new, 0) - COALESCE(latest.resurrected, 0) + ) + END as amount + from previous + LEFT JOIN latest ON latest.event_source = previous.event_source + AND @metrics_entity_type_col('to_{entity_type}_id', table_alias := latest) = @metrics_entity_type_col('to_{entity_type}_id', table_alias := previous) +), +change_in_full_time_contributors as ( + select @metrics_end('DATE') as metrics_sample_date, + COALESCE(latest.event_source, previous.event_source) as event_source, + @metrics_entity_type_alias( + COALESCE( + @metrics_entity_type_col('to_{entity_type}_id', table_alias := latest), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := previous) + ), + 'to_{entity_type}_id', + ), + '' as from_artifact_id, + @metrics_name('change_in_full_time_contributors') as metric, + COALESCE(latest.full, 0) - COALESCE(previous.full, 0) as amount + from previous + LEFT JOIN latest ON latest.event_source = previous.event_source + AND @metrics_entity_type_col('to_{entity_type}_id', table_alias := latest) = @metrics_entity_type_col('to_{entity_type}_id', table_alias := previous) +), +change_in_part_time_contributors as ( + select @metrics_end('DATE') as metrics_sample_date, + COALESCE(latest.event_source, previous.event_source) as event_source, + @metrics_entity_type_alias( + COALESCE( + @metrics_entity_type_col('to_{entity_type}_id', table_alias := latest), + @metrics_entity_type_col('to_{entity_type}_id', table_alias := previous) + ), + 'to_{entity_type}_id', + ), + '' as from_artifact_id, + @metrics_name('change_in_part_time_contributors') as metric, + COALESCE(latest.part, 0) - COALESCE(previous.part, 0) as amount + from previous + LEFT JOIN latest ON latest.event_source = previous.event_source + AND @metrics_entity_type_col('to_{entity_type}_id', table_alias := latest) = @metrics_entity_type_col('to_{entity_type}_id', table_alias := previous) +) +select * +from churned_contributors +union all +select * +from change_in_full_time_contributors +union all +select * +from change_in_part_time_contributors \ No newline at end of file diff --git a/warehouse/metrics_mesh/oso_metrics/time_to_first_response.sql b/warehouse/metrics_mesh/oso_metrics/time_to_first_response.sql index 5fab019a8..9f93d9907 100644 --- a/warehouse/metrics_mesh/oso_metrics/time_to_first_response.sql +++ b/warehouse/metrics_mesh/oso_metrics/time_to_first_response.sql @@ -7,11 +7,11 @@ select @metrics_sample_date(time) as metrics_sample_date, from metrics.issue_event_time_deltas where ( ( - event_type in ("PULL_REQUEST_MERGED", "ISSUE_CLOSED") + event_type in ('PULL_REQUEST_MERGED', 'ISSUE_CLOSED') and comments = 0 ) or ( - event_type in ("PULL_REQUEST_REVIEW_COMMENT", "ISSUE_COMMENT") + event_type in ('PULL_REQUEST_REVIEW_COMMENT', 'ISSUE_COMMENT') and comments = 1 ) ) diff --git a/warehouse/metrics_mesh/oso_metrics/user_retention_classification.sql b/warehouse/metrics_mesh/oso_metrics/user_retention_classification.sql index 131be3f9b..d0fd9bb25 100644 --- a/warehouse/metrics_mesh/oso_metrics/user_retention_classification.sql +++ b/warehouse/metrics_mesh/oso_metrics/user_retention_classification.sql @@ -1,28 +1,24 @@ with first_events as ( - select + select event_source, from_artifact_id, - min(time::date) as first_event_date - from @first_of_event_from_artifact - where event_type in (@activity_event_types) - group by from_artifact_id + to_artifact_id, + min(time) as first_event_date + from metrics.first_of_event_from_artifact + where event_type in @activity_event_types + group by event_source, + from_artifact_id, + to_artifact_id ), active_users as ( - select distinct - from_artifact_id, - event_date, + select distinct from_artifact_id, + bucket_day, event_source, - @metrics_entity_type_col( - 'to_{entity_type}_id', - include_column_alias := true - ) - from @metrics_source( - event_types := @activity_event_types - ) - where event_date >= @metrics_start('DATE') + to_artifact_id + from metrics.events_daily_to_artifact + where event_type in @activity_event_types + where bucket_day between @metrics_start('DATE') and @metrics_end('DATE') ) - -select - @metrics_end('DATE') as metrics_sample_date, +select @metrics_end('DATE') as metrics_sample_date, active.event_source, @metrics_entity_type_col( 'to_{entity_type}_id', @@ -33,10 +29,16 @@ select @metric_name('new_users') as metric, COUNT(DISTINCT active.from_artifact_id) as amount from active_users active -join first_events on first_events.from_artifact_id = active.from_artifact_id -where first_events.first_event_date >= @metrics_start('DATE') -group by - metrics_sample_date, + join first_events on first_events.from_artifact_id = active.from_artifact_id + and @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := active + ) = @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := first_events + ) +where first_events.first_event_date between @metrics_start('DATE') and @metrics_end('DATE') +group by metrics_sample_date, metric, from_artifact_id, @metrics_entity_type_col( @@ -44,11 +46,8 @@ group by table_alias := active ), event_source - union all - -select - @metrics_end('DATE') as metrics_sample_date, +select @metrics_end('DATE') as metrics_sample_date, active.event_source, @metrics_entity_type_col( 'to_{entity_type}_id', @@ -59,14 +58,20 @@ select @metric_name('returning_users') as metric, COUNT(DISTINCT active.from_artifact_id) as amount from active_users active -join first_events on first_events.from_artifact_id = active.from_artifact_id + join first_events on first_events.from_artifact_id = active.from_artifact_id + and @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := active + ) = @metrics_entity_type_col( + 'to_{entity_type}_id', + table_alias := first_events + ) where first_events.first_event_date < @metrics_start('DATE') -group by - metrics_sample_date, +group by metrics_sample_date, metric, from_artifact_id, @metrics_entity_type_col( 'to_{entity_type}_id', table_alias := active ), - event_source + event_source \ No newline at end of file diff --git a/warehouse/metrics_mesh/tests/test_change_in_developers_over_window.yml b/warehouse/metrics_mesh/tests/test_change_in_developers_over_window.yml index b151798ad..4b4a84497 100644 --- a/warehouse/metrics_mesh/tests/test_change_in_developers_over_window.yml +++ b/warehouse/metrics_mesh/tests/test_change_in_developers_over_window.yml @@ -2,7 +2,7 @@ test_change_in_30_day_developer_activity_to_artifact_over_2_period_window_full_t # Tests rolling count of active days when the user is active 4 of the 5 days # in the test interval gateway: local - model: metrics.change_in_30_day_developer_activity_to_artifact_over_2_period_window + model: metrics.change_in_developer_activity_to_artifact_over_30_day_window vars: start: 2024-01-31 end: 2024-01-31 diff --git a/warehouse/metrics_tools/factory/factory.py b/warehouse/metrics_tools/factory/factory.py index 00af9291f..4ecf23b4a 100644 --- a/warehouse/metrics_tools/factory/factory.py +++ b/warehouse/metrics_tools/factory/factory.py @@ -236,11 +236,14 @@ def _generate_metrics_queries( additional_macros, variables=evaluator_variables, ), - QualifyTransform(), + QualifyTransform( + validate_qualify_columns=False, allow_partial_qualification=True + ), JoinerTransform( ref["entity_type"], self._timeseries_sources, ), + QualifyTransform(), ], ) @@ -665,10 +668,15 @@ def generated_rolling_query( # If the rolling window is empty we need to yield from an empty tuple # otherwise sqlmesh fails. See: # https://sqlmesh.readthedocs.io/en/latest/concepts/models/python_models/#returning-empty-dataframes + total = 0 if df.empty: yield from () else: + count = len(df) + total += count + logger.debug(f"table={table_name} yielding rows {count}") yield df + logger.debug(f"table={table_name} yielded rows{total}") def generated_rolling_query_proxy( diff --git a/warehouse/metrics_tools/joiner/__init__.py b/warehouse/metrics_tools/joiner/__init__.py index 9b6ac5a8f..372d3ae35 100644 --- a/warehouse/metrics_tools/joiner/__init__.py +++ b/warehouse/metrics_tools/joiner/__init__.py @@ -94,11 +94,15 @@ def _transform(node: exp.Expression): ) ) - group = t.cast(exp.Group, updated_select.args.get("group")) - for group_idx in range(len(group.expressions)): - group_col = t.cast(exp.Column, group.expressions[group_idx]) - if group_col == current_to_artifact_id_col: - group_col.replace(new_to_entity_id_col) + group_exp = updated_select.args.get("group") + if group_exp: + group = t.cast(exp.Group, group_exp) + for group_idx in range(len(group.expressions)): + group_col = t.cast( + exp.Column, group.expressions[group_idx] + ) + if group_col == current_to_artifact_id_col: + group_col.replace(new_to_entity_id_col) return updated_select # If nothing happens in the for loop then we didn't find the kind of diff --git a/warehouse/metrics_tools/runner.py b/warehouse/metrics_tools/runner.py index 48167980e..acd76624a 100644 --- a/warehouse/metrics_tools/runner.py +++ b/warehouse/metrics_tools/runner.py @@ -149,16 +149,25 @@ def run_time_aggregation(self, start: datetime, end: datetime): def run_rolling(self, start: datetime, end: datetime): df: pd.DataFrame = pd.DataFrame() - logger.debug(f"run_rolling called with start={start} and end={end}") + logger.debug( + f"run_rolling[{self._ref['name']}]: called with start={start} and end={end}" + ) count = 0 + total_rows = 0 for rendered_query in self.render_rolling_queries(start, end): count += 1 logger.debug( - f"executing rolling window: {rendered_query}", + f"run_rolling[{self._ref['name']}]: executing rolling window: {rendered_query}", extra={"query": rendered_query}, ) day_result = self._context.engine_adapter.fetchdf(rendered_query) + day_rows = len(day_result) + total_rows += day_rows + logger.debug( + f"run_rolling[{self._ref['name']}]: rolling window period resulted in {day_rows} rows" + ) df = pd.concat([df, day_result]) + logger.debug(f"run_rolling[{self._ref['name']}]: total rows {total_rows}") return df