From bf53eb70ce7bd3c9ce6cb49cc037dae07555109a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20R=C3=ADos?= Date: Fri, 22 Nov 2024 13:40:22 +0700 Subject: [PATCH] add: `sqlmesh` metrics (#2483) * add: `active_developers` sqlmesh metric * add: `closed_issues` sqlmesh metric * add: `contributors` sqlmesh metric * add: `first_commit_date` sqlmesh metric * add: `last_commit` sqlmesh metric * add: `new_contributors` sqlmesh metric * add: `repositories` sqlmesh metric * add: developer_states sqlmesh metric models * fix: remove `duplicated` model * add: point in time and interval `models` * Adds dbt point in time metrics from source data * Disable first and last commits for now * Add point_in_time_v0 * fix * more fixes and tests --------- Co-authored-by: Reuven V. Gonzales --- .../int_point_in_time_from_sources.sql | 22 +++++ .../marts/point_in_time/point_in_time_v0.sql | 14 ++++ .../stg_ossd__repository_point_in_time.sql | 17 ++++ .../metrics_mesh/models/metrics_factories.py | 80 +++++++++++++++++-- .../oso_metrics/active_developers.sql | 14 ++++ .../metrics_mesh/oso_metrics/contributors.sql | 19 +++++ .../oso_metrics/first_commit_date.sql | 17 ++++ .../oso_metrics/last_commit_date.sql | 17 ++++ .../metrics_mesh/oso_metrics/repositories.sql | 25 ++++++ .../test_change_in_developers_over_window.yml | 33 ++++++++ ...test_developer_active_days_over_window.yml | 8 +- ...t_developer_classification_over_window.yml | 39 +++++++++ warehouse/metrics_tools/macros/macros.py | 7 +- 13 files changed, 297 insertions(+), 15 deletions(-) create mode 100644 warehouse/dbt/models/intermediate/point_in_time/int_point_in_time_from_sources.sql create mode 100644 warehouse/dbt/models/marts/point_in_time/point_in_time_v0.sql create mode 100644 warehouse/dbt/models/staging/oss-directory/stg_ossd__repository_point_in_time.sql create mode 100644 warehouse/metrics_mesh/oso_metrics/active_developers.sql create mode 100644 warehouse/metrics_mesh/oso_metrics/contributors.sql create mode 100644 warehouse/metrics_mesh/oso_metrics/first_commit_date.sql create mode 100644 warehouse/metrics_mesh/oso_metrics/last_commit_date.sql create mode 100644 warehouse/metrics_mesh/oso_metrics/repositories.sql create mode 100644 warehouse/metrics_mesh/tests/test_change_in_developers_over_window.yml create mode 100644 warehouse/metrics_mesh/tests/test_developer_classification_over_window.yml diff --git a/warehouse/dbt/models/intermediate/point_in_time/int_point_in_time_from_sources.sql b/warehouse/dbt/models/intermediate/point_in_time/int_point_in_time_from_sources.sql new file mode 100644 index 000000000..ae3e7a057 --- /dev/null +++ b/warehouse/dbt/models/intermediate/point_in_time/int_point_in_time_from_sources.sql @@ -0,0 +1,22 @@ +{# +Point in time metrics that are taken from raw sources. This does not do any +point in time aggregations for running sums/averages. + +Not all collected data has historical state information but some things do. This +is to include those for greater accuracy when rendering metrics + +This is particularly useful for: +* Star Count (using STARRED events doesn't capture accurate star counts) +* Watcher Count +* Repository Count (This is an aggregated metric for a project/collection) + +Other things in the future will likely be useful here but for now this is just +for repository related metrics that aren't timeseries by nature. +#} +select + `time`, + artifact_source, + {{ oso_id("artifact_source", "artifact_source_id") }} as artifact_id, + metric, + amount +from {{ ref("stg_ossd__repository_point_in_time") }} diff --git a/warehouse/dbt/models/marts/point_in_time/point_in_time_v0.sql b/warehouse/dbt/models/marts/point_in_time/point_in_time_v0.sql new file mode 100644 index 000000000..02a8285c0 --- /dev/null +++ b/warehouse/dbt/models/marts/point_in_time/point_in_time_v0.sql @@ -0,0 +1,14 @@ +{{ + config(meta = { + 'sync_to_db': True, + 'order_by': [ 'artifact_source', 'metric', 'artifact_id', 'time' ] + }) +}} + +select + time, + artifact_source, + artifact_id, + metric, + amount +from {{ ref("int_point_in_time_from_sources") }} diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__repository_point_in_time.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__repository_point_in_time.sql new file mode 100644 index 000000000..0b16379ae --- /dev/null +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__repository_point_in_time.sql @@ -0,0 +1,17 @@ +{# +Point in time view for repository data. +#} +select + repo.ingestion_time as `time`, + "GITHUB" as artifact_source, + "REPOSITORY" as artifact_type, + repo.owner as artifact_namespace, + repo.name as artifact_name, + repo.id as artifact_source_id, + unpivoted.metric as metric, + unpivoted.amount as amount +from {{ oso_source('ossd', 'repositories') }} as repo, unnest([ + struct("fork_count" as metric, fork_count as amount), + struct("star_count" as metric, star_count as amount), + struct("watcher_count" as metric, watcher_count as amount) +]) as unpivoted diff --git a/warehouse/metrics_mesh/models/metrics_factories.py b/warehouse/metrics_mesh/models/metrics_factories.py index 016a8432b..2b6095837 100644 --- a/warehouse/metrics_mesh/models/metrics_factories.py +++ b/warehouse/metrics_mesh/models/metrics_factories.py @@ -45,6 +45,18 @@ ref="gas_fees.sql", time_aggregations=["daily", "weekly", "monthly"], ), + "repositories": MetricQueryDef( + ref="repositories.sql", + time_aggregations=["daily", "weekly", "monthly"], + ), + "contributors": MetricQueryDef( + ref="contributors.sql", + time_aggregations=["daily", "weekly", "monthly"], + ), + "active_developers": MetricQueryDef( + ref="active_developers.sql", + time_aggregations=["daily", "weekly", "monthly"], + ), # This defines something with a rolling option that allows you to look back # to some arbitrary window. So you specify the window and specify the unit. # The unit and the window are used to pass in variables to the query. So it's @@ -57,7 +69,7 @@ "activity_event_types": ["COMMIT_CODE"], }, rolling=RollingConfig( - windows=[30, 60, 90], + windows=[30, 90, 180], unit="day", cron="@daily", # This determines how often this is calculated ), @@ -70,7 +82,7 @@ "full_time_ratio": 10 / 30, }, rolling=RollingConfig( - windows=[30, 60, 90], + windows=[30, 90, 180], unit="day", cron="@daily", ), @@ -79,12 +91,12 @@ ref="contributor_activity_classification.sql", vars={"full_time_ratio": 10 / 30}, rolling=RollingConfig( - windows=[30, 60, 90], + windows=[30, 90, 180], unit="day", cron="@daily", ), ), - "change_in_30_developer_activity": MetricQueryDef( + "change_in_30_day_developer_activity": MetricQueryDef( vars={ "comparison_interval": 30, }, @@ -95,9 +107,9 @@ cron="@daily", ), ), - "change_in_60_developer_activity": MetricQueryDef( + "change_in_90_day_developer_activity": MetricQueryDef( vars={ - "comparison_interval": 60, + "comparison_interval": 90, }, ref="change_in_developers.sql", rolling=RollingConfig( @@ -106,6 +118,62 @@ cron="@daily", ), ), + "change_in_180_day_developer_activity": MetricQueryDef( + vars={ + "comparison_interval": 180, + }, + ref="change_in_developers.sql", + rolling=RollingConfig( + windows=[2], + unit="period", + cron="@daily", + ), + ), + "commits_rolling": MetricQueryDef( + ref="commits.sql", + rolling=RollingConfig( + windows=[180], + unit="day", + cron="@daily", + ), + entity_types=["artifact", "project", "collection"], + ), + "opened_pull_requests": MetricQueryDef( + ref="prs_opened.sql", + rolling=RollingConfig( + windows=[180], + unit="day", + cron="@daily", + ), + entity_types=["artifact", "project", "collection"], + ), + "merged_pull_requests": MetricQueryDef( + ref="prs_merged.sql", + rolling=RollingConfig( + windows=[180], + unit="day", + cron="@daily", + ), + entity_types=["artifact", "project", "collection"], + ), + "opened_issues": MetricQueryDef( + ref="issues_opened.sql", + rolling=RollingConfig( + windows=[180], + unit="day", + cron="@daily", + ), + entity_types=["artifact", "project", "collection"], + ), + "closed_issues_6_months": MetricQueryDef( + ref="issues_closed.sql", + rolling=RollingConfig( + windows=[180], + unit="day", + cron="@daily", + ), + entity_types=["artifact", "project", "collection"], + ), }, default_dialect="clickhouse", ) diff --git a/warehouse/metrics_mesh/oso_metrics/active_developers.sql b/warehouse/metrics_mesh/oso_metrics/active_developers.sql new file mode 100644 index 000000000..79ef5148a --- /dev/null +++ b/warehouse/metrics_mesh/oso_metrics/active_developers.sql @@ -0,0 +1,14 @@ +select @metrics_sample_date(events.bucket_day) as metrics_sample_date, + events.event_source, + events.to_artifact_id as to_artifact_id, + '' as from_artifact_id, + @metric_name() as metric, + COUNT(distinct events.from_artifact_id) as amount +from metrics.events_daily_to_artifact as events +where events.event_type = 'COMMIT_CODE' + and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE') +group by 1, + metric, + from_artifact_id, + to_artifact_id, + event_source diff --git a/warehouse/metrics_mesh/oso_metrics/contributors.sql b/warehouse/metrics_mesh/oso_metrics/contributors.sql new file mode 100644 index 000000000..d9eec2325 --- /dev/null +++ b/warehouse/metrics_mesh/oso_metrics/contributors.sql @@ -0,0 +1,19 @@ +select @metrics_sample_date(events.bucket_day) as metrics_sample_date, + events.event_source, + events.to_artifact_id as to_artifact_id, + '' as from_artifact_id, + @metric_name() as metric, + COUNT(distinct events.from_artifact_id) as amount +from metrics.events_daily_to_artifact as events +where events.event_type in ( + 'COMMIT_CODE', + 'ISSUE_OPENED', + 'PULL_REQUEST_OPENED', + 'PULL_REQUEST_MERGED' + ) + and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE') +group by 1, + metric, + from_artifact_id, + to_artifact_id, + event_source diff --git a/warehouse/metrics_mesh/oso_metrics/first_commit_date.sql b/warehouse/metrics_mesh/oso_metrics/first_commit_date.sql new file mode 100644 index 000000000..a0518d888 --- /dev/null +++ b/warehouse/metrics_mesh/oso_metrics/first_commit_date.sql @@ -0,0 +1,17 @@ +-- TODO (@ravenac95) keeping this for now, might prove useful, but we likely need +-- a different kind of model for first commit data +select @metrics_sample_date(events.bucket_day) as metrics_sample_date, + events.event_source, + events.to_artifact_id as to_artifact_id, + '' as from_artifact_id, + @metric_name() as metric, + 1 as amount, + MIN(events.bucket_day) as first_commit_date +from metrics.events_daily_to_artifact as events +where events.event_type = 'COMMIT_CODE' + and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE') +group by 1, + metric, + from_artifact_id, + to_artifact_id, + event_source \ No newline at end of file diff --git a/warehouse/metrics_mesh/oso_metrics/last_commit_date.sql b/warehouse/metrics_mesh/oso_metrics/last_commit_date.sql new file mode 100644 index 000000000..355bde9a7 --- /dev/null +++ b/warehouse/metrics_mesh/oso_metrics/last_commit_date.sql @@ -0,0 +1,17 @@ +-- TODO (@ravenac95) keeping this for now, might prove useful, but we likely need +-- a different kind of model for last commit data +select @metrics_sample_date(events.bucket_day) as metrics_sample_date, + events.event_source, + events.to_artifact_id as to_artifact_id, + '' as from_artifact_id, + @metric_name() as metric, + 1 as amount, + MAX(events.bucket_day) as last_commit_date +from metrics.events_daily_to_artifact as events +where events.event_type = 'COMMIT_CODE' + and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE') +group by 1, + metric, + from_artifact_id, + to_artifact_id, + event_source \ No newline at end of file diff --git a/warehouse/metrics_mesh/oso_metrics/repositories.sql b/warehouse/metrics_mesh/oso_metrics/repositories.sql new file mode 100644 index 000000000..fa065fd97 --- /dev/null +++ b/warehouse/metrics_mesh/oso_metrics/repositories.sql @@ -0,0 +1,25 @@ +select @metrics_sample_date(events.bucket_day) as metrics_sample_date, + events.event_source, + events.to_artifact_id as to_artifact_id, + '' as from_artifact_id, + @metric_name() as metric, + COUNT(distinct events.to_artifact_id) as amount +from metrics.events_daily_to_artifact as events +where events.event_type in ( + 'ISSUE_OPENED', + 'STARRED', + 'PULL_REQUEST_OPENED', + 'FORKED', + 'PULL_REQUEST_REOPENED', + 'PULL_REQUEST_CLOSED', + 'COMMIT_CODE', + 'ISSUE_REOPENED', + 'PULL_REQUEST_MERGED', + 'ISSUE_CLOSED', + ) + and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE') +group by 1, + metric, + from_artifact_id, + to_artifact_id, + event_source diff --git a/warehouse/metrics_mesh/tests/test_change_in_developers_over_window.yml b/warehouse/metrics_mesh/tests/test_change_in_developers_over_window.yml new file mode 100644 index 000000000..b151798ad --- /dev/null +++ b/warehouse/metrics_mesh/tests/test_change_in_developers_over_window.yml @@ -0,0 +1,33 @@ +test_change_in_30_day_developer_activity_to_artifact_over_2_period_window_full_time_devs: + # Tests rolling count of active days when the user is active 4 of the 5 days + # in the test interval + gateway: local + model: metrics.change_in_30_day_developer_activity_to_artifact_over_2_period_window + vars: + start: 2024-01-31 + end: 2024-01-31 + inputs: + metrics.developer_classifications_to_artifact_over_30_day_window: + rows: + - to_artifact_id: repo_0 + from_artifact_id: null + event_source: SOURCE_PROVIDER + metrics_sample_date: 2024-01-01 + metric: full_time_developers_over_30_day_window + amount: 30 + - to_artifact_id: repo_0 + from_artifact_id: null + event_source: SOURCE_PROVIDER + metrics_sample_date: 2024-01-31 + metric: full_time_developers_over_30_day_window + amount: 10 + outputs: + partial: true + query: + partial: true + rows: + - metrics_sample_date: 2024-01-31 + to_artifact_id: repo_0 + from_artifact_id: "" + amount: -20 + diff --git a/warehouse/metrics_mesh/tests/test_developer_active_days_over_window.yml b/warehouse/metrics_mesh/tests/test_developer_active_days_over_window.yml index 6c2e6e65c..c96aa066b 100644 --- a/warehouse/metrics_mesh/tests/test_developer_active_days_over_window.yml +++ b/warehouse/metrics_mesh/tests/test_developer_active_days_over_window.yml @@ -1,8 +1,8 @@ -test_developer_active_days_to_artifact_over_60_day_window_with_cumulative_active_days: +test_developer_active_days_to_artifact_over_30_day_window_with_cumulative_active_days: # Tests rolling count of active days when the user is active 4 of the 5 days # in the test interval gateway: local - model: metrics.developer_active_days_to_artifact_over_60_day_window + model: metrics.developer_active_days_to_artifact_over_30_day_window vars: start: 2024-01-01 end: 2024-01-05 @@ -59,10 +59,10 @@ test_developer_active_days_to_artifact_over_60_day_window_with_cumulative_active metrics_sample_date: 2024-01-05 amount: 4 -test_developer_active_days_to_artifact_over_60_day_window_with_1_active_day: +test_developer_active_days_to_artifact_over_30_day_window_with_1_active_day: # Tests rolling count of active days when the user is active 1 in the test interval gateway: local - model: metrics.developer_active_days_to_artifact_over_60_day_window + model: metrics.developer_active_days_to_artifact_over_30_day_window vars: start: 2024-01-01 end: 2024-01-03 diff --git a/warehouse/metrics_mesh/tests/test_developer_classification_over_window.yml b/warehouse/metrics_mesh/tests/test_developer_classification_over_window.yml new file mode 100644 index 000000000..07eb3b604 --- /dev/null +++ b/warehouse/metrics_mesh/tests/test_developer_classification_over_window.yml @@ -0,0 +1,39 @@ +test_developer_classifications_to_artifact_over_30_day_window_full_time_devs: + # Tests rolling count of active days when the user is active 4 of the 5 days + # in the test interval + gateway: local + model: metrics.developer_classifications_to_artifact_over_30_day_window + vars: + start: 2024-01-01 + end: 2024-01-01 + inputs: + metrics.developer_active_days_to_artifact_over_30_day_window: + rows: + - to_artifact_id: repo_0 + from_artifact_id: dev_0 + event_source: SOURCE_PROVIDER + metrics_sample_date: 2024-01-01 + metric: developer_active_days + amount: 30 + - to_artifact_id: repo_0 + from_artifact_id: dev_1 + event_source: SOURCE_PROVIDER + metrics_sample_date: 2024-01-01 + metric: developer_active_days + amount: 30 + outputs: + partial: true + query: + partial: true + rows: + - metrics_sample_date: 2024-01-01 + to_artifact_id: repo_0 + from_artifact_id: "" + metric: full_time_developers_over_30_day_window + amount: 2 + - metrics_sample_date: 2024-01-01 + to_artifact_id: repo_0 + from_artifact_id: "" + metric: active_developers_over_30_day_window + amount: 2 + diff --git a/warehouse/metrics_tools/macros/macros.py b/warehouse/metrics_tools/macros/macros.py index 4a5d733a1..5387fdb7b 100644 --- a/warehouse/metrics_tools/macros/macros.py +++ b/warehouse/metrics_tools/macros/macros.py @@ -31,11 +31,6 @@ def relative_window_sample_date( must be a valid thing to subtract from. Also note, the base should generally be the `@metrics_end` date. """ - # if evaluator.runtime_stage in ["loading", "creating"]: - # return parse_one("STR_TO_DATE('1970-01-01', '%Y-%m-%d')") - if relative_index == 0: - return base - if isinstance(unit, exp.Literal): unit = t.cast(str, unit.this) elif isinstance(unit, exp.Expression): @@ -55,6 +50,8 @@ def relative_window_sample_date( converted_relative_index = int(t.cast(int, relative_index.this)) elif isinstance(relative_index, exp.Neg): converted_relative_index = int(relative_index.this.this) * -1 + if converted_relative_index == 0: + return base interval_unit = exp.Var(this=unit) interval_delta = exp.Interval( this=exp.Mul(