From 2d9c4f296b9edd91d51744714fdd597c0a8cf422 Mon Sep 17 00:00:00 2001 From: Carl Cervone <42869436+ccerv1@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:33:57 -0500 Subject: [PATCH] feat(dbt): add remaining PLN github-based metrics (#2484) * feat(dbt): add comments and releases dbt models * feat(dbt): create event model for numbered prs and issues * feat(dbt): time to pr merge code metric * fix: rename metric * fix(dbt): add comment threads to event table * feat(dbt): add time to first response * chore(dbt): add new metrics to mart models * fix(dbt): linting error * fix: missing var * feat(sql-mesh): add comment counts * feat(sql-mesh): add releases * chore(sql-mesh): add metrics to factory * feat(sql-mesh): add model for unioning parent-child github events * fix: remove child events sqlmesh model --- .../analyses/int_github_pr_issue_threads.sql | 107 ++++++++++++++++++ .../int_code_metric__commits_prs_issues.sql | 5 +- ...c__time_to_first_response_days_average.sql | 89 +++++++++++++++ ...ode_metric__time_to_merge_days_average.sql | 60 ++++++++++ .../metrics/int_code_metrics_by_project.sql | 49 +++++++- .../metrics/code_metrics_by_project_v1.sql | 6 +- .../stg_github__pull_request_merge_events.sql | 3 +- .../metrics_mesh/models/metrics_factories.py | 8 ++ .../metrics_mesh/oso_metrics/comments.sql | 14 +++ .../metrics_mesh/oso_metrics/releases.sql | 14 +++ 10 files changed, 349 insertions(+), 6 deletions(-) create mode 100644 warehouse/dbt/models/intermediate/analyses/int_github_pr_issue_threads.sql create mode 100644 warehouse/dbt/models/intermediate/metrics/code/int_code_metric__time_to_first_response_days_average.sql create mode 100644 warehouse/dbt/models/intermediate/metrics/code/int_code_metric__time_to_merge_days_average.sql create mode 100644 warehouse/metrics_mesh/oso_metrics/comments.sql create mode 100644 warehouse/metrics_mesh/oso_metrics/releases.sql diff --git a/warehouse/dbt/models/intermediate/analyses/int_github_pr_issue_threads.sql b/warehouse/dbt/models/intermediate/analyses/int_github_pr_issue_threads.sql new file mode 100644 index 000000000..8137fc00e --- /dev/null +++ b/warehouse/dbt/models/intermediate/analyses/int_github_pr_issue_threads.sql @@ -0,0 +1,107 @@ +with pr_events as ( + select + `number`, + `type`, + actor_id, + created_at, + LOWER(actor_login) as actor_login, + LOWER(repository_name) as repository_name, + CAST(repository_id as STRING) as to_artifact_source_id + from {{ ref('stg_github__pull_requests') }} + where `type` = 'PULL_REQUEST_OPENED' +), + +merge_events as ( + select + `number`, + `type`, + actor_id, + created_at, + LOWER(actor_login) as actor_login, + LOWER(repository_name) as repository_name, + CAST(repository_id as STRING) as to_artifact_source_id + from {{ ref('stg_github__pull_request_merge_events') }} +), + +issue_events as ( + select + `number`, + `type`, + actor_id, + created_at, + LOWER(actor_login) as actor_login, + LOWER(repository_name) as repository_name, + CAST(repository_id as STRING) as to_artifact_source_id + from {{ ref('stg_github__issues') }} +), + +comment_events as ( + select + `number`, + `type`, + actor_id, + created_at, + LOWER(actor_login) as actor_login, + LOWER(repository_name) as repository_name, + CAST(repository_id as STRING) as to_artifact_source_id + from {{ ref('stg_github__comments') }} +), + +all_events as ( + select + `number`, + `type`, + actor_login, + repository_name, + actor_id, + to_artifact_source_id, + created_at, + 'GITHUB' as event_source + from pr_events + union all + select + `number`, + `type`, + actor_login, + repository_name, + actor_id, + to_artifact_source_id, + created_at, + 'GITHUB' as event_source + from merge_events + union all + select + `number`, + `type`, + actor_login, + repository_name, + actor_id, + to_artifact_source_id, + created_at, + 'GITHUB' as event_source + from issue_events + union all + select + `number`, + `type`, + actor_login, + repository_name, + actor_id, + to_artifact_source_id, + created_at, + 'GITHUB' as event_source + from comment_events +) + +select + 'GITHUB' as event_source, + created_at as `time`, + `number`, + `type`, + actor_login, + repository_name, + actor_id, + to_artifact_source_id, + {{ oso_id("event_source", "to_artifact_source_id") }} as to_artifact_id +from all_events +where actor_login not like '%[bot]' diff --git a/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__commits_prs_issues.sql b/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__commits_prs_issues.sql index f0282850c..80d8bf178 100644 --- a/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__commits_prs_issues.sql +++ b/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__commits_prs_issues.sql @@ -12,8 +12,11 @@ where 'COMMIT_CODE', 'PULL_REQUEST_OPENED', 'PULL_REQUEST_MERGED', + 'PULL_REQUEST_REVIEW_COMMENT', 'ISSUE_OPENED', - 'ISSUE_CLOSED' + 'ISSUE_CLOSED', + 'ISSUE_COMMENT', + 'RELEASE_PUBLISHED' ) group by events.project_id, diff --git a/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__time_to_first_response_days_average.sql b/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__time_to_first_response_days_average.sql new file mode 100644 index 000000000..cde77e57c --- /dev/null +++ b/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__time_to_first_response_days_average.sql @@ -0,0 +1,89 @@ +with start_events as ( + select + `number`, + actor_id as creator_id, + to_artifact_id, + `time` as created_at, + `type` + from {{ ref('int_github_pr_issue_threads') }} + where `type` in ('PULL_REQUEST_OPENED', 'ISSUE_OPENED') +), + +response_events as ( + select + `number`, + actor_id as responder_id, + to_artifact_id, + `time` as responded_at, + `type` + from {{ ref('int_github_pr_issue_threads') }} + where `type` in ( + 'PULL_REQUEST_MERGED', + 'PULL_REQUEST_REVIEW_COMMENT', + 'ISSUE_CLOSED', + 'ISSUE_COMMENT' + ) +), + +time_to_first_response as ( + select + start_events.number, + start_events.to_artifact_id, + start_events.created_at, + 'GITHUB' as event_source, + min(resp.responded_at) as responded_at, + cast( + timestamp_diff(min(resp.responded_at), start_events.created_at, minute) + as float64 + ) / 60.0 / 24.0 as time_to_first_response_days + from start_events + inner join response_events as resp + on + start_events.number = resp.number + and start_events.to_artifact_id = resp.to_artifact_id + and start_events.creator_id != resp.responder_id + and ( + ( + start_events.`type` = 'ISSUE_OPENED' + and resp.`type` in ( + 'ISSUE_COMMENT', 'ISSUE_CLOSED' + ) + ) + or + ( + start_events.`type` = 'PULL_REQUEST_OPENED' + and resp.`type` in ( + 'PULL_REQUEST_REVIEW_COMMENT', 'PULL_REQUEST_MERGED' + ) + ) + ) + group by + start_events.number, + start_events.to_artifact_id, + start_events.created_at +), + +time_to_first_response_events as ( + select + responded_at as `time`, + to_artifact_id, + event_source, + time_to_first_response_days as amount + from time_to_first_response +) + +select + artifacts_by_project.project_id, + time_to_first_response_events.event_source, + time_intervals.time_interval, + 'time_to_first_response_days_average' as metric, + avg(time_to_first_response_events.amount) as amount +from time_to_first_response_events +left join {{ ref('artifacts_by_project_v1') }} as artifacts_by_project + on time_to_first_response_events.to_artifact_id = artifacts_by_project.artifact_id +cross join {{ ref('int_time_intervals') }} as time_intervals +where time_to_first_response_events.time >= time_intervals.start_date +group by + artifacts_by_project.project_id, + time_to_first_response_events.event_source, + time_intervals.time_interval diff --git a/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__time_to_merge_days_average.sql b/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__time_to_merge_days_average.sql new file mode 100644 index 000000000..08ee27262 --- /dev/null +++ b/warehouse/dbt/models/intermediate/metrics/code/int_code_metric__time_to_merge_days_average.sql @@ -0,0 +1,60 @@ +with pr_events as ( + select + `number`, + to_artifact_id, + `time` as created_at + from {{ ref('int_github_pr_issue_threads') }} + where `type` = 'PULL_REQUEST_OPENED' +), + +merge_events as ( + select + `number`, + to_artifact_id, + `time` as merged_at + from {{ ref('int_github_pr_issue_threads') }} + where `type` = 'PULL_REQUEST_MERGED' +), + +time_to_merge as ( + select + pr.number, + pr.to_artifact_id, + pr.created_at, + m.merged_at, + 'GITHUB' as event_source, + CAST( + TIMESTAMP_DIFF(m.merged_at, pr.created_at, minute) + as FLOAT64 + ) / 60.0 / 24.0 as time_to_merge_days + from pr_events as pr + inner join merge_events as m + on + pr.number = m.number + and pr.to_artifact_id = m.to_artifact_id +), + +time_to_merge_events as ( + select + merged_at as `time`, + to_artifact_id, + event_source, + time_to_merge_days as amount + from time_to_merge +) + +select + artifacts_by_project.project_id, + time_to_merge_events.event_source, + time_intervals.time_interval, + 'time_to_merge_days_average' as metric, + AVG(time_to_merge_events.amount) as amount +from time_to_merge_events +left join {{ ref('artifacts_by_project_v1') }} as artifacts_by_project + on time_to_merge_events.to_artifact_id = artifacts_by_project.artifact_id +cross join {{ ref('int_time_intervals') }} as time_intervals +where time_to_merge_events.time >= time_intervals.start_date +group by + artifacts_by_project.project_id, + time_to_merge_events.event_source, + time_intervals.time_interval diff --git a/warehouse/dbt/models/intermediate/metrics/int_code_metrics_by_project.sql b/warehouse/dbt/models/intermediate/metrics/int_code_metrics_by_project.sql index 91a38565e..14d1d53ca 100644 --- a/warehouse/dbt/models/intermediate/metrics/int_code_metrics_by_project.sql +++ b/warehouse/dbt/models/intermediate/metrics/int_code_metrics_by_project.sql @@ -15,6 +15,10 @@ with metrics as ( from {{ ref('int_code_metric__fulltime_developers_average') }} union all select * from {{ ref('int_code_metric__new_contributors') }} + union all + select * from {{ ref('int_code_metric__time_to_first_response_days_average') }} + union all + select * from {{ ref('int_code_metric__time_to_merge_days_average') }} ), aggs as ( @@ -66,6 +70,24 @@ aggs as ( else 0 end ) as closed_issue_count_6_months, + SUM( + case + when + metric in ('pull_request_review_comment_count', 'issue_comment_count') + and time_interval = '6 MONTHS' + then amount + else 0 + end + ) as comment_count_6_months, + SUM( + case + when + metric = 'release_published_count' + and time_interval = '6 MONTHS' + then amount + else 0 + end + ) as release_count_6_months, SUM( case when @@ -119,7 +141,25 @@ aggs as ( then amount else 0 end - ) as fulltime_developer_average_6_months + ) as fulltime_developer_average_6_months, + SUM( + case + when + metric = 'time_to_first_response_days_average' + and time_interval = '6 MONTHS' + then amount + else 0 + end + ) as time_to_first_response_days_average_6_months, + SUM( + case + when + metric = 'time_to_merge_days_average' + and time_interval = '6 MONTHS' + then amount + else 0 + end + ) as time_to_merge_days_average_6_months from metrics group by project_id, @@ -138,7 +178,6 @@ repos as ( SUM(star_count) as star_count, SUM(fork_count) as fork_count from {{ ref('int_repo_metrics_by_project') }} - --WHERE r.is_fork = false group by project_id, artifact_source @@ -191,7 +230,11 @@ select code_metrics.opened_pull_request_count_6_months, code_metrics.merged_pull_request_count_6_months, code_metrics.opened_issue_count_6_months, - code_metrics.closed_issue_count_6_months + code_metrics.closed_issue_count_6_months, + code_metrics.comment_count_6_months, + code_metrics.release_count_6_months, + code_metrics.time_to_first_response_days_average_6_months, + code_metrics.time_to_merge_days_average_6_months from project_metadata left join code_metrics on diff --git a/warehouse/dbt/models/marts/metrics/code_metrics_by_project_v1.sql b/warehouse/dbt/models/marts/metrics/code_metrics_by_project_v1.sql index 034cbdd51..3b640ccc1 100644 --- a/warehouse/dbt/models/marts/metrics/code_metrics_by_project_v1.sql +++ b/warehouse/dbt/models/marts/metrics/code_metrics_by_project_v1.sql @@ -28,5 +28,9 @@ select opened_pull_request_count_6_months, merged_pull_request_count_6_months, opened_issue_count_6_months, - closed_issue_count_6_months + closed_issue_count_6_months, + comment_count_6_months, + release_count_6_months, + time_to_first_response_days_average_6_months, + time_to_merge_days_average_6_months from {{ ref('int_code_metrics_by_project') }} diff --git a/warehouse/dbt/models/staging/github/stg_github__pull_request_merge_events.sql b/warehouse/dbt/models/staging/github/stg_github__pull_request_merge_events.sql index 3c8a3da85..c7fa94114 100644 --- a/warehouse/dbt/models/staging/github/stg_github__pull_request_merge_events.sql +++ b/warehouse/dbt/models/staging/github/stg_github__pull_request_merge_events.sql @@ -38,7 +38,8 @@ select distinct ) as review_comments, JSON_VALUE( pre.payload, "$.pull_request.author_association" - ) as author_association + ) as author_association, + JSON_VALUE(pre.payload, "$.number") as `number` from pull_request_events as pre where JSON_VALUE(pre.payload, "$.pull_request.merged_at") is not null diff --git a/warehouse/metrics_mesh/models/metrics_factories.py b/warehouse/metrics_mesh/models/metrics_factories.py index 386215ea3..016a8432b 100644 --- a/warehouse/metrics_mesh/models/metrics_factories.py +++ b/warehouse/metrics_mesh/models/metrics_factories.py @@ -29,6 +29,14 @@ ref="commits.sql", time_aggregations=["daily", "weekly", "monthly"], ), + "comments": MetricQueryDef( + ref="comments.sql", + time_aggregations=["daily", "weekly", "monthly"], + ), + "releases": MetricQueryDef( + ref="releases.sql", + time_aggregations=["daily", "weekly", "monthly"], + ), "forks": MetricQueryDef( ref="forks.sql", time_aggregations=["daily", "weekly", "monthly"], diff --git a/warehouse/metrics_mesh/oso_metrics/comments.sql b/warehouse/metrics_mesh/oso_metrics/comments.sql new file mode 100644 index 000000000..9a0f16f57 --- /dev/null +++ b/warehouse/metrics_mesh/oso_metrics/comments.sql @@ -0,0 +1,14 @@ +select @metrics_sample_date(events.bucket_day) as metrics_sample_date, + events.event_source, + events.to_artifact_id, + '' as from_artifact_id, + @metric_name() as metric, + SUM(events.amount) as amount +from metrics.events_daily_to_artifact as events +where event_type in ('PULL_REQUEST_REVIEW_COMMENT', 'ISSUE_COMMENT') + and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE') +group by 1, + metric, + from_artifact_id, + to_artifact_id, + event_source diff --git a/warehouse/metrics_mesh/oso_metrics/releases.sql b/warehouse/metrics_mesh/oso_metrics/releases.sql new file mode 100644 index 000000000..cb832028c --- /dev/null +++ b/warehouse/metrics_mesh/oso_metrics/releases.sql @@ -0,0 +1,14 @@ +select @metrics_sample_date(events.bucket_day) as metrics_sample_date, + events.event_source, + events.to_artifact_id, + '' as from_artifact_id, + @metric_name() as metric, + SUM(events.amount) as amount +from metrics.events_daily_to_artifact as events +where event_type in ('RELEASE_PUBLISHED') + and events.bucket_day BETWEEN @metrics_start('DATE') AND @metrics_end('DATE') +group by 1, + metric, + from_artifact_id, + to_artifact_id, + event_source