From d99d6a1a4f27d261453b554d0215fad7dec37afe Mon Sep 17 00:00:00 2001 From: Carl Cervone <42869436+ccerv1@users.noreply.github.com> Date: Fri, 17 May 2024 20:14:28 -0400 Subject: [PATCH] refactor int_models that power summary metric marts (#1436) * refactor int_code_metrics_by_project to new models * fix: linting errors --- ...nt_developer_status_monthly_by_project.sql | 61 ++++++++++ .../int_first_last_event_by_artifact.sql | 21 ++++ .../metrics/int_code_metrics_by_project.sql | 108 ++++++++---------- .../metrics/pms/int_pm_dev_months.sql | 4 +- .../int_active_devs_monthly_to_project.sql | 34 ------ 5 files changed, 133 insertions(+), 95 deletions(-) create mode 100644 warehouse/dbt/models/intermediate/analyses/int_developer_status_monthly_by_project.sql create mode 100644 warehouse/dbt/models/intermediate/analyses/int_first_last_event_by_artifact.sql delete mode 100644 warehouse/dbt/models/intermediate/users/int_active_devs_monthly_to_project.sql diff --git a/warehouse/dbt/models/intermediate/analyses/int_developer_status_monthly_by_project.sql b/warehouse/dbt/models/intermediate/analyses/int_developer_status_monthly_by_project.sql new file mode 100644 index 000000000..c449a7ce3 --- /dev/null +++ b/warehouse/dbt/models/intermediate/analyses/int_developer_status_monthly_by_project.sql @@ -0,0 +1,61 @@ +{# + This model segments developers based on monthly activity + using the same taxonomy as in the Electric Capital + Developer Report. + + The taxonomy is as follows: + - Full-time developer: A developer who has made at least 10 commits + - Part-time developer: A developer who has made less than 10 commits + - Other contributor: A user who has not made any commits +#} + +with activity as ( + select + project_id, + from_artifact_id, + event_type, + bucket_day, + TIMESTAMP_TRUNC(bucket_day, month) as bucket_month + from {{ ref('int_events_daily_to_project') }} + where + event_type in ( + 'COMMIT_CODE', + 'PULL_REQUEST_OPENED', + 'PULL_REQUEST_REOPENED', + 'PULL_REQUEST_MERGED', + 'PULL_REQUEST_CLOSED', + 'ISSUE_OPENED', + 'ISSUE_REOPENED', + 'ISSUE_CLOSED' + ) +), + +user_activity as ( + select + project_id, + from_artifact_id, + bucket_month, + SUM( + case when event_type = 'COMMIT_CODE' then 1 else 0 end + ) as commit_days, + SUM( + case when event_type != 'COMMIT_CODE' then 1 else 0 end + ) as other_contrib_days + from activity + group by + project_id, + from_artifact_id, + bucket_month +) + +select + project_id, + from_artifact_id, + bucket_month, + case + when commit_days >= 10 then 'FULL_TIME_DEVELOPER' + when commit_days between 1 and 9 then 'PART_TIME_DEVELOPER' + when other_contrib_days >= 10 then 'FULL_TIME_CONTRIBUTOR' + else 'PART_TIME_CONTRIBUTOR' + end as user_segment_type +from user_activity diff --git a/warehouse/dbt/models/intermediate/analyses/int_first_last_event_by_artifact.sql b/warehouse/dbt/models/intermediate/analyses/int_first_last_event_by_artifact.sql new file mode 100644 index 000000000..856e98d9b --- /dev/null +++ b/warehouse/dbt/models/intermediate/analyses/int_first_last_event_by_artifact.sql @@ -0,0 +1,21 @@ +{# + Summary stats for the first and last event for each artifact +#} + +select + project_id, + from_artifact_id, + to_artifact_id, + event_source, + event_type, + MIN(bucket_day) as first_event_day, + MAX(bucket_day) as last_event_day, + COUNT(*) as event_count, + SUM(amount) as amount +from {{ ref('int_events_daily_to_project') }} +group by + project_id, + from_artifact_id, + to_artifact_id, + event_source, + event_type diff --git a/warehouse/dbt/models/intermediate/metrics/int_code_metrics_by_project.sql b/warehouse/dbt/models/intermediate/metrics/int_code_metrics_by_project.sql index 6730e7967..d6c18bb26 100644 --- a/warehouse/dbt/models/intermediate/metrics/int_code_metrics_by_project.sql +++ b/warehouse/dbt/models/intermediate/metrics/int_code_metrics_by_project.sql @@ -1,21 +1,4 @@ -{# - Summary GitHub metrics for a project: - - first_commit_date: The date of the first commit to the project - - last_commit_date: The date of the last commit to the project - - repos: The number of repositories in the project - - stars: The number of stars the project has - - forks: The number of forks the project has - - contributors: The number of contributors to the project - - contributors_6_months: The number of contributors to the project in the last 6 months - - new_contributors_6_months: The number of new contributors to the project in the last 6 months - - avg_fulltime_devs_6_months: The number of full-time developers in the last 6 months - - avg_active_devs_6_months: The average number of active developers in the last 6 months - - commits_6_months: The number of commits to the project in the last 6 months - - issues_opened_6_months: The number of issues opened in the project in the last 6 months - - issues_closed_6_months: The number of issues closed in the project in the last 6 months - - pull_requests_opened_6_months: The number of pull requests opened in the project in the last 6 months - - pull_requests_merged_6_months: The number of pull requests merged in the project in the last 6 months -#} +{% set date_6_months = "DATE_TRUNC(CURRENT_DATE(), month) - INTERVAL 6 MONTH" %} with repos as ( select @@ -47,62 +30,69 @@ project_repos_summary as ( on repos.project_id = int_projects.project_id ), -n_cte as ( +dev_activity as ( select project_id, - SUM(case when time_interval = 'ALL' then amount end) as contributors, - SUM(case when time_interval = '6M' then amount end) - as new_contributors_6_months - from {{ ref('int_pm_new_contribs') }} - group by project_id + from_artifact_id, + bucket_month, + user_segment_type, + case + when DATE(bucket_month) >= {{ date_6_months }} then 1 + else 0 + end as is_last_6_months, + case + when + DATE(bucket_month) >= {{ date_6_months }} + and LAG(bucket_month) over ( + partition by project_id, from_artifact_id + order by bucket_month + ) is null + then 1 + else 0 + end as is_new_last_6_months + from {{ ref('int_developer_status_monthly_by_project') }} ), -c_cte as ( - select - project_id, - SUM(amount) as contributors_6_months - from {{ ref('int_pm_contributors') }} - where time_interval = '6M' - group by project_id -), - -d_cte as ( +contribs_cte as ( select project_id, - SUM( + COUNT(distinct from_artifact_id) as contributors, + COUNT( + distinct case - when impact_metric = 'FULL_TIME_DEV_TOTAL' then amount / 6 + when is_last_6_months = 1 then from_artifact_id + end + ) as contributors_6_months, + COUNT( + distinct + case + when is_new_last_6_months = 1 then from_artifact_id + end + ) as new_contributors_6_months, + AVG( + case + when + is_last_6_months = 1 + and user_segment_type = 'FULL_TIME_DEVELOPER' + then 1 else 0 end - ) as avg_fts_6_months, - SUM( + ) as avg_fulltime_devs_6_months, + AVG( case - when impact_metric = 'PART_TIME_DEV_TOTAL' then amount / 6 + when + is_last_6_months = 1 + and user_segment_type in ( + 'FULL_TIME_DEVELOPER', + 'PART_TIME_DEVELOPER' + ) then 1 else 0 end - ) as avg_pts_6_months - from {{ ref('int_pm_dev_months') }} - where time_interval = '6M' + ) as avg_active_devs_6_months + from dev_activity group by project_id ), -contribs_cte as ( - select - n.project_id, - n.contributors, - n.new_contributors_6_months, - c.contributors_6_months, - d.avg_fts_6_months as avg_fulltime_devs_6_months, - d.avg_fts_6_months + d.avg_pts_6_months as avg_active_devs_6_months - from n_cte as n - left join c_cte as c - on - n.project_id = c.project_id - left join d_cte as d - on - n.project_id = d.project_id -), - activity_cte as ( select project_id, diff --git a/warehouse/dbt/models/intermediate/metrics/pms/int_pm_dev_months.sql b/warehouse/dbt/models/intermediate/metrics/pms/int_pm_dev_months.sql index 9a901920d..71e5a74ff 100644 --- a/warehouse/dbt/models/intermediate/metrics/pms/int_pm_dev_months.sql +++ b/warehouse/dbt/models/intermediate/metrics/pms/int_pm_dev_months.sql @@ -11,8 +11,8 @@ select e.project_id, t.time_interval, CONCAT(e.user_segment_type, '_TOTAL') as impact_metric, - SUM(e.amount) as amount -from {{ ref('int_active_devs_monthly_to_project') }} as e + COUNT(distinct e.from_artifact_id) as amount +from {{ ref('int_developer_status_monthly_by_project') }} as e cross join {{ ref('int_time_intervals') }} as t where DATE(e.bucket_month) >= DATE_TRUNC(t.start_date, month) diff --git a/warehouse/dbt/models/intermediate/users/int_active_devs_monthly_to_project.sql b/warehouse/dbt/models/intermediate/users/int_active_devs_monthly_to_project.sql deleted file mode 100644 index 83ffebe5b..000000000 --- a/warehouse/dbt/models/intermediate/users/int_active_devs_monthly_to_project.sql +++ /dev/null @@ -1,34 +0,0 @@ -{# - This model segments developers based on monthly activity - using the same taxonomy as in the Electric Capital - Developer Report. - - The taxonomy is as follows: - - Full-time developer: A developer who has made at least 10 commits - - Part-time developer: A developer who has made less than 10 commits - - Other contributor: A user who has not made any commits -#} - -select - project_id, - from_artifact_id, - bucket_month, - 1 as amount, - case - when - event_type = 'COMMIT_CODE' and count_days >= 10 - then 'FULL_TIME_DEV' - when - event_type = 'COMMIT_CODE' and count_days < 10 - then 'PART_TIME_DEV' - else 'OTHER_CONTRIBUTOR' - end as user_segment_type -from {{ ref('int_user_events_monthly_to_project') }} -where - event_type in ( - 'COMMIT_CODE', - 'PULL_REQUEST_OPENED', - 'PULL_REQUEST_MERGED', - 'ISSUE_OPENED', - 'ISSUE_CLOSED' - )