From 2f6821376ccc39f6fc55881f458e5206198d577f Mon Sep 17 00:00:00 2001 From: Carl Cervone <42869436+ccerv1@users.noreply.github.com> Date: Tue, 2 Apr 2024 20:44:27 -0400 Subject: [PATCH] dbt: new user models (v1) (#1162) * add: user int models * add: initial user marts * fixes to existing marts (wip) * refactor: simplify int user models * fix: linting 2 * add artifact and project mini-marts * refactor: code metrics to use new mart - part 1 * add: active devs impact metrics * rename dev months * add new contribs impact model * rename gh mart --- .../intermediate/contributors/int_devs.sql | 27 ---- .../intermediate/contributors/int_users.sql | 14 -- .../contributors/int_users_by_namespace.sql | 5 - .../intermediate/users/int_addresses.sql | 14 ++ .../models/intermediate/users/int_devs.sql | 24 +++ .../int_user_events_monthly_to_collection.sql | 21 +++ .../int_user_events_monthly_to_project.sql | 21 +++ .../code_metrics_by_collection.sql | 27 ++-- .../code_metrics/code_metrics_by_project.sql | 148 ++++++++---------- .../marts/directory/artifacts_by_project.sql | 8 +- .../marts/directory/deployers_by_project.sql | 8 + .../dbt/models/marts/directory/projects.sql | 6 +- .../marts/directory/repos_by_project.sql | 33 ++++ .../marts/events/users_monthly_to_project.sql | 17 +- .../project_metrics/live/pm_dev_months.sql | 56 +++++++ .../project_metrics/live/pm_new_contribs.sql | 50 ++++++ .../active_devs_monthly_to_collection.sql | 35 +++++ .../users/active_devs_monthly_to_project.sql | 35 +++++ .../users/address_rfm_segments_by_project.sql | 114 ++++++++++++++ .../users/new_users_monthly_to_project.sql | 35 +++++ .../stg_ossd__repositories_by_project.sql | 13 +- 21 files changed, 547 insertions(+), 164 deletions(-) delete mode 100644 warehouse/dbt/models/intermediate/contributors/int_devs.sql delete mode 100644 warehouse/dbt/models/intermediate/contributors/int_users.sql delete mode 100644 warehouse/dbt/models/intermediate/contributors/int_users_by_namespace.sql create mode 100644 warehouse/dbt/models/intermediate/users/int_addresses.sql create mode 100644 warehouse/dbt/models/intermediate/users/int_devs.sql create mode 100644 warehouse/dbt/models/intermediate/users/int_user_events_monthly_to_collection.sql create mode 100644 warehouse/dbt/models/intermediate/users/int_user_events_monthly_to_project.sql create mode 100644 warehouse/dbt/models/marts/directory/deployers_by_project.sql create mode 100644 warehouse/dbt/models/marts/directory/repos_by_project.sql create mode 100644 warehouse/dbt/models/marts/project_metrics/live/pm_dev_months.sql create mode 100644 warehouse/dbt/models/marts/project_metrics/live/pm_new_contribs.sql create mode 100644 warehouse/dbt/models/marts/users/active_devs_monthly_to_collection.sql create mode 100644 warehouse/dbt/models/marts/users/active_devs_monthly_to_project.sql create mode 100644 warehouse/dbt/models/marts/users/address_rfm_segments_by_project.sql create mode 100644 warehouse/dbt/models/marts/users/new_users_monthly_to_project.sql diff --git a/warehouse/dbt/models/intermediate/contributors/int_devs.sql b/warehouse/dbt/models/intermediate/contributors/int_devs.sql deleted file mode 100644 index 1f62a6c6d..000000000 --- a/warehouse/dbt/models/intermediate/contributors/int_devs.sql +++ /dev/null @@ -1,27 +0,0 @@ -SELECT - e.project_id, - e.to_namespace AS repository_source, - e.from_id, - 1 AS amount, - TIMESTAMP_TRUNC(e.time, MONTH) AS bucket_month, - CASE - WHEN - COUNT(DISTINCT CASE WHEN e.event_type = 'COMMIT_CODE' THEN e.time END) - >= 10 - THEN 'FULL_TIME_DEV' - WHEN - COUNT(DISTINCT CASE WHEN e.event_type = 'COMMIT_CODE' THEN e.time END) - >= 1 - THEN 'PART_TIME_DEV' - ELSE 'OTHER_CONTRIBUTOR' - END AS user_segment_type -FROM {{ ref('int_events_to_project') }} AS e -WHERE - e.event_type IN ( - 'PULL_REQUEST_CREATED', - 'PULL_REQUEST_MERGED', - 'COMMIT_CODE', - 'ISSUE_CLOSED', - 'ISSUE_CREATED' - ) -GROUP BY e.project_id, bucket_month, e.from_id, repository_source diff --git a/warehouse/dbt/models/intermediate/contributors/int_users.sql b/warehouse/dbt/models/intermediate/contributors/int_users.sql deleted file mode 100644 index a2b2e4dfa..000000000 --- a/warehouse/dbt/models/intermediate/contributors/int_users.sql +++ /dev/null @@ -1,14 +0,0 @@ -SELECT - e.project_id, - e.to_namespace AS onchain_network, - e.from_id, - 1 AS amount, - TIMESTAMP_TRUNC(e.time, MONTH) AS bucket_month, - CASE - WHEN SUM(e.amount) >= 1000 THEN 'HIGH_FREQUENCY_USER' - WHEN SUM(e.amount) >= 10 THEN 'HIGH_VALUE_USER' - ELSE 'LOW_VALUE_USER' - END AS user_segment_type -FROM {{ ref('int_events_to_project') }} AS e -WHERE e.event_type = 'CONTRACT_INVOCATION_DAILY_COUNT' -GROUP BY e.project_id, onchain_network, e.from_id, bucket_month diff --git a/warehouse/dbt/models/intermediate/contributors/int_users_by_namespace.sql b/warehouse/dbt/models/intermediate/contributors/int_users_by_namespace.sql deleted file mode 100644 index 9d02441ee..000000000 --- a/warehouse/dbt/models/intermediate/contributors/int_users_by_namespace.sql +++ /dev/null @@ -1,5 +0,0 @@ -SELECT DISTINCT - from_id, - from_name, - from_namespace -FROM {{ ref('int_events_to_project') }} diff --git a/warehouse/dbt/models/intermediate/users/int_addresses.sql b/warehouse/dbt/models/intermediate/users/int_addresses.sql new file mode 100644 index 000000000..022c01f97 --- /dev/null +++ b/warehouse/dbt/models/intermediate/users/int_addresses.sql @@ -0,0 +1,14 @@ +{# + Address stats by project and network +#} + +SELECT + from_id, + from_namespace AS network, + project_id, + MIN(time) AS date_first_txn, + MAX(time) AS date_last_txn, + SUM(amount) AS count_events +FROM {{ ref('int_events_to_project') }} +WHERE event_type = 'CONTRACT_INVOCATION_DAILY_COUNT' +GROUP BY 1, 2, 3 diff --git a/warehouse/dbt/models/intermediate/users/int_devs.sql b/warehouse/dbt/models/intermediate/users/int_devs.sql new file mode 100644 index 000000000..cc160761f --- /dev/null +++ b/warehouse/dbt/models/intermediate/users/int_devs.sql @@ -0,0 +1,24 @@ +{# + Developer stats by project and repo source +#} + +SELECT + from_id, + from_namespace AS repository_source, + project_id, + MIN(time) AS date_first_contribution, + MAX(time) AS date_last_contribution, + SUM(amount) AS count_events +FROM {{ ref('int_events_to_project') }} +WHERE + event_type IN ( + 'COMMIT_CODE', + 'PULL_REQUEST_OPENED', + 'PULL_REQUEST_REOPENED', + 'PULL_REQUEST_CLOSED', + 'PULL_REQUEST_MERGED', + 'ISSUE_CLOSED', + 'ISSUE_OPENED', + 'ISSUE_REOPENED' + ) +GROUP BY 1, 2, 3 diff --git a/warehouse/dbt/models/intermediate/users/int_user_events_monthly_to_collection.sql b/warehouse/dbt/models/intermediate/users/int_user_events_monthly_to_collection.sql new file mode 100644 index 000000000..6a7c92cc0 --- /dev/null +++ b/warehouse/dbt/models/intermediate/users/int_user_events_monthly_to_collection.sql @@ -0,0 +1,21 @@ +{# + This model aggregates user events to collections on + a monthly basis. It is used to calculate various + user engagement metrics. +#} + +SELECT + from_id, + from_namespace, + collection_id, + event_type, + DATE_TRUNC(DATE(time), MONTH) AS bucket_month, + COUNT(DISTINCT DATE_TRUNC(DATE(time), DAY)) AS count_days, + SUM(amount) AS total_amount +FROM {{ ref('int_events_to_collection') }} +GROUP BY + from_id, + from_namespace, + collection_id, + event_type, + DATE_TRUNC(DATE(time), MONTH) diff --git a/warehouse/dbt/models/intermediate/users/int_user_events_monthly_to_project.sql b/warehouse/dbt/models/intermediate/users/int_user_events_monthly_to_project.sql new file mode 100644 index 000000000..ee5ef1ddf --- /dev/null +++ b/warehouse/dbt/models/intermediate/users/int_user_events_monthly_to_project.sql @@ -0,0 +1,21 @@ +{# + This model aggregates user events to project level on + a monthly basis. It is used to calculate various + user engagement metrics by project. +#} + +SELECT + from_id, + from_namespace, + project_id, + event_type, + DATE_TRUNC(DATE(time), MONTH) AS bucket_month, + COUNT(DISTINCT DATE_TRUNC(DATE(time), DAY)) AS count_days, + SUM(amount) AS amount +FROM {{ ref('int_events_to_project') }} +GROUP BY + from_id, + from_namespace, + project_id, + event_type, + DATE_TRUNC(DATE(time), MONTH) diff --git a/warehouse/dbt/models/marts/code_metrics/code_metrics_by_collection.sql b/warehouse/dbt/models/marts/code_metrics/code_metrics_by_collection.sql index beffbf067..9fd9ba239 100644 --- a/warehouse/dbt/models/marts/code_metrics/code_metrics_by_collection.sql +++ b/warehouse/dbt/models/marts/code_metrics/code_metrics_by_collection.sql @@ -68,7 +68,7 @@ collection_contributors AS ( DISTINCT CASE WHEN d.bucket_month - >= CAST(DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) AS TIMESTAMP) + >= DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH) THEN d.from_id END ) AS contributors_6_months, @@ -76,7 +76,7 @@ collection_contributors AS ( DISTINCT CASE WHEN d.bucket_month - >= CAST(DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) AS TIMESTAMP) + >= DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH) AND d.user_segment_type = 'FULL_TIME_DEV' THEN CONCAT(d.from_id, '_', d.bucket_month) END @@ -86,7 +86,7 @@ collection_contributors AS ( DISTINCT CASE WHEN d.bucket_month - >= CAST(DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) AS TIMESTAMP) + >= DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH) AND d.user_segment_type IN ('FULL_TIME_DEV', 'PART_TIME_DEV') THEN CONCAT(d.from_id, '_', d.bucket_month) END @@ -96,24 +96,21 @@ collection_contributors AS ( DISTINCT CASE WHEN d.first_contribution_date - >= CAST(DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) AS TIMESTAMP) + >= DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH) THEN d.from_id END ) AS new_contributors_6_months FROM ( SELECT - d.from_id, - pbc.collection_id, - d.repository_source, - d.bucket_month, - d.user_segment_type, - MIN(d.bucket_month) - OVER (PARTITION BY d.from_id, pbc.collection_id) + from_id, + collection_id, + repository_source, + bucket_month, + user_segment_type, + MIN(bucket_month) + OVER (PARTITION BY from_id, collection_id) AS first_contribution_date - FROM {{ ref('int_devs') }} AS d - INNER JOIN - {{ ref('stg_ossd__projects_by_collection') }} AS pbc - ON d.project_id = pbc.project_id + FROM {{ ref('active_devs_monthly_to_collection') }} ) AS d GROUP BY d.collection_id, d.repository_source ), diff --git a/warehouse/dbt/models/marts/code_metrics/code_metrics_by_project.sql b/warehouse/dbt/models/marts/code_metrics/code_metrics_by_project.sql index 9f638db35..4829e55d9 100644 --- a/warehouse/dbt/models/marts/code_metrics/code_metrics_by_project.sql +++ b/warehouse/dbt/models/marts/code_metrics/code_metrics_by_project.sql @@ -17,93 +17,73 @@ - pull_requests_merged_6_months: The number of pull requests merged in the project in the last 6 months #} --- CTE for calculating the first and last commit date for each project, --- ignoring forked repos -WITH project_commit_dates AS ( +-- CTE for aggregating repo data for each project +WITH project_repos_summary AS ( SELECT - e.project_id, - r.repository_source, - MIN(e.time) AS first_commit_date, - MAX(e.time) AS last_commit_date - FROM {{ ref('int_events_to_project') }} AS e - INNER JOIN - {{ ref('stg_ossd__repositories_by_project') }} AS r - ON e.project_id = r.project_id - WHERE - e.event_type = 'COMMIT_CODE' - AND r.is_fork = false - GROUP BY e.project_id, r.repository_source -), - --- CTE for aggregating stars, forks, and repository counts by project -project_repos_summary AS ( - SELECT - p.project_id, - p.project_slug, - p.project_name, - r.repository_source, - COUNT(DISTINCT r.id) AS repositories, - SUM(r.star_count) AS stars, - SUM(r.fork_count) AS forks - FROM {{ ref('stg_ossd__repositories_by_project') }} AS r - INNER JOIN {{ ref('projects') }} AS p - ON r.project_id = p.project_id - WHERE r.is_fork = false - GROUP BY p.project_id, p.project_slug, p.project_name, r.repository_source + project_id, + project_slug, + project_name, + repository_source, + MIN(first_commit_date) AS first_commit_date, + MAX(last_commit_date) AS last_commit_date, + COUNT(DISTINCT artifact_id) AS repositories, + SUM(repo_star_count) AS stars, + SUM(repo_fork_count) AS forks + FROM {{ ref('repos_by_project') }} + --WHERE r.is_fork = false + GROUP BY + project_id, + project_slug, + project_name, + repository_source ), -- CTE for calculating contributor counts and new contributors in the last 6 -- months -contributors_cte AS ( +devs_cte AS ( SELECT project_id, - repository_source, - COUNT(DISTINCT from_id) AS contributors, - COUNT( - DISTINCT CASE - WHEN - DATE(bucket_month) >= DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) - THEN from_id + namespace AS repository_source, + SUM(amount) / 6 AS contributors_6_months, + SUM( + CASE + WHEN CONTAINS_SUBSTR(impact_metric, 'FULL_TIME_DEV') THEN amount / 6 + ELSE 0 END - ) AS contributors_6_months, - COUNT( - DISTINCT CASE - WHEN - DATE(bucket_month) >= DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) - AND user_segment_type = 'FULL_TIME_DEV' - THEN CONCAT(from_id, '_', bucket_month) + ) AS avg_fulltime_devs_6_months, + SUM( + CASE + WHEN CONTAINS_SUBSTR(impact_metric, 'DEV') THEN amount / 6 + ELSE 0 END - ) - / 6 AS avg_fulltime_devs_6_months, - COUNT( - DISTINCT CASE - WHEN - DATE(bucket_month) >= DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) - AND user_segment_type IN ('FULL_TIME_DEV', 'PART_TIME_DEV') - THEN CONCAT(from_id, '_', bucket_month) + ) AS avg_active_devs_6_months + FROM {{ ref('pm_dev_months') }} + WHERE CONTAINS_SUBSTR(impact_metric, '_6M') + GROUP BY + project_id, + namespace +), + +contribs_cte AS ( + SELECT + project_id, + namespace AS repository_source, + SUM( + CASE + WHEN CONTAINS_SUBSTR(impact_metric, '_ALL') THEN amount + ELSE 0 END - ) - / 6 AS avg_active_devs_6_months, - COUNT( - DISTINCT CASE - WHEN - first_contribution_date >= DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) - THEN from_id + ) AS contributors, + SUM( + CASE + WHEN CONTAINS_SUBSTR(impact_metric, '_6M') THEN amount + ELSE 0 END ) AS new_contributors_6_months - FROM ( - SELECT - from_id, - project_id, - repository_source, - user_segment_type, - DATE(bucket_month) AS bucket_month, - MIN(DATE(bucket_month)) - OVER (PARTITION BY from_id, project_id) - AS first_contribution_date - FROM {{ ref('int_devs') }} - ) - GROUP BY project_id, repository_source + FROM {{ ref('pm_new_contribs') }} + GROUP BY + project_id, + namespace ), -- CTE for summarizing project activity metrics over the past 6 months @@ -132,27 +112,27 @@ SELECT p.project_slug, p.project_name, p.repository_source AS `source`, - pcd.first_commit_date, - pcd.last_commit_date, + p.first_commit_date, + p.last_commit_date, p.repositories, p.stars, p.forks, c.contributors, - c.contributors_6_months, c.new_contributors_6_months, - c.avg_fulltime_devs_6_months, - c.avg_active_devs_6_months, + d.contributors_6_months, + d.avg_fulltime_devs_6_months, + d.avg_active_devs_6_months, act.commits_6_months, act.issues_opened_6_months, act.issues_closed_6_months, act.pull_requests_opened_6_months, act.pull_requests_merged_6_months FROM project_repos_summary AS p -LEFT JOIN project_commit_dates AS pcd +LEFT JOIN devs_cte AS d ON - p.project_id = pcd.project_id - AND p.repository_source = pcd.repository_source -LEFT JOIN contributors_cte AS c + p.project_id = d.project_id + AND p.repository_source = d.repository_source +LEFT JOIN contribs_cte AS c ON p.project_id = c.project_id AND p.repository_source = c.repository_source diff --git a/warehouse/dbt/models/marts/directory/artifacts_by_project.sql b/warehouse/dbt/models/marts/directory/artifacts_by_project.sql index 34ff52467..912f0495d 100644 --- a/warehouse/dbt/models/marts/directory/artifacts_by_project.sql +++ b/warehouse/dbt/models/marts/directory/artifacts_by_project.sql @@ -1,4 +1,10 @@ SELECT atp.project_id AS project_id, - atp.artifact_id AS artifact_id + p.project_slug AS project_slug, + atp.artifact_id AS artifact_id, + atp.artifact_namespace AS artifact_namespace, + atp.artifact_type AS artifact_type, + atp.artifact_name AS artifact_name FROM {{ ref('stg_ossd__artifacts_by_project') }} AS atp +LEFT JOIN {{ ref('projects') }} AS p + ON atp.project_id = p.project_id diff --git a/warehouse/dbt/models/marts/directory/deployers_by_project.sql b/warehouse/dbt/models/marts/directory/deployers_by_project.sql new file mode 100644 index 000000000..88e3bbc2c --- /dev/null +++ b/warehouse/dbt/models/marts/directory/deployers_by_project.sql @@ -0,0 +1,8 @@ +SELECT + atp.project_id AS project_id, + atp.project_slug AS project_slug, + atp.artifact_id AS artifact_id, + atp.artifact_namespace AS artifact_namespace, + atp.artifact_name AS artifact_name +FROM {{ ref('artifacts_by_project') }} AS atp +WHERE atp.artifact_type = 'DEPLOYER' diff --git a/warehouse/dbt/models/marts/directory/projects.sql b/warehouse/dbt/models/marts/directory/projects.sql index 423b70042..56fb20027 100644 --- a/warehouse/dbt/models/marts/directory/projects.sql +++ b/warehouse/dbt/models/marts/directory/projects.sql @@ -1,6 +1,10 @@ SELECT id AS project_id, slug AS project_slug, + -- description AS project_description, name AS project_name, - namespace AS user_namespace + namespace AS user_namespace, + ARRAY_LENGTH(JSON_EXTRACT_ARRAY(github)) AS count_github_artifacts, + ARRAY_LENGTH(JSON_EXTRACT_ARRAY(blockchain)) AS count_blockchain_artifacts, + ARRAY_LENGTH(JSON_EXTRACT_ARRAY(npm)) AS count_npm_artifacts FROM {{ ref('stg_ossd__current_projects') }} diff --git a/warehouse/dbt/models/marts/directory/repos_by_project.sql b/warehouse/dbt/models/marts/directory/repos_by_project.sql new file mode 100644 index 000000000..1bbce601e --- /dev/null +++ b/warehouse/dbt/models/marts/directory/repos_by_project.sql @@ -0,0 +1,33 @@ +{# + This model is a directory of all GitHub repositories that are associated with a project. + It includes metadata about the repository, as well as the first and last commit dates. +#} + +WITH github_stats AS ( + SELECT + artifact_id, + MIN(bucket_day) AS first_commit_date, + MAX(bucket_day) AS last_commit_date + FROM {{ ref('events_daily_to_artifact') }} + WHERE event_type = 'COMMIT_CODE' + GROUP BY artifact_id +) + +SELECT + p.project_id, + p.project_slug, + p.project_name, + r.repository_source, + r.artifact_id, + r.is_fork AS repo_is_fork, + r.fork_count AS repo_fork_count, + r.star_count AS repo_star_count, + s.first_commit_date, + s.last_commit_date, + LOWER(r.name_with_owner) AS repo_name_with_owner +FROM {{ ref('stg_ossd__repositories_by_project') }} AS r +LEFT JOIN {{ ref('projects') }} AS p + ON r.project_id = p.project_id +LEFT JOIN github_stats AS s + ON r.artifact_id = s.artifact_id +WHERE r.repository_source = 'GITHUB' diff --git a/warehouse/dbt/models/marts/events/users_monthly_to_project.sql b/warehouse/dbt/models/marts/events/users_monthly_to_project.sql index ed05e3587..f8a199687 100644 --- a/warehouse/dbt/models/marts/events/users_monthly_to_project.sql +++ b/warehouse/dbt/models/marts/events/users_monthly_to_project.sql @@ -1,15 +1,8 @@ SELECT project_id, - user_segment_type, + from_namespace, + event_type, bucket_month, - SUM(amount) AS amount -FROM - ( - SELECT * FROM {{ ref('int_devs') }} - UNION ALL - SELECT * FROM {{ ref('int_users') }} - ) AS combined_data -GROUP BY - project_id, - user_segment_type, - bucket_month + COUNT(DISTINCT from_id) AS amount +FROM {{ ref('int_user_events_monthly_to_project') }} +GROUP BY 1, 2, 3, 4 diff --git a/warehouse/dbt/models/marts/project_metrics/live/pm_dev_months.sql b/warehouse/dbt/models/marts/project_metrics/live/pm_dev_months.sql new file mode 100644 index 000000000..4c6e7b6ef --- /dev/null +++ b/warehouse/dbt/models/marts/project_metrics/live/pm_dev_months.sql @@ -0,0 +1,56 @@ +{# + This model calculates the total man-months of developer activity + for each project in various time ranges. + + The model uses the active_devs_monthly_to_project model to segment + developers based on monthly activity using the Electric Capital + Developer Report taxonomy. +#} +{{ + config(meta = { + 'sync_to_cloudsql': True + }) +}} + +WITH time_ranges AS ( + SELECT + time_interval, + DATE_TRUNC(start_date, MONTH) AS start_month + FROM ( + SELECT + '30D' AS time_interval, + DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY) AS start_date + UNION ALL + SELECT + '90D' AS time_interval, + DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY) AS start_date + UNION ALL + SELECT + '6M' AS time_interval, + DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH) AS start_date + UNION ALL + SELECT + '1Y' AS time_interval, + DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR) AS start_date + UNION ALL + SELECT + 'ALL' AS time_interval, + DATE('1970-01-01') AS start_date + ) +) + +SELECT + e.project_id, + e.repository_source AS namespace, + CONCAT(e.user_segment_type, '_TOTAL_', tr.time_interval) AS impact_metric, + SUM(e.amount) AS amount +FROM {{ ref('active_devs_monthly_to_project') }} AS e +CROSS JOIN time_ranges AS tr +WHERE + DATE(e.bucket_month) >= tr.start_month + AND DATE(e.bucket_month) < DATE_TRUNC(CURRENT_DATE(), MONTH) +GROUP BY + e.project_id, + e.repository_source, + e.user_segment_type, + tr.time_interval diff --git a/warehouse/dbt/models/marts/project_metrics/live/pm_new_contribs.sql b/warehouse/dbt/models/marts/project_metrics/live/pm_new_contribs.sql new file mode 100644 index 000000000..29479d4f8 --- /dev/null +++ b/warehouse/dbt/models/marts/project_metrics/live/pm_new_contribs.sql @@ -0,0 +1,50 @@ +{# + +#} +{{ + config(meta = { + 'sync_to_cloudsql': True + }) +}} + +WITH time_ranges AS ( + SELECT + time_interval, + DATE_TRUNC(start_date, MONTH) AS start_month + FROM ( + SELECT + '30D' AS time_interval, + DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY) AS start_date + UNION ALL + SELECT + '90D' AS time_interval, + DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY) AS start_date + UNION ALL + SELECT + '6M' AS time_interval, + DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH) AS start_date + UNION ALL + SELECT + '1Y' AS time_interval, + DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR) AS start_date + UNION ALL + SELECT + 'ALL' AS time_interval, + DATE('1970-01-01') AS start_date + ) +) + +SELECT + d.project_id, + d.repository_source AS namespace, + CONCAT('NEW_CONTRIBUTORS_TOTAL_', tr.time_interval) AS impact_metric, + COUNT(DISTINCT CASE + WHEN DATE(d.date_first_contribution) >= tr.start_month + THEN d.from_id + END) AS amount +FROM {{ ref('int_devs') }} AS d +CROSS JOIN time_ranges AS tr +GROUP BY + d.project_id, + d.repository_source, + tr.time_interval diff --git a/warehouse/dbt/models/marts/users/active_devs_monthly_to_collection.sql b/warehouse/dbt/models/marts/users/active_devs_monthly_to_collection.sql new file mode 100644 index 000000000..09596ce13 --- /dev/null +++ b/warehouse/dbt/models/marts/users/active_devs_monthly_to_collection.sql @@ -0,0 +1,35 @@ +{# + This model segments developers based on monthly activity + using the same taxonomy as in the Electric Capital + Developer Report by collection. + + The taxonomy is as follows: + - Full-time developer: A developer who has made at least 10 commits + - Part-time developer: A developer who has made less than 10 commits + - Other contributor: A user who has not made any commits +#} + +SELECT + collection_id, + from_id, + from_namespace AS repository_source, + bucket_month, + 1 AS amount, + CASE + WHEN + event_type = 'COMMIT_CODE' AND count_days >= 10 + THEN 'FULL_TIME_DEV' + WHEN + event_type = 'COMMIT_CODE' AND count_days < 10 + THEN 'PART_TIME_DEV' + ELSE 'OTHER_CONTRIBUTOR' + END AS user_segment_type +FROM {{ ref('int_user_events_monthly_to_collection') }} +WHERE + event_type IN ( + 'COMMIT_CODE', + 'PULL_REQUEST_OPENED', + 'PULL_REQUEST_MERGED', + 'ISSUE_OPENED', + 'ISSUE_CLOSED' + ) diff --git a/warehouse/dbt/models/marts/users/active_devs_monthly_to_project.sql b/warehouse/dbt/models/marts/users/active_devs_monthly_to_project.sql new file mode 100644 index 000000000..c45078fac --- /dev/null +++ b/warehouse/dbt/models/marts/users/active_devs_monthly_to_project.sql @@ -0,0 +1,35 @@ +{# + This model segments developers based on monthly activity + using the same taxonomy as in the Electric Capital + Developer Report. + + The taxonomy is as follows: + - Full-time developer: A developer who has made at least 10 commits + - Part-time developer: A developer who has made less than 10 commits + - Other contributor: A user who has not made any commits +#} + +SELECT + project_id, + from_id, + from_namespace AS repository_source, + bucket_month, + 1 AS amount, + CASE + WHEN + event_type = 'COMMIT_CODE' AND count_days >= 10 + THEN 'FULL_TIME_DEV' + WHEN + event_type = 'COMMIT_CODE' AND count_days < 10 + THEN 'PART_TIME_DEV' + ELSE 'OTHER_CONTRIBUTOR' + END AS user_segment_type +FROM {{ ref('int_user_events_monthly_to_project') }} +WHERE + event_type IN ( + 'COMMIT_CODE', + 'PULL_REQUEST_OPENED', + 'PULL_REQUEST_MERGED', + 'ISSUE_OPENED', + 'ISSUE_CLOSED' + ) diff --git a/warehouse/dbt/models/marts/users/address_rfm_segments_by_project.sql b/warehouse/dbt/models/marts/users/address_rfm_segments_by_project.sql new file mode 100644 index 000000000..86eff6c88 --- /dev/null +++ b/warehouse/dbt/models/marts/users/address_rfm_segments_by_project.sql @@ -0,0 +1,114 @@ +{# + Onchain address segementation using the RFM model. + + RFM stands for Recency, Frequency, and Monetary Value. + In this version, we adapt the M to represent the + number of projects a user has interacted with. + + We will consider the following metrics: + - Recency: Days since last activity + - Frequency: Total activity + - Monetary Value: Project count + + We will then assign a segment to each user based on these + metrics. +#} + +WITH user_data AS ( + SELECT + from_id, + network, + project_id, + count_events AS total_activity, + DATE_DIFF(CURRENT_TIMESTAMP(), date_last_txn, DAY) + AS days_since_last_activity + FROM {{ ref('int_addresses') }} +), + +user_project_count AS ( + SELECT + from_id, + network, + COUNT(DISTINCT project_id) AS project_count + FROM user_data + GROUP BY 1, 2 +), + +user_stats AS ( + SELECT + ud.*, + upc.project_count + FROM user_data AS ud + LEFT JOIN user_project_count AS upc + ON + ud.from_id = upc.from_id + AND ud.network = upc.network +), + +rfm_components AS ( + SELECT + from_id, + network, + project_id, + CASE + WHEN days_since_last_activity < 7 THEN 5 + WHEN days_since_last_activity < 30 THEN 4 + WHEN days_since_last_activity < 90 THEN 3 + WHEN days_since_last_activity < 180 THEN 2 + ELSE 1 + END AS rfm_recency, + CASE + WHEN total_activity = 1 THEN 1 + WHEN total_activity < 10 THEN 2 + WHEN total_activity < 100 THEN 3 + WHEN total_activity < 1000 THEN 4 + ELSE 5 + END AS rfm_frequency, + CASE + WHEN project_count = 1 THEN 1 + WHEN project_count <= 3 THEN 2 + WHEN project_count <= 10 THEN 3 + WHEN project_count <= 30 THEN 4 + ELSE 5 + END AS rfm_ecosystem + FROM user_stats +) + +SELECT + *, + CASE + WHEN rfm_frequency = 5 + THEN + CASE + WHEN rfm_recency = 5 THEN 'Power' + WHEN rfm_recency = 4 THEN 'Loyal' + WHEN rfm_recency = 3 THEN 'At risk' + ELSE 'Churned' + END + WHEN rfm_frequency = 4 + THEN + CASE + WHEN rfm_recency >= 4 THEN 'Loyal' + WHEN rfm_recency = 3 THEN 'At risk' + ELSE 'Churned' + END + WHEN rfm_frequency = 3 + THEN + CASE + WHEN rfm_recency >= 4 THEN 'Promising' + WHEN rfm_recency = 3 THEN 'Needs attention' + ELSE 'Tourist' + END + WHEN rfm_frequency = 2 + THEN + CASE + WHEN rfm_recency >= 4 THEN 'Noob' + ELSE 'Tourist' + END + WHEN rfm_frequency = 1 THEN + CASE + WHEN rfm_recency >= 3 THEN 'Noob' + ELSE 'One and done' + END + END AS user_segment +FROM rfm_components diff --git a/warehouse/dbt/models/marts/users/new_users_monthly_to_project.sql b/warehouse/dbt/models/marts/users/new_users_monthly_to_project.sql new file mode 100644 index 000000000..e0ea77a6d --- /dev/null +++ b/warehouse/dbt/models/marts/users/new_users_monthly_to_project.sql @@ -0,0 +1,35 @@ +{# + Monthly new vs returning users by project and namespace +#} + +WITH user_data AS ( + SELECT + from_id, + repository_source AS namespace, + project_id, + DATE_TRUNC(DATE(date_first_contribution), MONTH) AS month_first + FROM {{ ref('int_devs') }} + UNION ALL + SELECT + from_id, + network AS namespace, + project_id, + DATE_TRUNC(DATE(date_first_txn), MONTH) AS month_first + FROM {{ ref('int_addresses') }} +) + +SELECT + e.from_id, + e.from_id AS namespace, + e.project_id, + e.bucket_month, + CASE + WHEN e.bucket_month = u.month_first THEN 'new' + ELSE 'returning' + END AS user_status +FROM {{ ref('int_user_events_monthly_to_project') }} AS e +LEFT JOIN user_data AS u + ON + e.from_id = u.from_id + AND e.from_namespace = u.namespace + AND e.project_id = u.project_id diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__repositories_by_project.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__repositories_by_project.sql index 3758573a6..6a871b2d6 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__repositories_by_project.sql +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__repositories_by_project.sql @@ -1,11 +1,14 @@ +{# + Currently this is just Github. + oss-directory needs some refactoring to support multiple repository providers +#} + SELECT repos.*, - {# - Currently this is just Github. - oss-directory needs some refactoring to support multiple repository providers - #} projects.id AS project_id, - "GITHUB" AS repository_source + "GITHUB" AS repository_source, + {{ oso_id("'GITHUB'", "'GIT_REPOSITORY'", "CAST(repos.id AS STRING)") }} + AS artifact_id FROM {{ ref('stg_ossd__current_projects') }} AS projects CROSS JOIN