From fc7e6457e883b1374c496e0f526360e353131593 Mon Sep 17 00:00:00 2001 From: Carl Cervone <42869436+ccerv1@users.noreply.github.com> Date: Fri, 19 Jul 2024 17:37:59 -0400 Subject: [PATCH] fix: duping of repo stats (#1825) * fix: duping of repo stats * fix: dedupe repos in staging * chore: add all columns --- .../metrics/int_repo_metrics_by_project.sql | 40 ++++++++----- .../stg_ossd__current_repositories.sql | 56 ++++++++++++------- 2 files changed, 62 insertions(+), 34 deletions(-) diff --git a/warehouse/dbt/models/intermediate/metrics/int_repo_metrics_by_project.sql b/warehouse/dbt/models/intermediate/metrics/int_repo_metrics_by_project.sql index b3c592e30..875707e02 100644 --- a/warehouse/dbt/models/intermediate/metrics/int_repo_metrics_by_project.sql +++ b/warehouse/dbt/models/intermediate/metrics/int_repo_metrics_by_project.sql @@ -1,7 +1,7 @@ -{{ +{{ config( materialized='table' - ) + ) }} with repo_artifact as ( @@ -20,7 +20,7 @@ with repo_artifact as ( ), repo_snapshot as ( - select + select distinct {{ oso_id("a.artifact_source", "a.artifact_source_id") }} as `artifact_id`, artifact_namespace, artifact_name, @@ -47,15 +47,28 @@ repo_stats as ( group by project_id, to_artifact_id -) +), +artifacts_project as ( + select distinct + project_id, + artifact_id, + artifact_namespace, + artifact_name, + artifact_source, + artifact_type + from {{ ref('int_artifacts_in_ossd_by_project') }} + where + artifact_source = 'GITHUB' + and artifact_type = 'REPOSITORY' +) select - int_artifacts_in_ossd_by_project.project_id, - int_artifacts_in_ossd_by_project.artifact_id, - int_artifacts_in_ossd_by_project.artifact_namespace, - int_artifacts_in_ossd_by_project.artifact_name, - int_artifacts_in_ossd_by_project.artifact_source, + artifacts_project.project_id, + artifacts_project.artifact_id, + artifacts_project.artifact_namespace, + artifacts_project.artifact_name, + artifacts_project.artifact_source, repo_snapshot.is_fork, repo_snapshot.fork_count, repo_snapshot.star_count, @@ -67,11 +80,8 @@ select repo_stats.days_with_commits_count, repo_stats.contributors_to_repo_count, repo_stats.commit_count -from {{ ref('int_artifacts_in_ossd_by_project') }} +from artifacts_project left join repo_snapshot - on int_artifacts_in_ossd_by_project.artifact_id = repo_snapshot.artifact_id + on artifacts_project.artifact_id = repo_snapshot.artifact_id left join repo_stats - on int_artifacts_in_ossd_by_project.artifact_id = repo_stats.artifact_id -where - int_artifacts_in_ossd_by_project.artifact_source = 'GITHUB' - and int_artifacts_in_ossd_by_project.artifact_type = 'REPOSITORY' + on artifacts_project.artifact_id = repo_stats.artifact_id diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql index a25b46006..7ac6e88c0 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql @@ -1,26 +1,44 @@ {# The most recent view of repositories from the github-resolve-repos cloudquery plugin. #} -with most_recent_sync as ( - select MAX(ingestion_time) as ingestion_time +with ranked_repositories as ( + select + node_id, + id, + url, + name, + name_with_owner, + owner, + branch, + star_count, + watcher_count, + fork_count, + is_fork, + license_name, + license_spdx_id, + language, + ingestion_time, + ROW_NUMBER() + over (partition by node_id order by ingestion_time desc, id asc) + as row_num from {{ oso_source('ossd', 'repositories') }} ) select - repositories.node_id, - repositories.id, - repositories.url, - repositories.name, - repositories.name_with_owner, - repositories.owner, - repositories.branch, - repositories.star_count, - repositories.watcher_count, - repositories.fork_count, - repositories.is_fork, - repositories.license_name, - repositories.license_spdx_id, - repositories.language, - repositories.ingestion_time -from {{ oso_source('ossd', 'repositories') }} as repositories -where repositories.ingestion_time = (select * from most_recent_sync) + node_id, + id, + url, + name, + name_with_owner, + owner, + branch, + star_count, + watcher_count, + fork_count, + is_fork, + license_name, + license_spdx_id, + language, + ingestion_time +from ranked_repositories +where row_num = 1