From 79823d5c16f9a0a0da8a3da44f2c65ab748a445c Mon Sep 17 00:00:00 2001 From: Reuven Gonzales Date: Fri, 19 Jul 2024 00:39:50 -0700 Subject: [PATCH] Wire the new ossd source from dagster (#1815) * Wire the new ossd source from dagster * Fix repositories * Remove more json references * more fixes * more fixes * more fixes for incremental * Clean up --- .../macros/models/first_time_addresses.sql | 11 +++--- .../base_playground__ossd_collections.sql | 4 +-- .../base_playground__ossd_projects.sql | 4 +-- .../base_playground__ossd_repositories.sql | 4 +-- .../int_artifacts_in_ossd_by_project.sql | 34 +++++++++---------- .../intermediate/directory/int_projects.sql | 6 ++-- .../rf4_oso_contract_discovery.sql | 8 ++--- warehouse/dbt/models/oss_directory_source.yml | 17 +++------- .../stg_ossd__current_collections.sql | 8 ++--- .../stg_ossd__current_projects.sql | 9 ++--- .../stg_ossd__current_repositories.sql | 6 ++-- .../oss-directory/stg_ossd__schema.yml | 15 +++----- .../dlt_sources/github_repos/__init__.py | 2 ++ 13 files changed, 53 insertions(+), 75 deletions(-) diff --git a/warehouse/dbt/macros/models/first_time_addresses.sql b/warehouse/dbt/macros/models/first_time_addresses.sql index 271a02161..3137c1052 100644 --- a/warehouse/dbt/macros/models/first_time_addresses.sql +++ b/warehouse/dbt/macros/models/first_time_addresses.sql @@ -22,14 +22,15 @@ from ( ,min_by(`hash`, block_number) as first_tx_hash ,min_by(substring(input, 1, 10), block_number) as first_method_id from {{ oso_source(network_name, "transactions") }} + where + gas_price > 0 + and receipt_status = 1 + and receipt_gas_used > 0 {% if is_incremental() %} - where {{ block_timestamp_column }} > TIMESTAMP_SUB(_dbt_max_partition, INTERVAL 1 DAY) + and {{ block_timestamp_column }} > TIMESTAMP_SUB(_dbt_max_partition, INTERVAL 1 DAY) {% else %} - {{ playground_filter(block_timestamp_column, is_start=True) }} + {{ playground_filter(block_timestamp_column, is_start=False) }} {% endif %} - and gas_price > 0 - and receipt_status = 1 - and receipt_gas_used > 0 group by 1, 2 ) {% endmacro %} diff --git a/warehouse/dbt/models/base_playground/base_playground__ossd_collections.sql b/warehouse/dbt/models/base_playground/base_playground__ossd_collections.sql index 7b5520ee9..4161a22ee 100644 --- a/warehouse/dbt/models/base_playground/base_playground__ossd_collections.sql +++ b/warehouse/dbt/models/base_playground/base_playground__ossd_collections.sql @@ -12,7 +12,7 @@ with filtered_collections as ( select distinct collections.collection_name as `name`, - collections.sync_time as `sync_time` + collections.committed_time as `committed_time` from {{ ref('stg_ossd__current_collections') }} as collections cross join UNNEST(collections.projects) as project_name inner join {{ ref('stg_ossd__current_projects') }} as projects @@ -24,4 +24,4 @@ select collections.* from {{ oso_source("ossd", "collections") }} as collections inner join filtered_collections as filtered on filtered.name = collections.name - and collections.sync_time = filtered.sync_time \ No newline at end of file + and collections.committed_time = filtered.committed_time \ No newline at end of file diff --git a/warehouse/dbt/models/base_playground/base_playground__ossd_projects.sql b/warehouse/dbt/models/base_playground/base_playground__ossd_projects.sql index 0eaffdce2..6bbd350da 100644 --- a/warehouse/dbt/models/base_playground/base_playground__ossd_projects.sql +++ b/warehouse/dbt/models/base_playground/base_playground__ossd_projects.sql @@ -12,7 +12,7 @@ with filtered_projects as ( select distinct projects.project_name as `name`, - projects.sync_time as `sync_time` + projects.committed_time as `committed_time` from {{ ref('stg_ossd__current_projects') }} as projects inner join {{ ref("base_playground__project_filter") }} as filtered on filtered.project_name = projects.project_name @@ -22,4 +22,4 @@ select projects.* from {{ oso_source("ossd", "projects") }} as projects inner join filtered_projects as filtered on filtered.name = projects.name - and projects.sync_time = filtered.sync_time \ No newline at end of file + and projects.committed_time = filtered.committed_time \ No newline at end of file diff --git a/warehouse/dbt/models/base_playground/base_playground__ossd_repositories.sql b/warehouse/dbt/models/base_playground/base_playground__ossd_repositories.sql index 4bdcb8c07..cf696fdf7 100644 --- a/warehouse/dbt/models/base_playground/base_playground__ossd_repositories.sql +++ b/warehouse/dbt/models/base_playground/base_playground__ossd_repositories.sql @@ -18,7 +18,7 @@ with filtered_project_ids as ( ), filtered_repositories as ( select distinct repos.id as `id`, - repos.sync_time as `sync_time` + repos.ingestion_time as `ingestion_time` from {{ ref("stg_ossd__current_repositories") }} as repos inner join {{ ref("int_artifacts_by_project") }} as artifacts_by_project on CAST(repos.id as string) = artifacts_by_project.artifact_source_id @@ -31,4 +31,4 @@ select from {{ oso_source("ossd", "repositories") }} as repos inner join filtered_repositories as filtered on filtered.id = repos.id - and repos._cq_sync_time = filtered.sync_time \ No newline at end of file + and repos.ingestion_time = filtered.ingestion_time \ No newline at end of file diff --git a/warehouse/dbt/models/intermediate/directory/int_artifacts_in_ossd_by_project.sql b/warehouse/dbt/models/intermediate/directory/int_artifacts_in_ossd_by_project.sql index 1fed73204..f79538ffe 100644 --- a/warehouse/dbt/models/intermediate/directory/int_artifacts_in_ossd_by_project.sql +++ b/warehouse/dbt/models/intermediate/directory/int_artifacts_in_ossd_by_project.sql @@ -18,7 +18,7 @@ github_repos as ( CAST(repos.id as STRING) as artifact_source_id from projects cross join - UNNEST(JSON_QUERY_ARRAY(projects.github)) as github + UNNEST(projects.github) as github inner join {{ ref('stg_ossd__current_repositories') }} as repos on @@ -27,8 +27,8 @@ github_repos as ( The RTRIMs are to ensure we match even if there are trailing slashes #} LOWER(CONCAT("https://github.com/", repos.owner)) - = LOWER(RTRIM(JSON_VALUE(github.url), "/")) - or LOWER(repos.url) = LOWER(RTRIM(JSON_VALUE(github.url), "/")) + = LOWER(RTRIM(github.url, "/")) + or LOWER(repos.url) = LOWER(RTRIM(github.url, "/")) ), all_npm_raw as ( @@ -36,19 +36,19 @@ all_npm_raw as ( "NPM" as artifact_source, "PACKAGE" as artifact_type, projects.project_id, - JSON_VALUE(npm.url) as artifact_source_id, + npm.url as artifact_source_id, + npm.url as artifact_url, case when - JSON_VALUE(npm.url) like "https://npmjs.com/package/%" - then SUBSTR(JSON_VALUE(npm.url), 28) + npm.url like "https://npmjs.com/package/%" + then SUBSTR(npm.url, 28) when - JSON_VALUE(npm.url) like "https://www.npmjs.com/package/%" - then SUBSTR(JSON_VALUE(npm.url), 31) - end as artifact_name, - JSON_VALUE(npm.url) as artifact_url + npm.url like "https://www.npmjs.com/package/%" + then SUBSTR(npm.url, 31) + end as artifact_name from projects cross join - UNNEST(JSON_QUERY_ARRAY(projects.npm)) as npm + UNNEST(projects.npm) as npm ), all_npm as ( @@ -70,16 +70,16 @@ ossd_blockchain as ( tag as artifact_type, network as artifact_namespace, network as artifact_source, - JSON_VALUE(blockchains.address) as artifact_source_id, - JSON_VALUE(blockchains.address) as artifact_name, - JSON_VALUE(blockchains.address) as artifact_url + blockchains.address as artifact_source_id, + blockchains.address as artifact_name, + blockchains.address as artifact_url from projects cross join - UNNEST(JSON_QUERY_ARRAY(projects.blockchain)) as blockchains + UNNEST(projects.blockchain) as blockchains cross join - UNNEST(JSON_VALUE_ARRAY(blockchains.networks)) as network + UNNEST(blockchains.networks) as network cross join - UNNEST(JSON_VALUE_ARRAY(blockchains.tags)) as tag + UNNEST(blockchains.tags) as tag ), all_artifacts as ( diff --git a/warehouse/dbt/models/intermediate/directory/int_projects.sql b/warehouse/dbt/models/intermediate/directory/int_projects.sql index 28af6ff61..e1f7c248e 100644 --- a/warehouse/dbt/models/intermediate/directory/int_projects.sql +++ b/warehouse/dbt/models/intermediate/directory/int_projects.sql @@ -5,10 +5,10 @@ select project_name, display_name, description, - ARRAY_LENGTH(JSON_EXTRACT_ARRAY(github)) + ARRAY_LENGTH(github) as github_artifact_count, - ARRAY_LENGTH(JSON_EXTRACT_ARRAY(blockchain)) + ARRAY_LENGTH(blockchain) as blockchain_artifact_count, - ARRAY_LENGTH(JSON_EXTRACT_ARRAY(npm)) + ARRAY_LENGTH(npm) as npm_artifact_count from {{ ref('stg_ossd__current_projects') }} diff --git a/warehouse/dbt/models/marts/superchain/verification/rf4_oso_contract_discovery.sql b/warehouse/dbt/models/marts/superchain/verification/rf4_oso_contract_discovery.sql index db71a987c..ed21d019d 100644 --- a/warehouse/dbt/models/marts/superchain/verification/rf4_oso_contract_discovery.sql +++ b/warehouse/dbt/models/marts/superchain/verification/rf4_oso_contract_discovery.sql @@ -16,14 +16,14 @@ oso_blockchain_artifacts as ( 'oso_verification' as discovery_method, UPPER(tag) as artifact_type, UPPER(network) as network, - LOWER(JSON_VALUE(blockchains.address)) as address + LOWER(blockchains.address) as address from projects cross join - UNNEST(JSON_QUERY_ARRAY(projects.blockchain)) as blockchains + UNNEST(projects.blockchain) as blockchains cross join - UNNEST(JSON_VALUE_ARRAY(blockchains.networks)) as network + UNNEST(blockchains.networks) as network cross join - UNNEST(JSON_VALUE_ARRAY(blockchains.tags)) as tag + UNNEST(blockchains.tags) as tag where tag in ('contract', 'deployer') ), diff --git a/warehouse/dbt/models/oss_directory_source.yml b/warehouse/dbt/models/oss_directory_source.yml index 29f27c591..781543df4 100644 --- a/warehouse/dbt/models/oss_directory_source.yml +++ b/warehouse/dbt/models/oss_directory_source.yml @@ -2,23 +2,14 @@ sources: - name: ossd database: opensource-observer - schema: oso + schema: ossd tables: - name: projects - identifier: | - {%- if target.name == 'production' -%} projects_ossd - {%- else -%} base_playground__ossd_projects - {%- endif -%} + identifier: projects - name: collections - identifier: | - {%- if target.name == 'production' -%} collections_ossd - {%- else -%} base_playground__ossd_collections - {%- endif -%} + identifier: collections - name: repositories - identifier: | - {%- if target.name == 'production' -%} repositories_ossd - {%- else -%} base_playground__ossd_repositories - {%- endif -%} + identifier: repositories diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql index b9fd80001..bbd1f4843 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_collections.sql @@ -1,10 +1,6 @@ {# The most recent view of collections from the ossd cloudquery plugin. #} -with most_recent_sync as ( - select MAX(_cq_sync_time) as sync_time - from {{ oso_source('ossd', 'collections') }} -) select {# @@ -19,6 +15,6 @@ select collections.display_name, collections.description, collections.projects, - collections.sync_time + collections.sha, + collections.committed_time from {{ oso_source('ossd', 'collections') }} as collections -where _cq_sync_time = (select * from most_recent_sync) diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql index 508f28565..95a145bc9 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_projects.sql @@ -1,11 +1,6 @@ {# The most recent view of projects from the ossd cloudquery plugin. #} -with most_recent_sync as ( - select MAX(_cq_sync_time) as sync_time - from {{ oso_source('ossd', 'projects') }} -) - select {# id is the SHA256 of namespace + slug. We hardcode our namespace @@ -21,6 +16,6 @@ select projects.github, projects.npm, projects.blockchain, - projects.sync_time + projects.sha, + projects.committed_time from {{ oso_source('ossd', 'projects') }} as projects -where _cq_sync_time = (select * from most_recent_sync) diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql index 3509d5073..a25b46006 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__current_repositories.sql @@ -2,7 +2,7 @@ The most recent view of repositories from the github-resolve-repos cloudquery plugin. #} with most_recent_sync as ( - select MAX(_cq_sync_time) as sync_time + select MAX(ingestion_time) as ingestion_time from {{ oso_source('ossd', 'repositories') }} ) @@ -21,6 +21,6 @@ select repositories.license_name, repositories.license_spdx_id, repositories.language, - repositories._cq_sync_time as `sync_time` + repositories.ingestion_time from {{ oso_source('ossd', 'repositories') }} as repositories -where _cq_sync_time = (select * from most_recent_sync) +where repositories.ingestion_time = (select * from most_recent_sync) diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__schema.yml b/warehouse/dbt/models/staging/oss-directory/stg_ossd__schema.yml index e69faa35e..e8ac9c4fe 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__schema.yml +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__schema.yml @@ -38,12 +38,9 @@ models: - &namespace name: namespace description: "namespace" - - &_cq_sync_time - name: _cq_sync_time - description: "sync time" - - &_cq_source_name - name: _cq_source_name - description: "source name" + - &committed_sha + name: committed_sha + description: "the oss-directory sha" - &slug name: slug description: "slug" @@ -68,8 +65,7 @@ models: - name: id description: "project unique id - SHA256 hash derived from namespace and slug" - *namespace - - *_cq_sync_time - - *_cq_source_name + - *committed_sha - *slug - name: name description: "project name" @@ -101,9 +97,6 @@ models: tags: ['staging', 'oss-directory', 'repository'] description: "GitHub repositories" columns: - - *_cq_sync_time - - name: _cq_source_name - description: "source name" - &node_id name: node_id description: "node id" diff --git a/warehouse/oso_dagster/dlt_sources/github_repos/__init__.py b/warehouse/oso_dagster/dlt_sources/github_repos/__init__.py index 521c1c849..d442db439 100644 --- a/warehouse/oso_dagster/dlt_sources/github_repos/__init__.py +++ b/warehouse/oso_dagster/dlt_sources/github_repos/__init__.py @@ -48,6 +48,7 @@ class Repository(BaseModel): ingestion_time: Optional[datetime] id: int node_id: str + owner: str name_with_owner: str url: str name: str @@ -118,6 +119,7 @@ def gh_repository_to_repository( node_id=repo.node_id, name_with_owner=repo.full_name, name=repo.name, + owner=repo.owner.login, branch=repo.default_branch or "main", star_count=repo.stargazers_count or 0, watcher_count=repo.watchers_count or 0,