Skip to content

Commit

Permalink
Wire the new ossd source from dagster (#1815)
Browse files Browse the repository at this point in the history
* Wire the new ossd source from dagster

* Fix repositories

* Remove more json references

* more fixes

* more fixes

* more fixes for incremental

* Clean up
  • Loading branch information
ravenac95 authored Jul 19, 2024
1 parent 58c4c46 commit 79823d5
Show file tree
Hide file tree
Showing 13 changed files with 53 additions and 75 deletions.
11 changes: 6 additions & 5 deletions warehouse/dbt/macros/models/first_time_addresses.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ from (
,min_by(`hash`, block_number) as first_tx_hash
,min_by(substring(input, 1, 10), block_number) as first_method_id
from {{ oso_source(network_name, "transactions") }}
where
gas_price > 0
and receipt_status = 1
and receipt_gas_used > 0
{% if is_incremental() %}
where {{ block_timestamp_column }} > TIMESTAMP_SUB(_dbt_max_partition, INTERVAL 1 DAY)
and {{ block_timestamp_column }} > TIMESTAMP_SUB(_dbt_max_partition, INTERVAL 1 DAY)
{% else %}
{{ playground_filter(block_timestamp_column, is_start=True) }}
{{ playground_filter(block_timestamp_column, is_start=False) }}
{% endif %}
and gas_price > 0
and receipt_status = 1
and receipt_gas_used > 0
group by 1, 2
)
{% endmacro %}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
with filtered_collections as (
select distinct
collections.collection_name as `name`,
collections.sync_time as `sync_time`
collections.committed_time as `committed_time`
from {{ ref('stg_ossd__current_collections') }} as collections
cross join UNNEST(collections.projects) as project_name
inner join {{ ref('stg_ossd__current_projects') }} as projects
Expand All @@ -24,4 +24,4 @@ select collections.*
from {{ oso_source("ossd", "collections") }} as collections
inner join filtered_collections as filtered
on filtered.name = collections.name
and collections.sync_time = filtered.sync_time
and collections.committed_time = filtered.committed_time
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
with filtered_projects as (
select distinct
projects.project_name as `name`,
projects.sync_time as `sync_time`
projects.committed_time as `committed_time`
from {{ ref('stg_ossd__current_projects') }} as projects
inner join {{ ref("base_playground__project_filter") }} as filtered
on filtered.project_name = projects.project_name
Expand All @@ -22,4 +22,4 @@ select projects.*
from {{ oso_source("ossd", "projects") }} as projects
inner join filtered_projects as filtered
on filtered.name = projects.name
and projects.sync_time = filtered.sync_time
and projects.committed_time = filtered.committed_time
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ with filtered_project_ids as (
), filtered_repositories as (
select distinct
repos.id as `id`,
repos.sync_time as `sync_time`
repos.ingestion_time as `ingestion_time`
from {{ ref("stg_ossd__current_repositories") }} as repos
inner join {{ ref("int_artifacts_by_project") }} as artifacts_by_project
on CAST(repos.id as string) = artifacts_by_project.artifact_source_id
Expand All @@ -31,4 +31,4 @@ select
from {{ oso_source("ossd", "repositories") }} as repos
inner join filtered_repositories as filtered
on filtered.id = repos.id
and repos._cq_sync_time = filtered.sync_time
and repos.ingestion_time = filtered.ingestion_time
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ github_repos as (
CAST(repos.id as STRING) as artifact_source_id
from projects
cross join
UNNEST(JSON_QUERY_ARRAY(projects.github)) as github
UNNEST(projects.github) as github
inner join
{{ ref('stg_ossd__current_repositories') }} as repos
on
Expand All @@ -27,28 +27,28 @@ github_repos as (
The RTRIMs are to ensure we match even if there are trailing slashes
#}
LOWER(CONCAT("https://github.com/", repos.owner))
= LOWER(RTRIM(JSON_VALUE(github.url), "/"))
or LOWER(repos.url) = LOWER(RTRIM(JSON_VALUE(github.url), "/"))
= LOWER(RTRIM(github.url, "/"))
or LOWER(repos.url) = LOWER(RTRIM(github.url, "/"))
),

all_npm_raw as (
select
"NPM" as artifact_source,
"PACKAGE" as artifact_type,
projects.project_id,
JSON_VALUE(npm.url) as artifact_source_id,
npm.url as artifact_source_id,
npm.url as artifact_url,
case
when
JSON_VALUE(npm.url) like "https://npmjs.com/package/%"
then SUBSTR(JSON_VALUE(npm.url), 28)
npm.url like "https://npmjs.com/package/%"
then SUBSTR(npm.url, 28)
when
JSON_VALUE(npm.url) like "https://www.npmjs.com/package/%"
then SUBSTR(JSON_VALUE(npm.url), 31)
end as artifact_name,
JSON_VALUE(npm.url) as artifact_url
npm.url like "https://www.npmjs.com/package/%"
then SUBSTR(npm.url, 31)
end as artifact_name
from projects
cross join
UNNEST(JSON_QUERY_ARRAY(projects.npm)) as npm
UNNEST(projects.npm) as npm
),

all_npm as (
Expand All @@ -70,16 +70,16 @@ ossd_blockchain as (
tag as artifact_type,
network as artifact_namespace,
network as artifact_source,
JSON_VALUE(blockchains.address) as artifact_source_id,
JSON_VALUE(blockchains.address) as artifact_name,
JSON_VALUE(blockchains.address) as artifact_url
blockchains.address as artifact_source_id,
blockchains.address as artifact_name,
blockchains.address as artifact_url
from projects
cross join
UNNEST(JSON_QUERY_ARRAY(projects.blockchain)) as blockchains
UNNEST(projects.blockchain) as blockchains
cross join
UNNEST(JSON_VALUE_ARRAY(blockchains.networks)) as network
UNNEST(blockchains.networks) as network
cross join
UNNEST(JSON_VALUE_ARRAY(blockchains.tags)) as tag
UNNEST(blockchains.tags) as tag
),

all_artifacts as (
Expand Down
6 changes: 3 additions & 3 deletions warehouse/dbt/models/intermediate/directory/int_projects.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ select
project_name,
display_name,
description,
ARRAY_LENGTH(JSON_EXTRACT_ARRAY(github))
ARRAY_LENGTH(github)
as github_artifact_count,
ARRAY_LENGTH(JSON_EXTRACT_ARRAY(blockchain))
ARRAY_LENGTH(blockchain)
as blockchain_artifact_count,
ARRAY_LENGTH(JSON_EXTRACT_ARRAY(npm))
ARRAY_LENGTH(npm)
as npm_artifact_count
from {{ ref('stg_ossd__current_projects') }}
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ oso_blockchain_artifacts as (
'oso_verification' as discovery_method,
UPPER(tag) as artifact_type,
UPPER(network) as network,
LOWER(JSON_VALUE(blockchains.address)) as address
LOWER(blockchains.address) as address
from projects
cross join
UNNEST(JSON_QUERY_ARRAY(projects.blockchain)) as blockchains
UNNEST(projects.blockchain) as blockchains
cross join
UNNEST(JSON_VALUE_ARRAY(blockchains.networks)) as network
UNNEST(blockchains.networks) as network
cross join
UNNEST(JSON_VALUE_ARRAY(blockchains.tags)) as tag
UNNEST(blockchains.tags) as tag
where tag in ('contract', 'deployer')
),

Expand Down
17 changes: 4 additions & 13 deletions warehouse/dbt/models/oss_directory_source.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,14 @@ sources:
- name: ossd
database: opensource-observer

schema: oso
schema: ossd
tables:
- name: projects
identifier: |
{%- if target.name == 'production' -%} projects_ossd
{%- else -%} base_playground__ossd_projects
{%- endif -%}
identifier: projects
- name: collections
identifier: |
{%- if target.name == 'production' -%} collections_ossd
{%- else -%} base_playground__ossd_collections
{%- endif -%}
identifier: collections
- name: repositories
identifier: |
{%- if target.name == 'production' -%} repositories_ossd
{%- else -%} base_playground__ossd_repositories
{%- endif -%}
identifier: repositories



Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
{#
The most recent view of collections from the ossd cloudquery plugin.
#}
with most_recent_sync as (
select MAX(_cq_sync_time) as sync_time
from {{ oso_source('ossd', 'collections') }}
)

select
{#
Expand All @@ -19,6 +15,6 @@ select
collections.display_name,
collections.description,
collections.projects,
collections.sync_time
collections.sha,
collections.committed_time
from {{ oso_source('ossd', 'collections') }} as collections
where _cq_sync_time = (select * from most_recent_sync)
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
{#
The most recent view of projects from the ossd cloudquery plugin.
#}
with most_recent_sync as (
select MAX(_cq_sync_time) as sync_time
from {{ oso_source('ossd', 'projects') }}
)

select
{#
id is the SHA256 of namespace + slug. We hardcode our namespace
Expand All @@ -21,6 +16,6 @@ select
projects.github,
projects.npm,
projects.blockchain,
projects.sync_time
projects.sha,
projects.committed_time
from {{ oso_source('ossd', 'projects') }} as projects
where _cq_sync_time = (select * from most_recent_sync)
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
The most recent view of repositories from the github-resolve-repos cloudquery plugin.
#}
with most_recent_sync as (
select MAX(_cq_sync_time) as sync_time
select MAX(ingestion_time) as ingestion_time
from {{ oso_source('ossd', 'repositories') }}
)

Expand All @@ -21,6 +21,6 @@ select
repositories.license_name,
repositories.license_spdx_id,
repositories.language,
repositories._cq_sync_time as `sync_time`
repositories.ingestion_time
from {{ oso_source('ossd', 'repositories') }} as repositories
where _cq_sync_time = (select * from most_recent_sync)
where repositories.ingestion_time = (select * from most_recent_sync)
15 changes: 4 additions & 11 deletions warehouse/dbt/models/staging/oss-directory/stg_ossd__schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,9 @@ models:
- &namespace
name: namespace
description: "namespace"
- &_cq_sync_time
name: _cq_sync_time
description: "sync time"
- &_cq_source_name
name: _cq_source_name
description: "source name"
- &committed_sha
name: committed_sha
description: "the oss-directory sha"
- &slug
name: slug
description: "slug"
Expand All @@ -68,8 +65,7 @@ models:
- name: id
description: "project unique id - SHA256 hash derived from namespace and slug"
- *namespace
- *_cq_sync_time
- *_cq_source_name
- *committed_sha
- *slug
- name: name
description: "project name"
Expand Down Expand Up @@ -101,9 +97,6 @@ models:
tags: ['staging', 'oss-directory', 'repository']
description: "GitHub repositories"
columns:
- *_cq_sync_time
- name: _cq_source_name
description: "source name"
- &node_id
name: node_id
description: "node id"
Expand Down
2 changes: 2 additions & 0 deletions warehouse/oso_dagster/dlt_sources/github_repos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class Repository(BaseModel):
ingestion_time: Optional[datetime]
id: int
node_id: str
owner: str
name_with_owner: str
url: str
name: str
Expand Down Expand Up @@ -118,6 +119,7 @@ def gh_repository_to_repository(
node_id=repo.node_id,
name_with_owner=repo.full_name,
name=repo.name,
owner=repo.owner.login,
branch=repo.default_branch or "main",
star_count=repo.stargazers_count or 0,
watcher_count=repo.watchers_count or 0,
Expand Down

0 comments on commit 79823d5

Please sign in to comment.