Skip to content

Commit

Permalink
Add all artifact fields to package ownership model (#2545)
Browse files Browse the repository at this point in the history
* fix(dbt): update case types and lookup remote artifact ids

* add: support for `git://` urls to `parse_npm_git_url` macro

---------

Co-authored-by: Javier <[email protected]>
  • Loading branch information
ccerv1 and Jabolol authored Nov 28, 2024
1 parent 55c1232 commit 24c4b32
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 54 deletions.
41 changes: 26 additions & 15 deletions warehouse/dbt/macros/models/parse_npm_git_url.sql
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ normalized_urls as (
regexp_replace(cleaned_url, r'^git@(.*?):', 'https://\\1/')
when regexp_contains(cleaned_url, r'^git\+https://') then
regexp_replace(cleaned_url, r'^git\+', '')
when regexp_contains(cleaned_url, r'^git://') then
regexp_replace(cleaned_url, r'^git://', 'https://')
when regexp_contains(cleaned_url, r'^[^:/]+\.[^:/]+/') then
concat('https://', cleaned_url)
when regexp_contains(cleaned_url, r'^https?://') then
Expand All @@ -35,29 +37,38 @@ normalized_urls as (

parsed_data as (
select
`name`,
artifact_url,
normalized_url,
regexp_extract(normalized_url, r'https?://([^/]+)/') as remote_host,
regexp_extract(normalized_url, r'https?://[^/]+/([^/]+)/') as remote_namespace,
regexp_extract(normalized_url, r'https?://[^/]+/[^/]+/([^/.]+)') as remote_name
lower(`name`) as artifact_name,
lower(regexp_extract(`name`, r'@?([^/]+)')) as artifact_namespace,
lower(artifact_url) as artifact_url,
lower(normalized_url) as normalized_url,
lower(regexp_extract(normalized_url, r'https?://([^/]+)/'))
as remote_artifact_host,
lower(regexp_extract(normalized_url, r'https?://[^/]+/([^/]+)/'))
as remote_artifact_namespace,
lower(regexp_extract(normalized_url, r'https?://[^/]+/[^/]+/([^/.]+)'))
as remote_artifact_name
from normalized_urls
),

final_data as (
select
`name`,
'NPM' as artifact_source,
artifact_name,
artifact_namespace,
artifact_url,
concat('https://', remote_host, '/', remote_namespace, '/', remote_name, '.git') as remote_url,
remote_host,
remote_namespace,
remote_name,
concat(
'https://', remote_artifact_host, '/', remote_artifact_namespace,
'/', remote_artifact_name, '.git'
) as remote_artifact_url,
remote_artifact_host,
remote_artifact_namespace,
remote_artifact_name,
case
when lower(remote_host) like 'github.com%' then 'GITHUB'
when lower(remote_host) like 'gitlab.com%' then 'GITLAB'
when lower(remote_host) like 'bitbucket.org%' then 'BITBUCKET'
when remote_artifact_host like 'github.com%' then 'GITHUB'
when remote_artifact_host like 'gitlab.com%' then 'GITLAB'
when remote_artifact_host like 'bitbucket.org%' then 'BITBUCKET'
else 'OTHER'
end as remote_source_id
end as remote_artifact_source
from parsed_data
)

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
with npm_artifacts as (
select artifact_name
from {{ ref('artifacts_v1') }}
where artifact_source = 'NPM'
),

npm_manifests as (
select
`name`,
json_value(repository, '$.url') as manifest_repository_url,
json_value(repository, '$.type') as manifest_repository_type,
concat('https://www.npmjs.com/package/', `name`) as artifact_url
from {{ ref('stg_npm__manifests') }}
where
`name` in (select * from npm_artifacts)
and json_value(repository, '$.url') is not null
),

npm_repository_urls as (
{{ parse_npm_git_url('manifest_repository_url', 'npm_manifests') }}
),

npm_artifact_ownership as (
select
{{ oso_id(
"artifact_source",
"artifact_url",
) }} as artifact_id,
artifact_url as artifact_source_id,
artifact_source,
artifact_namespace,
artifact_name,
artifact_url,
remote_artifact_source,
remote_artifact_namespace,
remote_artifact_name,
remote_artifact_url
from npm_repository_urls
)

select
npm_artifact_ownership.artifact_id,
npm_artifact_ownership.artifact_source_id,
npm_artifact_ownership.artifact_source,
npm_artifact_ownership.artifact_namespace,
npm_artifact_ownership.artifact_name,
npm_artifact_ownership.artifact_url,
{#
Because we use repo.id as the artifact_source_id for github, we need to lookup the artifact_id for the remote artifact. If the artifact is not found, this will return null.
#}
all_artifacts.artifact_id as remote_artifact_id,
npm_artifact_ownership.remote_artifact_source,
npm_artifact_ownership.remote_artifact_namespace,
npm_artifact_ownership.remote_artifact_name,
npm_artifact_ownership.remote_artifact_url
from npm_artifact_ownership
left outer join {{ ref('int_all_artifacts') }} as all_artifacts
on
npm_artifact_ownership.remote_artifact_namespace = all_artifacts.artifact_namespace
and npm_artifact_ownership.remote_artifact_name = all_artifacts.artifact_name

0 comments on commit 24c4b32

Please sign in to comment.