Skip to content

Commit

Permalink
add: github repo dependencies dlt asset (#2397)
Browse files Browse the repository at this point in the history
* add: dagster `github_sbom_resource` asset

* add: `sbom` manifests for `ossd` repositories

* add: `missing_sbom` staging model

* add: `missing_sbom` dagster asset
  • Loading branch information
Jabolol authored Oct 24, 2024
1 parent 1cbe2c4 commit ed1d399
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 49 deletions.
8 changes: 2 additions & 6 deletions warehouse/dbt/models/oss_directory_source.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,5 @@ sources:
identifier: collections
- name: repositories
identifier: repositories






- name: missing_sbom
identifier: missing_sbom
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{{ config(
materialized = 'view'
) }}

with source as (
select *
from {{ source('ossd', 'missing_sbom') }}
),

current_dlt_load_id as (
select max(_dlt_load_id) as max_dlt_load_id
from source
),

last_snapshot as (
select *
from source
where _dlt_load_id = (select max_dlt_load_id from current_dlt_load_id)
)

select *
from last_snapshot
37 changes: 28 additions & 9 deletions warehouse/dbt/models/staging/oss-directory/stg_ossd__schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,23 @@ models:
#...
contributors: oso-team, tomfutago
config:
tags: ['oss-directory', 'artifact', 'project']
tags: ["oss-directory", "artifact", "project"]
description: "Artifacts by associated list of projects"
columns:
- &project_id
name: project_id
description: "project id"
- name: artifact_namespace
- &artifact_namespace
name: artifact_namespace
description: "artifact namespace"
- name: artifact_type
- &artifact_type
name: artifact_type
description: "artifact type"
- name: artifact_name
- &artifact_name
name: artifact_name
description: "artifact name (e.g. GitHub repo, npm package url, blockchain address)"
- name: artifact_url
- &artifact_url
name: artifact_url
description: "artifact url"
- name: artifact_source_id
description: "artifact source id (e.g. internal GitHub repo ID, npm package url, blockchain address)"
Expand All @@ -30,7 +34,7 @@ models:
#...
contributors: oso-team, tomfutago
config:
tags: ['staging', 'oss-directory', 'collection']
tags: ["staging", "oss-directory", "collection"]
description: "Collections"
columns:
- name: id
Expand Down Expand Up @@ -59,7 +63,7 @@ models:
#...
contributors: oso-team, tomfutago
config:
tags: ['staging', 'oss-directory', 'project']
tags: ["staging", "oss-directory", "project"]
description: "Projects"
columns:
- name: id
Expand All @@ -82,7 +86,7 @@ models:
#...
contributors: oso-team, tomfutago
config:
tags: ['staging', 'oss-directory', 'project', 'collection']
tags: ["staging", "oss-directory", "project", "collection"]
description: "Projects by collection (referencial list of IDs)"
columns:
- name: collection_id
Expand All @@ -94,7 +98,7 @@ models:
#...
contributors: oso-team, tomfutago
config:
tags: ['staging', 'oss-directory', 'repository']
tags: ["staging", "oss-directory", "repository"]
description: "GitHub repositories"
columns:
- &node_id
Expand Down Expand Up @@ -130,3 +134,18 @@ models:
- &is_fork
name: is_fork
description: "is this repo a fork?"
- name: stg_ossd__missing_sbom
meta:
#...
contributors: oso-team
config:
tags: ["staging", "oss-directory", "sbom"]
description: "Projects that are missing SBOMs"
columns:
- *artifact_namespace
- *artifact_name
- name: artifact_source
description: "artifact source, currently only GITHUB"
- *artifact_url
- name: snapshot_at
description: "snapshot time"
26 changes: 26 additions & 0 deletions warehouse/oso_dagster/assets/ossd.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
)
from oso_dagster.dlt_sources.github_repos import (
oss_directory_github_repositories_resource,
oss_directory_github_sbom_resource,
oss_directory_missing_sbom_repositories_resource,
)
from oso_dagster.factories import dlt_factory
from oso_dagster.factories.common import AssetFactoryResponse
Expand Down Expand Up @@ -147,6 +149,30 @@ def repositories(
yield oss_directory_github_repositories_resource(projects_df, gh_token)


@dlt_factory(
key_prefix="ossd",
ins={"projects_df": AssetIn(project_key)},
tags=common_tags,
)
def sbom(
projects_df: pl.DataFrame,
gh_token: str = secret_ref_arg(group_name="ossd", key="github_token"),
):
yield oss_directory_github_sbom_resource(projects_df, gh_token)


@dlt_factory(
key_prefix="ossd",
ins={"projects_df": AssetIn(project_key)},
tags=common_tags,
)
def missing_sbom(
projects_df: pl.DataFrame,
gh_token: str = secret_ref_arg(group_name="ossd", key="github_token"),
):
yield oss_directory_missing_sbom_repositories_resource(projects_df, gh_token)


@discoverable_jobs(dependencies=[repositories])
def ossd_jobs(dependencies: t.List[AssetFactoryResponse]):
repositories = t.cast(AssetsDefinition, list(dependencies[0].assets)[0])
Expand Down
198 changes: 164 additions & 34 deletions warehouse/oso_dagster/dlt_sources/github_repos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@
from datetime import datetime, UTC
from dataclasses import dataclass
from enum import Enum
from typing import Optional, Iterable, cast, ParamSpec, Any
from pathlib import Path
from typing import List, Optional, Iterable, cast, Any
from urllib.parse import urlparse, ParseResult

import httpx
import hishel
import dlt
from dlt.common.libs.pydantic import pydantic_to_table_schema_columns
from oso_dagster.factories.dlt import pydantic_to_dlt_nullable_columns
import polars as pl
from pydantic import BaseModel
from githubkit import GitHub
Expand Down Expand Up @@ -64,6 +63,30 @@ class Repository(BaseModel):
updated_at: datetime


class GithubRepositorySBOMItem(BaseModel):
artifact_namespace: str
artifact_name: str
artifact_source: str
package: str
package_source: str
package_version: Optional[str]
snapshot_at: datetime


class GitHubRespositoryMissingSBOMItem(BaseModel):
artifact_namespace: str
artifact_name: str
artifact_source: str
artifact_url: str
snapshot_at: datetime


class GithubClientConfig(BaseModel):
gh_token: str
rate_limit_max_retry: int = 5
server_error_max_rety: int = 3


class InvalidGithubURL(Exception):
pass

Expand Down Expand Up @@ -262,19 +285,82 @@ def github_urls_from_df(self, projects_df: pl.DataFrame):
logger.debug(f"unnested all github urls and got {len(all_github_urls)} rows")
return all_github_urls

def is_repo_missing_sbom(self, repo: Repository) -> bool:
try:
self._gh.rest.dependency_graph.export_sbom(
repo.owner,
repo.name,
)
return False
except RequestFailed as e:
if e.response.status_code == 404:
return True
raise e

def get_sbom_for_repo(self, repo: Repository) -> List[GithubRepositorySBOMItem]:
try:
sbom = self._gh.rest.dependency_graph.export_sbom(
repo.owner,
repo.name,
)
graph = sbom.parsed_data.sbom
sbom_list: List[GithubRepositorySBOMItem] = []

for package in graph.packages:
package_name = package.name or "unknown"

if package_name.find(":") == -1:
continue

def repository_columns():
table_schema_columns = pydantic_to_table_schema_columns(Repository)
for column in table_schema_columns.values():
column["nullable"] = True
print(table_schema_columns)
return table_schema_columns
package_source = package_name[0 : package_name.index(":")]
package_name = package_name[package_name.index(":") + 1 :]

sbom_list.append(
GithubRepositorySBOMItem(
artifact_namespace=repo.owner,
artifact_name=repo.name,
artifact_source="GITHUB",
package=package_name,
package_source=package_source.upper(),
package_version=package.version_info or None,
snapshot_at=arrow.get(graph.creation_info.created).datetime,
)
)

return sbom_list
except RequestFailed as exception:
if exception.response.status_code == 404:
logging.warning("Skipping %s, no SBOM found", repo.url)
return []
raise exception

@staticmethod
def get_github_client(config: GithubClientConfig) -> GitHub:
if constants.http_cache:
logger.debug("Using the cache at: %s", constants.http_cache)
return CachedGithub(
config.gh_token,
sync_storage=get_sync_http_cache_storage(constants.http_cache),
async_storage=get_async_http_cache_storage(constants.http_cache),
auto_retry=RetryChainDecision(
RetryRateLimit(max_retry=config.rate_limit_max_retry),
RetryServerError(max_retry=config.server_error_max_rety),
),
)
logger.debug("Loading github client without a cache")
return GitHub(
config.gh_token,
auto_retry=RetryChainDecision(
RetryRateLimit(max_retry=config.rate_limit_max_retry),
RetryServerError(max_retry=config.server_error_max_rety),
),
)


@dlt.resource(
name="repositories",
table_name="repositories",
columns=repository_columns(),
columns=pydantic_to_dlt_nullable_columns(Repository),
write_disposition="append",
primary_key="id",
merge_key="node_id",
Expand All @@ -287,32 +373,76 @@ def oss_directory_github_repositories_resource(
):
"""Based on the oss_directory data we resolve repositories"""

if constants.http_cache:
logger.debug(f"Using the cache at: {constants.http_cache}")
gh = CachedGithub(
gh_token,
sync_storage=get_sync_http_cache_storage(constants.http_cache),
async_storage=get_async_http_cache_storage(constants.http_cache),
auto_retry=RetryChainDecision(
RetryRateLimit(max_retry=rate_limit_max_retry),
RetryServerError(max_retry=server_error_max_rety),
),
)
else:
logger.debug(f"Loading github client without a cache")
gh = GitHub(
gh_token,
auto_retry=RetryChainDecision(
RetryRateLimit(max_retry=rate_limit_max_retry),
RetryServerError(max_retry=server_error_max_rety),
),
)
config = GithubClientConfig(
gh_token=gh_token,
rate_limit_max_retry=rate_limit_max_retry,
server_error_max_rety=server_error_max_rety,
)

gh = GithubRepositoryResolver.get_github_client(config)
resolver = GithubRepositoryResolver(gh)

yield from resolver.resolve_repos(projects_df)


@dlt.resource(
name="sbom",
table_name="sbom",
columns=pydantic_to_dlt_nullable_columns(GithubRepositorySBOMItem),
write_disposition="append",
)
def oss_directory_github_sbom_resource(
projects_df: pl.DataFrame,
gh_token: str = dlt.secrets.value,
rate_limit_max_retry: int = 5,
server_error_max_rety: int = 3,
):
"""Based on the oss_directory data we resolve sbom manifests for repositories"""

config = GithubClientConfig(
gh_token=gh_token,
rate_limit_max_retry=rate_limit_max_retry,
server_error_max_rety=server_error_max_rety,
)

gh = GithubRepositoryResolver.get_github_client(config)
resolver = GithubRepositoryResolver(gh)

for repo in resolver.resolve_repos(projects_df):
yield repo
yield from resolver.get_sbom_for_repo(repo)


@dlt.resource(
name="missing_sbom",
table_name="missing_sbom",
columns=pydantic_to_dlt_nullable_columns(GitHubRespositoryMissingSBOMItem),
write_disposition="append",
)
def oss_directory_missing_sbom_repositories_resource(
projects_df: pl.DataFrame,
gh_token: str = dlt.secrets.value,
rate_limit_max_retry: int = 5,
server_error_max_rety: int = 3,
):
"""Based on the oss_directory data we resolve repositories"""

config = GithubClientConfig(
gh_token=gh_token,
rate_limit_max_retry=rate_limit_max_retry,
server_error_max_rety=server_error_max_rety,
)

@dlt.source
def oss_directory_github_repositories():
return oss_directory_github_repositories_resource
gh = GithubRepositoryResolver.get_github_client(config)
resolver = GithubRepositoryResolver(gh)

yield (
GitHubRespositoryMissingSBOMItem(
artifact_namespace=repo.owner,
artifact_name=repo.name,
artifact_source="GITHUB",
artifact_url=repo.url,
snapshot_at=repo.ingestion_time or datetime.now(UTC),
)
for repo in resolver.resolve_repos(projects_df)
if resolver.is_repo_missing_sbom(repo)
)

0 comments on commit ed1d399

Please sign in to comment.