From d14dee4be15cb7fe2125875452604060b055a0f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20R=C3=ADos?= Date: Mon, 28 Oct 2024 10:08:29 -0700 Subject: [PATCH] fix: `sbom` performance improvements (#2423) * add: `sbom` source to `oss_directory` * fix: identify missing `sbom` repositories more efficiently * fix: avoid calling `resolve_repos` when resolving `sbom` * fix: use dbt `source` macro * chore: fix `sqlfluff` style for `missing_sbom` staging model --- warehouse/dbt/models/oss_directory_source.yml | 2 + .../oss-directory/stg_ossd__missing_sbom.sql | 34 +++++---- warehouse/oso_dagster/assets/ossd.py | 13 ---- .../dlt_sources/github_repos/__init__.py | 70 ++++--------------- 4 files changed, 38 insertions(+), 81 deletions(-) diff --git a/warehouse/dbt/models/oss_directory_source.yml b/warehouse/dbt/models/oss_directory_source.yml index dd89ab5e9..aac4f48d9 100644 --- a/warehouse/dbt/models/oss_directory_source.yml +++ b/warehouse/dbt/models/oss_directory_source.yml @@ -10,5 +10,7 @@ sources: identifier: collections - name: repositories identifier: repositories + - name: sbom + identifier: sbom - name: missing_sbom identifier: missing_sbom diff --git a/warehouse/dbt/models/staging/oss-directory/stg_ossd__missing_sbom.sql b/warehouse/dbt/models/staging/oss-directory/stg_ossd__missing_sbom.sql index 8248b3e08..c3e3d3ac2 100644 --- a/warehouse/dbt/models/staging/oss-directory/stg_ossd__missing_sbom.sql +++ b/warehouse/dbt/models/staging/oss-directory/stg_ossd__missing_sbom.sql @@ -2,21 +2,31 @@ materialized = 'view' ) }} -with source as ( +with all_repos as ( select * - from {{ source('ossd', 'missing_sbom') }} + from + {{ source('ossd', 'repositories') }} ), -current_dlt_load_id as ( - select max(_dlt_load_id) as max_dlt_load_id - from source -), - -last_snapshot as ( +all_ossd as ( select * - from source - where _dlt_load_id = (select max_dlt_load_id from current_dlt_load_id) + from + {{ source('ossd', 'sbom') }} + where + artifact_source = 'GITHUB' ) -select * -from last_snapshot +select + `owner` as artifact_namespace, + `name` as artifact_name, + 'GITHUB' as artifact_source, + `url` as artifact_url, + ingestion_time as snapshot_at +from + all_repos as ar +left join + all_ossd as ao + on + CONCAT(ao.artifact_namespace, '/', ao.artifact_name) = ar.name_with_owner +where + ao.artifact_namespace is null diff --git a/warehouse/oso_dagster/assets/ossd.py b/warehouse/oso_dagster/assets/ossd.py index 1e1faa40e..152a49c66 100644 --- a/warehouse/oso_dagster/assets/ossd.py +++ b/warehouse/oso_dagster/assets/ossd.py @@ -17,7 +17,6 @@ from oso_dagster.dlt_sources.github_repos import ( oss_directory_github_repositories_resource, oss_directory_github_sbom_resource, - oss_directory_missing_sbom_repositories_resource, ) from oso_dagster.factories import dlt_factory from oso_dagster.factories.common import AssetFactoryResponse @@ -162,18 +161,6 @@ def sbom( yield oss_directory_github_sbom_resource(projects_df, gh_token) -@dlt_factory( - key_prefix="ossd", - ins={"projects_df": AssetIn(project_key)}, - tags=common_tags, -) -def missing_sbom( - projects_df: pl.DataFrame, - gh_token: str = secret_ref_arg(group_name="ossd", key="github_token"), -): - yield oss_directory_missing_sbom_repositories_resource(projects_df, gh_token) - - @discoverable_jobs(dependencies=[repositories]) def ossd_jobs(dependencies: t.List[AssetFactoryResponse]): repositories = t.cast(AssetsDefinition, list(dependencies[0].assets)[0]) diff --git a/warehouse/oso_dagster/dlt_sources/github_repos/__init__.py b/warehouse/oso_dagster/dlt_sources/github_repos/__init__.py index 9ef631755..a7c21cb18 100644 --- a/warehouse/oso_dagster/dlt_sources/github_repos/__init__.py +++ b/warehouse/oso_dagster/dlt_sources/github_repos/__init__.py @@ -285,23 +285,13 @@ def github_urls_from_df(self, projects_df: pl.DataFrame): logger.debug(f"unnested all github urls and got {len(all_github_urls)} rows") return all_github_urls - def is_repo_missing_sbom(self, repo: Repository) -> bool: - try: - self._gh.rest.dependency_graph.export_sbom( - repo.owner, - repo.name, - ) - return False - except RequestFailed as e: - if e.response.status_code != 404: - logger.warning("Error checking for SBOM: %s", e) - return True - - def get_sbom_for_repo(self, repo: Repository) -> List[GithubRepositorySBOMItem]: + def get_sbom_for_repo( + self, owner: str, name: str + ) -> List[GithubRepositorySBOMItem]: try: sbom = self._gh.rest.dependency_graph.export_sbom( - repo.owner, - repo.name, + owner, + name, ) graph = sbom.parsed_data.sbom sbom_list: List[GithubRepositorySBOMItem] = [] @@ -317,8 +307,8 @@ def get_sbom_for_repo(self, repo: Repository) -> List[GithubRepositorySBOMItem]: sbom_list.append( GithubRepositorySBOMItem( - artifact_namespace=repo.owner, - artifact_name=repo.name, + artifact_namespace=owner, + artifact_name=name, artifact_source="GITHUB", package=package_name, package_source=package_source.upper(), @@ -330,7 +320,7 @@ def get_sbom_for_repo(self, repo: Repository) -> List[GithubRepositorySBOMItem]: return sbom_list except RequestFailed as exception: if exception.response.status_code == 404: - logger.warning("Skipping %s, no SBOM found", repo.url) + logger.warning("Skipping %s, no SBOM found", f"{owner}/{name}") else: logger.warning("Error getting SBOM: %s", exception) return [] @@ -398,7 +388,7 @@ def oss_directory_github_sbom_resource( rate_limit_max_retry: int = 5, server_error_max_rety: int = 3, ): - """Based on the oss_directory data we resolve sbom manifests for repositories""" + """Retrieve SBOM information for GitHub repositories""" config = GithubClientConfig( gh_token=gh_token, @@ -409,41 +399,9 @@ def oss_directory_github_sbom_resource( gh = GithubRepositoryResolver.get_github_client(config) resolver = GithubRepositoryResolver(gh) - for repo in resolver.resolve_repos(projects_df): - yield from resolver.get_sbom_for_repo(repo) - - -@dlt.resource( - name="missing_sbom", - table_name="missing_sbom", - columns=pydantic_to_dlt_nullable_columns(GitHubRespositoryMissingSBOMItem), - write_disposition="append", -) -def oss_directory_missing_sbom_repositories_resource( - projects_df: pl.DataFrame, - gh_token: str = dlt.secrets.value, - rate_limit_max_retry: int = 5, - server_error_max_rety: int = 3, -): - """Based on the oss_directory data we resolve repositories""" - - config = GithubClientConfig( - gh_token=gh_token, - rate_limit_max_retry=rate_limit_max_retry, - server_error_max_rety=server_error_max_rety, - ) - - gh = GithubRepositoryResolver.get_github_client(config) - resolver = GithubRepositoryResolver(gh) + all_github_urls = resolver.github_urls_from_df(projects_df) + valid_urls = [resolver.parse_url(url) for url in all_github_urls["url"] if url] - yield ( - GitHubRespositoryMissingSBOMItem( - artifact_namespace=repo.owner, - artifact_name=repo.name, - artifact_source="GITHUB", - artifact_url=repo.url, - snapshot_at=repo.ingestion_time or datetime.now(UTC), - ) - for repo in resolver.resolve_repos(projects_df) - if resolver.is_repo_missing_sbom(repo) - ) + for url in valid_urls: + if url.type == GithubURLType.REPOSITORY and url.repository: + yield from resolver.get_sbom_for_repo(url.owner, url.repository)