From 7118352f971e1dd61e930e61872f3e8898e518b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20R=C3=ADos?= Date: Tue, 26 Nov 2024 19:52:46 +0100 Subject: [PATCH] fix: flatten `npm` manifest structure (#2524) * fix: flatten `npm` manifest structure * fix: update `npm` manifests staging columns * fix: update repository field references `int_artifact_ownership` model --- .../directory/int_artifact_ownership.sql | 8 +-- .../models/staging/npm/stg_npm__manifests.sql | 17 +++--- warehouse/oso_dagster/assets/npm.py | 58 ++++++++++++++++--- 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/warehouse/dbt/models/intermediate/directory/int_artifact_ownership.sql b/warehouse/dbt/models/intermediate/directory/int_artifact_ownership.sql index 03cad3b87..52addb4ca 100644 --- a/warehouse/dbt/models/intermediate/directory/int_artifact_ownership.sql +++ b/warehouse/dbt/models/intermediate/directory/int_artifact_ownership.sql @@ -7,17 +7,17 @@ with npm_artifacts as ( npm_manifests as ( select `name`, - repository__url, - repository__type, + json_value(repository, '$.url') as manifest_repository_url, + json_value(repository, '$.type') as manifest_repository_type, concat('https://www.npmjs.com/package/', `name`) as artifact_url from {{ ref('stg_npm__manifests') }} where `name` in (select * from npm_artifacts) - and repository__url is not null + and json_value(repository, '$.url') is not null ), npm_repository_urls as ( - {{ parse_npm_git_url('repository__url', 'npm_manifests') }} + {{ parse_npm_git_url('manifest_repository_url', 'npm_manifests') }} ), npm_artifact_ownership as ( diff --git a/warehouse/dbt/models/staging/npm/stg_npm__manifests.sql b/warehouse/dbt/models/staging/npm/stg_npm__manifests.sql index d51613f1e..e5612f599 100644 --- a/warehouse/dbt/models/staging/npm/stg_npm__manifests.sql +++ b/warehouse/dbt/models/staging/npm/stg_npm__manifests.sql @@ -1,13 +1,12 @@ {% set columns = [ - "name", "version", "description", "keywords", "homepage", "bugs", - "license", "author", "contributors", "funding", "files", "exports", - "main", "browser", "bin", "man", "directories", "repository", - "scripts", "config", "dependencies", "dev_dependencies", - "peer_dependencies", "peer_dependencies_meta", "bundle_dependencies", - "optional_dependencies", "overrides", "engines", "os", "cpu", - "dev_engines", "private", "publish_config", "workspaces", "bugs__url", - "repository__url", "repository__type", "author__url", "author__name", - "author__email" + "name", "version", "description", "keywords", "homepage", "bugs", + "license", "author", "contributors", "funding", "files", "exports", + "main", "browser", "bin", "man", "directories", "repository", + "scripts", "config", "dependencies", "dev_dependencies", + "peer_dependencies", "peer_dependencies_meta", "bundle_dependencies", + "optional_dependencies", "overrides", "engines", "os", "cpu", + "dev_engines", "private", "publish_config", "workspaces", + "_dlt_load_id", "_dlt_id" ] %} with source as ( diff --git a/warehouse/oso_dagster/assets/npm.py b/warehouse/oso_dagster/assets/npm.py index 4a3e20bb3..41e18d1bf 100644 --- a/warehouse/oso_dagster/assets/npm.py +++ b/warehouse/oso_dagster/assets/npm.py @@ -1,5 +1,5 @@ from datetime import datetime, timedelta -from typing import Dict, Generator, List, Optional, Union +from typing import Dict, Generator, List, Optional, Any import dlt import requests @@ -31,18 +31,19 @@ class NPMPackageManifest(BaseModel): description: Optional[str] = None keywords: Optional[List] = None homepage: Optional[str] = None - bugs: Optional[Union[str, Dict]] = None - license: Optional[str] = None - author: Optional[Union[str, Dict]] = None + bugs: Optional[Dict] = None + license: Optional[Dict] = None + author: Optional[Dict] = None contributors: Optional[List] = None - funding: Optional[Union[str, Dict, List]] = None + funding: Optional[List] = None files: Optional[List] = None exports: Optional[Dict] = None main: Optional[str] = None - browser: Optional[bool] = None - man: Optional[Union[str, Dict, List]] = None + browser: Optional[Dict] = None + bin: Optional[Dict] = None + man: Optional[List] = None directories: Optional[Dict] = None - repository: Optional[Union[str, Dict]] = None + repository: Optional[Dict] = None scripts: Optional[Dict] = None config: Optional[Dict] = None dependencies: Optional[Dict] = None @@ -61,6 +62,45 @@ class NPMPackageManifest(BaseModel): workspaces: Optional[List] = None +# Some fields in the NPM manifest are not always in the same format +# This dictionary contains the transformations to apply to the data +# before creating the manifest object +TRANSFORMATIONS = { + "bugs": lambda value: {"url": value} if isinstance(value, str) else value, + "license": lambda value: {"type": value} if isinstance(value, str) else value, + "author": lambda value: {"author": value} if isinstance(value, str) else value, + "funding": lambda value: ( + [{"type": "url", "url": value}] + if isinstance(value, str) + else [value] if isinstance(value, dict) else value + ), + "exports": lambda value: {".": value} if isinstance(value, str) else value, + "bin": lambda value: {"path": value} if isinstance(value, str) else value, + "man": lambda value: [value] if isinstance(value, str) else value, + "browser": lambda value: ( + {"browser": value} if isinstance(value, (str, bool)) else value + ), + "repository": lambda value: {"url": value} if isinstance(value, str) else value, +} + + +def flatten_manifest(data: Dict[str, Any]) -> Dict[str, Any]: + """ + Applies transformations to the data before creating the manifest object. + + Args: + data (Dict[str, Any]): The data to transform + + Returns: + Dict[str, Any]: The transformed data + """ + + for key, transform in TRANSFORMATIONS.items(): + if key in data: + data[key] = transform(data[key]) + return data + + def get_npm_package_downloads( package_name: str, date_from: datetime, date_to: datetime ) -> Generator[Optional[NPMPackageDownloadInfo], None, None]: @@ -169,7 +209,7 @@ def get_npm_package_manifest( if not response.ok: raise ValueError(f"Failed to fetch data for {package_name}: {response.text}") - yield NPMPackageManifest(**data) + yield NPMPackageManifest(**flatten_manifest(data)) @dlt.resource(