From dcceca0fe327a44b74292ae32ef792374de3ee2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20R=C3=ADos?= Date: Tue, 26 Nov 2024 17:34:36 +0100 Subject: [PATCH] add: `npm` ownership intermediate model (#2520) * add: npm `manifests` dlt connector * add: npm `manifests` and `downloads` staging sources * add: dbt macro to parse `npm` git URLs for remote information * add: `artifact_ownership` intermediate model --- .../dbt/macros/models/parse_npm_git_url.sql | 49 ++++++ .../directory/int_artifact_ownership.sql | 39 +++++ warehouse/dbt/models/npm_sources.yml | 9 ++ .../models/staging/npm/stg_npm__downloads.sql | 9 ++ .../models/staging/npm/stg_npm__manifests.sql | 25 +++ warehouse/oso_dagster/assets/npm.py | 150 ++++++++++++++++-- 6 files changed, 270 insertions(+), 11 deletions(-) create mode 100644 warehouse/dbt/macros/models/parse_npm_git_url.sql create mode 100644 warehouse/dbt/models/intermediate/directory/int_artifact_ownership.sql create mode 100644 warehouse/dbt/models/npm_sources.yml create mode 100644 warehouse/dbt/models/staging/npm/stg_npm__downloads.sql create mode 100644 warehouse/dbt/models/staging/npm/stg_npm__manifests.sql diff --git a/warehouse/dbt/macros/models/parse_npm_git_url.sql b/warehouse/dbt/macros/models/parse_npm_git_url.sql new file mode 100644 index 000000000..33444ca5f --- /dev/null +++ b/warehouse/dbt/macros/models/parse_npm_git_url.sql @@ -0,0 +1,49 @@ +{% macro parse_npm_git_url(key, source) %} + + select + *, + + case + when regexp_contains({{ key }}, r'^git\+ssh://') then + regexp_replace({{ key }}, r'^git\+ssh://([^@]+)@', 'https://') + when regexp_contains({{ key }}, r'^git@') then + regexp_replace({{ key }}, r'^git@(.*?):', 'https://\\1/') + when regexp_contains({{ key }}, r'^git\+https://') then + regexp_replace({{ key }}, r'^git\+', '') + when regexp_contains({{ key }}, r'^https?://') then + {{ key }} + when regexp_contains({{ key }}, r'^[^:/]+\.[^:/]+/') then + concat('https://', {{ key }}) + else null + end as remote_url, + + regexp_extract( + case + when regexp_contains({{ key }}, r'\.git$') then + regexp_replace({{ key }}, r'\.git$', '') + else {{ key }} + end, + r'/([^/]+)$' + ) as remote_name, + + regexp_extract( + case + when regexp_contains({{ key }}, r'^git@') then + regexp_replace({{ key }}, r'^git@(.*?):', 'https://\\1/') + when regexp_contains({{ key }}, r'^git\+ssh://') then + regexp_replace({{ key }}, r'^git\+ssh://', 'https://') + else {{ key }} + end, + r'https?:\/\/[^\/]+\/([^\/]+)\/[^\/]+$' + ) as remote_namespace, + + case + when regexp_contains({{ key }}, r'github\.com') then 'GITHUB' + when regexp_contains({{ key }}, r'gitlab\.com') then 'GITLAB' + when regexp_contains({{ key }}, r'bitbucket\.org') then 'BITBUCKET' + else 'OTHER' + end as remote_source_id + + from {{ source }} + +{% endmacro %} diff --git a/warehouse/dbt/models/intermediate/directory/int_artifact_ownership.sql b/warehouse/dbt/models/intermediate/directory/int_artifact_ownership.sql new file mode 100644 index 000000000..03cad3b87 --- /dev/null +++ b/warehouse/dbt/models/intermediate/directory/int_artifact_ownership.sql @@ -0,0 +1,39 @@ +with npm_artifacts as ( + select artifact_name + from {{ ref('artifacts_v1') }} + where artifact_source = 'NPM' +), + +npm_manifests as ( + select + `name`, + repository__url, + repository__type, + concat('https://www.npmjs.com/package/', `name`) as artifact_url + from {{ ref('stg_npm__manifests') }} + where + `name` in (select * from npm_artifacts) + and repository__url is not null +), + +npm_repository_urls as ( + {{ parse_npm_git_url('repository__url', 'npm_manifests') }} +), + +npm_artifact_ownership as ( + select + {{ oso_id( + "'NPM'", + "artifact_url", + ) }} as artifact_id, + artifact_url, + `name` as artifact_name, + 'NPM' as artifact_source_id, + remote_url, + remote_name, + remote_namespace, + remote_source_id + from npm_repository_urls +) + +select * from npm_artifact_ownership diff --git a/warehouse/dbt/models/npm_sources.yml b/warehouse/dbt/models/npm_sources.yml new file mode 100644 index 000000000..caac4c75c --- /dev/null +++ b/warehouse/dbt/models/npm_sources.yml @@ -0,0 +1,9 @@ +sources: + - name: npm + database: opensource-observer + schema: npm + tables: + - name: downloads + identifier: downloads + - name: manifests + identifier: manifests diff --git a/warehouse/dbt/models/staging/npm/stg_npm__downloads.sql b/warehouse/dbt/models/staging/npm/stg_npm__downloads.sql new file mode 100644 index 000000000..400975599 --- /dev/null +++ b/warehouse/dbt/models/staging/npm/stg_npm__downloads.sql @@ -0,0 +1,9 @@ +with source as ( + select + `date`, + artifact_name, + downloads + from {{ source('npm', 'downloads') }} +) + +select * from source diff --git a/warehouse/dbt/models/staging/npm/stg_npm__manifests.sql b/warehouse/dbt/models/staging/npm/stg_npm__manifests.sql new file mode 100644 index 000000000..d51613f1e --- /dev/null +++ b/warehouse/dbt/models/staging/npm/stg_npm__manifests.sql @@ -0,0 +1,25 @@ +{% set columns = [ + "name", "version", "description", "keywords", "homepage", "bugs", + "license", "author", "contributors", "funding", "files", "exports", + "main", "browser", "bin", "man", "directories", "repository", + "scripts", "config", "dependencies", "dev_dependencies", + "peer_dependencies", "peer_dependencies_meta", "bundle_dependencies", + "optional_dependencies", "overrides", "engines", "os", "cpu", + "dev_engines", "private", "publish_config", "workspaces", "bugs__url", + "repository__url", "repository__type", "author__url", "author__name", + "author__email" +] %} + +with source as ( + select * from {{ source('npm', 'manifests') }} +), + +renamed as ( + select + {% for column in columns %} + {{ adapter.quote(column) }}{% if not loop.last %},{% endif %} + {% endfor %} + from source +) + +select * from renamed diff --git a/warehouse/oso_dagster/assets/npm.py b/warehouse/oso_dagster/assets/npm.py index 71495e960..4a3e20bb3 100644 --- a/warehouse/oso_dagster/assets/npm.py +++ b/warehouse/oso_dagster/assets/npm.py @@ -1,5 +1,5 @@ from datetime import datetime, timedelta -from typing import Generator, List, Optional +from typing import Dict, Generator, List, Optional, Union import dlt import requests @@ -9,22 +9,61 @@ from ..factories.dlt import dlt_factory, pydantic_to_dlt_nullable_columns +# Host for the NPM API +NPM_API_HOST = "https://api.npmjs.org" + # Host for the NPM registry -NPM_HOST = "https://api.npmjs.org" +NPM_REGISTRY_HOST = "https://registry.npmjs.org" -# NPM was launched on January 12, 2010 -NPM_EPOCH = "2010-01-12T00:00:00Z" +# https://github.com/npm/registry/blob/main/docs/download-counts.md#limits +NPM_EPOCH = "2015-01-10T00:00:00Z" -class NPMPackageInfo(BaseModel): +class NPMPackageDownloadInfo(BaseModel): date: datetime artifact_name: str downloads: int +class NPMPackageManifest(BaseModel): + name: Optional[str] = None + version: Optional[str] = None + description: Optional[str] = None + keywords: Optional[List] = None + homepage: Optional[str] = None + bugs: Optional[Union[str, Dict]] = None + license: Optional[str] = None + author: Optional[Union[str, Dict]] = None + contributors: Optional[List] = None + funding: Optional[Union[str, Dict, List]] = None + files: Optional[List] = None + exports: Optional[Dict] = None + main: Optional[str] = None + browser: Optional[bool] = None + man: Optional[Union[str, Dict, List]] = None + directories: Optional[Dict] = None + repository: Optional[Union[str, Dict]] = None + scripts: Optional[Dict] = None + config: Optional[Dict] = None + dependencies: Optional[Dict] = None + devDependencies: Optional[Dict] = None + peerDependencies: Optional[Dict] = None + peerDependenciesMeta: Optional[Dict] = None + bundleDependencies: Optional[List] = None + optionalDependencies: Optional[Dict] = None + overrides: Optional[Dict] = None + engines: Optional[Dict] = None + os: Optional[List] = None + cpu: Optional[List] = None + devEngines: Optional[Dict] = None + private: Optional[bool] = None + publishConfig: Optional[Dict] = None + workspaces: Optional[List] = None + + def get_npm_package_downloads( package_name: str, date_from: datetime, date_to: datetime -) -> Generator[Optional[NPMPackageInfo], None, None]: +) -> Generator[Optional[NPMPackageDownloadInfo], None, None]: """ Fetches the download count for an NPM package between two dates. @@ -34,13 +73,13 @@ def get_npm_package_downloads( date_to (datetime): The end date Yields: - Optional[NPMPackageInfo]: The download count for the package + Optional[NPMPackageDownloadInfo]: The download count for the package """ str_from = date_from.strftime("%Y-%m-%d") str_to = date_to.strftime("%Y-%m-%d") - endpoint = f"{NPM_HOST}/downloads/range/{str_from}:{str_to}/{package_name}" + endpoint = f"{NPM_API_HOST}/downloads/range/{str_from}:{str_to}/{package_name}" response = requests.get( endpoint, timeout=10, @@ -89,7 +128,7 @@ def get_npm_package_downloads( total_downloads = sum(download["downloads"] for download in data["downloads"]) yield ( - NPMPackageInfo( + NPMPackageDownloadInfo( date=date_from, artifact_name=package_name, downloads=total_downloads, @@ -99,10 +138,44 @@ def get_npm_package_downloads( ) +def get_npm_package_manifest( + package_name: str, +) -> Generator[Optional[NPMPackageManifest], None, None]: + """ + Fetches the manifest for an NPM package. + + Args: + context (AssetExecutionContext): The asset execution context + package_name (str): The NPM package name + + Yields: + Optional[NPMPackageManifest]: The manifest for the package + """ + + endpoint = f"{NPM_REGISTRY_HOST}/{package_name}/latest" + response = requests.get( + endpoint, + timeout=10, + headers={ + "X-URL": "https://github.com/opensource-observer/oso", + "X-Contact": "ops@karibalabs.co", + "X-Purpose": "We are currently indexing NPM packages to provide dependency statistics. " + "If you have any questions or concerns, please contact us", + }, + ) + + data = response.json() + + if not response.ok: + raise ValueError(f"Failed to fetch data for {package_name}: {response.text}") + + yield NPMPackageManifest(**data) + + @dlt.resource( primary_key="artifact_name", name="downloads", - columns=pydantic_to_dlt_nullable_columns(NPMPackageInfo), + columns=pydantic_to_dlt_nullable_columns(NPMPackageDownloadInfo), ) def get_all_downloads( context: AssetExecutionContext, @@ -117,7 +190,7 @@ def get_all_downloads( package_names (List[str]): List of NPM package names to fetch Yields: - List[NPMPackageInfo]: The download count for each package + List[NPMPackageDownloadInfo]: The download count for each package """ start = datetime.strptime(context.partition_key, "%Y-%m-%d") @@ -134,6 +207,33 @@ def get_all_downloads( ) +@dlt.resource( + primary_key="name", + name="manifests", + columns=pydantic_to_dlt_nullable_columns(NPMPackageManifest), +) +def get_all_manifests( + context: AssetExecutionContext, + package_names: List, +): + """ + Fetches the manifest for a list of NPM packages. + + Args: + context (AssetExecutionContext): The asset execution + package_names (List): List of NPM package names to fetch + + Yields: + List[NPMPackageManifest]: The manifest for each package + """ + + context.log.info(f"Processing NPM manifests for {len(package_names)} packages") + + yield from ( + get_npm_package_manifest(package_name) for package_name in package_names + ) + + @dlt_factory( key_prefix="npm", partitions_def=WeeklyPartitionsDefinition( @@ -164,3 +264,31 @@ def downloads( for row in client.query_with_string(unique_artifacts_query) ], ) + + +@dlt_factory( + key_prefix="npm", + deps=[AssetKey(["dbt", "production", "artifacts_v1"])], +) +def manifests( + context: AssetExecutionContext, + cbt: CBTResource, +): + unique_artifacts_query = """ + SELECT + DISTINCT(artifact_name) + FROM + `oso.artifacts_v1` + WHERE + artifact_source = "NPM" + """ + + client = cbt.get(context.log) + + yield get_all_manifests( + context, + package_names=[ + row["artifact_name"] + for row in client.query_with_string(unique_artifacts_query) + ], + )