From 37f2297e6bdcc8fdc1eb5efcf166dfd4534bf261 Mon Sep 17 00:00:00 2001 From: Noel Georgi Date: Tue, 6 Aug 2024 21:10:19 +0530 Subject: [PATCH] feat: support lts and production nvidia modules Support LTS and production versions of NVIDIA kernel modules as per https://docs.nvidia.com/datacenter/tesla/drivers/index.html#lifecycle Part of: https://github.com/siderolabs/talos/issues/9086 Signed-off-by: Noel Georgi --- .github/workflows/ci.yaml | 6 +- .github/workflows/weekly.yaml | 4 +- .kres.yaml | 14 ++- Makefile | 19 ++-- hack/release.toml | 21 ++++ .../kmod-nvidia/{ => lts}/files/nvidia.conf | 0 .../kmod-nvidia/{ => lts}/manifest.yaml | 2 +- .../nonfree/kmod-nvidia/{ => lts}/pkg.yaml | 4 +- .../nonfree/kmod-nvidia/{ => lts}/vars.yaml | 2 +- .../kmod-nvidia/production}/files/nvidia.conf | 0 .../kmod-nvidia/production/manifest.yaml | 10 ++ .../nonfree/kmod-nvidia/production/pkg.yaml | 31 ++++++ .../nonfree/kmod-nvidia/production/vars.yaml | 2 + .../{ => lts}/manifest.yaml | 2 +- .../{ => lts}/nvidia-persistenced.yaml | 0 .../{ => lts}/pkg.yaml | 4 +- .../{ => lts}/vars.yaml | 2 +- .../nvidia-container-cli/{ => lts}/pkg.yaml | 10 +- .../nvidia-container-cli/production/pkg.yaml | 67 +++++++++++++ .../nvidia-container-runtime/pkg.yaml | 6 +- .../{ => lts}/files/15-nvidia-device.rules | 0 .../nvidia-pkgs/lts/pkg.yaml | 98 +++++++++++++++++++ .../production/files/15-nvidia-device.rules | 5 + .../nvidia-pkgs/{ => production}/pkg.yaml | 44 ++++++--- .../production/manifest.yaml | 10 ++ .../production/nvidia-persistenced.yaml | 55 +++++++++++ .../production/pkg.yaml | 34 +++++++ .../production/vars.yaml | 2 + .../{ => lts}/manifest.yaml | 2 +- .../{ => lts}/nvidia-fabricmanager.yaml | 0 .../nvidia-fabricmanager/{ => lts}/pkg.yaml | 14 +-- nvidia-gpu/nvidia-fabricmanager/lts/vars.yaml | 1 + .../production/lts/manifest.yaml | 10 ++ .../production/lts/nvidia-fabricmanager.yaml | 74 ++++++++++++++ .../production/lts/pkg.yaml | 59 +++++++++++ .../production/lts/vars.yaml | 1 + nvidia-gpu/nvidia-fabricmanager/vars.yaml | 1 - .../nvidia-modules/lts/files/nvidia.conf | 4 + .../nvidia-modules/{ => lts}/manifest.yaml | 2 +- nvidia-gpu/nvidia-modules/{ => lts}/pkg.yaml | 4 +- nvidia-gpu/nvidia-modules/{ => lts}/vars.yaml | 2 +- .../production/files/nvidia.conf | 4 + .../nvidia-modules/production/manifest.yaml | 10 ++ nvidia-gpu/nvidia-modules/production/pkg.yaml | 31 ++++++ .../nvidia-modules/production/vars.yaml | 2 + nvidia-gpu/vars.yaml | 18 ++-- power/nut-client/pkg.yaml | 2 - storage/iscsi-tools/open-isns/pkg.yaml | 1 - 48 files changed, 625 insertions(+), 71 deletions(-) rename nvidia-gpu/nonfree/kmod-nvidia/{ => lts}/files/nvidia.conf (100%) rename nvidia-gpu/nonfree/kmod-nvidia/{ => lts}/manifest.yaml (88%) rename nvidia-gpu/nonfree/kmod-nvidia/{ => lts}/pkg.yaml (88%) rename nvidia-gpu/nonfree/kmod-nvidia/{ => lts}/vars.yaml (62%) rename nvidia-gpu/{nvidia-modules => nonfree/kmod-nvidia/production}/files/nvidia.conf (100%) create mode 100644 nvidia-gpu/nonfree/kmod-nvidia/production/manifest.yaml create mode 100644 nvidia-gpu/nonfree/kmod-nvidia/production/pkg.yaml create mode 100644 nvidia-gpu/nonfree/kmod-nvidia/production/vars.yaml rename nvidia-gpu/nvidia-container-toolkit/{ => lts}/manifest.yaml (87%) rename nvidia-gpu/nvidia-container-toolkit/{ => lts}/nvidia-persistenced.yaml (100%) rename nvidia-gpu/nvidia-container-toolkit/{ => lts}/pkg.yaml (92%) rename nvidia-gpu/nvidia-container-toolkit/{ => lts}/vars.yaml (52%) rename nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/{ => lts}/pkg.yaml (80%) create mode 100644 nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/production/pkg.yaml rename nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/{ => lts}/files/15-nvidia-device.rules (100%) create mode 100644 nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml create mode 100644 nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/files/15-nvidia-device.rules rename nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/{ => production}/pkg.yaml (59%) create mode 100644 nvidia-gpu/nvidia-container-toolkit/production/manifest.yaml create mode 100644 nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml create mode 100644 nvidia-gpu/nvidia-container-toolkit/production/pkg.yaml create mode 100644 nvidia-gpu/nvidia-container-toolkit/production/vars.yaml rename nvidia-gpu/nvidia-fabricmanager/{ => lts}/manifest.yaml (87%) rename nvidia-gpu/nvidia-fabricmanager/{ => lts}/nvidia-fabricmanager.yaml (100%) rename nvidia-gpu/nvidia-fabricmanager/{ => lts}/pkg.yaml (82%) create mode 100644 nvidia-gpu/nvidia-fabricmanager/lts/vars.yaml create mode 100644 nvidia-gpu/nvidia-fabricmanager/production/lts/manifest.yaml create mode 100644 nvidia-gpu/nvidia-fabricmanager/production/lts/nvidia-fabricmanager.yaml create mode 100644 nvidia-gpu/nvidia-fabricmanager/production/lts/pkg.yaml create mode 100644 nvidia-gpu/nvidia-fabricmanager/production/lts/vars.yaml delete mode 100644 nvidia-gpu/nvidia-fabricmanager/vars.yaml create mode 100644 nvidia-gpu/nvidia-modules/lts/files/nvidia.conf rename nvidia-gpu/nvidia-modules/{ => lts}/manifest.yaml (85%) rename nvidia-gpu/nvidia-modules/{ => lts}/pkg.yaml (92%) rename nvidia-gpu/nvidia-modules/{ => lts}/vars.yaml (62%) create mode 100644 nvidia-gpu/nvidia-modules/production/files/nvidia.conf create mode 100644 nvidia-gpu/nvidia-modules/production/manifest.yaml create mode 100644 nvidia-gpu/nvidia-modules/production/pkg.yaml create mode 100644 nvidia-gpu/nvidia-modules/production/vars.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 99a84027..837012c3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2024-08-01T13:26:11Z by kres faf91e3. +# Generated on 2024-08-06T13:24:11Z by kres 133368e. name: default concurrency: @@ -33,7 +33,7 @@ jobs: labels: ${{ steps.retrieve-pr-labels.outputs.result }} services: buildkitd: - image: moby/buildkit:v0.15.0 + image: moby/buildkit:v0.15.1 options: --privileged ports: - 1234:1234 @@ -143,7 +143,7 @@ jobs: - default services: buildkitd: - image: moby/buildkit:v0.15.0 + image: moby/buildkit:v0.15.1 options: --privileged ports: - 1234:1234 diff --git a/.github/workflows/weekly.yaml b/.github/workflows/weekly.yaml index bf2b9e4e..8f8920c4 100644 --- a/.github/workflows/weekly.yaml +++ b/.github/workflows/weekly.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2024-08-01T13:26:11Z by kres faf91e3. +# Generated on 2024-08-06T13:24:11Z by kres 133368e. name: weekly concurrency: @@ -16,7 +16,7 @@ jobs: - pkgs services: buildkitd: - image: moby/buildkit:v0.15.0 + image: moby/buildkit:v0.15.1 options: --privileged ports: - 1234:1234 diff --git a/.kres.yaml b/.kres.yaml index 8e8b13ff..105131b2 100644 --- a/.kres.yaml +++ b/.kres.yaml @@ -24,9 +24,12 @@ spec: - mdadm - mei - nut-client - - nvidia-container-toolkit - - nvidia-fabricmanager - - nvidia-open-gpu-kernel-modules + - nvidia-container-toolkit-lts + - nvidia-container-toolkit-production + - nvidia-fabricmanager-lts + - nvidia-fabricmanager-production + - nvidia-open-gpu-kernel-modules-lts + - nvidia-open-gpu-kernel-modules-production - qemu-guest-agent - qlogic-firmware - realtek-firmware @@ -43,7 +46,8 @@ spec: - zfs additionalTargets: nonfree: - - nonfree-kmod-nvidia + - nonfree-kmod-nvidia-lts + - nonfree-kmod-nvidia-production reproducibleTargetName: reproducibility extraBuildArgs: - TAG @@ -54,7 +58,7 @@ spec: - name: EXTENSIONS_IMAGE_REF defaultValue: $(REGISTRY_AND_USERNAME)/extensions:$(TAG) - name: PKGS - defaultValue: v1.8.0-alpha.0-41-ga97d58f + defaultValue: v1.8.0-alpha.0-45-gaf6b4e6 - name: PKGS_PREFIX defaultValue: ghcr.io/siderolabs useBldrPkgTagResolver: true diff --git a/Makefile b/Makefile index b9dff642..81940976 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2024-08-01T17:25:51Z by kres faf91e3. +# Generated on 2024-08-06T11:36:28Z by kres 2fded2b. # common variables @@ -25,7 +25,7 @@ SOURCE_DATE_EPOCH := $(shell git log $(INITIAL_COMMIT_SHA) --pretty=%ct) # sync bldr image with pkgfile -BLDR_RELEASE := v0.3.1 +BLDR_RELEASE := v0.3.2 BLDR_IMAGE := ghcr.io/siderolabs/bldr:$(BLDR_RELEASE) BLDR := docker run --rm --user $(shell id -u):$(shell id -g) --volume $(PWD):/src --entrypoint=/bldr $(BLDR_IMAGE) --root=/src @@ -48,7 +48,7 @@ COMMON_ARGS += --build-arg=PKGS_PREFIX="$(PKGS_PREFIX)" # extra variables EXTENSIONS_IMAGE_REF ?= $(REGISTRY_AND_USERNAME)/extensions:$(TAG) -PKGS ?= v1.8.0-alpha.0-41-ga97d58f +PKGS ?= v1.8.0-alpha.0-45-gaf6b4e6 PKGS_PREFIX ?= ghcr.io/siderolabs # targets defines all the available targets @@ -64,7 +64,6 @@ TARGETS += drbd TARGETS += ecr-credential-provider TARGETS += fuse3 TARGETS += gasket-driver -TARGETS += crun TARGETS += gvisor TARGETS += gvisor-debug TARGETS += hello-world-service @@ -76,9 +75,12 @@ TARGETS += kata-containers TARGETS += mdadm TARGETS += mei TARGETS += nut-client -TARGETS += nvidia-container-toolkit -TARGETS += nvidia-fabricmanager -TARGETS += nvidia-open-gpu-kernel-modules +TARGETS += nvidia-container-toolkit-lts +TARGETS += nvidia-container-toolkit-production +TARGETS += nvidia-fabricmanager-lts +TARGETS += nvidia-fabricmanager-production +TARGETS += nvidia-open-gpu-kernel-modules-lts +TARGETS += nvidia-open-gpu-kernel-modules-production TARGETS += qemu-guest-agent TARGETS += qlogic-firmware TARGETS += realtek-firmware @@ -93,7 +95,8 @@ TARGETS += vmtoolsd-guest-agent TARGETS += wasmedge TARGETS += xen-guest-agent TARGETS += zfs -NONFREE_TARGETS = nonfree-kmod-nvidia +NONFREE_TARGETS = nonfree-kmod-nvidia-lts +NONFREE_TARGETS += nonfree-kmod-nvidia-production # help menu diff --git a/hack/release.toml b/hack/release.toml index aff84482..79643f61 100644 --- a/hack/release.toml +++ b/hack/release.toml @@ -31,6 +31,27 @@ Gvisor now ships an additional runtime using `kvm` as the sandboxing mechanism. title = "Intel Management Engine" description = """ Intel Management Engine (IME) modules is now shipped as a Talos System Extension. +""" + + [notes.nvidia] + title = "NVIDIA Driver and Container Toolkit" + description = """ +The NVIDIA drivers and the container toolkits now ships an LTS and Production version as per https://docs.nvidia.com/datacenter/tesla/drivers/index.html#lifecycle. + +The new extensions are named below: + +* nvidia-container-toolkit-production +* nvidia-container-toolkit-lts +* nvidia-open-gpu-kernel-modules-production +* nvidia-open-gpu-kernel-modules-lts +* nonfree-kmod-nvidia-lts +* nonfree-kmod-nvidia-production + +The extensions would ship the latest version of LTS/Production drivers available at the time of Talos release. + +Image Factory using an existing schematic id would upgrade the NVIDIA driver and container toolkit to the LTS version. + +If production version is required, the schematic id should be updated to the production version. """ [notes.updates] diff --git a/nvidia-gpu/nonfree/kmod-nvidia/files/nvidia.conf b/nvidia-gpu/nonfree/kmod-nvidia/lts/files/nvidia.conf similarity index 100% rename from nvidia-gpu/nonfree/kmod-nvidia/files/nvidia.conf rename to nvidia-gpu/nonfree/kmod-nvidia/lts/files/nvidia.conf diff --git a/nvidia-gpu/nonfree/kmod-nvidia/manifest.yaml b/nvidia-gpu/nonfree/kmod-nvidia/lts/manifest.yaml similarity index 88% rename from nvidia-gpu/nonfree/kmod-nvidia/manifest.yaml rename to nvidia-gpu/nonfree/kmod-nvidia/lts/manifest.yaml index adcde8d7..0571c0a9 100644 --- a/nvidia-gpu/nonfree/kmod-nvidia/manifest.yaml +++ b/nvidia-gpu/nonfree/kmod-nvidia/lts/manifest.yaml @@ -1,6 +1,6 @@ version: v1alpha1 metadata: - name: nonfree-kmod-nvidia + name: nonfree-kmod-nvidia-lts version: "$VERSION" author: Sidero Labs description: | diff --git a/nvidia-gpu/nonfree/kmod-nvidia/pkg.yaml b/nvidia-gpu/nonfree/kmod-nvidia/lts/pkg.yaml similarity index 88% rename from nvidia-gpu/nonfree/kmod-nvidia/pkg.yaml rename to nvidia-gpu/nonfree/kmod-nvidia/lts/pkg.yaml index a23a186e..9b96f0ff 100644 --- a/nvidia-gpu/nonfree/kmod-nvidia/pkg.yaml +++ b/nvidia-gpu/nonfree/kmod-nvidia/lts/pkg.yaml @@ -1,11 +1,11 @@ -name: nonfree-kmod-nvidia +name: nonfree-kmod-nvidia-lts variant: scratch shell: /toolchain/bin/bash dependencies: - stage: base # The pkgs version for a particular release of Talos as defined in # https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs - - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-pkg:{{ .BUILD_ARG_PKGS }}" + - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-lts-pkg:{{ .BUILD_ARG_PKGS }}" steps: - prepare: - | diff --git a/nvidia-gpu/nonfree/kmod-nvidia/vars.yaml b/nvidia-gpu/nonfree/kmod-nvidia/lts/vars.yaml similarity index 62% rename from nvidia-gpu/nonfree/kmod-nvidia/vars.yaml rename to nvidia-gpu/nonfree/kmod-nvidia/lts/vars.yaml index f9870de7..fa402e12 100644 --- a/nvidia-gpu/nonfree/kmod-nvidia/vars.yaml +++ b/nvidia-gpu/nonfree/kmod-nvidia/lts/vars.yaml @@ -1,2 +1,2 @@ # the first part is the driver version and the second the talos version for which the module is built against -VERSION: "{{ .NVIDIA_DRIVER_VERSION }}-{{ .BUILD_ARG_TAG }}" +VERSION: "{{ .NVIDIA_DRIVER_LTS_VERSION }}-{{ .BUILD_ARG_TAG }}" diff --git a/nvidia-gpu/nvidia-modules/files/nvidia.conf b/nvidia-gpu/nonfree/kmod-nvidia/production/files/nvidia.conf similarity index 100% rename from nvidia-gpu/nvidia-modules/files/nvidia.conf rename to nvidia-gpu/nonfree/kmod-nvidia/production/files/nvidia.conf diff --git a/nvidia-gpu/nonfree/kmod-nvidia/production/manifest.yaml b/nvidia-gpu/nonfree/kmod-nvidia/production/manifest.yaml new file mode 100644 index 00000000..f6ecfb81 --- /dev/null +++ b/nvidia-gpu/nonfree/kmod-nvidia/production/manifest.yaml @@ -0,0 +1,10 @@ +version: v1alpha1 +metadata: + name: nonfree-kmod-nvidia-production + version: "$VERSION" + author: Sidero Labs + description: | + This system extension provides nvidia proprietary kernel modules built against a specific Talos version. + compatibility: + talos: + version: ">= v1.5.0" diff --git a/nvidia-gpu/nonfree/kmod-nvidia/production/pkg.yaml b/nvidia-gpu/nonfree/kmod-nvidia/production/pkg.yaml new file mode 100644 index 00000000..24f95bae --- /dev/null +++ b/nvidia-gpu/nonfree/kmod-nvidia/production/pkg.yaml @@ -0,0 +1,31 @@ +name: nonfree-kmod-nvidia-production +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base +# The pkgs version for a particular release of Talos as defined in +# https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs + - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-production-pkg:{{ .BUILD_ARG_PKGS }}" +steps: + - prepare: + - | + sed -i 's#$VERSION#{{ .VERSION }}#' /pkg/manifest.yaml + - install: + - | + mkdir -p /rootfs/lib/modules \ + /rootfs/usr/local/lib/modprobe.d + + cp /pkg/files/nvidia.conf /rootfs/usr/local/lib/modprobe.d/nvidia.conf + + cp -R /lib/modules/* /rootfs/lib/modules + test: + - | + mkdir -p /extensions-validator-rootfs + cp -r /rootfs/ /extensions-validator-rootfs/rootfs + cp /pkg/manifest.yaml /extensions-validator-rootfs/manifest.yaml + /extensions-validator validate --rootfs=/extensions-validator-rootfs --pkg-name="${PKG_NAME}" +finalize: + - from: /rootfs + to: /rootfs + - from: /pkg/manifest.yaml + to: / diff --git a/nvidia-gpu/nonfree/kmod-nvidia/production/vars.yaml b/nvidia-gpu/nonfree/kmod-nvidia/production/vars.yaml new file mode 100644 index 00000000..b377850f --- /dev/null +++ b/nvidia-gpu/nonfree/kmod-nvidia/production/vars.yaml @@ -0,0 +1,2 @@ +# the first part is the driver version and the second the talos version for which the module is built against +VERSION: "{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-{{ .BUILD_ARG_TAG }}" diff --git a/nvidia-gpu/nvidia-container-toolkit/manifest.yaml b/nvidia-gpu/nvidia-container-toolkit/lts/manifest.yaml similarity index 87% rename from nvidia-gpu/nvidia-container-toolkit/manifest.yaml rename to nvidia-gpu/nvidia-container-toolkit/lts/manifest.yaml index b4166ab8..a93c078a 100644 --- a/nvidia-gpu/nvidia-container-toolkit/manifest.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/lts/manifest.yaml @@ -1,6 +1,6 @@ version: v1alpha1 metadata: - name: nvidia-container-toolkit + name: nvidia-container-toolkit-lts version: "$VERSION" author: Sidero Labs description: | diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced.yaml b/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-persistenced.yaml similarity index 100% rename from nvidia-gpu/nvidia-container-toolkit/nvidia-persistenced.yaml rename to nvidia-gpu/nvidia-container-toolkit/lts/nvidia-persistenced.yaml diff --git a/nvidia-gpu/nvidia-container-toolkit/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/lts/pkg.yaml similarity index 92% rename from nvidia-gpu/nvidia-container-toolkit/pkg.yaml rename to nvidia-gpu/nvidia-container-toolkit/lts/pkg.yaml index 2b01677d..7734674e 100644 --- a/nvidia-gpu/nvidia-container-toolkit/pkg.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/lts/pkg.yaml @@ -1,9 +1,9 @@ -name: nvidia-container-toolkit +name: nvidia-container-toolkit-lts variant: scratch shell: /toolchain/bin/bash dependencies: - stage: base - - stage: nvidia-container-cli + - stage: nvidia-container-cli-lts - stage: elfutils - stage: zlib - stage: libcap diff --git a/nvidia-gpu/nvidia-container-toolkit/vars.yaml b/nvidia-gpu/nvidia-container-toolkit/lts/vars.yaml similarity index 52% rename from nvidia-gpu/nvidia-container-toolkit/vars.yaml rename to nvidia-gpu/nvidia-container-toolkit/lts/vars.yaml index 93f875cb..38963327 100644 --- a/nvidia-gpu/nvidia-container-toolkit/vars.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/lts/vars.yaml @@ -1,2 +1,2 @@ # the first part is the driver version and the second the container-toolkit version -VERSION: "{{ .NVIDIA_DRIVER_VERSION }}-{{ .CONTAINER_TOOLKIT_VERSION }}" +VERSION: "{{ .NVIDIA_DRIVER_LTS_VERSION }}-{{ .CONTAINER_TOOLKIT_VERSION }}" diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/lts/pkg.yaml similarity index 80% rename from nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/pkg.yaml rename to nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/lts/pkg.yaml index 2b1a08d1..874f74f7 100644 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/pkg.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/lts/pkg.yaml @@ -1,4 +1,4 @@ -name: nvidia-container-cli +name: nvidia-container-cli-lts variant: scratch shell: /bin/bash install: @@ -15,7 +15,7 @@ dependencies: # nvidia-pkgs depends on glibc, # so any stage depending on nvidia-container-cli will have the updated ld.so.cache, # from both nvidia-pkgs and nvidia-container-cli - - stage: nvidia-pkgs + - stage: nvidia-pkgs-lts - stage: libseccomp from: /rootfs - stage: libcap @@ -28,10 +28,10 @@ dependencies: from: /rootfs steps: - sources: - - url: https://gitlab.com/nvidia/container-toolkit/libnvidia-container/-/archive/{{ .LIBNVIDIA_CONTAINER_VERSION }}/libnvidia-container-{{ .LIBNVIDIA_CONTAINER_VERSION }}.tar.gz + - url: https://github.com/NVIDIA/libnvidia-container/archive/refs/tags/{{ .LIBNVIDIA_CONTAINER_VERSION }}.tar.gz destination: libnvidia-container.tar.gz - sha256: d23984591004c59c33f6f13c8237f1fb84113b8eddb0f9943302df4c3b0cc549 - sha512: a5a75b0cd29cf7c0484dbd650456c93bb495a0fe5449d6b8c7680af7509be3b9e1f12ab437b56309bfb4b66cfe2868b4adbe882e29b169c7733c0247ecf2489b + sha256: cbc1dda7ee90b8b729c5f178292cd07b421863015d84b84c37e69c8d580ab3ff + sha512: b304c284c5ab0c3544362307dc16ffcca8d34497e4356a520dc6da81a86a62b2a262b528cba559bb0d7a3addf018c3b50b6cb78669c82c1b4acae159e5922548 env: SOURCE_DATE_EPOCH: {{ .BUILD_ARG_SOURCE_DATE_EPOCH }} REVISION: {{ .LIBNVIDIA_CONTAINER_REF }} diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/production/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/production/pkg.yaml new file mode 100644 index 00000000..347fa269 --- /dev/null +++ b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-cli/production/pkg.yaml @@ -0,0 +1,67 @@ +name: nvidia-container-cli-production +variant: scratch +shell: /bin/bash +install: + - build-base + - bash + - go + - coreutils + - sed + - curl + - rpcsvc-proto + - patch +dependencies: + - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} + # nvidia-pkgs depends on glibc, + # so any stage depending on nvidia-container-cli will have the updated ld.so.cache, + # from both nvidia-pkgs and nvidia-container-cli + - stage: nvidia-pkgs-production + - stage: libseccomp + from: /rootfs + - stage: libcap + from: /rootfs + - stage: elfutils + from: /rootfs + - stage: zlib + from: /rootfs + - stage: libtirpc + from: /rootfs +steps: + - sources: + - url: https://github.com/NVIDIA/libnvidia-container/archive/refs/tags/{{ .LIBNVIDIA_CONTAINER_VERSION }}.tar.gz + destination: libnvidia-container.tar.gz + sha256: cbc1dda7ee90b8b729c5f178292cd07b421863015d84b84c37e69c8d580ab3ff + sha512: b304c284c5ab0c3544362307dc16ffcca8d34497e4356a520dc6da81a86a62b2a262b528cba559bb0d7a3addf018c3b50b6cb78669c82c1b4acae159e5922548 + env: + SOURCE_DATE_EPOCH: {{ .BUILD_ARG_SOURCE_DATE_EPOCH }} + REVISION: {{ .LIBNVIDIA_CONTAINER_REF }} + LIB_VERSION: {{ .LIBNVIDIA_CONTAINER_VERSION | replace "v" "" }} + WITH_NVCGO: yes + WITH_LIBELF: yes + WITH_TIRPC: no # setting no means we'll use the system libtirpc + WITH_SECCOMP: yes + PKG_CONFIG_PATH: /usr/local/glibc/lib/pkgconfig # to find runtime libraries compiled in extensions (libseccomp) + PATH: "/usr/bin:{{ .PATH }}" # bldr doesn't have /usr/bin in PATH + prepare: + - | + mkdir libnvidia-container + tar -xzf libnvidia-container.tar.gz --strip-components=1 -C libnvidia-container + build: + - | + cd libnvidia-container + + # LDLIBS=-L/usr/local/glibc/lib is set so that libnvidia-container-cli libs which are hardcoded as -llibname and not using pkg-config + CPPFLAGS="-I/usr/local/glibc/include/tirpc" LDLIBS="-L/usr/local/glibc/lib -ltirpc -lelf -lseccomp" LDFLAGS='-Wl,--rpath=\$$ORIGIN/../glibc/\$$LIB' make + install: + - | + mkdir -p /rootfs + + cd libnvidia-container + + make install DESTDIR=/rootfs + + # run ldconfig to update the cache + /rootfs/usr/local/glibc/sbin/ldconfig -r /rootfs +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml index 368b8dcb..3b238243 100644 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml @@ -10,10 +10,10 @@ dependencies: - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} steps: - sources: - - url: https://gitlab.com/nvidia/container-toolkit/container-toolkit/-/archive/{{ .CONTAINER_TOOLKIT_VERSION }}/container-toolkit-{{ .CONTAINER_TOOLKIT_VERSION }}.tar.gz + - url: https://github.com/NVIDIA/nvidia-container-toolkit/archive/refs/tags/{{ .CONTAINER_TOOLKIT_VERSION }}.tar.gz destination: container-toolkit.tar.gz - sha256: b006700e31ed1475ed25695770cab10d74fdac55cdb94e66d70468740482fb53 - sha512: 11ceffddb164194d0f10c60aeec2c1e20c699a6f3cb1887bca8f49496c9fda869c6c65f1f5f8e816467abee43da002fe2922b8e68ba8f6e61d30f635509da5e0 + sha256: 38a193444e0342c0a2c0d3664403e2c341eb77f1461b3f9172fd93c04de82165 + sha512: 691d4fc47ea60b730ec491b333aa8118bcfd62cdab20a42b84155c6a13484d920e758435b5029bbae4fbefce82352aa5764f1554992682f689c95615809fb83c env: GIT_COMMIT: {{ substr 0 7 .CONTAINER_TOOLKIT_REF }} # build is using short sha prepare: diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/files/15-nvidia-device.rules b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/files/15-nvidia-device.rules similarity index 100% rename from nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/files/15-nvidia-device.rules rename to nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/files/15-nvidia-device.rules diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml new file mode 100644 index 00000000..7568662a --- /dev/null +++ b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml @@ -0,0 +1,98 @@ +name: nvidia-pkgs-lts +variant: scratch +shell: /bin/bash +install: + - bash +dependencies: + - image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }} + # depends on glibc to update ld.so.cache + # so any stage depending on nvidia-pkgs will have the updated cache + - stage: glibc +steps: + - sources: + # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/nvidia_driver/linux-sbsa/nvidia_driver-linux-sbsa-{{ .NVIDIA_DRIVER_LTS_VERSION }}-archive.tar.xz + destination: nvidia.tar.xz + sha256: 970be3ae71332ca008f3e6589ae44a70aeffb9e29382980114e47b8fce7790d1 + sha512: bd730a51a77d897509381ecb22eb21a9f4e0c2419288f1c1c26f8ef00e887b1cc09718d1d4c9d613912560e48185ff03ea221865be5c0e590a20868c45a8ea00 + # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/nvidia_driver/linux-x86_64/nvidia_driver-linux-x86_64-{{ .NVIDIA_DRIVER_LTS_VERSION }}-archive.tar.xz + destination: nvidia.tar.xz + sha256: e66527c5c016d0bee9050a7a8573e38be86aad58adee2f40e808c88a4d0c6e90 + sha512: 71624903e9d57a3f8a5dc7ffb2435991fe787b0609096e0e146d03ffef54bdb145940e8717510aa87cd6407c860e22938c840c126db7d4469c265f202db35e18 + # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + prepare: + - | + # the nvidia installer validates these packages are installed + ln -s /bin/true /bin/modprobe + ln -s /bin/true /bin/rmmod + ln -s /bin/true /bin/lsmod + ln -s /bin/true /bin/depmod + + tar xf nvidia.tar.xz --strip-components=1 + install: + - | + mkdir -p assets/{html,libglvnd_install_checker} + + cp -r bin/* assets/ + cp CHANGELOG assets/NVIDIA_Changelog + cp -r docs/* assets/html/ + cp -r etc/* assets/ + cp -r firmware assets/ + cp -r lib/* assets/ + cp LICENSE assets/ + cp -r man/man1/* assets/ + cp MANIFEST assets/.manifest + cp README assets/README.txt + cp -r sbin/* assets/ + cp -r share/* assets/ + cp -r supported-gpus assets/ + cp -r systemd assets/ + cp -r tests/glvnd/* assets/libglvnd_install_checker + # {{ if eq .ARCH "x86_64" }}cp -r wine/* assets/{{ end }} + + cd assets + + ./nvidia-installer --silent \ + --opengl-prefix=/rootfs/usr/local \ + --utility-prefix=/rootfs/usr/local \ + --utility-libdir=glibc/lib \ + --documentation-prefix=/rootfs/usr/local \ + --no-rpms \ + --no-kernel-modules \ + --log-file-name=/tmp/nvidia-installer.log \ + --no-distro-scripts \ + --no-wine-files \ + --no-kernel-module-source \ + --no-check-for-alternate-installs \ + --override-file-type-destination=NVIDIA_MODPROBE:/rootfs/usr/local/bin \ + --override-file-type-destination=FIRMWARE:/rootfs/lib/firmware/nvidia/{{ .NVIDIA_DRIVER_LTS_VERSION }} \ + --no-systemd \ + # {{ if eq .ARCH "x86_64" }}--no-install-compat32-libs{{ end }} + + # copy vulkan/OpenGL json files + mkdir -p /rootfs/{etc/vulkan,usr/share/{glvnd,egl}} + + cp -r /usr/share/glvnd/* /rootfs/usr/share/glvnd + cp -r /usr/share/egl/* /rootfs/usr/share/egl + cp -r /etc/vulkan/* /rootfs/etc/vulkan + + # mv over files from /usr/local/lib -> /usr/local/glibc/lib + mv /rootfs/usr/local/lib/* /rootfs/usr/local/glibc/lib/ + + # copy xorg files + mkdir -p /rootfs/usr/local/glibc/lib/nvidia/xorg + find /usr/lib/xorg/modules -type f -exec cp {} /rootfs/usr/local/glibc/lib/nvidia/xorg \; + + # run ldconfig to update the cache + /rootfs/usr/local/glibc/sbin/ldconfig -r /rootfs + + mkdir -p /rootfs/usr/local/lib/containers/nvidia-persistenced \ + /rootfs/usr/local/etc/containers \ + /rootfs/usr/etc/udev/rules.d + + # copy udev rule + cp /pkg/files/15-nvidia-device.rules /rootfs/usr/etc/udev/rules.d +finalize: + - from: /rootfs + to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/files/15-nvidia-device.rules b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/files/15-nvidia-device.rules new file mode 100644 index 00000000..9277b3cb --- /dev/null +++ b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/files/15-nvidia-device.rules @@ -0,0 +1,5 @@ +# This will create the device nvidia device nodes +ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/local/bin/nvidia-modprobe -c 0" + +# Create the device node for the nvidia-uvm module +ACTION=="add", DEVPATH=="/module/nvidia_uvm", SUBSYSTEM=="module", RUN+="/usr/local/bin/nvidia-modprobe -c 0 -u" diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml similarity index 59% rename from nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/pkg.yaml rename to nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml index 6d3ebb55..b9b43927 100644 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/pkg.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml @@ -1,4 +1,4 @@ -name: nvidia-pkgs +name: nvidia-pkgs-production variant: scratch shell: /bin/bash install: @@ -11,15 +11,15 @@ dependencies: steps: - sources: # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - - url: https://download.nvidia.com/XFree86/Linux-aarch64/{{ .NVIDIA_DRIVER_VERSION }}/NVIDIA-Linux-aarch64-{{ .NVIDIA_DRIVER_VERSION }}.run - destination: nvidia.run - sha256: 8ba8d961457a241bcdf91b76d6fe2f36cb473c8bbdb02fb6650a622ce2e85b33 - sha512: 706de7e53b81f909d8bc6a12a39c594754a164c49f5d23c7939dc3abcfc04f5d5b12b7d65762ae574582149a098f06ee5fe95be4f8ad1056a3307a6ce93f3c00 + - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/nvidia_driver/linux-sbsa/nvidia_driver-linux-sbsa-{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-archive.tar.xz + destination: nvidia.tar.xz + sha256: dd2892ac0c97abe69dd9ccb5e09d2fd5b5ce010c64ce5eb0950a0f6fceb9b4dc + sha512: 9c1466d9ea09a01dda4de0a2b3270cc6a5093636554eadfb58c3e2957e053592f7d628c3d5b31dbb36702e187561cb7f955e9bf2ddb1adb28e7ca4568d39a0f0 # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - - url: https://download.nvidia.com/XFree86/Linux-x86_64/{{ .NVIDIA_DRIVER_VERSION }}/NVIDIA-Linux-x86_64-{{ .NVIDIA_DRIVER_VERSION }}-no-compat32.run - destination: nvidia.run - sha256: ffed07a30323fd6cf9caad3fb45e6259223135f6004d832511921a788f719ba6 - sha512: f75c288b27a17ea8c63dac68cda01b94184b41332778df6a702d30d814c407c1e45f30bd7c81511508ace6560a16e79c24e8698f457aaee3ee1d03c57725ab27 + - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/nvidia_driver/linux-x86_64/nvidia_driver-linux-x86_64-{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-archive.tar.xz + destination: nvidia.tar.xz + sha256: 7959e9e0e15863c9242f8a0bda0b3b67b39701956890ff159961f59e89f92158 + sha512: 89a4249bce2c15af56911afa6998c355d6522e2e7493e80ed9241a9d5009ccf2522bf7bceffc03673600bbfd0d89f3a46a3c21fb0f4977e6dc674648b4c6caea # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr prepare: - | @@ -29,10 +29,28 @@ steps: ln -s /bin/true /bin/lsmod ln -s /bin/true /bin/depmod - bash nvidia.run --extract-only + tar xf nvidia.tar.xz --strip-components=1 install: - | - cd NVIDIA-Linux-* + mkdir -p assets/html + + cp -r bin/* assets/ + cp CHANGELOG assets/NVIDIA_Changelog + cp -r docs/* assets/html/ + cp -r etc/* assets/ + cp -r firmware assets/ + cp -r lib/* assets/ + cp LICENSE assets/ + cp -r man/man1/* assets/ + cp MANIFEST assets/.manifest + cp README assets/README.txt + cp -r sbin/* assets/ + cp -r share/* assets/ + cp -r supported-gpus assets/ + cp -r systemd assets/ + # {{ if eq .ARCH "x86_64" }}cp -r wine/* assets/{{ end }} + + cd assets ./nvidia-installer --silent \ --opengl-prefix=/rootfs/usr/local \ @@ -47,11 +65,11 @@ steps: --no-kernel-module-source \ --no-check-for-alternate-installs \ --override-file-type-destination=NVIDIA_MODPROBE:/rootfs/usr/local/bin \ - --override-file-type-destination=FIRMWARE:/rootfs/lib/firmware/nvidia/{{ .NVIDIA_DRIVER_VERSION }} \ + --override-file-type-destination=FIRMWARE:/rootfs/lib/firmware/nvidia/{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }} \ --no-systemd # copy vulkan/OpenGL json files - mkdir -p /rootfs/{etc/vulkan,usr/{lib/xorg,share/{glvnd,egl}}} + mkdir -p /rootfs/{etc/vulkan,usr/share/{glvnd,egl}} cp -r /usr/share/glvnd/* /rootfs/usr/share/glvnd cp -r /usr/share/egl/* /rootfs/usr/share/egl diff --git a/nvidia-gpu/nvidia-container-toolkit/production/manifest.yaml b/nvidia-gpu/nvidia-container-toolkit/production/manifest.yaml new file mode 100644 index 00000000..49fc70ab --- /dev/null +++ b/nvidia-gpu/nvidia-container-toolkit/production/manifest.yaml @@ -0,0 +1,10 @@ +version: v1alpha1 +metadata: + name: nvidia-container-toolkit-production + version: "$VERSION" + author: Sidero Labs + description: | + This system extension provides nvidia runtime and it's dependencies using NVIDIA's runtime handler. + compatibility: + talos: + version: ">= v1.2.0" diff --git a/nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml b/nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml new file mode 100644 index 00000000..ffd99967 --- /dev/null +++ b/nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml @@ -0,0 +1,55 @@ +# https://download.nvidia.com/XFree86/Linux-x86_64/515.65.01/README/nvidia-persistenced.html +name: nvidia-persistenced +container: + entrypoint: /usr/local/bin/nvidia-persistenced-wrapper + mounts: + # device files + - source: /dev + destination: /dev + type: bind + options: + - rshared + - rbind + - rw + # shared libraries + - source: /lib64 + destination: /lib64 + type: bind + options: + - bind + - ro + # shared libraries + - source: /usr/local/glibc + destination: /usr/local/glibc + type: bind + options: + - bind + - ro + # nvidia libraries + - source: /usr/local/lib + destination: /usr/local/lib + type: bind + options: + - bind + - ro + # service state file + - source: /var/run + destination: /var/run + type: bind + options: + - rshared + - rbind + - rw + # binaries + - source: /usr/local/bin + destination: /usr/local/bin + type: bind + options: + - bind + - ro +depends: + - service: cri + # we need to depend on udevd so that the nvidia device files are created + - service: udevd + - path: /sys/bus/pci/drivers/nvidia +restart: always diff --git a/nvidia-gpu/nvidia-container-toolkit/production/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/production/pkg.yaml new file mode 100644 index 00000000..d672eb60 --- /dev/null +++ b/nvidia-gpu/nvidia-container-toolkit/production/pkg.yaml @@ -0,0 +1,34 @@ +name: nvidia-container-toolkit-production +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base + - stage: nvidia-container-cli-production + - stage: elfutils + - stage: zlib + - stage: libcap + - stage: libseccomp + - stage: libtirpc + - stage: nvidia-container-runtime + - stage: nvidia-container-runtime-wrapper + - stage: nvidia-persistenced-wrapper +steps: + - prepare: + - | + sed -i 's#$VERSION#{{ .VERSION }}#' /pkg/manifest.yaml + install: + - | + mkdir -p /rootfs + test: + - | + mkdir -p /extensions-validator-rootfs + cp -r /rootfs/ /extensions-validator-rootfs/rootfs + cp /pkg/manifest.yaml /extensions-validator-rootfs/manifest.yaml + /extensions-validator validate --rootfs=/extensions-validator-rootfs --pkg-name="${PKG_NAME}" +finalize: + - from: /rootfs + to: /rootfs + - from: /pkg/nvidia-persistenced.yaml + to: /rootfs/usr/local/etc/containers/nvidia-persistenced.yaml + - from: /pkg/manifest.yaml + to: / diff --git a/nvidia-gpu/nvidia-container-toolkit/production/vars.yaml b/nvidia-gpu/nvidia-container-toolkit/production/vars.yaml new file mode 100644 index 00000000..58e7f164 --- /dev/null +++ b/nvidia-gpu/nvidia-container-toolkit/production/vars.yaml @@ -0,0 +1,2 @@ +# the first part is the driver version and the second the container-toolkit version +VERSION: "{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-{{ .CONTAINER_TOOLKIT_VERSION }}" diff --git a/nvidia-gpu/nvidia-fabricmanager/manifest.yaml b/nvidia-gpu/nvidia-fabricmanager/lts/manifest.yaml similarity index 87% rename from nvidia-gpu/nvidia-fabricmanager/manifest.yaml rename to nvidia-gpu/nvidia-fabricmanager/lts/manifest.yaml index 6bb3008c..b1ba3303 100644 --- a/nvidia-gpu/nvidia-fabricmanager/manifest.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/lts/manifest.yaml @@ -1,6 +1,6 @@ version: v1alpha1 metadata: - name: nvidia-fabricmanager + name: nvidia-fabricmanager-lts version: "$VERSION" author: Sidero Labs description: | diff --git a/nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager.yaml b/nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml similarity index 100% rename from nvidia-gpu/nvidia-fabricmanager/nvidia-fabricmanager.yaml rename to nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml diff --git a/nvidia-gpu/nvidia-fabricmanager/pkg.yaml b/nvidia-gpu/nvidia-fabricmanager/lts/pkg.yaml similarity index 82% rename from nvidia-gpu/nvidia-fabricmanager/pkg.yaml rename to nvidia-gpu/nvidia-fabricmanager/lts/pkg.yaml index e8335091..81732f2d 100644 --- a/nvidia-gpu/nvidia-fabricmanager/pkg.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/lts/pkg.yaml @@ -1,4 +1,4 @@ -name: nvidia-fabricmanager +name: nvidia-fabricmanager-lts variant: scratch shell: /toolchain/bin/bash dependencies: @@ -6,15 +6,15 @@ dependencies: steps: - sources: # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-sbsa/fabricmanager-linux-sbsa-{{ .NVIDIA_DRIVER_VERSION }}-archive.tar.xz + - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-sbsa/fabricmanager-linux-sbsa-{{ .NVIDIA_DRIVER_LTS_VERSION }}-archive.tar.xz destination: fabricmanager.tar.xz - sha256: d0fbe665669a3b68d138bec0edcc4920866935171bf12c24470328d10ca2403b - sha512: c705cc208225b8b83c91cca4a9e363c4862c0fb726e95fd68dd5e6a269620da0f1272138102c12b061c4b3ff20ceee4e35abb3bf8af4adbe1de9411ddec82f6a + sha256: 235ed7e0a55215ec4d0467fe73f71445622debca87bdb990bf582e022d38d699 + sha512: c1d4b8983e274be5c881664e44ba558e0d7c92560a9058adaa158f5a88df2e40b6b4b95c555accd672e9aa366b2e9c0b945d5d9f320150791aea844b07db5bf0 # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr - - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-{{ .NVIDIA_DRIVER_VERSION }}-archive.tar.xz + - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-{{ .NVIDIA_DRIVER_LTS_VERSION }}-archive.tar.xz destination: fabricmanager.tar.xz - sha256: e4a4584be24b5408439019fc67b7b4b89bd42d0cba752a709d8b1b071c3b3318 - sha512: a9bb0bb7d52b576378f1d767c5b801c6421390aa5d2acc40e2bc2b264c1d1f41b2cd5b166a00e4893654975a20e68a8597c15e467ebaade137fa18a6015609ab + sha256: a4d1ead61c684d2b83edbedbb09869b9cfa7a83838ea8210985a519fa36c9834 + sha512: 1cd3a4662da46210695a759be6f962754d5168a72e1376b1a9d8464f80829519a0fc4498474009a426264ef17a8569a587e37560760a599068794705cf3fdcc7 # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr prepare: - | diff --git a/nvidia-gpu/nvidia-fabricmanager/lts/vars.yaml b/nvidia-gpu/nvidia-fabricmanager/lts/vars.yaml new file mode 100644 index 00000000..a948993c --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/lts/vars.yaml @@ -0,0 +1 @@ +VERSION: "{{ .NVIDIA_DRIVER_LTS_VERSION }}" diff --git a/nvidia-gpu/nvidia-fabricmanager/production/lts/manifest.yaml b/nvidia-gpu/nvidia-fabricmanager/production/lts/manifest.yaml new file mode 100644 index 00000000..61ccc162 --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/production/lts/manifest.yaml @@ -0,0 +1,10 @@ +version: v1alpha1 +metadata: + name: nvidia-fabricmanager-production + version: "$VERSION" + author: Sidero Labs + description: | + This system extension provides the Nvidia fabricmanager for GPU's that need NVLink support. + compatibility: + talos: + version: ">= v1.2.0" diff --git a/nvidia-gpu/nvidia-fabricmanager/production/lts/nvidia-fabricmanager.yaml b/nvidia-gpu/nvidia-fabricmanager/production/lts/nvidia-fabricmanager.yaml new file mode 100644 index 00000000..f252c5af --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/production/lts/nvidia-fabricmanager.yaml @@ -0,0 +1,74 @@ +# https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf +name: nvidia-fabricmanager +container: + entrypoint: /usr/local/bin/nv-fabricmanager + args: + - --config + - /usr/local/share/nvidia/nvswitch/fabricmanager.cfg + mounts: + # device files + - source: /dev + destination: /dev + type: bind + options: + - rshared + - rbind + - rw + # shared libraries + - source: /lib64 + destination: /lib64 + type: bind + options: + - bind + - ro + # shared libraries + - source: /usr/local/glibc + destination: /usr/local/glibc + type: bind + options: + - bind + - ro + # nvidia libraries + - source: /usr/local/lib + destination: /usr/local/lib + type: bind + options: + - bind + - ro + # service state file + - source: /var/run/nvidia-fabricmanager + destination: /var/run/nvidia-fabricmanager + type: bind + options: + - rshared + - rbind + - rw + # log files + - source: /var/log + destination: /var/log + type: bind + options: + - rshared + - rbind + - rw + # fabric topology files + - source: /usr/local/share/nvidia/nvswitch + destination: /usr/local/share/nvidia/nvswitch + type: bind + options: + - rshared + - rbind + - ro + # binaries + - source: /usr/local/bin + destination: /usr/local/bin + type: bind + options: + - bind + - ro +depends: + - service: cri + # we need to depend on udevd so that the nvidia device files are created + - service: udevd + - path: /sys/bus/pci/drivers/nvidia +restart: always diff --git a/nvidia-gpu/nvidia-fabricmanager/production/lts/pkg.yaml b/nvidia-gpu/nvidia-fabricmanager/production/lts/pkg.yaml new file mode 100644 index 00000000..9a7ac4ba --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/production/lts/pkg.yaml @@ -0,0 +1,59 @@ +name: nvidia-fabricmanager-production +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base +steps: + - sources: + # {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-sbsa/fabricmanager-linux-sbsa-{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-archive.tar.xz + destination: fabricmanager.tar.xz + sha256: c9760f6f1c582fd43ea15b93ca66dd368459432a39338f648c585d32514ab6d6 + sha512: 9b0b6b7c8ce19f5a7408d338ebf123b2a5a8184d20590ee82744f44b182fbc555c43b2278cade063836493f2162cdfa2d984466a05956c95cbe4f0c172589422 + # {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + - url: https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-archive.tar.xz + destination: fabricmanager.tar.xz + sha256: 034c04ca2a6ce6a5d49bf293b969618609c90470e620fee97ec76cac1f4471f7 + sha512: 6af90b415d82e448d81416daa36cb4588be6f796f53a3e04a1466a32c46212598ba3c60a96b4c066cde7af881a41f88f4f2015c499dedeed3c0d59611e0d6b21 + # {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr + prepare: + - | + tar -xf fabricmanager.tar.xz --strip-components=1 + + sed -i 's#$VERSION#{{ .VERSION }}#' /pkg/manifest.yaml + install: + - | + mkdir -p /rootfs/usr/local/bin \ + /rootfs/usr/local/lib \ + /rootfs/usr/local/share/nvidia/nvswitch \ + /rootfs/usr/local/lib/containers/nvidia-fabricmanager \ + /rootfs/usr/local/etc/containers + + cp lib/libnvfm.so.1 /rootfs/usr/local/lib/libnvfm.so.1 + ln -s libnvfm.so.1 /rootfs/usr/local/lib/libnvfm.so + + cp bin/nv-fabricmanager /rootfs/usr/local/bin/ + cp bin/nvswitch-audit /rootfs/usr/local/bin/ + + cp share/nvidia/nvswitch/dgx2_hgx2_topology /rootfs/usr/local/share/nvidia/nvswitch/ + cp share/nvidia/nvswitch/dgxa100_hgxa100_topology /rootfs/usr/local/share/nvidia/nvswitch/ + + cp etc/fabricmanager.cfg /rootfs/usr/local/share/nvidia/nvswitch/ + + sed -i 's/DAEMONIZE=.*/DAEMONIZE=0/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg + sed -i 's/STATE_FILE_NAME=.*/STATE_FILE_NAME=\/var\/run\/nvidia-fabricmanager\/fabricmanager.state/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg + sed -i 's/TOPOLOGY_FILE_PATH=.*/TOPOLOGY_FILE_PATH=\/usr\/local\/share\/nvidia\/nvswitch/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg + sed -i 's/DATABASE_PATH=.*/DATABASE_PATH=\/usr\/local\/share\/nvidia\/nvswitch/g' /rootfs/usr/local/share/nvidia/nvswitch/fabricmanager.cfg + test: + - | + mkdir -p /extensions-validator-rootfs + cp -r /rootfs/ /extensions-validator-rootfs/rootfs + cp /pkg/manifest.yaml /extensions-validator-rootfs/manifest.yaml + /extensions-validator validate --rootfs=/extensions-validator-rootfs --pkg-name="${PKG_NAME}" +finalize: + - from: /rootfs + to: /rootfs + - from: /pkg/nvidia-fabricmanager.yaml + to: /rootfs/usr/local/etc/containers/nvidia-fabricmanager.yaml + - from: /pkg/manifest.yaml + to: / diff --git a/nvidia-gpu/nvidia-fabricmanager/production/lts/vars.yaml b/nvidia-gpu/nvidia-fabricmanager/production/lts/vars.yaml new file mode 100644 index 00000000..2a8ccd39 --- /dev/null +++ b/nvidia-gpu/nvidia-fabricmanager/production/lts/vars.yaml @@ -0,0 +1 @@ +VERSION: "{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}" diff --git a/nvidia-gpu/nvidia-fabricmanager/vars.yaml b/nvidia-gpu/nvidia-fabricmanager/vars.yaml deleted file mode 100644 index 4613151f..00000000 --- a/nvidia-gpu/nvidia-fabricmanager/vars.yaml +++ /dev/null @@ -1 +0,0 @@ -VERSION: "{{ .NVIDIA_DRIVER_VERSION }}" diff --git a/nvidia-gpu/nvidia-modules/lts/files/nvidia.conf b/nvidia-gpu/nvidia-modules/lts/files/nvidia.conf new file mode 100644 index 00000000..62b5f931 --- /dev/null +++ b/nvidia-gpu/nvidia-modules/lts/files/nvidia.conf @@ -0,0 +1,4 @@ +blacklist nvidia +blacklist nvidia_uvm +blacklist nvidia_drm +blacklist nvidia_modeset diff --git a/nvidia-gpu/nvidia-modules/manifest.yaml b/nvidia-gpu/nvidia-modules/lts/manifest.yaml similarity index 85% rename from nvidia-gpu/nvidia-modules/manifest.yaml rename to nvidia-gpu/nvidia-modules/lts/manifest.yaml index aee804a1..77874823 100644 --- a/nvidia-gpu/nvidia-modules/manifest.yaml +++ b/nvidia-gpu/nvidia-modules/lts/manifest.yaml @@ -1,6 +1,6 @@ version: v1alpha1 metadata: - name: nvidia-open-gpu-kernel-modules + name: nvidia-open-gpu-kernel-modules-lts version: "$VERSION" author: Sidero Labs description: | diff --git a/nvidia-gpu/nvidia-modules/pkg.yaml b/nvidia-gpu/nvidia-modules/lts/pkg.yaml similarity index 92% rename from nvidia-gpu/nvidia-modules/pkg.yaml rename to nvidia-gpu/nvidia-modules/lts/pkg.yaml index 0af3f7d6..6c29c2bc 100644 --- a/nvidia-gpu/nvidia-modules/pkg.yaml +++ b/nvidia-gpu/nvidia-modules/lts/pkg.yaml @@ -1,11 +1,11 @@ -name: nvidia-open-gpu-kernel-modules +name: nvidia-open-gpu-kernel-modules-lts variant: scratch shell: /toolchain/bin/bash dependencies: - stage: base # The pkgs version for a particular release of Talos as defined in # https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs - - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nvidia-open-gpu-kernel-modules-pkg:{{ .BUILD_ARG_PKGS }}" + - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nvidia-open-gpu-kernel-modules-lts-pkg:{{ .BUILD_ARG_PKGS }}" steps: - prepare: - | diff --git a/nvidia-gpu/nvidia-modules/vars.yaml b/nvidia-gpu/nvidia-modules/lts/vars.yaml similarity index 62% rename from nvidia-gpu/nvidia-modules/vars.yaml rename to nvidia-gpu/nvidia-modules/lts/vars.yaml index f9870de7..fa402e12 100644 --- a/nvidia-gpu/nvidia-modules/vars.yaml +++ b/nvidia-gpu/nvidia-modules/lts/vars.yaml @@ -1,2 +1,2 @@ # the first part is the driver version and the second the talos version for which the module is built against -VERSION: "{{ .NVIDIA_DRIVER_VERSION }}-{{ .BUILD_ARG_TAG }}" +VERSION: "{{ .NVIDIA_DRIVER_LTS_VERSION }}-{{ .BUILD_ARG_TAG }}" diff --git a/nvidia-gpu/nvidia-modules/production/files/nvidia.conf b/nvidia-gpu/nvidia-modules/production/files/nvidia.conf new file mode 100644 index 00000000..62b5f931 --- /dev/null +++ b/nvidia-gpu/nvidia-modules/production/files/nvidia.conf @@ -0,0 +1,4 @@ +blacklist nvidia +blacklist nvidia_uvm +blacklist nvidia_drm +blacklist nvidia_modeset diff --git a/nvidia-gpu/nvidia-modules/production/manifest.yaml b/nvidia-gpu/nvidia-modules/production/manifest.yaml new file mode 100644 index 00000000..7398546b --- /dev/null +++ b/nvidia-gpu/nvidia-modules/production/manifest.yaml @@ -0,0 +1,10 @@ +version: v1alpha1 +metadata: + name: nvidia-open-gpu-kernel-modules-production + version: "$VERSION" + author: Sidero Labs + description: | + This system extension provides nvidia open source driver kernel modules built against a specific Talos version. + compatibility: + talos: + version: ">= v1.2.0" diff --git a/nvidia-gpu/nvidia-modules/production/pkg.yaml b/nvidia-gpu/nvidia-modules/production/pkg.yaml new file mode 100644 index 00000000..64c374dc --- /dev/null +++ b/nvidia-gpu/nvidia-modules/production/pkg.yaml @@ -0,0 +1,31 @@ +name: nvidia-open-gpu-kernel-modules-production +variant: scratch +shell: /toolchain/bin/bash +dependencies: + - stage: base +# The pkgs version for a particular release of Talos as defined in +# https://github.com/siderolabs/talos/blob//pkg/machinery/gendata/data/pkgs + - image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nvidia-open-gpu-kernel-modules-production-pkg:{{ .BUILD_ARG_PKGS }}" +steps: + - prepare: + - | + sed -i 's#$VERSION#{{ .VERSION }}#' /pkg/manifest.yaml + - install: + - | + mkdir -p /rootfs/lib/modules \ + /rootfs/usr/local/lib/modprobe.d + + cp /pkg/files/nvidia.conf /rootfs/usr/local/lib/modprobe.d/nvidia.conf + + cp -R /lib/modules/* /rootfs/lib/modules + test: + - | + mkdir -p /extensions-validator-rootfs + cp -r /rootfs/ /extensions-validator-rootfs/rootfs + cp /pkg/manifest.yaml /extensions-validator-rootfs/manifest.yaml + /extensions-validator validate --rootfs=/extensions-validator-rootfs --pkg-name="${PKG_NAME}" +finalize: + - from: /rootfs + to: /rootfs + - from: /pkg/manifest.yaml + to: / diff --git a/nvidia-gpu/nvidia-modules/production/vars.yaml b/nvidia-gpu/nvidia-modules/production/vars.yaml new file mode 100644 index 00000000..b377850f --- /dev/null +++ b/nvidia-gpu/nvidia-modules/production/vars.yaml @@ -0,0 +1,2 @@ +# the first part is the driver version and the second the talos version for which the module is built against +VERSION: "{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-{{ .BUILD_ARG_TAG }}" diff --git a/nvidia-gpu/vars.yaml b/nvidia-gpu/vars.yaml index ea93ec29..bed72d9b 100644 --- a/nvidia-gpu/vars.yaml +++ b/nvidia-gpu/vars.yaml @@ -1,12 +1,14 @@ # only update if there's a matching fabric manager version -# renovate: datasource=github-releases depName=nvidia/open-gpu-kernel-modules -NVIDIA_DRIVER_VERSION: 535.129.03 -# renovate: datasource=git-tags depName=https://gitlab.com/nvidia/container-toolkit/container-toolkit.git -CONTAINER_TOOLKIT_VERSION: v1.15.0 -CONTAINER_TOOLKIT_REF: ddeeca392c7bd8b33d0a66400b77af7a97e16cef -# renovate: datasource=git-tags depName=https://gitlab.com/nvidia/container-toolkit/libnvidia-container.git -LIBNVIDIA_CONTAINER_VERSION: v1.15.0 -LIBNVIDIA_CONTAINER_REF: 6c8f1df7fd32cea3280cf2a2c6e931c9b3132465 +# renovate: datasource=github-releases extractVersion=^\d+\.(?\d+\.\d+)$ depName=nvidia/open-gpu-kernel-modules +NVIDIA_DRIVER_LTS_VERSION: 535.183.06 +# renovate: datasource=github-releases extractVersion=^\d+\.(?\d+\.\d+)$ depName=nvidia/open-gpu-kernel-modules +NVIDIA_DRIVER_PRODUCTION_VERSION: 550.90.07 +# renovate: datasource=github-releases depName=nvidia/nvidia-container-toolkit +CONTAINER_TOOLKIT_VERSION: v1.16.1 +CONTAINER_TOOLKIT_REF: a470818ba7d9166be282cd0039dd2fc9b0a34d73 +# renovate: datasource=git-tags depName=nvidia/libnvidia-container +LIBNVIDIA_CONTAINER_VERSION: v1.16.1 +LIBNVIDIA_CONTAINER_REF: 4c2494f16573b585788a42e9c7bee76ecd48c73d # renovate: datasource=docker versioning=docker depName=cgr.dev/chainguard/wolfi-base WOLFI_BASE_REF: sha256:c9339087a6de501ba6989756aeb1e1c89af82ac0e53c8b1ccd1feb44ec2246d9 # renovate: datasource=git-tags extractVersion=^glibc-(?.*)$ depName=https://sourceware.org/git/glibc.git diff --git a/power/nut-client/pkg.yaml b/power/nut-client/pkg.yaml index cdb775c5..6123bcd9 100644 --- a/power/nut-client/pkg.yaml +++ b/power/nut-client/pkg.yaml @@ -27,8 +27,6 @@ steps: ln -s /toolchain/bin/pkg-config /usr/bin/pkg-config ln -s /toolchain/bin/file /usr/bin/file - cp /toolchain/share/automake-1.16/config.guess config.guess - cp /toolchain/lib/libstdc++* /lib autoreconf -if export PKG_CONFIG_PATH=/usr/lib/pkgconfig diff --git a/storage/iscsi-tools/open-isns/pkg.yaml b/storage/iscsi-tools/open-isns/pkg.yaml index e0484143..1add9c1c 100644 --- a/storage/iscsi-tools/open-isns/pkg.yaml +++ b/storage/iscsi-tools/open-isns/pkg.yaml @@ -15,7 +15,6 @@ steps: tar -xzf open-isns.tar.gz --strip-components=1 - | - cp /toolchain/share/automake-1.16/config.guess aclocal/ autoreconf -fi - |