Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: ensure containerd-related directories removed on failed bootstrap/join-cluster. #863

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/k8s/pkg/k8sd/app/hooks_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,5 +145,9 @@ func (a *App) onPreRemove(ctx context.Context, s state.State, force bool) (rerr
}
}

if err := setup.TryCleanupContainerdPaths(snap); err != nil {
log.Error(err, "Failed to perform containerd cleanup")
}

return nil
}
67 changes: 62 additions & 5 deletions src/k8s/pkg/k8sd/setup/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

"dario.cat/mergo"
"github.com/canonical/k8s/pkg/k8sd/images"
"github.com/canonical/k8s/pkg/log"
"github.com/canonical/k8s/pkg/snap"
snaputil "github.com/canonical/k8s/pkg/snap/util"
"github.com/canonical/k8s/pkg/utils"
Expand Down Expand Up @@ -166,23 +167,79 @@ func Containerd(snap snap.Snap, extraContainerdConfig map[string]any, extraArgs
return nil
}

func saveSnapContainerdPaths(s snap.Snap) error {
// Write the containerd-related paths to files to properly clean-up on removal.
// Returns a mapping between the absolute paths of the lockfiles within the
// k8s snap and the absolute paths of the containerd directory they lock.
func containerdLockPathsForSnap(s snap.Snap) map[string]string {
m := map[string]string{
"containerd-socket-path": s.ContainerdSocketDir(),
"containerd-config-dir": s.ContainerdConfigDir(),
"containerd-root-dir": s.ContainerdRootDir(),
"containerd-cni-bin-dir": s.CNIBinDir(),
}

for filename, content := range m {
if err := utils.WriteFile(filepath.Join(s.LockFilesDir(), filename), []byte(content), 0o600); err != nil {
return fmt.Errorf("failed to write %s: %w", filename, err)
prefixed := map[string]string{}
for k, v := range m {
prefixed[filepath.Join(s.LockFilesDir(), k)] = v
}

return prefixed
}

// Creates the lock files for the containerd directory paths to be used for later cleanup.
func saveSnapContainerdPaths(s snap.Snap) error {
for lockpath, dirpath := range containerdLockPathsForSnap(s) {
if err := utils.WriteFile(lockpath, []byte(dirpath), 0o600); err != nil {
return fmt.Errorf("failed to write %s: %w", lockpath, err)
}
}
return nil
}

// Attempts to clean up all containerd directories which were created
// by the k8s-snap based on the existence of their respective lockfiles
// located in the directory returned by `s.LockFilesDir()`.
func TryCleanupContainerdPaths(s snap.Snap) error {
log := log.L()

for lockpath, dirpath := range containerdLockPathsForSnap(s) {
// Ensure lockfile exists:
if _, err := os.Stat(lockpath); os.IsNotExist(err) {
log.Info("WARN: failed to find containerd lockfile, no cleanup will be perfomed", "lockfile", lockpath, "directory", dirpath)
continue
}

// Ensure lockfile's contents is the one we expect:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this check ensure for us? Is it to check someone else did not create the locks?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I admit I have no concrete usecase where this check would ever be relevant, it just "felt right" to double-check in case someone changed their contents from what the k8s-snap expects.

Note that, as far as I can tell, the k8s-snap does NOT use the paths written within the lockfiles (considering they're hardcoded/derived within the k8s-snap itself, it has no need for them).

The only place I've seen the actual file contents read is in the remove hook of the snap.

I could gladly remove it if it's "too redundant" or switch the k8s-snap to always clean up the paths which are written within the file, instead of what the k8s-snap expects that file lock to be for...

lockfile_contents := ""
if contents, err := os.ReadFile(lockpath); err != nil {
log.Info("WARN: failed to read contents of lockfile", "lockfile", lockpath, "error", err)
continue
} else {
lockfile_contents = string(contents)
}

if lockfile_contents != dirpath {
log.Info("WARN: lockfile points to different path than expected", "lockfile", lockpath, "expected", dirpath, "actual", lockfile_contents)
continue
}

// Check directory exists before attempting to remove:
if _, err := os.Stat(dirpath); os.IsNotExist(err) {
log.Info("Containerd directory doesn't exist; skipping cleanup", "directory", dirpath)
} else {
if err := os.RemoveAll(dirpath); err != nil {
log.Info("WARN: failed to remove containerd data directory", "directory", dirpath, "error", err)
continue // Avoid removing the lockfile path.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't cleanup done on a best effort basis? Do we check the lockfile later on at bootstrap time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is, but IMO it would have been "incorrect" for this code to remove the lockfile path if it had failed to remove the actual path it was pointing too. (IMO either both should be deleted; or none)

Do we check the lockfile later on at bootstrap time?

AFAIK the LockFilesDir() is used only by k8sd in this file when setting up the containerd dirs, and to mark a a node as worker (contents doesn't matter for marking workers, only the file existing does).

These contents of the lockfiles are only read by the remove hook of the k8s-snap (it also neglects to delete the lockfiles after it's done though, unsure whether that's intentional or not...).

}
}

if err := os.Remove(lockpath); err != nil {
log.Info("WARN: Failed to remove containerd lockfile", "lockfile", lockpath)
}
}

return nil
}

func init() {
images.Register(defaultPauseImage)
}
12 changes: 10 additions & 2 deletions src/k8s/pkg/k8sd/setup/containerd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ func TestContainerd(t *testing.T) {
ContainerdRegistryConfigDir: filepath.Join(dir, "containerd-hosts"),
ContainerdStateDir: filepath.Join(dir, "containerd-state"),
ContainerdExtraConfigDir: filepath.Join(dir, "containerd-confd"),
LockFilesDir: filepath.Join(dir, "lockfiles"),
ServiceArgumentsDir: filepath.Join(dir, "args"),
CNIBinDir: filepath.Join(dir, "opt-cni-bin"),
CNIConfDir: filepath.Join(dir, "cni-netd"),
Expand Down Expand Up @@ -129,9 +130,16 @@ func TestContainerd(t *testing.T) {
"containerd-root-dir": s.ContainerdRootDir(),
"containerd-cni-bin-dir": s.CNIBinDir(),
}
for filename, content := range m {

b, err := os.ReadFile(filepath.Join(s.LockFilesDir(), filename))
// Ensure locks directory exists:
files, err := os.ReadDir(s.LockFilesDir())
g.Expect(err).To(Not(HaveOccurred()))
t.Logf("Lockfiles currently in %q: %v", s.LockFilesDir(), files)

for filename, content := range m {
path := filepath.Join(s.LockFilesDir(), filename)
t.Logf("Checking path: %s", path)
b, err := os.ReadFile(path)
g.Expect(err).To(Not(HaveOccurred()))
g.Expect(string(b)).To(Equal(content))
}
Expand Down
70 changes: 64 additions & 6 deletions tests/integration/tests/test_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
from typing import List

import pytest
from test_util import harness, tags, util
from test_util import config, harness, tags, util

LOG = logging.getLogger(__name__)

KUBE_CONTROLLER_MANAGER_SNAP_PORT = 10257

CONTAINERD_PATHS = [
"/etc/containerd",
"/opt/cni/bin",
Expand All @@ -17,21 +19,77 @@
]


def _assert_paths_not_exist(instance: harness.Instance, paths: List[str]):
aznashwan marked this conversation as resolved.
Show resolved Hide resolved
paths_which_exist = [
p
for p, exists in util.check_file_paths_exist(instance, paths).items()
if exists
]
if paths_which_exist:
raise AssertionError(
f"Expected the following path(s) to not exist: {paths_which_exist}"
)


@pytest.mark.node_count(1)
@pytest.mark.tags(tags.NIGHTLY)
def test_node_cleanup(instances: List[harness.Instance], tmp_path):
"""Verifies that a `snap remove k8s` will perform proper cleanup."""
instance = instances[0]
util.wait_for_dns(instance)
util.wait_for_network(instance)

util.remove_k8s_snap(instance)

# Check that the containerd-related folders are removed on snap removal.
process = instance.exec(
["ls", *CONTAINERD_PATHS], capture_output=True, text=True, check=False
)
for path in CONTAINERD_PATHS:
assert f"cannot access '{path}': No such file or directory" in process.stderr
_assert_paths_not_exist(instance, CONTAINERD_PATHS)

util.setup_k8s_snap(instance, tmp_path)
instance.exec(["k8s", "bootstrap"])


@pytest.mark.node_count(1)
@pytest.mark.no_setup()
@pytest.mark.tags(tags.NIGHTLY)
def test_containerd_path_cleanup_on_failed_init(
instances: List[harness.Instance], tmp_path
):
"""Tests that a failed `bootsrap` properly cleans up any
containerd-related paths it may have created as part of the
failed `bootstrap`.

It induces a bootstrap failure by pre-binding a required k8s service
port (10257 for the kube-controller-manager) before running `k8s bootstrap`.

NOTE: a failed `join-cluster` will trigger the exact same cleanup
hook, so the test implicitly applies to it as well.
"""
instance = instances[0]
expected_code = 1
expected_message = (
"Encountered error(s) while verifying port availability for Kubernetes "
"services: Port 10257 (needed by: kube-controller-manager) is already in use."
)

with util.open_port(KUBE_CONTROLLER_MANAGER_SNAP_PORT) as _:
util.setup_k8s_snap(instance, tmp_path, config.SNAP, connect_interfaces=False)

proc = instance.exec(
["k8s", "bootstrap"], capture_output=True, text=True, check=False
)

if proc.returncode != expected_code:
raise AssertionError(
f"Expected `k8s bootstrap` to exit with code {expected_code}, "
f"but it exited with {proc.returncode}.\n"
f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}"
)

if expected_message not in proc.stderr:
raise AssertionError(
f"Expected to find port-related warning '{expected_message}' in "
"stderr of the `k8s bootstrap` command.\n"
f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}"
)

_assert_paths_not_exist(instance, CONTAINERD_PATHS)
50 changes: 50 additions & 0 deletions tests/integration/tests/test_util/util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#
# Copyright 2024 Canonical, Ltd.
#
import contextlib
import ipaddress
import json
import logging
import re
import shlex
import socket
import subprocess
import urllib.request
from datetime import datetime
Expand Down Expand Up @@ -514,3 +516,51 @@ def find_suitable_cidr(parent_cidr: str, excluded_ips: List[str]):

return str(lb_net)
raise RuntimeError("Could not find a suitable CIDR for LoadBalancer services")


@contextlib.contextmanager
def open_port(
port: int,
host: str = "",
address_family: socket.AddressFamily = socket.AF_INET,
socket_kind: socket.SocketKind = socket.SOCK_STREAM,
max_backlogged_connections: int = 0,
):
"""Context manager which opens a socket with the given properties
and binds it to the given port.

Yields the already setup and listening socket object for use.

By default, it will only allow one single active connection
and instantly refuse any new ones. Use the `max_backlogged_connections`
argument if you'd like it to accept more connections as `pending`.
"""
sock = socket.socket(family=address_family, type=socket_kind)
if not host:
host = socket.gethostname()
sock.bind((host, port))
LOG.info(f"Successfully bound new socket on '{host}:{port}'")

try:
sock.listen(max_backlogged_connections)
LOG.info(f"Successfully started listening on '{host}:{port}'")
yield sock
finally:
sock.close()
LOG.info(f"Closed socket on '{host}:{port}'")


def check_file_paths_exist(
instance: harness.Instance, paths: List[str]
) -> Mapping[str, bool]:
"""Returns whether the given path(s) exist within the given harness instance
by checking the output of a single `ls` command containing all of them.

It is recommended to always use absolute paths, as the cwd relative to
which the `ls` will get executed depends on the harness instance.
"""
process = instance.exec(["ls", *paths], capture_output=True, text=True, check=False)
return {
p: not (f"cannot access '{p}': No such file or directory" in process.stderr)
for p in paths
}
Loading