sandbox/cgroup: improve cgroup-based process termination algorithm

unify termination algorithm for v1/v2 - for each snap cgroup: - while cgroup.procs is not empty: - SIGKILL each pid in cgroup.procs - for v1 only, also kill pids found in freezer cgroup created by snap-confine - this is relevant for systemd v237 (used in ubuntu 18.04) for non-root users where the transient scope cgroups are not created This logic drops the freeze/kill/thaw approach with all the weird v1/v2/kernel backward compatibility. Signed-off-by: Zeyad Gouda <[email protected]>
canonical · Sep 17, 2024 · 371e0eb · 371e0eb
1 parent f3d0682
commit 371e0eb
Show file tree

Hide file tree

Showing 4 changed files with 292 additions and 184 deletions.
diff --git a/sandbox/cgroup/export_test.go b/sandbox/cgroup/export_test.go
@@ -42,6 +42,8 @@ var (
 	SecurityTagFromCgroupPath = securityTagFromCgroupPath
 
 	ApplyToSnap = applyToSnap
+
+	KillProcessesInCgroup = killProcessesInCgroup
 )
 
 func MockFsTypeForPath(mock func(string) (int64, error)) (restore func()) {
@@ -137,30 +139,14 @@ func (iw *inotifyWatcher) MonitorDelete(folders []string, name string, channel c
 
 var NewInotifyWatcher = newInotifyWatcher
 
-func MockFreezeSnapProcessesImplV1(fn func(ctx context.Context, snapName string) error) (restore func()) {
-	return testutil.Mock(&freezeSnapProcessesImplV1, fn)
-}
-
-func MockThawSnapProcessesImplV1(fn func(snapName string) error) (restore func()) {
-	return testutil.Mock(&thawSnapProcessesImplV1, fn)
+func MockKillProcessesInCgroup(fn func(dir string) error) (restore func()) {
+	return testutil.Mock(&killProcessesInCgroup, fn)
 }
 
 func MockSyscallKill(fn func(pid int, sig syscall.Signal) error) (restore func()) {
 	return testutil.Mock(&syscallKill, fn)
 }
 
-func MockFreezeOneV2(fn func(ctx context.Context, dir string) error) (restore func()) {
-	return testutil.Mock(&freezeOneV2, fn)
-}
-
-func MockThawOneV2(fn func(dir string) error) (restore func()) {
-	return testutil.Mock(&thawOneV2, fn)
-}
-
-func MockFreezePulseDelay(t time.Duration) (restore func()) {
-	return testutil.Mock(&freezePulseDelay, t)
-}
-
 func MockOsReadFile(fn func(name string) ([]byte, error)) (restore func()) {
 	return testutil.Mock(&osReadFile, fn)
 }
diff --git a/sandbox/cgroup/freezer.go b/sandbox/cgroup/freezer.go
@@ -37,9 +37,9 @@ import (
 
 const defaultFreezerCgroupV1Dir = "/sys/fs/cgroup/freezer"
 const maxFreezeTimeout = 3000 * time.Millisecond
+const freezePulseDelay = 100 * time.Millisecond
 
 var freezerCgroupV1Dir = defaultFreezerCgroupV1Dir
-var freezePulseDelay = 100 * time.Millisecond
 
 var osReadFile = os.ReadFile
 

diff --git a/sandbox/cgroup/kill.go b/sandbox/cgroup/kill.go
@@ -32,15 +32,12 @@ import (
 // a given snap.
 //
 // A correct implementation is picked depending on cgroup v1 or v2 use in the
-// system. When cgroup v1 is detected, the call will directly act on the freezer
-// group created when a snap process was started, while with v2 the call will
-// act on all tracking groups of a snap.
+// system. For both cgroup v1 and v2, the call will act on all tracking groups
+// of a snap.
 //
-// XXX: It is important to note that the implementation for v2 is racy because
-// processes can be killed externally even when the cgroup is frozen and have
-// their pid reused. Also, v2 freezing support is a no-op on kernels before
-// 5.2 which means new processes can keep getting spawned while killing
-// pids read earlier.
+// Note: When cgroup v1 is detected, the call will also act on the freezer
+// group created when a snap process was started to address a known bug on
+// systemd v327 for non-root users.
 var KillSnapProcesses = func(ctx context.Context, snapName string) error {
 	return errors.New("KillSnapProcesses not implemented")
 }
@@ -52,63 +49,86 @@ var syscallKill = syscall.Kill
 //
 // The caller is responsible for making sure that pids are not reused
 // after reading `cgroup.procs` to avoid TOCTOU.
-func killProcessesInCgroup(dir string, signal syscall.Signal) error {
-	pids, err := pidsInFile(filepath.Join(dir, "cgroup.procs"))
-	if err != nil {
-		return err
+var killProcessesInCgroup = func(dir string) error {
+	// Keep sending SIGKILL signals until no more pids are left in cgroup
+	// to cover the case where a process forks before we kill it.
+	for {
+		// XXX: Should this have maximum retries?
+		pids, err := pidsInFile(filepath.Join(dir, "cgroup.procs"))
+		if err != nil {
+			return err
+		}
+		if len(pids) == 0 {
+			// no more pids
+			return nil
+		}
+
+		var firstErr error
+		for _, pid := range pids {
+			pidNotFoundErr := syscall.ESRCH
+			// TODO: Use pidfs when possible to avoid killing reused pids.
+			if err := syscallKill(pid, syscall.SIGKILL); err != nil && !errors.Is(err, pidNotFoundErr) && firstErr == nil {
+				firstErr = err
+			}
+		}
+		if firstErr != nil {
+			return firstErr
+		}
 	}
+}
 
+func killSnapProcessesImplV1(ctx context.Context, snapName string) error {
 	var firstErr error
-	for _, pid := range pids {
-		pidNotFoundErr := syscall.ESRCH
-		if err := syscallKill(pid, signal); err != nil && !errors.As(err, &pidNotFoundErr) && firstErr == nil {
+	skipError := func(err error) bool {
+		// fs.ErrNotExist and ENODEV are ignored in case the cgroup went away while we were
+		// processing the cgorup. ENODEV is returned by the kernel if the cgroup went
+		// away while a kernfs operation is ongoing.
+		if !errors.Is(err, fs.ErrNotExist) && !errors.Is(err, syscall.ENODEV) && firstErr == nil {
 			firstErr = err
 		}
+		return true
 	}
 
-	return firstErr
-}
-
-func killSnapProcessesImplV1(ctx context.Context, snapName string) error {
-	if err := freezeSnapProcessesImplV1(ctx, snapName); err != nil {
+	if err := applyToSnap(snapName, killProcessesInCgroup, skipError); err != nil {
 		return err
 	}
-	// For V1, SIGKILL on a frozen cgroup will not take effect
-	// until the cgroup is thawed
-	defer thawSnapProcessesImplV1(snapName)
 
-	return killProcessesInCgroup(filepath.Join(freezerCgroupV1Dir, fmt.Sprintf("snap.%s", snapName)), syscall.SIGKILL)
+	// This is a workaround for systemd v237 (used by Ubuntu 18.04) for non-root users
+	// where a transient scope cgroup is not created for a snap hence it cannot be tracked
+	// by the usual snap.<security-tag>-<uuid>.scope pattern.
+	// Here, We rely on the fact that snap-confine moves the snap pids into the freezer cgroup
+	// created for the snap.
+	// There is still a tiny race window between "snap run" unlocking the run inhibition lock
+	// and snap-confine moving pids to the freezer cgroup where we would miss those pids.
+	err := killProcessesInCgroup(filepath.Join(freezerCgroupV1Dir, fmt.Sprintf("snap.%s", snapName)))
+	if err != nil && !errors.Is(err, fs.ErrNotExist) && !errors.Is(err, syscall.ENODEV) && firstErr == nil {
+		firstErr = err
+	}
+
+	return firstErr
 }
 
-// XXX: killSnapProcessesImplV2 is racy to varying degrees depending on the kernel
-// version.
-//
-//  1. Cgroup v2 freezer was only available since Linux 5.2 so freezing is a no-op before 5.2 which allows processes to keep forking.
-//  2. Freezing does not put processes in an uninterruptable sleep unlike v1, so they can be killed externally and have their pid reused.
-//  3. `cgroup.kill` was introduced in Linux 5.14 and solves the above issues as it kills the cgroup processes atomically.
 func killSnapProcessesImplV2(ctx context.Context, snapName string) error {
 	killCgroupProcs := func(dir string) error {
 		// Use cgroup.kill if it exists (requires linux 5.14+)
 		err := writeExistingFile(filepath.Join(dir, "cgroup.kill"), []byte("1"))
 		if err == nil || !errors.Is(err, fs.ErrNotExist) {
 			return err
 		}
-		// Fallback to classic freeze/kill/thaw if cgroup.kill doesn't exist
 
-		if err := freezeOneV2(ctx, dir); err != nil {
-			return err
-		}
-		if err := killProcessesInCgroup(dir, syscall.SIGKILL); err != nil {
-			// Thaw on error to avoid keeping cgroup stuck
-			thawOneV2(dir) // ignore the error, this is best-effort
+		// Fallback to killing each pid if cgroup.kill doesn't exist
+		if err := killProcessesInCgroup(dir); err != nil {
 			return err
 		}
 		return nil
 	}
 
 	var firstErr error
 	skipError := func(err error) bool {
-		if !errors.Is(err, fs.ErrNotExist) && firstErr == nil {
+		// fs.ErrNotExist and ENODEV are ignored in case the cgroup went away while we were
+		// processing the cgorup. ENODEV is returned by the kernel if the cgroup went
+		// away while a kernfs operation is ongoing.
+		if !errors.Is(err, fs.ErrNotExist) && !errors.Is(err, syscall.ENODEV) && firstErr == nil {
 			firstErr = err
 		}
 		return true