Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(vm): fix vm-router panics when we delete a virtual machine. #201

Merged
merged 5 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 11 additions & 16 deletions images/vmi-router/controllers/vmirouter_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"
"fmt"

virtv1alpha2 "github.com/deckhouse/virtualization/api/core/v1alpha2"
"github.com/go-logr/logr"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
Expand All @@ -34,6 +33,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"

virtv2 "github.com/deckhouse/virtualization/api/core/v1alpha2"
"vmi-router/netlinkmanager"
)

Expand All @@ -47,7 +48,7 @@ type VMRouterReconciler struct {
}

func (r *VMRouterReconciler) SetupWatches(mgr manager.Manager, ctr controller.Controller) error {
if err := ctr.Watch(source.Kind(mgr.GetCache(), &virtv1alpha2.VirtualMachine{}), &handler.EnqueueRequestForObject{},
if err := ctr.Watch(source.Kind(mgr.GetCache(), &virtv2.VirtualMachine{}), &handler.EnqueueRequestForObject{},
predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
r.log.V(4).Info(fmt.Sprintf("Got CREATE event for VM %s/%s", e.Object.GetNamespace(), e.Object.GetName()))
Expand All @@ -73,29 +74,23 @@ func (r *VMRouterReconciler) Reconcile(ctx context.Context, req reconcile.Reques
r.log.V(4).Info(fmt.Sprintf("Got reconcile request for %s", req.String()))

// Start with retrieving affected VMI.
var vm virtv1alpha2.VirtualMachine
var isAbsent bool
err := r.client.Get(ctx, req.NamespacedName, &vm)
vm := &virtv2.VirtualMachine{}
err := r.client.Get(ctx, req.NamespacedName, vm)
if err != nil {
if k8serrors.IsNotFound(err) {
isAbsent = true
} else {
r.log.Error(err, fmt.Sprintf("fail to retrieve vm/%s", req.String()))
return reconcile.Result{}, err
r.netlinkMgr.DeleteRoute(req.NamespacedName, "")
return reconcile.Result{}, nil
}
r.log.Error(err, fmt.Sprintf("fail to retrieve vm/%s", req.String()))
return reconcile.Result{}, err
}

// Delete route on VM deletion.
if vm.GetDeletionTimestamp() != nil {
r.netlinkMgr.DeleteRoute(&vm)
return reconcile.Result{}, nil
}

if isAbsent {
r.netlinkMgr.DeleteRoute(nil)
r.netlinkMgr.DeleteRoute(req.NamespacedName, vm.Status.IPAddress)
return reconcile.Result{}, nil
}

r.netlinkMgr.UpdateRoute(ctx, &vm)
r.netlinkMgr.UpdateRoute(ctx, vm)
return reconcile.Result{}, nil
}
37 changes: 16 additions & 21 deletions images/vmi-router/netlinkmanager/manager.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright 2023,2024 Flant JSC
Copyright 2024 Flant JSC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -18,18 +18,21 @@ package netlinkmanager

import (
"context"
"errors"
"fmt"
"net"
"os"
"sync"

ciliumv2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2"
"github.com/cilium/cilium/pkg/node/addressing"
virtv1alpha2 "github.com/deckhouse/virtualization/api/core/v1alpha2"
"github.com/go-logr/logr"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

virtv2 "github.com/deckhouse/virtualization/api/core/v1alpha2"
"vmi-router/netlinkwrap"
"vmi-router/netutil"
)
Expand All @@ -47,7 +50,7 @@ type Manager struct {
tableId int
cidrs []*net.IPNet
nodeName string
vmIPs map[string]string
vmIPs map[types.NamespacedName]string
vmIPsLock sync.RWMutex
}

Expand All @@ -62,7 +65,7 @@ func New(client client.Client, log logr.Logger, tableId int, cidrs []*net.IPNet,
tableId: tableId,
cidrs: cidrs,
nlWrapper: nlWrapper,
vmIPs: make(map[string]string),
vmIPs: make(map[types.NamespacedName]string),
}
}

Expand Down Expand Up @@ -115,7 +118,7 @@ func (m *Manager) SyncRules() error {

func (m *Manager) SyncRoutes(ctx context.Context) error {
// List all Virtual Machines to collect all IPs on this Node.
vmList := &virtv1alpha2.VirtualMachineList{}
vmList := &virtv2.VirtualMachineList{}
err := m.client.List(ctx, vmList)
if err != nil {
return fmt.Errorf("list VirtualMachines: %w", err)
Expand Down Expand Up @@ -183,7 +186,7 @@ func (m *Manager) isManagedIP(ip string) (bool, error) {
}

// UpdateRoute updates route for a single VirtualMachine.
func (m *Manager) UpdateRoute(ctx context.Context, vm *virtv1alpha2.VirtualMachine) {
func (m *Manager) UpdateRoute(ctx context.Context, vm *virtv2.VirtualMachine) {
// TODO Add cleanup if node was lost?
// TODO What about migration? Is nodeName just changed to new node or we need some workarounds when 2 Pods are running?
if vm.Status.Node == "" {
Expand Down Expand Up @@ -215,9 +218,9 @@ func (m *Manager) UpdateRoute(ctx context.Context, vm *virtv1alpha2.VirtualMachi
}

// Save IP to the in-memory cache to restore IP later.
vmiKey := fmt.Sprintf("%s/%s", vm.GetNamespace(), vm.GetName())
vmKey := types.NamespacedName{Name: vm.GetName(), Namespace: vm.GetNamespace()}
m.vmIPsLock.Lock()
m.vmIPs[vmiKey] = vmIP
m.vmIPs[vmKey] = vmIP
m.vmIPsLock.Unlock()

// Retrieve a Cilium Node by VMs node name.
Expand Down Expand Up @@ -275,31 +278,23 @@ func getCiliumInternalIPAddress(node *ciliumv2.CiliumNode) string {
return ""
}

func (m *Manager) DeleteRoute(vm *virtv1alpha2.VirtualMachine) {
// Check if IP is in cache. Do not delete routes for unknown IPs.
vmKey := fmt.Sprintf("%s/%s", vm.GetNamespace(), vm.GetName())

// Get IP either from Status, or from cache.
var vmIP string
if vm != nil {
vmIP = vm.Status.IPAddress
}
func (m *Manager) DeleteRoute(vmKey types.NamespacedName, vmIP string) {
if vmIP == "" {
// Try to recover IP from the cache.
m.vmIPsLock.RLock()
vmIP = m.vmIPs[vmKey]
m.vmIPsLock.RUnlock()
}
if vmIP == "" {
m.log.Info(fmt.Sprintf("Can't retrieve IP for VM %s/%s, it may lead to stale routes.", vm.GetNamespace(), vm.GetName()))
m.log.Info(fmt.Sprintf("Can't retrieve IP for VM %q, it may lead to stale routes.", vmKey.String()))
return
}

// Prepare ip with the mask to use as the route destination.
vmIPWithNetmask := netutil.AppendHostNetmask(vmIP)
_, vmRouteDst, err := net.ParseCIDR(netutil.AppendHostNetmask(vmIP))
if err != nil {
m.log.Error(err, fmt.Sprintf("failed to parse IP with netmask %s for VM %s/%s", vmIPWithNetmask, vm.GetNamespace(), vm.GetName()))
m.log.Error(err, fmt.Sprintf("failed to parse IP with netmask %s for VM %q", vmIPWithNetmask, vmKey.String()))
return
}

Expand All @@ -308,10 +303,10 @@ func (m *Manager) DeleteRoute(vm *virtv1alpha2.VirtualMachine) {
Table: m.tableId,
}

if err := m.nlWrapper.RouteDel(&route); err != nil && !os.IsNotExist(err) {
if err := m.nlWrapper.RouteDel(&route); err != nil && !os.IsNotExist(err) && !errors.Is(err, unix.ESRCH) {
m.log.Error(err, "failed to delete route")
}
m.log.Info(fmt.Sprintf("route %s deleted for VM %s/%s", fmtRoute(route), vmKey))
m.log.Info(fmt.Sprintf("route %s deleted for VM %q", fmtRoute(route), vmKey))

// Delete IP from the cache.
m.vmIPsLock.Lock()
Expand Down