Skip to content

Commit

Permalink
fix(vm): fix vm-router panics when we delete a virtual machine. (#201)
Browse files Browse the repository at this point in the history
fix reconsile VM. refactor DeleteRoute.
---------
Signed-off-by: yaroslavborbat <[email protected]>
  • Loading branch information
yaroslavborbat authored Jul 11, 2024
1 parent df12d38 commit 8ebce8c
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 37 deletions.
27 changes: 11 additions & 16 deletions images/vmi-router/controllers/vmirouter_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"
"fmt"

virtv1alpha2 "github.com/deckhouse/virtualization/api/core/v1alpha2"
"github.com/go-logr/logr"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
Expand All @@ -34,6 +33,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/source"

virtv2 "github.com/deckhouse/virtualization/api/core/v1alpha2"
"vmi-router/netlinkmanager"
)

Expand All @@ -47,7 +48,7 @@ type VMRouterReconciler struct {
}

func (r *VMRouterReconciler) SetupWatches(mgr manager.Manager, ctr controller.Controller) error {
if err := ctr.Watch(source.Kind(mgr.GetCache(), &virtv1alpha2.VirtualMachine{}), &handler.EnqueueRequestForObject{},
if err := ctr.Watch(source.Kind(mgr.GetCache(), &virtv2.VirtualMachine{}), &handler.EnqueueRequestForObject{},
predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
r.log.V(4).Info(fmt.Sprintf("Got CREATE event for VM %s/%s", e.Object.GetNamespace(), e.Object.GetName()))
Expand All @@ -73,29 +74,23 @@ func (r *VMRouterReconciler) Reconcile(ctx context.Context, req reconcile.Reques
r.log.V(4).Info(fmt.Sprintf("Got reconcile request for %s", req.String()))

// Start with retrieving affected VMI.
var vm virtv1alpha2.VirtualMachine
var isAbsent bool
err := r.client.Get(ctx, req.NamespacedName, &vm)
vm := &virtv2.VirtualMachine{}
err := r.client.Get(ctx, req.NamespacedName, vm)
if err != nil {
if k8serrors.IsNotFound(err) {
isAbsent = true
} else {
r.log.Error(err, fmt.Sprintf("fail to retrieve vm/%s", req.String()))
return reconcile.Result{}, err
r.netlinkMgr.DeleteRoute(req.NamespacedName, "")
return reconcile.Result{}, nil
}
r.log.Error(err, fmt.Sprintf("fail to retrieve vm/%s", req.String()))
return reconcile.Result{}, err
}

// Delete route on VM deletion.
if vm.GetDeletionTimestamp() != nil {
r.netlinkMgr.DeleteRoute(&vm)
return reconcile.Result{}, nil
}

if isAbsent {
r.netlinkMgr.DeleteRoute(nil)
r.netlinkMgr.DeleteRoute(req.NamespacedName, vm.Status.IPAddress)
return reconcile.Result{}, nil
}

r.netlinkMgr.UpdateRoute(ctx, &vm)
r.netlinkMgr.UpdateRoute(ctx, vm)
return reconcile.Result{}, nil
}
37 changes: 16 additions & 21 deletions images/vmi-router/netlinkmanager/manager.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright 2023,2024 Flant JSC
Copyright 2024 Flant JSC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -18,18 +18,21 @@ package netlinkmanager

import (
"context"
"errors"
"fmt"
"net"
"os"
"sync"

ciliumv2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2"
"github.com/cilium/cilium/pkg/node/addressing"
virtv1alpha2 "github.com/deckhouse/virtualization/api/core/v1alpha2"
"github.com/go-logr/logr"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"

virtv2 "github.com/deckhouse/virtualization/api/core/v1alpha2"
"vmi-router/netlinkwrap"
"vmi-router/netutil"
)
Expand All @@ -47,7 +50,7 @@ type Manager struct {
tableId int
cidrs []*net.IPNet
nodeName string
vmIPs map[string]string
vmIPs map[types.NamespacedName]string
vmIPsLock sync.RWMutex
}

Expand All @@ -62,7 +65,7 @@ func New(client client.Client, log logr.Logger, tableId int, cidrs []*net.IPNet,
tableId: tableId,
cidrs: cidrs,
nlWrapper: nlWrapper,
vmIPs: make(map[string]string),
vmIPs: make(map[types.NamespacedName]string),
}
}

Expand Down Expand Up @@ -115,7 +118,7 @@ func (m *Manager) SyncRules() error {

func (m *Manager) SyncRoutes(ctx context.Context) error {
// List all Virtual Machines to collect all IPs on this Node.
vmList := &virtv1alpha2.VirtualMachineList{}
vmList := &virtv2.VirtualMachineList{}
err := m.client.List(ctx, vmList)
if err != nil {
return fmt.Errorf("list VirtualMachines: %w", err)
Expand Down Expand Up @@ -183,7 +186,7 @@ func (m *Manager) isManagedIP(ip string) (bool, error) {
}

// UpdateRoute updates route for a single VirtualMachine.
func (m *Manager) UpdateRoute(ctx context.Context, vm *virtv1alpha2.VirtualMachine) {
func (m *Manager) UpdateRoute(ctx context.Context, vm *virtv2.VirtualMachine) {
// TODO Add cleanup if node was lost?
// TODO What about migration? Is nodeName just changed to new node or we need some workarounds when 2 Pods are running?
if vm.Status.Node == "" {
Expand Down Expand Up @@ -215,9 +218,9 @@ func (m *Manager) UpdateRoute(ctx context.Context, vm *virtv1alpha2.VirtualMachi
}

// Save IP to the in-memory cache to restore IP later.
vmiKey := fmt.Sprintf("%s/%s", vm.GetNamespace(), vm.GetName())
vmKey := types.NamespacedName{Name: vm.GetName(), Namespace: vm.GetNamespace()}
m.vmIPsLock.Lock()
m.vmIPs[vmiKey] = vmIP
m.vmIPs[vmKey] = vmIP
m.vmIPsLock.Unlock()

// Retrieve a Cilium Node by VMs node name.
Expand Down Expand Up @@ -275,31 +278,23 @@ func getCiliumInternalIPAddress(node *ciliumv2.CiliumNode) string {
return ""
}

func (m *Manager) DeleteRoute(vm *virtv1alpha2.VirtualMachine) {
// Check if IP is in cache. Do not delete routes for unknown IPs.
vmKey := fmt.Sprintf("%s/%s", vm.GetNamespace(), vm.GetName())

// Get IP either from Status, or from cache.
var vmIP string
if vm != nil {
vmIP = vm.Status.IPAddress
}
func (m *Manager) DeleteRoute(vmKey types.NamespacedName, vmIP string) {
if vmIP == "" {
// Try to recover IP from the cache.
m.vmIPsLock.RLock()
vmIP = m.vmIPs[vmKey]
m.vmIPsLock.RUnlock()
}
if vmIP == "" {
m.log.Info(fmt.Sprintf("Can't retrieve IP for VM %s/%s, it may lead to stale routes.", vm.GetNamespace(), vm.GetName()))
m.log.Info(fmt.Sprintf("Can't retrieve IP for VM %q, it may lead to stale routes.", vmKey.String()))
return
}

// Prepare ip with the mask to use as the route destination.
vmIPWithNetmask := netutil.AppendHostNetmask(vmIP)
_, vmRouteDst, err := net.ParseCIDR(netutil.AppendHostNetmask(vmIP))
if err != nil {
m.log.Error(err, fmt.Sprintf("failed to parse IP with netmask %s for VM %s/%s", vmIPWithNetmask, vm.GetNamespace(), vm.GetName()))
m.log.Error(err, fmt.Sprintf("failed to parse IP with netmask %s for VM %q", vmIPWithNetmask, vmKey.String()))
return
}

Expand All @@ -308,10 +303,10 @@ func (m *Manager) DeleteRoute(vm *virtv1alpha2.VirtualMachine) {
Table: m.tableId,
}

if err := m.nlWrapper.RouteDel(&route); err != nil && !os.IsNotExist(err) {
if err := m.nlWrapper.RouteDel(&route); err != nil && !os.IsNotExist(err) && !errors.Is(err, unix.ESRCH) {
m.log.Error(err, "failed to delete route")
}
m.log.Info(fmt.Sprintf("route %s deleted for VM %s/%s", fmtRoute(route), vmKey))
m.log.Info(fmt.Sprintf("route %s deleted for VM %q", fmtRoute(route), vmKey))

// Delete IP from the cache.
m.vmIPsLock.Lock()
Expand Down

0 comments on commit 8ebce8c

Please sign in to comment.