Skip to content

Commit

Permalink
Add drive repair support (#880)
Browse files Browse the repository at this point in the history
Signed-off-by: Bala.FA <[email protected]>
  • Loading branch information
balamurugana authored Jul 26, 2024
1 parent a3471d6 commit 7d99392
Show file tree
Hide file tree
Showing 25 changed files with 1,025 additions and 83 deletions.
4 changes: 2 additions & 2 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
linters-settings:
gofumpt:
lang-version: "1.22"
run:
go: "1.22"

misspell:
locale: US
Expand Down
1 change: 1 addition & 0 deletions cmd/directpv/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ func init() {
mainCmd.AddCommand(legacyControllerCmd)
mainCmd.AddCommand(legacyNodeServerCmd)
mainCmd.AddCommand(nodeControllerCmd)
mainCmd.AddCommand(repairCmd)
}

func main() {
Expand Down
80 changes: 80 additions & 0 deletions cmd/directpv/repair.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// This file is part of MinIO DirectPV
// Copyright (c) 2024 MinIO, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

package main

import (
"context"
"errors"

directpvtypes "github.com/minio/directpv/pkg/apis/directpv.min.io/types"
"github.com/minio/directpv/pkg/client"
drivepkg "github.com/minio/directpv/pkg/drive"
"github.com/minio/directpv/pkg/types"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var (
forceFlag = false
disablePrefetchFlag = false
dryRunFlag = false
)

var repairCmd = &cobra.Command{
Use: "repair <DRIVE-ID>",
Short: "Start drive repair.",
SilenceUsage: true,
SilenceErrors: true,
RunE: func(c *cobra.Command, args []string) error {
switch len(args) {
case 0:
return errors.New("DRIVE-ID must be provided")
case 1:
default:
return errors.New("only one DRIVE-ID must be provided")
}
return startRepair(c.Context(), args[0])
},
}

func init() {
repairCmd.PersistentFlags().BoolVar(&forceFlag, "force", forceFlag, "Force log zeroing")
repairCmd.PersistentFlags().BoolVar(&disablePrefetchFlag, "disable-prefetch", disablePrefetchFlag, "Disable prefetching of inode and directory blocks")
repairCmd.PersistentFlags().BoolVar(&dryRunFlag, "dry-run", dryRunFlag, "No modify mode")
}

func startRepair(ctx context.Context, driveID string) error {
var cancel context.CancelFunc
ctx, cancel = context.WithCancel(ctx)
defer cancel()

drive, err := client.DriveClient().Get(ctx, driveID, metav1.GetOptions{})
if err != nil {
return err
}

if drive.Status.Status != directpvtypes.DriveStatusRepairing {
drive.Status.Status = directpvtypes.DriveStatusRepairing
}

updatedDrive, err := client.DriveClient().Update(ctx, drive, metav1.UpdateOptions{TypeMeta: types.NewDriveTypeMeta()})
if err != nil {
return err
}

return drivepkg.Repair(ctx, updatedDrive, forceFlag, disablePrefetchFlag, dryRunFlag)
}
1 change: 1 addition & 0 deletions cmd/kubectl-directpv/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ Use "{{.CommandPath}} [command] --help" for more information about this command.
mainCmd.AddCommand(cleanCmd)
mainCmd.AddCommand(suspendCmd)
mainCmd.AddCommand(resumeCmd)
mainCmd.AddCommand(repairCmd)
mainCmd.AddCommand(removeCmd)
mainCmd.AddCommand(uninstallCmd)
mainCmd.SetHelpCommand(&cobra.Command{
Expand Down
92 changes: 92 additions & 0 deletions cmd/kubectl-directpv/repair.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// This file is part of MinIO DirectPV
// Copyright (c) 2024 MinIO, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

package main

import (
"context"
"errors"
"os"
"strings"

"github.com/minio/directpv/pkg/admin"
"github.com/minio/directpv/pkg/consts"
"github.com/spf13/cobra"
)

var (
forceFlag = false
disablePrefetchFlag = false
)

var repairCmd = &cobra.Command{
Use: "repair DRIVE ...",
Short: "Repair filesystem of drives",
SilenceUsage: true,
SilenceErrors: true,
Example: strings.ReplaceAll(
`1. Repair drives
$ kubectl {PLUGIN_NAME} repair 3b562992-f752-4a41-8be4-4e688ae8cd4c`,
`{PLUGIN_NAME}`,
consts.AppName,
),
Run: func(c *cobra.Command, args []string) {
driveIDArgs = args
if err := validateRepairCmd(); err != nil {
eprintf(true, "%v\n", err)
os.Exit(-1)
}

repairMain(c.Context())
},
}

func init() {
setFlagOpts(repairCmd)

addDryRunFlag(repairCmd, "Repair drives with no modify mode")
repairCmd.PersistentFlags().BoolVar(&forceFlag, "force", forceFlag, "Force log zeroing")
repairCmd.PersistentFlags().BoolVar(&disablePrefetchFlag, "disable-prefetch", disablePrefetchFlag, "Disable prefetching of inode and directory blocks")
}

func validateRepairCmd() error {
if err := validateDriveIDArgs(); err != nil {
return err
}

if len(driveIDArgs) == 0 {
return errors.New("no drive provided to repair")
}

return nil
}

func repairMain(ctx context.Context) {
_, err := adminClient.Repair(
ctx,
admin.RepairArgs{
DriveIDs: driveIDSelectors,
DryRun: dryRunFlag,
ForceFlag: forceFlag,
DisablePrefetchFlag: disablePrefetchFlag,
},
logFunc,
)
if err != nil {
eprintf(!errors.Is(err, admin.ErrNoMatchingResourcesFound), "%v\n", err)
os.Exit(1)
}
}
22 changes: 22 additions & 0 deletions docs/command-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,28 @@ EXAMPLES:
$ kubectl directpv resume volumes pvc-0700b8c7-85b2-4894-b83a-274484f220d0
```

## `repair` command
```
Repair filesystem of drives
USAGE:
directpv repair DRIVE ... [flags]
FLAGS:
--dry-run Repair drives with no modify mode
--force Force log zeroing
--disable-prefetch Disable prefetching of inode and directory blocks
-h, --help help for repair
GLOBAL FLAGS:
--kubeconfig string Path to the kubeconfig file to use for CLI requests
--quiet Suppress printing error messages
EXAMPLES:
1. Repair drives
$ kubectl directpv repair 3b562992-f752-4a41-8be4-4e688ae8cd4c
```

## `remove` command
```
Remove unused drives from DirectPV
Expand Down
20 changes: 18 additions & 2 deletions docs/drive-management.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,27 @@ Refer [remove command](./command-reference.md#remove-command) for more informati
By Kubernetes design, [StatefulSet](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) workload is active only if all of its pods are in running state. Any faulty drive(s) will prevent the statefulset from starting up. DirectPV provides a workaround to suspend failed drives which will mount the respective volumes on empty `/var/lib/directpv/tmp` directory with read-only access. This can be done by executing the `suspend drives` command. Below is an example:

```sh
> kubectl directpv suspend drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
$ kubectl directpv suspend drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
```

Suspended drives can be resumed once they are fixed. Upon resuming, the corresponding volumes will resume using the respective allocated drives. This can be done by using the `resume drives` command. Below is an example:

```sh
> kubectl directpv resume drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
$ kubectl directpv resume drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
```

## Repair drives

***CAUTION: THIS IS DANGEROUS OPERATION WHICH LEADS TO DATA LOSS***

In a rare situation, filesystem on faulty drives can be repaired to make them usable. As a first step, faulty drives must be suspended, then the `repair` command should be run for them. The `repair` command creates onetime Kubernetes `Job` with the pod name as `repair-<DRIVE-ID>` and these jobs are auto removed after five minutes of its completion. Progress and status of the drive repair can be viewed using `kubectl log` command. Below is an example:

```sh
# Suspend faulty drives
$ kubectl directpv suspend drives af3b8b4c-73b4-4a74-84b7-1ec30492a6f0

# Restart volume consumer pods and make sure associated volumes are unbound

# Run repair command on suspended drives
$ kubectl directpv repair af3b8b4c-73b4-4a74-84b7-1ec30492a6f0
```
Loading

0 comments on commit 7d99392

Please sign in to comment.