Skip to content

Commit

Permalink
osd: reweight osd while resizing
Browse files Browse the repository at this point in the history
osd got resized by cryptsetup bluestore cmd
but it should also be reweight to balance the pgs properly

closes: rook#14430

Signed-off-by: parth-gr <[email protected]>
(cherry picked from commit 17cfda5)
Signed-off-by: parth-gr <[email protected]>
  • Loading branch information
parth-gr committed Aug 22, 2024
1 parent fc77d31 commit f8c3a69
Show file tree
Hide file tree
Showing 12 changed files with 141 additions and 14 deletions.
3 changes: 3 additions & 0 deletions Documentation/CRDs/Cluster/ceph-cluster-crd.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ For more details on the mons and when to choose a number other than `3`, see the
* `config`: Config settings applied to all OSDs on the node unless overridden by `devices`. See the [config settings](#osd-configuration-settings) below.
* `allowDeviceClassUpdate`: Whether to allow changing the device class of an OSD after it is created. The default is false
to prevent unintentional data movement or CRUSH changes if the device class is changed accidentally.
* `allowOsdCrushWeightUpdate`: Whether Rook will resize the OSD CRUSH weight when the OSD PVC size is increased.
This allows cluster data to be rebalanced to make most effective use of new OSD space.
The default is false since data rebalancing can cause temporary cluster slowdown.
* [storage selection settings](#storage-selection-settings)
* [Storage Class Device Sets](#storage-class-device-sets)
* `onlyApplyOSDPlacement`: Whether the placement specific for OSDs is merged with the `all` placement. If `false`, the OSD placement will be merged with the `all` placement. If true, the `OSD placement will be applied` and the `all` placement will be ignored. The placement for OSDs is computed from several different places depending on the type of OSD:
Expand Down
14 changes: 14 additions & 0 deletions Documentation/CRDs/specification.md
Original file line number Diff line number Diff line change
Expand Up @@ -12158,6 +12158,20 @@ bool
<p>Whether to allow updating the device class after the OSD is initially provisioned</p>
</td>
</tr>
<tr>
<td>
<code>allowOsdCrushWeightUpdate</code><br/>
<em>
bool
</em>
</td>
<td>
<em>(Optional)</em>
<p>Whether Rook will resize the OSD CRUSH weight when the OSD PVC size is increased.
This allows cluster data to be rebalanced to make most effective use of new OSD space.
The default is false since data rebalancing can cause temporary cluster slowdown.</p>
</td>
</tr>
</tbody>
</table>
<h3 id="ceph.rook.io/v1.StoreType">StoreType
Expand Down
6 changes: 6 additions & 0 deletions deploy/charts/rook-ceph/templates/resources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3132,6 +3132,12 @@ spec:
allowDeviceClassUpdate:
description: Whether to allow updating the device class after the OSD is initially provisioned
type: boolean
allowOsdCrushWeightUpdate:
description: |-
Whether Rook will resize the OSD CRUSH weight when the OSD PVC size is increased.
This allows cluster data to be rebalanced to make most effective use of new OSD space.
The default is false since data rebalancing can cause temporary cluster slowdown.
type: boolean
config:
additionalProperties:
type: string
Expand Down
1 change: 1 addition & 0 deletions deploy/examples/cluster-on-pvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ spec:
maxLogSize: 500M # SUFFIX may be 'M' or 'G'. Must be at least 1M.
storage:
allowDeviceClassUpdate: false # whether to allow changing the device class of an OSD after it is created
allowOsdCrushWeightUpdate: true # whether to allow resizing the OSD crush weight after osd pvc is increased
storageClassDeviceSets:
- name: set1
# The number of OSDs to create from this device set
Expand Down
1 change: 1 addition & 0 deletions deploy/examples/cluster-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ spec:
useAllNodes: true
useAllDevices: true
allowDeviceClassUpdate: true
allowOsdCrushWeightUpdate: false
#deviceFilter:
#config:
# deviceClass: testclass
Expand Down
1 change: 1 addition & 0 deletions deploy/examples/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ spec:
# encryptedDevice: "true" # the default value for this option is "false"
# deviceClass: "myclass" # specify a device class for OSDs in the cluster
allowDeviceClassUpdate: false # whether to allow changing the device class of an OSD after it is created
allowOsdCrushWeightUpdate: false # whether to allow resizing the OSD crush weight after osd pvc is increased
# Individual nodes and their config can be specified as well, but 'useAllNodes' above must be set to false. Then, only the named
# nodes below will be used as storage resources. Each node's 'name' field should match their 'kubernetes.io/hostname' label.
# nodes:
Expand Down
6 changes: 6 additions & 0 deletions deploy/examples/crds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3130,6 +3130,12 @@ spec:
allowDeviceClassUpdate:
description: Whether to allow updating the device class after the OSD is initially provisioned
type: boolean
allowOsdCrushWeightUpdate:
description: |-
Whether Rook will resize the OSD CRUSH weight when the OSD PVC size is increased.
This allows cluster data to be rebalanced to make most effective use of new OSD space.
The default is false since data rebalancing can cause temporary cluster slowdown.
type: boolean
config:
additionalProperties:
type: string
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/ceph.rook.io/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2834,6 +2834,11 @@ type StorageScopeSpec struct {
// Whether to allow updating the device class after the OSD is initially provisioned
// +optional
AllowDeviceClassUpdate bool `json:"allowDeviceClassUpdate,omitempty"`
// Whether Rook will resize the OSD CRUSH weight when the OSD PVC size is increased.
// This allows cluster data to be rebalanced to make most effective use of new OSD space.
// The default is false since data rebalancing can cause temporary cluster slowdown.
// +optional
AllowOsdCrushWeightUpdate bool `json:"allowOsdCrushWeightUpdate,omitempty"`
}

// OSDStore is the backend storage type used for creating the OSDs
Expand Down
45 changes: 44 additions & 1 deletion pkg/daemon/ceph/client/osd.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package client
import (
"encoding/json"
"fmt"
"math"
"strconv"
"strings"

Expand All @@ -42,7 +43,7 @@ type OSDNodeUsage struct {
CrushWeight json.Number `json:"crush_weight"`
Depth json.Number `json:"depth"`
Reweight json.Number `json:"reweight"`
KB json.Number `json:"kb"`
KB json.Number `json:"kb"` // KB is in KiB units
UsedKB json.Number `json:"kb_used"`
AvailKB json.Number `json:"kb_avail"`
Utilization json.Number `json:"utilization"`
Expand Down Expand Up @@ -217,6 +218,48 @@ func GetOSDUsage(context *clusterd.Context, clusterInfo *ClusterInfo) (*OSDUsage
return &osdUsage, nil
}

func convertKibibytesToTebibytes(kib string) (float64, error) {
kibFloat, err := strconv.ParseFloat(kib, 64)
if err != nil {
return float64(0), errors.Wrap(err, "failed to convert string to float")
}
return kibFloat / float64(1024*1024*1024), nil
}

func ResizeOsdCrushWeight(actualOSD OSDNodeUsage, ctx *clusterd.Context, clusterInfo *ClusterInfo) (bool, error) {
currentCrushWeight, err := strconv.ParseFloat(actualOSD.CrushWeight.String(), 64)
if err != nil {
return false, errors.Wrapf(err, "failed converting string to float for osd.%d crush weight %q", actualOSD.ID, actualOSD.CrushWeight.String())
}
// actualOSD.KB is in KiB units
calculatedCrushWeight, err := convertKibibytesToTebibytes(actualOSD.KB.String())
if err != nil {
return false, errors.Wrapf(err, "failed to convert KiB to TiB for osd.%d crush weight %q", actualOSD.ID, actualOSD.KB.String())
}

// do not reweight if the calculated crush weight is 0 or less than equal to actualCrushWeight or there percentage resize is less than 1 percent
if calculatedCrushWeight == float64(0) {
logger.Debugf("osd size is 0 for osd.%d, not resizing the crush weights", actualOSD.ID)
return false, nil
} else if calculatedCrushWeight <= currentCrushWeight {
logger.Debugf("calculatedCrushWeight %f is less then current currentCrushWeight %f for osd.%d, not resizing the crush weights", calculatedCrushWeight, currentCrushWeight, actualOSD.ID)
return false, nil
} else if math.Abs(((calculatedCrushWeight - currentCrushWeight) / currentCrushWeight)) <= 0.01 {
logger.Debugf("calculatedCrushWeight %f is less then 1 percent increased from currentCrushWeight %f for osd.%d, not resizing the crush weights", calculatedCrushWeight, currentCrushWeight, actualOSD.ID)
return false, nil
}

calculatedCrushWeightString := fmt.Sprintf("%f", calculatedCrushWeight)
logger.Infof("updating osd.%d crush weight to %q for cluster in namespace %q", actualOSD.ID, calculatedCrushWeightString, clusterInfo.Namespace)
args := []string{"osd", "crush", "reweight", fmt.Sprintf("osd.%d", actualOSD.ID), calculatedCrushWeightString}
buf, err := NewCephCommand(ctx, clusterInfo, args).Run()
if err != nil {
return false, errors.Wrapf(err, "failed to reweight osd.%d for cluster in namespace %q from actual crush weight %f to calculated crush weight %f: %s", actualOSD.ID, clusterInfo.Namespace, currentCrushWeight, calculatedCrushWeight, string(buf))
}

return true, nil
}

func SetDeviceClass(context *clusterd.Context, clusterInfo *ClusterInfo, osdID int, deviceClass string) error {
// First remove the existing device class
args := []string{"osd", "crush", "rm-device-class", fmt.Sprintf("osd.%d", osdID)}
Expand Down
12 changes: 12 additions & 0 deletions pkg/daemon/ceph/client/osd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,18 @@ func TestOSDDeviceClasses(t *testing.T) {
})
}

func TestConvertKibibytesToTebibytes(t *testing.T) {
kib := "1024"
terabyte, err := convertKibibytesToTebibytes(kib)
assert.NoError(t, err)
assert.Equal(t, float64(9.5367431640625e-07), terabyte)

kib = "1073741824"
terabyte, err = convertKibibytesToTebibytes(kib)
assert.NoError(t, err)
assert.Equal(t, float64(1), terabyte)
}

func TestOSDOkToStop(t *testing.T) {
returnString := ""
returnOkResult := true
Expand Down
17 changes: 14 additions & 3 deletions pkg/operator/ceph/cluster/osd/osd.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,13 +325,24 @@ func (c *Cluster) postReconcileUpdateOSDProperties(desiredOSDs map[int]*OSDInfo)
}
logger.Debugf("post processing osd properties with %d actual osds from ceph osd df and %d existing osds found during reconcile", len(osdUsage.OSDNodes), len(desiredOSDs))
for _, actualOSD := range osdUsage.OSDNodes {
if desiredOSD, ok := desiredOSDs[actualOSD.ID]; ok {
if err := c.updateDeviceClassIfChanged(actualOSD.ID, desiredOSD.DeviceClass, actualOSD.DeviceClass); err != nil {
if c.spec.Storage.AllowOsdCrushWeightUpdate {
_, err := cephclient.ResizeOsdCrushWeight(actualOSD, c.context, c.clusterInfo)
if err != nil {
// Log the error and allow other updates to continue
logger.Error(err)
logger.Errorf("failed to resize osd crush weight on cluster in namespace %s: %v", c.clusterInfo.Namespace, err)
}
}

desiredOSD, ok := desiredOSDs[actualOSD.ID]
if !ok {
continue
}
if err := c.updateDeviceClassIfChanged(actualOSD.ID, desiredOSD.DeviceClass, actualOSD.DeviceClass); err != nil {
// Log the error and allow other updates to continue
logger.Errorf("failed to update device class on cluster in namespace %s: %v", c.clusterInfo.Namespace, err)
}
}

return nil
}

Expand Down
44 changes: 34 additions & 10 deletions pkg/operator/ceph/cluster/osd/osd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,20 @@ import (
const (
healthyCephStatus = `{"fsid":"877a47e0-7f6c-435e-891a-76983ab8c509","health":{"checks":{},"status":"HEALTH_OK"},"election_epoch":12,"quorum":[0,1,2],"quorum_names":["a","b","c"],"monmap":{"epoch":3,"fsid":"877a47e0-7f6c-435e-891a-76983ab8c509","modified":"2020-11-02 09:58:23.015313","created":"2020-11-02 09:57:37.719235","min_mon_release":14,"min_mon_release_name":"nautilus","features":{"persistent":["kraken","luminous","mimic","osdmap-prune","nautilus"],"optional":[]},"mons":[{"rank":0,"name":"a","public_addrs":{"addrvec":[{"type":"v2","addr":"172.30.74.42:3300","nonce":0},{"type":"v1","addr":"172.30.74.42:6789","nonce":0}]},"addr":"172.30.74.42:6789/0","public_addr":"172.30.74.42:6789/0"},{"rank":1,"name":"b","public_addrs":{"addrvec":[{"type":"v2","addr":"172.30.101.61:3300","nonce":0},{"type":"v1","addr":"172.30.101.61:6789","nonce":0}]},"addr":"172.30.101.61:6789/0","public_addr":"172.30.101.61:6789/0"},{"rank":2,"name":"c","public_addrs":{"addrvec":[{"type":"v2","addr":"172.30.250.55:3300","nonce":0},{"type":"v1","addr":"172.30.250.55:6789","nonce":0}]},"addr":"172.30.250.55:6789/0","public_addr":"172.30.250.55:6789/0"}]},"osdmap":{"osdmap":{"epoch":19,"num_osds":3,"num_up_osds":3,"num_in_osds":3,"num_remapped_pgs":0}},"pgmap":{"pgs_by_state":[{"state_name":"active+clean","count":96}],"num_pgs":96,"num_pools":3,"num_objects":79,"data_bytes":81553681,"bytes_used":3255447552,"bytes_avail":1646011994112,"bytes_total":1649267441664,"read_bytes_sec":853,"write_bytes_sec":5118,"read_op_per_sec":1,"write_op_per_sec":0},"fsmap":{"epoch":9,"id":1,"up":1,"in":1,"max":1,"by_rank":[{"filesystem_id":1,"rank":0,"name":"ocs-storagecluster-cephfilesystem-b","status":"up:active","gid":14161},{"filesystem_id":1,"rank":0,"name":"ocs-storagecluster-cephfilesystem-a","status":"up:standby-replay","gid":24146}],"up:standby":0},"mgrmap":{"epoch":10,"active_gid":14122,"active_name":"a","active_addrs":{"addrvec":[{"type":"v2","addr":"10.131.0.28:6800","nonce":1},{"type":"v1","addr":"10.131.0.28:6801","nonce":1}]}}}`
unHealthyCephStatus = `{"fsid":"613975f3-3025-4802-9de1-a2280b950e75","health":{"checks":{"OSD_DOWN":{"severity":"HEALTH_WARN","summary":{"message":"1 osds down"}},"OSD_HOST_DOWN":{"severity":"HEALTH_WARN","summary":{"message":"1 host (1 osds) down"}},"PG_AVAILABILITY":{"severity":"HEALTH_WARN","summary":{"message":"Reduced data availability: 101 pgs stale"}},"POOL_APP_NOT_ENABLED":{"severity":"HEALTH_WARN","summary":{"message":"application not enabled on 1 pool(s)"}}},"status":"HEALTH_WARN","overall_status":"HEALTH_WARN"},"election_epoch":12,"quorum":[0,1,2],"quorum_names":["rook-ceph-mon0","rook-ceph-mon2","rook-ceph-mon1"],"monmap":{"epoch":3,"fsid":"613975f3-3025-4802-9de1-a2280b950e75","modified":"2017-08-11 20:13:02.075679","created":"2017-08-11 20:12:35.314510","features":{"persistent":["kraken","luminous"],"optional":[]},"mons":[{"rank":0,"name":"rook-ceph-mon0","addr":"10.3.0.45:6789/0","public_addr":"10.3.0.45:6789/0"},{"rank":1,"name":"rook-ceph-mon2","addr":"10.3.0.249:6789/0","public_addr":"10.3.0.249:6789/0"},{"rank":2,"name":"rook-ceph-mon1","addr":"10.3.0.252:6789/0","public_addr":"10.3.0.252:6789/0"}]},"osdmap":{"osdmap":{"epoch":17,"num_osds":2,"num_up_osds":1,"num_in_osds":2,"full":false,"nearfull":true,"num_remapped_pgs":0}},"pgmap":{"pgs_by_state":[{"state_name":"stale+active+clean","count":101},{"state_name":"active+clean","count":99}],"num_pgs":200,"num_pools":2,"num_objects":243,"data_bytes":976793635,"bytes_used":13611479040,"bytes_avail":19825307648,"bytes_total":33436786688},"fsmap":{"epoch":1,"by_rank":[]},"mgrmap":{"epoch":3,"active_gid":14111,"active_name":"rook-ceph-mgr0","active_addr":"10.2.73.6:6800/9","available":true,"standbys":[],"modules":["restful","status"],"available_modules":["dashboard","prometheus","restful","status","zabbix"]},"servicemap":{"epoch":1,"modified":"0.000000","services":{}}}`
osdDFResults = `
// osdDFResults is a JSON representation of the output of `ceph osd df` command
// which has 5 osds with different storage usage
// Testing the resize of crush weight for OSDs based on the utilization
// 1) `ceph osd df`, kb size(in Tib) < crush_weight size -> no reweight
// 2) `ceph osd df`, kb size(in Tib) = 0 -> no reweight
// 3) `ceph osd df`, kb size(in Tib) and crush_weight size has 0.085% difference -> no reweight
// 4) & 5) `ceph osd df`, kb size(in Tib) and crush_weight size has more than 1% difference -> reweight
osdDFResults = `
{"nodes":[
{"id":0,"device_class":"hdd","name":"osd.0","type":"osd","type_id":0,"crush_weight":0.039093017578125,"depth":2,"pool_weights":{},"reweight":1,"kb":41943040,"kb_used":27640,"kb_used_data":432,"kb_used_omap":1,"kb_used_meta":27198,"kb_avail":41915400,"utilization":0.065898895263671875,"var":0.99448308946989694,"pgs":9,"status":"up"},
{"id":1,"device_class":"hdd","name":"osd.1","type":"osd","type_id":0,"crush_weight":0.039093017578125,"depth":2,"pool_weights":{},"reweight":1,"kb":41943040,"kb_used":27960,"kb_used_data":752,"kb_used_omap":1,"kb_used_meta":27198,"kb_avail":41915080,"utilization":0.066661834716796875,"var":1.005996641880547,"pgs":15,"status":"up"},
{"id":2,"device_class":"hdd","name":"osd.2","type":"osd","type_id":0,"crush_weight":0.039093017578125,"depth":2,"pool_weights":{},"reweight":1,"kb":41943040,"kb_used":27780,"kb_used_data":564,"kb_used_omap":1,"kb_used_meta":27198,"kb_avail":41915260,"utilization":0.066232681274414062,"var":0.99952026864955634,"pgs":8,"status":"up"}],
{"id":1,"device_class":"hdd","name":"osd.1","type":"osd","type_id":0,"crush_weight":0.039093017578125,"depth":2,"pool_weights":{},"reweight":1,"kb":0,"kb_used":27960,"kb_used_data":752,"kb_used_omap":1,"kb_used_meta":27198,"kb_avail":41915080,"utilization":0.066661834716796875,"var":1.005996641880547,"pgs":15,"status":"up"},
{"id":2,"device_class":"hdd","name":"osd.1","type":"osd","type_id":0,"crush_weight":0.039093017578125,"depth":2,"pool_weights":{},"reweight":1,"kb":42333872,"kb_used":27960,"kb_used_data":752,"kb_used_omap":1,"kb_used_meta":27198,"kb_avail":41915080,"utilization":0.066661834716796875,"var":1.005996641880547,"pgs":15,"status":"up"},
{"id":3,"device_class":"hdd","name":"osd.1","type":"osd","type_id":0,"crush_weight":0.039093017578125,"depth":2,"pool_weights":{},"reweight":1,"kb":9841943040,"kb_used":27960,"kb_used_data":752,"kb_used_omap":1,"kb_used_meta":27198,"kb_avail":41915080,"utilization":0.066661834716796875,"var":1.005996641880547,"pgs":15,"status":"up"},
{"id":4,"device_class":"hdd","name":"osd.2","type":"osd","type_id":0,"crush_weight":0.039093017578125,"depth":2,"pool_weights":{},"reweight":1,"kb":9991943040,"kb_used":27780,"kb_used_data":564,"kb_used_omap":1,"kb_used_meta":27198,"kb_avail":41915260,"utilization":0.066232681274414062,"var":0.99952026864955634,"pgs":8,"status":"up"}],
"stray":[],"summary":{"total_kb":125829120,"total_kb_used":83380,"total_kb_used_data":1748,"total_kb_used_omap":3,"total_kb_used_meta":81596,"total_kb_avail":125745740,"average_utilization":0.066264470418294266,"min_var":0.99448308946989694,"max_var":1.005996641880547,"dev":0.00031227879054369131}}`
)

Expand Down Expand Up @@ -371,12 +380,14 @@ func TestAddRemoveNode(t *testing.T) {
assert.True(t, k8serrors.IsNotFound(err))
}

func TestUpdateDeviceClass(t *testing.T) {
func TestPostReconcileUpdateOSDProperties(t *testing.T) {
namespace := "ns"
clientset := fake.NewSimpleClientset()
removedDeviceClassOSD := ""
setDeviceClassOSD := ""
setDeviceClass := ""
var crushWeight []string
var osdID []string
executor := &exectest.MockExecutor{
MockExecuteCommandWithOutput: func(command string, args ...string) (string, error) {
logger.Infof("ExecuteCommandWithOutput: %s %v", command, args)
Expand All @@ -390,6 +401,9 @@ func TestUpdateDeviceClass(t *testing.T) {
} else if args[2] == "set-device-class" {
setDeviceClass = args[3]
setDeviceClassOSD = args[4]
} else if args[2] == "reweight" {
osdID = append(osdID, args[3])
crushWeight = append(crushWeight, args[4])
}
}
}
Expand All @@ -402,7 +416,6 @@ func TestUpdateDeviceClass(t *testing.T) {
Name: "testing",
Namespace: namespace,
},
Spec: cephv1.ClusterSpec{Storage: cephv1.StorageScopeSpec{AllowDeviceClassUpdate: true}},
}
// Objects to track in the fake client.
object := []runtime.Object{
Expand All @@ -426,11 +439,22 @@ func TestUpdateDeviceClass(t *testing.T) {
1: {ID: 1, DeviceClass: "hdd"},
2: {ID: 2, DeviceClass: "newclass"},
}
err := c.postReconcileUpdateOSDProperties(desiredOSDs)
assert.Nil(t, err)
assert.Equal(t, "newclass", setDeviceClass)
assert.Equal(t, "osd.2", setDeviceClassOSD)
assert.Equal(t, "osd.2", removedDeviceClassOSD)
t.Run("test device class change", func(t *testing.T) {
c.spec.Storage = cephv1.StorageScopeSpec{AllowDeviceClassUpdate: true}
err := c.postReconcileUpdateOSDProperties(desiredOSDs)
assert.Nil(t, err)
assert.Equal(t, "newclass", setDeviceClass)
assert.Equal(t, "osd.2", setDeviceClassOSD)
assert.Equal(t, "osd.2", removedDeviceClassOSD)
})
t.Run("test resize Osd Crush Weight", func(t *testing.T) {
c.spec.Storage = cephv1.StorageScopeSpec{AllowOsdCrushWeightUpdate: true}
err := c.postReconcileUpdateOSDProperties(desiredOSDs)
assert.Nil(t, err)
// only osds with more than 1% change in utilization should be reweighted
assert.Equal(t, []string([]string{"osd.3", "osd.4"}), osdID)
assert.Equal(t, []string([]string{"9.166024", "9.305722"}), crushWeight)
})
}

func TestAddNodeFailure(t *testing.T) {
Expand Down

0 comments on commit f8c3a69

Please sign in to comment.