truefoundry · chiragjn · Jan 2, 2025 · Dec 24, 2024 · Dec 24, 2024 · Dec 24, 2024
diff --git a/charts/tfy-gpu-operator/Chart.yaml b/charts/tfy-gpu-operator/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
 name: tfy-gpu-operator
-version: 0.1.25
+version: 0.1.26
 description: "Truefoundry GPU Operator"
 maintainers:
   - name: truefoundry

diff --git a/charts/tfy-gpu-operator/README.md b/charts/tfy-gpu-operator/README.md
@@ -40,7 +40,7 @@ Tfy-gpu-operator is a Helm chart that facilitates the deployment and management
 | `aws-eks-gpu-operator.node-feature-discovery.gc.interval`                      | Interval between two garbage collection runs.                                                                                            | `30m`                     |
 | `aws-eks-gpu-operator.node-feature-discovery.gc.resources.requests.cpu`        | CPU request for node feature discovery garbage collector.                                                                                | `10m`                     |
 | `aws-eks-gpu-operator.node-feature-discovery.gc.resources.requests.memory`     | Memory request for node feature discovery garbage collector.                                                                             | `100Mi`                   |
-| `aws-eks-gpu-operator.daemonsets.updateStrategy`                               | Update Strategy for Daemonsets - one of ["OnDelete", "RollingUpdate"]                                                                    | `OnDelete`                |
+| `aws-eks-gpu-operator.daemonsets.updateStrategy`                               | Update Strategy for Daemonsets - one of ["OnDelete", "RollingUpdate"]                                                                    | `RollingUpdate`           |
 | `aws-eks-gpu-operator.dcgm.enabled`                                            | Enabled/Disable standalone DCGM.                                                                                                         | `false`                   |
 | `aws-eks-gpu-operator.dcgm.version`                                            | Image tag for DCGM container. Find all image tags at https://catalog.ngc.nvidia.com/orgs/nvidia/teams/cloud-native/containers/dcgm/tags  | `3.3.8-1-ubuntu22.04`     |
 | `aws-eks-gpu-operator.dcgm.resources.requests.cpu`                             | CPU request for standalone DCGM container                                                                                                | `10m`                     |
@@ -119,7 +119,7 @@ Tfy-gpu-operator is a Helm chart that facilitates the deployment and management
 | `azure-aks-gpu-operator.node-feature-discovery.gc.interval`                      | Interval between two garbage collection runs.                                                                                                           | `30m`                     |
 | `azure-aks-gpu-operator.node-feature-discovery.gc.resources.requests.cpu`        | CPU request for node feature discovery garbage collector.                                                                                               | `10m`                     |
 | `azure-aks-gpu-operator.node-feature-discovery.gc.resources.requests.memory`     | Memory request for node feature discovery garbage collector.                                                                                            | `100Mi`                   |
-| `azure-aks-gpu-operator.daemonsets.updateStrategy`                               | Update Strategy for Daemonsets - one of ["OnDelete", "RollingUpdate"]                                                                                   | `OnDelete`                |
+| `azure-aks-gpu-operator.daemonsets.updateStrategy`                               | Update Strategy for Daemonsets - one of ["OnDelete", "RollingUpdate"]                                                                                   | `RollingUpdate`           |
 | `azure-aks-gpu-operator.daemonsets.priorityClassName`                            | Priority class for Daemonsets                                                                                                                           | `system-node-critical`    |
 | `azure-aks-gpu-operator.driver.enabled`                                          | Enable/Disable driver installation.                                                                                                                     | `false`                   |
 | `azure-aks-gpu-operator.toolkit.enabled`                                         | Enable/Disable nvidia container toolkit installation.                                                                                                   | `true`                    |
@@ -206,7 +206,7 @@ Tfy-gpu-operator is a Helm chart that facilitates the deployment and management
 | `generic-gpu-operator.node-feature-discovery.gc.interval`                      | Interval between two garbage collection runs.                                                                                            | `30m`                     |
 | `generic-gpu-operator.node-feature-discovery.gc.resources.requests.cpu`        | CPU request for node feature discovery garbage collector.                                                                                | `10m`                     |
 | `generic-gpu-operator.node-feature-discovery.gc.resources.requests.memory`     | Memory request for node feature discovery garbage collector.                                                                             | `100Mi`                   |
-| `generic-gpu-operator.daemonsets.updateStrategy`                               | Update Strategy for Daemonsets - one of ["OnDelete", "RollingUpdate"]                                                                    | `OnDelete`                |
+| `generic-gpu-operator.daemonsets.updateStrategy`                               | Update Strategy for Daemonsets - one of ["OnDelete", "RollingUpdate"]                                                                    | `RollingUpdate`           |
 | `generic-gpu-operator.daemonsets.priorityClassName`                            | Priority class for Daemonsets                                                                                                            | `system-node-critical`    |
 | `generic-gpu-operator.driver.enabled`                                          | Enable/Disable driver installation.                                                                                                      | `true`                    |
 | `generic-gpu-operator.toolkit.enabled`                                         | Enable/Disable nvidia container toolkit installation.                                                                                    | `true`                    |

diff --git a/charts/tfy-gpu-operator/values.yaml b/charts/tfy-gpu-operator/values.yaml
@@ -250,13 +250,8 @@ aws-eks-gpu-operator:
   ## Daemonsets configuration
   daemonsets:
     ## @param aws-eks-gpu-operator.daemonsets.updateStrategy Update Strategy for Daemonsets - one of ["OnDelete", "RollingUpdate"]
-    # This is set to OnDelete to protect against pod failures in case device plugin is unavailable during a kubelet restart (caused by toolkit container restart)
-    # The downside being Daemonset will not be updated until it is manually deleted,
-    # which is mostly okay for gpu case - effect of toolkit, device plugin daemonsets are limited to the node they run on.
-    # If needed, older nodes can be drained to force a newer versions on a newer nodes
-    # This will be changed to "RollingUpdate" when newer kubelet versions with allocation issue fix become the norm
     # We would also like to use `daemonsets.rollingUpdate.maxSurge: 1` which is not supported yet
-    updateStrategy: "OnDelete"
+    updateStrategy: "RollingUpdate"
 
   ## Validator configuration.
   validator:
@@ -668,13 +663,8 @@ azure-aks-gpu-operator:
   ## Daemonsets configuration
   daemonsets:
     ## @param azure-aks-gpu-operator.daemonsets.updateStrategy Update Strategy for Daemonsets - one of ["OnDelete", "RollingUpdate"]
-    # This is set to OnDelete to protect against pod failures in case device plugin is unavailable during a kubelet restart (caused by toolkit container restart)
-    # The downside being Daemonset will not be updated until it is manually deleted,
-    # which is mostly okay for gpu case - effect of toolkit, device plugin daemonsets are limited to the node they run on.
-    # If needed, older nodes can be drained to force a newer versions on a newer nodes
-    # This will be changed to "RollingUpdate" when newer kubelet versions with allocation issue fix become the norm
     # We would also like to use `daemonsets.rollingUpdate.maxSurge: 1` which is not supported yet
-    updateStrategy: OnDelete
+    updateStrategy: RollingUpdate
     ## @param azure-aks-gpu-operator.daemonsets.priorityClassName Priority class for Daemonsets
     priorityClassName: system-node-critical
     ## @skip azure-aks-gpu-operator.daemonsets.tolerations
@@ -1179,13 +1169,8 @@ generic-gpu-operator:
   ## Daemonsets configuration
   daemonsets:
     ## @param generic-gpu-operator.daemonsets.updateStrategy Update Strategy for Daemonsets - one of ["OnDelete", "RollingUpdate"]
-    # This is set to OnDelete to protect against pod failures in case device plugin is unavailable during a kubelet restart (caused by toolkit container restart)
-    # The downside being Daemonset will not be updated until it is manually deleted,
-    # which is mostly okay for gpu case - effect of toolkit, device plugin daemonsets are limited to the node they run on.
-    # If needed, older nodes can be drained to force a newer versions on a newer nodes
-    # This will be changed to "RollingUpdate" when newer kubelet versions with allocation issue fix become the norm
     # We would also like to use `daemonsets.rollingUpdate.maxSurge: 1` which is not supported yet
-    updateStrategy: OnDelete
+    updateStrategy: RollingUpdate
     ## @param generic-gpu-operator.daemonsets.priorityClassName Priority class for Daemonsets
     priorityClassName: system-node-critical
     ## @skip generic-gpu-operator.daemonsets.tolerations