feat (k8s/GPU): Changed aliyun GPU memory management plugin to NVIDIA…

… official device plugin.
ertis-research · Jan 18, 2024 · 1612e2b · 1612e2b
1 parent d36fdb4
commit 1612e2b
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 39 deletions.
diff --git a/backend/automl/views.py b/backend/automl/views.py
@@ -548,10 +548,11 @@ def post(self, request, format=None):
                                                                 {'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"},  ##  (Sharing GPU)
                                                                 {'name': 'CASE', 'value': str(case)}
                                                                 ],
-                                                        'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
+                                                        'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                                     }],
-                                                    'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
-                                                    'restartPolicy': 'OnFailure'
+                                                    'imagePullPolicy': 'Always',
+                                                    'restartPolicy': 'OnFailure',
+                                                    'runtimeClassName': 'nvidia'
                                                 }
                                             }
                                         }
@@ -593,10 +594,11 @@ def post(self, request, format=None):
                                                                 {'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)},
                                                                 {'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)}
                                                                 ],
-                                                        'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
+                                                        'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                                     }],
-                                                    'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
-                                                    'restartPolicy': 'OnFailure'
+                                                    'imagePullPolicy': 'Always',
+                                                    'restartPolicy': 'OnFailure',
+                                                    'runtimeClassName': 'nvidia'
                                                 }
                                             }
                                         }
@@ -634,10 +636,11 @@ def post(self, request, format=None):
                                                                 {'name': 'CHANGE', 'value': deployment.change},
                                                                 {'name': 'IMPROVEMENT', 'value': str(deployment.improvement)}
                                                                 ],
-                                                        'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
+                                                        'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                                     }],
-                                                    'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
-                                                    'restartPolicy': 'OnFailure'
+                                                    'imagePullPolicy': 'Always',
+                                                    'restartPolicy': 'OnFailure',
+                                                    'runtimeClassName': 'nvidia'
                                                 }
                                             }
                                         }
@@ -684,10 +687,11 @@ def post(self, request, format=None):
                                                                 {'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)},
                                                                 {'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)}
                                                                 ],
-                                                        'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
+                                                        'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                                     }],
-                                                    'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
-                                                    'restartPolicy': 'OnFailure'
+                                                    'imagePullPolicy': 'Always',
+                                                    'restartPolicy': 'OnFailure',
+                                                    'runtimeClassName': 'nvidia'
                                                 }
                                             }
                                         }
@@ -746,10 +750,11 @@ def post(self, request, format=None):
                                                                 {'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"},  ##  (Sharing GPU)
                                                                 {'name': 'CASE', 'value': str(case)}
                                                                 ],
-                                                        'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
+                                                        'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                                     }],
-                                                    'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
-                                                    'restartPolicy': 'OnFailure'
+                                                    'imagePullPolicy': 'Always',
+                                                    'restartPolicy': 'OnFailure',
+                                                    'runtimeClassName': 'nvidia'
                                                 }
                                             }
                                         }
@@ -796,10 +801,11 @@ def post(self, request, format=None):
                                                                 {'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)},
                                                                 {'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)}
                                                                 ],
-                                                        'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
+                                                        'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                                     }],
-                                                    'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
-                                                    'restartPolicy': 'OnFailure'
+                                                    'imagePullPolicy': 'Always',
+                                                    'restartPolicy': 'OnFailure',
+                                                    'runtimeClassName': 'nvidia'
                                                 }
                                             }
                                         }
@@ -839,10 +845,11 @@ def post(self, request, format=None):
                                                                 {'name': 'STREAM_TIMEOUT', 'value': str(deployment.stream_timeout) if not deployment.indefinite else str(-1)},
                                                                 {'name': 'IMPROVEMENT', 'value': str(deployment.improvement)}
                                                                 ],
-                                                        'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
+                                                        'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                                     }],
-                                                    'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
-                                                    'restartPolicy': 'OnFailure'
+                                                    'imagePullPolicy': 'Always',
+                                                    'restartPolicy': 'OnFailure',
+                                                    'runtimeClassName': 'nvidia'
                                                 }
                                             }
                                         }
@@ -891,10 +898,11 @@ def post(self, request, format=None):
                                                                 {'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)},
                                                                 {'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)}
                                                                 ],
-                                                        'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
+                                                        'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                                     }],
-                                                    'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
-                                                    'restartPolicy': 'OnFailure'
+                                                    'imagePullPolicy': 'Always',
+                                                    'restartPolicy': 'OnFailure',
+                                                    'runtimeClassName': 'nvidia'
                                                 }
                                             }
                                         }
@@ -1540,10 +1548,10 @@ def post(self, request, pk, format=None):
                                                         {'name': 'GROUP_ID', 'value': 'inf'+str(result.id)},
                                                         {'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"}  ##  (Sharing GPU)
                                                         ],
-                                                'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
-                                                #'resources': {'limits':{'nvidia.com/gpu': 1}} ##  (Greedy GPU)
+                                                'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                             }],
-                                            'imagePullPolicy': 'IfNotPresent' # TODO: Remove this when the image is in DockerHub
+                                            'imagePullPolicy': 'Always',
+                                            'runtimeClassName': 'nvidia'
                                         }
                                     }
                                 }
@@ -1593,10 +1601,10 @@ def post(self, request, pk, format=None):
                                                         {'name': 'LIMIT', 'value': str(inference.limit)},
                                                         {'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"}  ##  (Sharing GPU)
                                                         ],
-                                                'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ##  (Sharing GPU)
-                                                #'resources': {'limits':{'nvidia.com/gpu': 1}} ##  (Greedy GPU)
+                                                'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ##  (Sharing GPU)
                                             }],
-                                            'imagePullPolicy': 'IfNotPresent' # TODO: Remove this when the image is in DockerHub
+                                            'imagePullPolicy': 'Always',
+                                            'runtimeClassName': 'nvidia'
                                         }
                                     }
                                 }

diff --git a/kustomize/README.md b/kustomize/README.md
@@ -3,15 +3,16 @@
 This folder contains multiple Kustomize files to ease the deployment on
 Kubernetes. Notably the following versions are available:
 
-| Version      | Resource URL                                              |
-| ------------ | --------------------------------------------------------- |
-| `master`     | `github.com/ertis-research/kafka-ml/kustomize/master`     |
-| `master-gpu` | `github.com/ertis-research/kafka-ml/kustomize/master-gpu` |
-| `v1.0`       | `github.com/ertis-research/kafka-ml/kustomize/v1.0`       |
-| `v1.0-gpu`   | `github.com/ertis-research/kafka-ml/kustomize/v1.0-gpu`   |
-| `v1.1`       | `github.com/ertis-research/kafka-ml/kustomize/v1.1`       |
-| `v1.1-gpu`   | `github.com/ertis-research/kafka-ml/kustomize/v1.1-gpu`   |
-| `local`      | `github.com/ertis-research/kafka-ml/kustomize/local`      |
+| Version             | Resource URL                                                     |
+| ------------------- | ---------------------------------------------------------------- |
+| `master`            | `github.com/ertis-research/kafka-ml/kustomize/master`            |
+| `master-gpu`        | `github.com/ertis-research/kafka-ml/kustomize/master-gpu`        |
+| `v1.0`              | `github.com/ertis-research/kafka-ml/kustomize/v1.0`              |
+| `v1.0-gpu`          | `github.com/ertis-research/kafka-ml/kustomize/v1.0-gpu`          |
+| `v1.1`              | `github.com/ertis-research/kafka-ml/kustomize/v1.1`              |
+| `v1.1-gpu`          | `github.com/ertis-research/kafka-ml/kustomize/v1.1-gpu`          |
+| `v1.1-gpu-nvidia`   | `github.com/ertis-research/kafka-ml/kustomize/v1.1-gpu-nvidia`   |
+| `local`             | `github.com/ertis-research/kafka-ml/kustomize/local`             |
 
 These versions should work with any Kubernetes compatible cluster, such as K8s
 and K3s.

diff --git a/kustomize/v1.1-gpu-nvidia/kustomization.yaml b/kustomize/v1.1-gpu-nvidia/kustomization.yaml
@@ -0,0 +1,17 @@
+resources:
+  - "../v1.1"
+
+configMapGenerator:
+  - name: kafkaml-configmap
+    behavior: merge
+    literals:
+      - tensorflow.training.image=ertis/kafka-ml-tensorflow_model_training-gpu:v1.1
+      - tensorflow.inference.image=ertis/kafka-ml-tensorflow_model_inference-gpu:v1.1
+      - pytorch.training.image=ertis/kafka-ml-pytorch_model_training-gpu:v1.1
+      - pytorch.inference.image=ertis/kafka-ml-pytorch_model_inference-gpu:v1.1
+
+images:
+  - name: ertis/kafka-ml-pthexecutor
+    newName: ertis/kafka-ml-pthexecutor-gpu
+  - name: ertis/kafka-ml-tfexecutor
+    newName: ertis/kafka-ml-tfexecutor-gpu