Skip to content

Commit

Permalink
feat (k8s/GPU): Changed aliyun GPU memory management plugin to NVIDIA…
Browse files Browse the repository at this point in the history
… official device plugin.
  • Loading branch information
antoniochavesgarcia committed Jan 18, 2024
1 parent d36fdb4 commit 1612e2b
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 39 deletions.
68 changes: 38 additions & 30 deletions backend/automl/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,10 +548,11 @@ def post(self, request, format=None):
{'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"}, ## (Sharing GPU)
{'name': 'CASE', 'value': str(case)}
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
'restartPolicy': 'OnFailure'
'imagePullPolicy': 'Always',
'restartPolicy': 'OnFailure',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down Expand Up @@ -593,10 +594,11 @@ def post(self, request, format=None):
{'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)},
{'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)}
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
'restartPolicy': 'OnFailure'
'imagePullPolicy': 'Always',
'restartPolicy': 'OnFailure',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down Expand Up @@ -634,10 +636,11 @@ def post(self, request, format=None):
{'name': 'CHANGE', 'value': deployment.change},
{'name': 'IMPROVEMENT', 'value': str(deployment.improvement)}
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
'restartPolicy': 'OnFailure'
'imagePullPolicy': 'Always',
'restartPolicy': 'OnFailure',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down Expand Up @@ -684,10 +687,11 @@ def post(self, request, format=None):
{'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)},
{'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)}
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
'restartPolicy': 'OnFailure'
'imagePullPolicy': 'Always',
'restartPolicy': 'OnFailure',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down Expand Up @@ -746,10 +750,11 @@ def post(self, request, format=None):
{'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"}, ## (Sharing GPU)
{'name': 'CASE', 'value': str(case)}
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
'restartPolicy': 'OnFailure'
'imagePullPolicy': 'Always',
'restartPolicy': 'OnFailure',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down Expand Up @@ -796,10 +801,11 @@ def post(self, request, format=None):
{'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)},
{'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)}
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
'restartPolicy': 'OnFailure'
'imagePullPolicy': 'Always',
'restartPolicy': 'OnFailure',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down Expand Up @@ -839,10 +845,11 @@ def post(self, request, format=None):
{'name': 'STREAM_TIMEOUT', 'value': str(deployment.stream_timeout) if not deployment.indefinite else str(-1)},
{'name': 'IMPROVEMENT', 'value': str(deployment.improvement)}
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
'restartPolicy': 'OnFailure'
'imagePullPolicy': 'Always',
'restartPolicy': 'OnFailure',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down Expand Up @@ -891,10 +898,11 @@ def post(self, request, format=None):
{'name': 'MODEL_LOGGER_TOPIC', 'value': str(settings.MODEL_LOGGER_TOPIC)},
{'name': 'FEDERATED_STRING_ID', 'value': str(federated_string_id)}
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent', # TODO: Remove this when the image is in DockerHub
'restartPolicy': 'OnFailure'
'imagePullPolicy': 'Always',
'restartPolicy': 'OnFailure',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down Expand Up @@ -1540,10 +1548,10 @@ def post(self, request, pk, format=None):
{'name': 'GROUP_ID', 'value': 'inf'+str(result.id)},
{'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"} ## (Sharing GPU)
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
#'resources': {'limits':{'nvidia.com/gpu': 1}} ## (Greedy GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent' # TODO: Remove this when the image is in DockerHub
'imagePullPolicy': 'Always',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down Expand Up @@ -1593,10 +1601,10 @@ def post(self, request, pk, format=None):
{'name': 'LIMIT', 'value': str(inference.limit)},
{'name': 'NVIDIA_VISIBLE_DEVICES', 'value': "all"} ## (Sharing GPU)
],
'resources': {'limits':{'aliyun.com/gpu-mem': gpu_mem_to_allocate}} ## (Sharing GPU)
#'resources': {'limits':{'nvidia.com/gpu': 1}} ## (Greedy GPU)
'resources': {'limits':{'nvidia.com/gpu': gpu_mem_to_allocate}} ## (Sharing GPU)
}],
'imagePullPolicy': 'IfNotPresent' # TODO: Remove this when the image is in DockerHub
'imagePullPolicy': 'Always',
'runtimeClassName': 'nvidia'
}
}
}
Expand Down
19 changes: 10 additions & 9 deletions kustomize/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
This folder contains multiple Kustomize files to ease the deployment on
Kubernetes. Notably the following versions are available:

| Version | Resource URL |
| ------------ | --------------------------------------------------------- |
| `master` | `github.com/ertis-research/kafka-ml/kustomize/master` |
| `master-gpu` | `github.com/ertis-research/kafka-ml/kustomize/master-gpu` |
| `v1.0` | `github.com/ertis-research/kafka-ml/kustomize/v1.0` |
| `v1.0-gpu` | `github.com/ertis-research/kafka-ml/kustomize/v1.0-gpu` |
| `v1.1` | `github.com/ertis-research/kafka-ml/kustomize/v1.1` |
| `v1.1-gpu` | `github.com/ertis-research/kafka-ml/kustomize/v1.1-gpu` |
| `local` | `github.com/ertis-research/kafka-ml/kustomize/local` |
| Version | Resource URL |
| ------------------- | ---------------------------------------------------------------- |
| `master` | `github.com/ertis-research/kafka-ml/kustomize/master` |
| `master-gpu` | `github.com/ertis-research/kafka-ml/kustomize/master-gpu` |
| `v1.0` | `github.com/ertis-research/kafka-ml/kustomize/v1.0` |
| `v1.0-gpu` | `github.com/ertis-research/kafka-ml/kustomize/v1.0-gpu` |
| `v1.1` | `github.com/ertis-research/kafka-ml/kustomize/v1.1` |
| `v1.1-gpu` | `github.com/ertis-research/kafka-ml/kustomize/v1.1-gpu` |
| `v1.1-gpu-nvidia` | `github.com/ertis-research/kafka-ml/kustomize/v1.1-gpu-nvidia` |
| `local` | `github.com/ertis-research/kafka-ml/kustomize/local` |

These versions should work with any Kubernetes compatible cluster, such as K8s
and K3s.
Expand Down
17 changes: 17 additions & 0 deletions kustomize/v1.1-gpu-nvidia/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
resources:
- "../v1.1"

configMapGenerator:
- name: kafkaml-configmap
behavior: merge
literals:
- tensorflow.training.image=ertis/kafka-ml-tensorflow_model_training-gpu:v1.1
- tensorflow.inference.image=ertis/kafka-ml-tensorflow_model_inference-gpu:v1.1
- pytorch.training.image=ertis/kafka-ml-pytorch_model_training-gpu:v1.1
- pytorch.inference.image=ertis/kafka-ml-pytorch_model_inference-gpu:v1.1

images:
- name: ertis/kafka-ml-pthexecutor
newName: ertis/kafka-ml-pthexecutor-gpu
- name: ertis/kafka-ml-tfexecutor
newName: ertis/kafka-ml-tfexecutor-gpu

0 comments on commit 1612e2b

Please sign in to comment.