From 6bd5fb63d1ebc0d0d564764a11c3eac5b934050c Mon Sep 17 00:00:00 2001 From: yuteng Date: Fri, 25 Oct 2024 07:24:58 +0800 Subject: [PATCH 01/20] draft of idea Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 rfc/system/5575-supporting-yunikorn-and-kueue.md diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md new file mode 100644 index 0000000000..dcde6b3942 --- /dev/null +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -0,0 +1,127 @@ +# [Newbie] Supporting Yunikorn and Kueue + +**Authors:** + +- @yuteng + +## 1 Executive Summary + +Providing k8s resource management, gang scheduling and preemption for flyte apllications by 3rd software including Apache Yunikorn and Kueue. + +## 2 Motivation + +Flyte support multi-tenant and many k8s plugins. + +Kueue and Yunikorn support gang scheduling and preemption. +Gang scheduling gurantees some avialable k8s crd services including spark, ray service with sufficient resource and preemption make sure high priority task execute immediately. + +Flyte doesn't maintain a resource management for multi-tenant which Yunikorn can solve it with hierarchy resource queues. + +## 3 Proposed Implementation + +```yaml +queueconfig: + general: + - org: org1 + users: "*" + priorityclass: priority-default + acl: + - jobs: "ray,dask" + users: "user1,user2" + priorityclass: priority-ray + gangscheduling: true + - jobs: "spark" + users: "user1,user3" + priorityclass: priority-spark + gangscheduling: true + namespace: + default: "ns1" + namespaces: "ns1|ns2" + - org: org2 + users: "user4, user5" + acl: + - jobs: "*" + users: "*" + namespace: + namespaces: "*" +``` + +// TODO how this yaml maps yunikorn configuration +// TODO how this yaml maps kueue CRD + +A SchedulerConfigManager maintains config from mentioned yaml. +It patch labels or annotations of k8s resources after they pass rules in config. + +```go +func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.TaskExecutionContext) (pluginsCore.Transition, error) { + o, err := e.plugin.BuildResource(ctx, k8sTaskCtx) + if err != nil { + return pluginsCore.UnknownTransition, err + } + if err := e.SchedulerConfigManager.Label(0); err != nil { + return pluginsCore.UnknownTransition, err + } +} +``` +When batchscheduler in flyte is yunikorn, some examples are like following. +For exmaple, this appoarch submit a Rayjob owned by user1 in org1 to "root.org1.ray". +A spark application in ns1 submitted by user4 in org1 is in "root.org1.ns1". +In the other hand, results of these examples are "org1-ray" and "org1-ns1" when adopting Kueue. + +## 4 Metrics & Dashboards + +1. Yunikorn scheduler add applications to a specific queue based on thier user info, queue name for any application type. +2. Yunikorn and Kueue provides gang scheduling based annotations For Ray and spark. +3. Preemption behavior meets user-defined configuration in yunikorn. + +## 5 Drawbacks + +This appoarch doens't provide a way to keep consistent between the accuate resource quato of each each group and configuration in scheduler. + +## 6 Alternatives + +## 7 Potential Impact and Dependencies + +## 8 Unresolved questions + +## 9 Conclusion + +Yunikorn and Kueue support gang scheduling to run all neccesary pods at same time when required resource are available. +Yunikorn provide preemption which calculate the priority of application based on its priority class an priority score of the queue where it submitted in order to trigger high-prioirty or emergency application immediately. +Yunikorn hierachy queue includes grarateed resources setting and acls. + +## 10 RFC Process Guide, remove this section when done + +*By writing an RFC, you're giving insight to your team on the direction you're taking. There may not be a right or better decision in many cases, but we will likely learn from it. By authoring, you're making a decision on where you want us to go and are looking for feedback on this direction from your team members, but ultimately the decision is yours.* + +This document is a: + +- thinking exercise, prototype with words. +- historical record, its value may decrease over time. +- way to broadcast information. +- mechanism to build trust. +- tool to empower. +- communication channel. + +This document is not: + +- a request for permission. +- the most up to date representation of any process or system + +**Checklist:** + +- [ ] Copy template +- [ ] Draft RFC (think of it as a wireframe) +- [ ] Share as WIP with folks you trust to gut-check +- [ ] Send pull request when comfortable +- [ ] Label accordingly +- [ ] Assign reviewers +- [ ] Merge PR + +**Recommendations** + +- Tag RFC title with [WIP] if you're still ironing out details. +- Tag RFC title with [Newbie] if you're trying out something experimental or you're not entirely convinced of what you're proposing. +- Tag RFC title with [RR] if you'd like to schedule a review request to discuss the RFC. +- If there are areas that you're not convinced on, tag people who you consider may know about this and ask for their input. +- If you have doubts, ask on [#feature-discussions](https://slack.com/app_redirect?channel=CPQ3ZFQ84&team=TN89P6GGK) for help moving something forward. From 7641f324cdcd811fb356706b694ddcdabd738a08 Mon Sep 17 00:00:00 2001 From: yuteng Date: Mon, 28 Oct 2024 07:25:29 +0800 Subject: [PATCH 02/20] Mapping between kueue crd and heirachy queue Signed-off-by: yuteng --- rfc/system/5575-supporting-yunikorn-and-kueue.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index dcde6b3942..d48bc83fed 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -47,7 +47,13 @@ queueconfig: ``` // TODO how this yaml maps yunikorn configuration -// TODO how this yaml maps kueue CRD +ResourceFlavor allocate resource based on labels which indicates thant category resource allocatiom by org label is available. +So a clusterQueue including multiple resources is a total acessaible resource for a org. +| clusterQueue | localQueue | +| --- | --- | +| Org | ray、spark、default | +A tenant can submit org task to queue such org.ray, org.spark and org.default to trace what job types are submitable. + A SchedulerConfigManager maintains config from mentioned yaml. It patch labels or annotations of k8s resources after they pass rules in config. From 02e35df54add12c756d4a3bad0cf0fc555b25275 Mon Sep 17 00:00:00 2001 From: yuteng Date: Tue, 29 Oct 2024 07:07:57 +0800 Subject: [PATCH 03/20] update potential impact Signed-off-by: yuteng --- rfc/system/5575-supporting-yunikorn-and-kueue.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index d48bc83fed..2a2d0db066 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -88,6 +88,16 @@ This appoarch doens't provide a way to keep consistent between the accuate resou ## 7 Potential Impact and Dependencies +Flyte support spark, ray and kubeflow CRD including pytorch and tfjobs. +Spark and Ray operator have been support Yunikorn gang scheduling after taskgroup calculation is implemented in these operators. +Taskgroup caclucation implementation in pods aspect in flyte or kubeflow is required for supporting kubeflow CRDs. +In the other hand, Kueue currently doesn't support Spark CRD. +| Operator | Yunikorn | Kueue | +| --- | --- | --- | +| Spark | v | x | +| Ray | v | v | +| Kubeflow | x | v | + ## 8 Unresolved questions ## 9 Conclusion From 590d64a36fa9c3cfdf841bfa4de0efa59aff7acf Mon Sep 17 00:00:00 2001 From: yuteng Date: Tue, 29 Oct 2024 07:20:43 +0800 Subject: [PATCH 04/20] yunikorn Signed-off-by: yuteng --- rfc/system/5575-supporting-yunikorn-and-kueue.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 2a2d0db066..125fe191c3 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -46,7 +46,11 @@ queueconfig: namespaces: "*" ``` -// TODO how this yaml maps yunikorn configuration +Mentioned configuration indicates what queues exist for an org. +Hierachy queues will be like following. +root.org1.ray、root.org1.spark and root.org1.default allowing summission from ns1 and ns1 namespace. +root.org2."CRDs" and root.org2.default allowing summission from any namespaces. + ResourceFlavor allocate resource based on labels which indicates thant category resource allocatiom by org label is available. So a clusterQueue including multiple resources is a total acessaible resource for a org. | clusterQueue | localQueue | @@ -82,7 +86,7 @@ In the other hand, results of these examples are "org1-ray" and "org1-ns1" when ## 5 Drawbacks -This appoarch doens't provide a way to keep consistent between the accuate resource quato of each each group and configuration in scheduler. +This appoarch doens't provide a way to keep consistent between the accuate resource quato of groups and configuration in scheduler. ## 6 Alternatives @@ -100,6 +104,7 @@ In the other hand, Kueue currently doesn't support Spark CRD. ## 8 Unresolved questions + ## 9 Conclusion Yunikorn and Kueue support gang scheduling to run all neccesary pods at same time when required resource are available. From 20d5672546d375b700b66da45da160714580f2ab Mon Sep 17 00:00:00 2001 From: yuteng Date: Tue, 29 Oct 2024 23:02:30 +0800 Subject: [PATCH 05/20] update Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 47 ++++++++++--------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 125fe191c3..00f410d43e 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -6,21 +6,22 @@ ## 1 Executive Summary -Providing k8s resource management, gang scheduling and preemption for flyte apllications by 3rd software including Apache Yunikorn and Kueue. +Providing kubernetes (k8s) resource management, gang scheduling and preemption for flyte applications by third-party software, including Apache Yunikorn and Kueue. ## 2 Motivation -Flyte support multi-tenant and many k8s plugins. +Flyte support multi-tenancy and various k8s plugins. Kueue and Yunikorn support gang scheduling and preemption. -Gang scheduling gurantees some avialable k8s crd services including spark, ray service with sufficient resource and preemption make sure high priority task execute immediately. +Gang scheduling guarantees the availability of certain K8s crd services, such as Spark, Ray, with sufficient resource and preemption make sure high priority task execute immediately. -Flyte doesn't maintain a resource management for multi-tenant which Yunikorn can solve it with hierarchy resource queues. +Flyte doesn't provide resource management for multi-tenancy, which hierarchical resource queues of Yunikorn can solve. ## 3 Proposed Implementation ```yaml queueconfig: + scheduler: yunikorn general: - org: org1 users: "*" @@ -47,20 +48,20 @@ queueconfig: ``` Mentioned configuration indicates what queues exist for an org. -Hierachy queues will be like following. -root.org1.ray、root.org1.spark and root.org1.default allowing summission from ns1 and ns1 namespace. -root.org2."CRDs" and root.org2.default allowing summission from any namespaces. +Hierarchucak queues will be structured as follows. +root.org1.ray、root.org1.spark and root.org1.default allowing submissions from ns1 and ns2 namespace. +root.org2."CRDs" and root.org2.default allowing submissions from any namespaces. -ResourceFlavor allocate resource based on labels which indicates thant category resource allocatiom by org label is available. -So a clusterQueue including multiple resources is a total acessaible resource for a org. +ResourceFlavor allocates resource based on labels which indicates that category-based resource allocation by organization label is available. +Thus, a clusterQueue including multiple resources represents the total acessaible resource for an organization. | clusterQueue | localQueue | | --- | --- | | Org | ray、spark、default | -A tenant can submit org task to queue such org.ray, org.spark and org.default to trace what job types are submitable. +A tenant can submit organization-specific tasks to queues such as org.ray, org.spark and org.default to track which job types are submittable. A SchedulerConfigManager maintains config from mentioned yaml. -It patch labels or annotations of k8s resources after they pass rules in config. +It patches labels or annotations on k8s resources after they pass rules specified in the configuration. ```go func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.TaskExecutionContext) (pluginsCore.Transition, error) { @@ -68,33 +69,33 @@ func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.Tas if err != nil { return pluginsCore.UnknownTransition, err } - if err := e.SchedulerConfigManager.Label(0); err != nil { + if err := e.SchedulerConfigManager.Patch(o); err != nil { return pluginsCore.UnknownTransition, err } } ``` When batchscheduler in flyte is yunikorn, some examples are like following. -For exmaple, this appoarch submit a Rayjob owned by user1 in org1 to "root.org1.ray". +For example, this appoarch submits a Ray job owned by user1 in org1 to "root.org1.ray". A spark application in ns1 submitted by user4 in org1 is in "root.org1.ns1". In the other hand, results of these examples are "org1-ray" and "org1-ns1" when adopting Kueue. ## 4 Metrics & Dashboards -1. Yunikorn scheduler add applications to a specific queue based on thier user info, queue name for any application type. -2. Yunikorn and Kueue provides gang scheduling based annotations For Ray and spark. -3. Preemption behavior meets user-defined configuration in yunikorn. +1. The Yunikorn scheduler add applications to a specific queue based on their user info, queue name for any application type. +2. Yunikorn and Kueue provide gang scheduling through annotations For Ray and spark. +3. Preemption behavior aligns with user-defined configuration in yunikorn. ## 5 Drawbacks -This appoarch doens't provide a way to keep consistent between the accuate resource quato of groups and configuration in scheduler. +This appoarch doesn't offer a way to maintain consistency between the accuate resource quotas of groups and the configuration in scheduler. ## 6 Alternatives ## 7 Potential Impact and Dependencies -Flyte support spark, ray and kubeflow CRD including pytorch and tfjobs. -Spark and Ray operator have been support Yunikorn gang scheduling after taskgroup calculation is implemented in these operators. -Taskgroup caclucation implementation in pods aspect in flyte or kubeflow is required for supporting kubeflow CRDs. +Flyte support Spark, Ray and Kubeflow CRDs including Pytorch and TFjobs. +The Spark and Ray operators have supported Yunikorn gang scheduling since task group calculation were implemented in these operators. +Taskgroup calculation implementation in pods aspect in flyte or kubeflow is required for supporting kubeflow CRDs. In the other hand, Kueue currently doesn't support Spark CRD. | Operator | Yunikorn | Kueue | | --- | --- | --- | @@ -107,9 +108,9 @@ In the other hand, Kueue currently doesn't support Spark CRD. ## 9 Conclusion -Yunikorn and Kueue support gang scheduling to run all neccesary pods at same time when required resource are available. -Yunikorn provide preemption which calculate the priority of application based on its priority class an priority score of the queue where it submitted in order to trigger high-prioirty or emergency application immediately. -Yunikorn hierachy queue includes grarateed resources setting and acls. +Yunikorn and Kueue support gang scheduling allowing all necassary pods to run sumultaneously when required resource are available. +Yunikorn provides preemption calculating the priority of applications based on thier priority class and priority score of the queue where they are submitted, in order to trigger high-prioirty or emergency application immediately. +Yunikorn's hierarchical queue includes grarateed resources settings and ACLs. ## 10 RFC Process Guide, remove this section when done From bb8a0ec63cbb857d26975bc3d8c5522921d139c9 Mon Sep 17 00:00:00 2001 From: yuteng Date: Wed, 30 Oct 2024 07:13:30 +0800 Subject: [PATCH 06/20] update logic flow Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 00f410d43e..b2a882c026 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -63,14 +63,36 @@ A tenant can submit organization-specific tasks to queues such as org.ray, org.s A SchedulerConfigManager maintains config from mentioned yaml. It patches labels or annotations on k8s resources after they pass rules specified in the configuration. +```go +type SchedulerHelper interface { + Patch(obj client.object) (error, string) + GangScheduling(string) error +} + +type PodLabels struct { + PodName string + labels map[string]string + annotations map[string]string +} +``` +Creat a scheduler helper according to the queueconfig.scheduler. +Its basic responsibility validate whether submitted application is accepted. +When a Yunikorn scheduler helper created, it will create applicationID、queue name and preemption labels . +in the other hand, a Kueue scheduler helper constructs labels including localQueueName, preemption. + ```go func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.TaskExecutionContext) (pluginsCore.Transition, error) { o, err := e.plugin.BuildResource(ctx, k8sTaskCtx) if err != nil { return pluginsCore.UnknownTransition, err } - if err := e.SchedulerConfigManager.Patch(o); err != nil { - return pluginsCore.UnknownTransition, err + if + if err, jobtype := e.SchedulerHelper.Patch(o); err == nil { + if e.SchedulerHelper.GangScheduling(jobtype, o); err != nil { + return pluginsCore.UnknownTransition, err + } + } else { + return pluginsCore.UnknownTransition, err } } ``` From 2c1235789e53e64dd7ed645a3e8cfdfca09bf1b0 Mon Sep 17 00:00:00 2001 From: yuteng Date: Fri, 1 Nov 2024 23:33:01 +0800 Subject: [PATCH 07/20] update Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 50 ++++++++++++------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index b2a882c026..debd6aa8d1 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -25,19 +25,18 @@ queueconfig: general: - org: org1 users: "*" - priorityclass: priority-default acl: - jobs: "ray,dask" users: "user1,user2" priorityclass: priority-ray - gangscheduling: true + gangscheduling: "placeholderTimeoutInSeconds=60 gangSchedulingStyle=hard" - jobs: "spark" users: "user1,user3" priorityclass: priority-spark - gangscheduling: true + gangscheduling: "placeholderTimeoutInSeconds=30 gangSchedulingStyle=hard" namespace: - default: "ns1" namespaces: "ns1|ns2" + priorityclass: priority-default - org: org2 users: "user4, user5" acl: @@ -64,21 +63,38 @@ A SchedulerConfigManager maintains config from mentioned yaml. It patches labels or annotations on k8s resources after they pass rules specified in the configuration. ```go -type SchedulerHelper interface { - Patch(obj client.object) (error, string) - GangScheduling(string) error +type YunikornScheduablePlugin interface { + MutateResourceForYunikorn(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) (client.Object, error) + GetLabels(id core.Identifier) map[string]string } -type PodLabels struct { - PodName string - labels map[string]string - annotations map[string]string +type KueueScheduablePlugin interface { + MutateResourceForKueue(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) (client.Object, error) + GetLabels(id core.Identifier) map[string]string +} + +func (h *YunikornScheduablePlugin) MutateResourceForYunikorn(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) (client.Object, error) error { + rayJob := object.(*rayv1.RayJob) + // TODO +} + +func (h *YunikornScheduablePlugin) GetLabels(id core.Identifier) map[string]string { + // 1.UserInfo + // 2.QueueName + // 3.ApplicationID +} + +func PatchPodSpec(target *v1.PodSpec, labels map[string]string) error { + // Get Metaobject from target + // Add label is the specific label doesn't exist } ``` -Creat a scheduler helper according to the queueconfig.scheduler. + + +Creat a scheduler plugin according to the queueconfig.scheduler. Its basic responsibility validate whether submitted application is accepted. -When a Yunikorn scheduler helper created, it will create applicationID、queue name and preemption labels . -in the other hand, a Kueue scheduler helper constructs labels including localQueueName, preemption. +When a Yunikorn scheduler plugin created, it will create applicationID、queue name and preemption labels . +in the other hand, a Kueue scheduler plugin constructs labels including localQueueName, preemption. ```go func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.TaskExecutionContext) (pluginsCore.Transition, error) { @@ -86,10 +102,8 @@ func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.Tas if err != nil { return pluginsCore.UnknownTransition, err } - if - if err, jobtype := e.SchedulerHelper.Patch(o); err == nil { - if e.SchedulerHelper.GangScheduling(jobtype, o); err != nil { - return pluginsCore.UnknownTransition, err + if o, err = e.SchedulerPlugin.MutateResourceForKueue(o); err == nil { + return pluginsCore.UnknownTransition, err } } else { return pluginsCore.UnknownTransition, err From 9a732c7c011cdabd0c0548ab2aed6e08a177ad67 Mon Sep 17 00:00:00 2001 From: yuteng Date: Fri, 1 Nov 2024 23:34:04 +0800 Subject: [PATCH 08/20] fin Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 36 ------------------- 1 file changed, 36 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index debd6aa8d1..e5a3376396 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -147,39 +147,3 @@ In the other hand, Kueue currently doesn't support Spark CRD. Yunikorn and Kueue support gang scheduling allowing all necassary pods to run sumultaneously when required resource are available. Yunikorn provides preemption calculating the priority of applications based on thier priority class and priority score of the queue where they are submitted, in order to trigger high-prioirty or emergency application immediately. Yunikorn's hierarchical queue includes grarateed resources settings and ACLs. - -## 10 RFC Process Guide, remove this section when done - -*By writing an RFC, you're giving insight to your team on the direction you're taking. There may not be a right or better decision in many cases, but we will likely learn from it. By authoring, you're making a decision on where you want us to go and are looking for feedback on this direction from your team members, but ultimately the decision is yours.* - -This document is a: - -- thinking exercise, prototype with words. -- historical record, its value may decrease over time. -- way to broadcast information. -- mechanism to build trust. -- tool to empower. -- communication channel. - -This document is not: - -- a request for permission. -- the most up to date representation of any process or system - -**Checklist:** - -- [ ] Copy template -- [ ] Draft RFC (think of it as a wireframe) -- [ ] Share as WIP with folks you trust to gut-check -- [ ] Send pull request when comfortable -- [ ] Label accordingly -- [ ] Assign reviewers -- [ ] Merge PR - -**Recommendations** - -- Tag RFC title with [WIP] if you're still ironing out details. -- Tag RFC title with [Newbie] if you're trying out something experimental or you're not entirely convinced of what you're proposing. -- Tag RFC title with [RR] if you'd like to schedule a review request to discuss the RFC. -- If there are areas that you're not convinced on, tag people who you consider may know about this and ask for their input. -- If you have doubts, ask on [#feature-discussions](https://slack.com/app_redirect?channel=CPQ3ZFQ84&team=TN89P6GGK) for help moving something forward. From ae9f6b9facf7832e30bea2d4e0e5f0dcbd2cd9ae Mon Sep 17 00:00:00 2001 From: yuteng Date: Fri, 1 Nov 2024 23:45:09 +0800 Subject: [PATCH 09/20] update Signed-off-by: yuteng --- rfc/system/5575-supporting-yunikorn-and-kueue.md | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index e5a3376396..f884a0ce0f 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -27,29 +27,24 @@ queueconfig: users: "*" acl: - jobs: "ray,dask" - users: "user1,user2" priorityclass: priority-ray gangscheduling: "placeholderTimeoutInSeconds=60 gangSchedulingStyle=hard" - jobs: "spark" - users: "user1,user3" priorityclass: priority-spark gangscheduling: "placeholderTimeoutInSeconds=30 gangSchedulingStyle=hard" namespace: - namespaces: "ns1|ns2" priorityclass: priority-default - org: org2 users: "user4, user5" acl: - jobs: "*" - users: "*" - namespace: - namespaces: "*" ``` Mentioned configuration indicates what queues exist for an org. -Hierarchucak queues will be structured as follows. -root.org1.ray、root.org1.spark and root.org1.default allowing submissions from ns1 and ns2 namespace. -root.org2."CRDs" and root.org2.default allowing submissions from any namespaces. +Hierarchical queues will be structured as follows. +root.org1.ray、root.org1.spark, root.org1.default and root.org2."CRDs". + +ray and spark linked to priority class setting ` yunikorn.apache.org/allow-preemption` with `false` recommand Yunikorn do its best to prevent applications from preemption. ResourceFlavor allocates resource based on labels which indicates that category-based resource allocation by organization label is available. Thus, a clusterQueue including multiple resources represents the total acessaible resource for an organization. From 9ed6b055b71ce054dffd21970c22a51b48c28589 Mon Sep 17 00:00:00 2001 From: yuteng Date: Sat, 2 Nov 2024 15:10:10 +0800 Subject: [PATCH 10/20] update Signed-off-by: yuteng --- rfc/system/5575-supporting-yunikorn-and-kueue.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index f884a0ce0f..55fae67b26 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -25,24 +25,24 @@ queueconfig: general: - org: org1 users: "*" - acl: - - jobs: "ray,dask" + jobs: + - type: "ray" priorityclass: priority-ray gangscheduling: "placeholderTimeoutInSeconds=60 gangSchedulingStyle=hard" - - jobs: "spark" + - type: "spark" priorityclass: priority-spark gangscheduling: "placeholderTimeoutInSeconds=30 gangSchedulingStyle=hard" - namespace: + default: priorityclass: priority-default - org: org2 users: "user4, user5" - acl: + jobs: - jobs: "*" ``` Mentioned configuration indicates what queues exist for an org. Hierarchical queues will be structured as follows. -root.org1.ray、root.org1.spark, root.org1.default and root.org2."CRDs". +root.org1.ray、root.org1.spark and root.org1.default". ray and spark linked to priority class setting ` yunikorn.apache.org/allow-preemption` with `false` recommand Yunikorn do its best to prevent applications from preemption. From 552a80040fe9ab118be0a05d1cf3b49471cbae52 Mon Sep 17 00:00:00 2001 From: yuteng Date: Mon, 4 Nov 2024 22:50:55 +0800 Subject: [PATCH 11/20] Discard priority class Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 55fae67b26..1ba3c21e8b 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -22,30 +22,17 @@ Flyte doesn't provide resource management for multi-tenancy, which hierarchical ```yaml queueconfig: scheduler: yunikorn - general: - - org: org1 - users: "*" - jobs: - - type: "ray" - priorityclass: priority-ray - gangscheduling: "placeholderTimeoutInSeconds=60 gangSchedulingStyle=hard" - - type: "spark" - priorityclass: priority-spark - gangscheduling: "placeholderTimeoutInSeconds=30 gangSchedulingStyle=hard" - default: - priorityclass: priority-default - - org: org2 - users: "user4, user5" - jobs: - - jobs: "*" + jobs: + - type: "ray" + gangscheduling: "placeholderTimeoutInSeconds=60 gangSchedulingStyle=hard" + - type: "spark" + gangscheduling: "placeholderTimeoutInSeconds=30 gangSchedulingStyle=hard" ``` Mentioned configuration indicates what queues exist for an org. Hierarchical queues will be structured as follows. root.org1.ray、root.org1.spark and root.org1.default". -ray and spark linked to priority class setting ` yunikorn.apache.org/allow-preemption` with `false` recommand Yunikorn do its best to prevent applications from preemption. - ResourceFlavor allocates resource based on labels which indicates that category-based resource allocation by organization label is available. Thus, a clusterQueue including multiple resources represents the total acessaible resource for an organization. | clusterQueue | localQueue | @@ -88,7 +75,7 @@ func PatchPodSpec(target *v1.PodSpec, labels map[string]string) error { Creat a scheduler plugin according to the queueconfig.scheduler. Its basic responsibility validate whether submitted application is accepted. -When a Yunikorn scheduler plugin created, it will create applicationID、queue name and preemption labels . +When a Yunikorn scheduler plugin created, it will create applicationID and queue name. in the other hand, a Kueue scheduler plugin constructs labels including localQueueName, preemption. ```go From b71b7490161226697c1d74fcfcb081d1271371a3 Mon Sep 17 00:00:00 2001 From: yuteng Date: Wed, 6 Nov 2024 07:03:22 +0800 Subject: [PATCH 12/20] org -> organization Signed-off-by: yuteng --- rfc/system/5575-supporting-yunikorn-and-kueue.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 1ba3c21e8b..0c3dd49496 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -29,16 +29,16 @@ queueconfig: gangscheduling: "placeholderTimeoutInSeconds=30 gangSchedulingStyle=hard" ``` -Mentioned configuration indicates what queues exist for an org. +Mentioned configuration indicates what queues exist for an organization. Hierarchical queues will be structured as follows. -root.org1.ray、root.org1.spark and root.org1.default". +root.organization1.ray、root.organization1.spark and root.organization1.default". ResourceFlavor allocates resource based on labels which indicates that category-based resource allocation by organization label is available. Thus, a clusterQueue including multiple resources represents the total acessaible resource for an organization. | clusterQueue | localQueue | | --- | --- | -| Org | ray、spark、default | -A tenant can submit organization-specific tasks to queues such as org.ray, org.spark and org.default to track which job types are submittable. +| | ray、spark、default | +A tenant can submit organization-specific tasks to queues such as organization.ray, organization.spark and organization.default to track which job types are submittable. A SchedulerConfigManager maintains config from mentioned yaml. @@ -93,9 +93,9 @@ func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.Tas } ``` When batchscheduler in flyte is yunikorn, some examples are like following. -For example, this appoarch submits a Ray job owned by user1 in org1 to "root.org1.ray". -A spark application in ns1 submitted by user4 in org1 is in "root.org1.ns1". -In the other hand, results of these examples are "org1-ray" and "org1-ns1" when adopting Kueue. +For example, this appoarch submits a Ray job owned by user1 in organization1 to "root.organization1.ray". +A spark application in ns1 submitted by user4 in organization1 is in "root.organization1.ns1". +In the other hand, results of these examples are "organization1-ray" and "organization1-ns1" when adopting Kueue. ## 4 Metrics & Dashboards @@ -127,5 +127,5 @@ In the other hand, Kueue currently doesn't support Spark CRD. ## 9 Conclusion Yunikorn and Kueue support gang scheduling allowing all necassary pods to run sumultaneously when required resource are available. -Yunikorn provides preemption calculating the priority of applications based on thier priority class and priority score of the queue where they are submitted, in order to trigger high-prioirty or emergency application immediately. +Yunikorn provides preemption calculating the priority of applications based on their priority class and priority score of the queue where they are submitted, in order to trigger high-prioirty or emergency application immediately. Yunikorn's hierarchical queue includes grarateed resources settings and ACLs. From 3fc572fe4cabd926562bd7a3955e47ac10bea0e3 Mon Sep 17 00:00:00 2001 From: yuteng Date: Wed, 6 Nov 2024 23:49:42 +0800 Subject: [PATCH 13/20] Use same interface and move to the AddObjectMeta Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 85 ++++++++++++------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 0c3dd49496..dba6442ea3 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -19,14 +19,18 @@ Flyte doesn't provide resource management for multi-tenancy, which hierarchical ## 3 Proposed Implementation +Kueue + ```yaml queueconfig: scheduler: yunikorn jobs: - type: "ray" gangscheduling: "placeholderTimeoutInSeconds=60 gangSchedulingStyle=hard" + allow-preemption: false - type: "spark" gangscheduling: "placeholderTimeoutInSeconds=30 gangSchedulingStyle=hard" + allow-preemption: true ``` Mentioned configuration indicates what queues exist for an organization. @@ -41,37 +45,63 @@ Thus, a clusterQueue including multiple resources represents the total acessaibl A tenant can submit organization-specific tasks to queues such as organization.ray, organization.spark and organization.default to track which job types are submittable. -A SchedulerConfigManager maintains config from mentioned yaml. It patches labels or annotations on k8s resources after they pass rules specified in the configuration. ```go -type YunikornScheduablePlugin interface { - MutateResourceForYunikorn(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) (client.Object, error) - GetLabels(id core.Identifier) map[string]string +type SchedulePlugin interface { + CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) + CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) + GetGroupLabels() (labels, annotations map[string]string) } -type KueueScheduablePlugin interface { - MutateResourceForKueue(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) (client.Object, error) - GetLabels(id core.Identifier) map[string]string +type YunikornScheduablePlugin struct { + jobs map[string]string + Labels map[string]string + Annotations map[string]string } -func (h *YunikornScheduablePlugin) MutateResourceForYunikorn(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) (client.Object, error) error { - rayJob := object.(*rayv1.RayJob) - // TODO +func (yk *YunikornSchedulPlugin) GetGroupLabels() (labels, annotations map[string]string) { + return yk.Labels, yk.Annotations } -func (h *YunikornScheduablePlugin) GetLabels(id core.Identifier) map[string]string { - // 1.UserInfo - // 2.QueueName - // 3.ApplicationID +func (yk *YunikornSchedulePlugin) CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { + // Set queue name based on the job type and flyteidl.Identifier fields including "ResourceType", "Org" and "Name". + // 1.Clean yk.Labels and yk.Annotations + // 2.Add yunikorn.apache.org/user.info = . + // 3.Add yunikorn.apache.org/app-id = - + // 4.Add yunikorn.apache.org/queue = . } -func PatchPodSpec(target *v1.PodSpec, labels map[string]string) error { - // Get Metaobject from target - // Add label is the specific label doesn't exist +func (yk *YunikornSchedulePlugin) CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { + // 1.Add yunikorn.apache.org/task-group-name = yk.CreateTaskgroupName(ResourceType) + // 2.Add yunikorn.apache.org/task-groups = yk.CreateTaskgroup(object) + // 3.Add yunikorn.apache.org/schedulingPolicyParameters = yk.jobs[ResourceType] + // 4.Add yunikorn.apache.org/allow-preemption = true/false +} + +type KueueScheduablePlugin struct { + jobs map[string]string + Labels map[string]string + Annotations map[string]string +} + +func (k *KueueScheduablePlugin) GetGroupLabels() (labels, annotations map[string]string) { + return k.Labels, k.Annotations +} + +func (k *KueueScheduablePlugin) CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { + // Set queue name based on the job type and flyteidl.Identifier field "Org". + // Clean k.Labels and k.Annotations + // 1.Add kueue.x-k8s.io/queue-name = . + // Update k.Labels and k.Annotations } -``` +func (k *KueueScheduablePlugin) CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { + // Add Label "kueue.x-k8s.io/pod-group-name" and "kueue.x-k8s.io/pod-group-total-count" for spark、dask. + // If object type is ray CRD and kubeflow CRD which are supported by Kueue then skips. + // Update k.Labels and k.Annotations +} +``` Creat a scheduler plugin according to the queueconfig.scheduler. Its basic responsibility validate whether submitted application is accepted. @@ -79,17 +109,14 @@ When a Yunikorn scheduler plugin created, it will create applicationID and queue in the other hand, a Kueue scheduler plugin constructs labels including localQueueName, preemption. ```go -func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.TaskExecutionContext) (pluginsCore.Transition, error) { - o, err := e.plugin.BuildResource(ctx, k8sTaskCtx) - if err != nil { - return pluginsCore.UnknownTransition, err - } - if o, err = e.SchedulerPlugin.MutateResourceForKueue(o); err == nil { - return pluginsCore.UnknownTransition, err - } - } else { - return pluginsCore.UnknownTransition, err - } +func (e *PluginManager) addObjectMetadata(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) { + e.SchedulerPlugin.CreateLabels(taskCtx, o) + e.SchedulerPlugin.CreateGroupLabels(taskCtx, o) + schedulerLabels, schedulerAnnotations := e.SchedulerPlugin.GetLabels() + o.SetNamespace(taskCtx.GetNamespace()) + o.SetAnnotations(pluginsUtils.UnionMaps(cfg.DefaultAnnotations, o.GetAnnotations(), pluginsUtils.CopyMap(taskCtx.GetAnnotations(), schedulerAnnotations))) + o.SetLabels(pluginsUtils.UnionMaps(cfg.DefaultLabels, o.GetLabels(), pluginsUtils.CopyMap(taskCtx.GetLabels(), schedulerLabels))) + o.SetName(taskCtx.GetTaskExecutionID().GetGeneratedName()) } ``` When batchscheduler in flyte is yunikorn, some examples are like following. From e64beb01f0c86d453665301ae37099a287fa873b Mon Sep 17 00:00:00 2001 From: yuteng Date: Thu, 7 Nov 2024 00:00:51 +0800 Subject: [PATCH 14/20] Set scheduler name when launchResource Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index dba6442ea3..d0cc0b7819 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -42,8 +42,7 @@ Thus, a clusterQueue including multiple resources represents the total acessaibl | clusterQueue | localQueue | | --- | --- | | | ray、spark、default | -A tenant can submit organization-specific tasks to queues such as organization.ray, organization.spark and organization.default to track which job types are submittable. - +A tenant can submit organization-specific tasks to queues such as organization.ray, organization.spark and organization.default to track which job types are submittable. It patches labels or annotations on k8s resources after they pass rules specified in the configuration. @@ -52,6 +51,7 @@ type SchedulePlugin interface { CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) GetGroupLabels() (labels, annotations map[string]string) + SetSchedulerName(object client.Objec) } type YunikornScheduablePlugin struct { @@ -104,11 +104,19 @@ func (k *KueueScheduablePlugin) CreateGroupLabels(ctx context.Context, object cl ``` Creat a scheduler plugin according to the queueconfig.scheduler. -Its basic responsibility validate whether submitted application is accepted. +Its basic responsibility validate whether submitted application is accepted. When a Yunikorn scheduler plugin created, it will create applicationID and queue name. in the other hand, a Kueue scheduler plugin constructs labels including localQueueName, preemption. ```go +func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.TaskExecutionContext) (pluginsCore.Transition, error) { + o, err := e.plugin.BuildResource(ctx, k8sTaskCtx) + if err != nil { + return pluginsCore.UnknownTransition, err + } + e.SchedulerPlugin.SetSchedulerName(o) +} + func (e *PluginManager) addObjectMetadata(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) { e.SchedulerPlugin.CreateLabels(taskCtx, o) e.SchedulerPlugin.CreateGroupLabels(taskCtx, o) @@ -119,6 +127,7 @@ func (e *PluginManager) addObjectMetadata(taskCtx pluginsCore.TaskExecutionMetad o.SetName(taskCtx.GetTaskExecutionID().GetGeneratedName()) } ``` + When batchscheduler in flyte is yunikorn, some examples are like following. For example, this appoarch submits a Ray job owned by user1 in organization1 to "root.organization1.ray". A spark application in ns1 submitted by user4 in organization1 is in "root.organization1.ns1". @@ -150,9 +159,8 @@ In the other hand, Kueue currently doesn't support Spark CRD. ## 8 Unresolved questions - ## 9 Conclusion Yunikorn and Kueue support gang scheduling allowing all necassary pods to run sumultaneously when required resource are available. -Yunikorn provides preemption calculating the priority of applications based on their priority class and priority score of the queue where they are submitted, in order to trigger high-prioirty or emergency application immediately. +Yunikorn provides preemption calculating the priority of applications based on their priority class and priority score of the queue where they are submitted, in order to trigger high-prioirty or emergency application immediately. Yunikorn's hierarchical queue includes grarateed resources settings and ACLs. From 539eff60be91c9b1cf054ec7ef8f1b663c18bfb5 Mon Sep 17 00:00:00 2001 From: yuteng Date: Mon, 11 Nov 2024 06:51:53 +0800 Subject: [PATCH 15/20] update and pass spellcheck Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index d0cc0b7819..75edb3803e 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -10,12 +10,12 @@ Providing kubernetes (k8s) resource management, gang scheduling and preemption f ## 2 Motivation -Flyte support multi-tenancy and various k8s plugins. - -Kueue and Yunikorn support gang scheduling and preemption. -Gang scheduling guarantees the availability of certain K8s crd services, such as Spark, Ray, with sufficient resource and preemption make sure high priority task execute immediately. - -Flyte doesn't provide resource management for multi-tenancy, which hierarchical resource queues of Yunikorn can solve. +Flyte supports multi-tenancy and various Kubernetes plugins. +Some Kubernetes plugins may encounter into resource wastage when jobs partially start without performing any meaningful work. +A solution to this issue is gang scheduling, which guarantees that all worker pods derived from a CRD are scheduled simultaneously. +Kueue or Apache Yunikorn support this mechanism. +Additionally, Yunikorn can map tenants and organizations to hierarchical queues to define resource quotas. +Based on this setting, access control lists can be configured to grant access to users and groups. ## 3 Proposed Implementation @@ -33,9 +33,8 @@ queueconfig: allow-preemption: true ``` -Mentioned configuration indicates what queues exist for an organization. -Hierarchical queues will be structured as follows. -root.organization1.ray、root.organization1.spark and root.organization1.default". +`root.organization1.ray` is the queue of the ray job submitted by user1 belonging organization1. +Other CRDs are not on the list will be submitted to `root..default`. ResourceFlavor allocates resource based on labels which indicates that category-based resource allocation by organization label is available. Thus, a clusterQueue including multiple resources represents the total acessaible resource for an organization. @@ -44,14 +43,17 @@ Thus, a clusterQueue including multiple resources represents the total acessaibl | | ray、spark、default | A tenant can submit organization-specific tasks to queues such as organization.ray, organization.spark and organization.default to track which job types are submittable. -It patches labels or annotations on k8s resources after they pass rules specified in the configuration. +A scheduling plugin implements functions `SetSchedulerName`, `PatchLabels` and `PatchGroupLabels` to update labels and `schedulerName`. +`PatchLabels` patches necassary labels, such as `queuename`, `user-info` and `applcationID`, to jobs. +`PatchGroupLabels` supports creating `group-pod` and `task-group` labels based on incoming CRD if need. +`SetSchedulerName` set `schedulerName` field in `podTemplate`. ```go type SchedulePlugin interface { - CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) - CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) + PatchLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) + PatchGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) GetGroupLabels() (labels, annotations map[string]string) - SetSchedulerName(object client.Objec) + SetSchedulerName(object client.Object) } type YunikornScheduablePlugin struct { @@ -64,7 +66,7 @@ func (yk *YunikornSchedulPlugin) GetGroupLabels() (labels, annotations map[strin return yk.Labels, yk.Annotations } -func (yk *YunikornSchedulePlugin) CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { +func (yk *YunikornSchedulePlugin) PatchLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { // Set queue name based on the job type and flyteidl.Identifier fields including "ResourceType", "Org" and "Name". // 1.Clean yk.Labels and yk.Annotations // 2.Add yunikorn.apache.org/user.info = . @@ -72,7 +74,7 @@ func (yk *YunikornSchedulePlugin) CreateLabels(taskCtx pluginsCore.TaskExecution // 4.Add yunikorn.apache.org/queue = . } -func (yk *YunikornSchedulePlugin) CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { +func (yk *YunikornSchedulePlugin) PatchGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { // 1.Add yunikorn.apache.org/task-group-name = yk.CreateTaskgroupName(ResourceType) // 2.Add yunikorn.apache.org/task-groups = yk.CreateTaskgroup(object) // 3.Add yunikorn.apache.org/schedulingPolicyParameters = yk.jobs[ResourceType] @@ -89,14 +91,14 @@ func (k *KueueScheduablePlugin) GetGroupLabels() (labels, annotations map[string return k.Labels, k.Annotations } -func (k *KueueScheduablePlugin) CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { +func (k *KueueScheduablePlugin) PatchLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { // Set queue name based on the job type and flyteidl.Identifier field "Org". // Clean k.Labels and k.Annotations // 1.Add kueue.x-k8s.io/queue-name = . // Update k.Labels and k.Annotations } -func (k *KueueScheduablePlugin) CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { +func (k *KueueScheduablePlugin) PatchGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { // Add Label "kueue.x-k8s.io/pod-group-name" and "kueue.x-k8s.io/pod-group-total-count" for spark、dask. // If object type is ray CRD and kubeflow CRD which are supported by Kueue then skips. // Update k.Labels and k.Annotations From 9d511e30f15e3d940a4420c2b50d533da2168ceb Mon Sep 17 00:00:00 2001 From: yuteng Date: Mon, 11 Nov 2024 07:02:22 +0800 Subject: [PATCH 16/20] describe the progress when a new job comes Signed-off-by: yuteng --- rfc/system/5575-supporting-yunikorn-and-kueue.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 75edb3803e..2185598aa8 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -31,10 +31,11 @@ queueconfig: - type: "spark" gangscheduling: "placeholderTimeoutInSeconds=30 gangSchedulingStyle=hard" allow-preemption: true + - type: ".*" ``` `root.organization1.ray` is the queue of the ray job submitted by user1 belonging organization1. -Other CRDs are not on the list will be submitted to `root..default`. +`.*` means that other jobs excluding ray and spark will be submitted to `root..default`. ResourceFlavor allocates resource based on labels which indicates that category-based resource allocation by organization label is available. Thus, a clusterQueue including multiple resources represents the total acessaible resource for an organization. @@ -105,10 +106,11 @@ func (k *KueueScheduablePlugin) PatchGroupLabels(ctx context.Context, object cli } ``` -Creat a scheduler plugin according to the queueconfig.scheduler. -Its basic responsibility validate whether submitted application is accepted. -When a Yunikorn scheduler plugin created, it will create applicationID and queue name. -in the other hand, a Kueue scheduler plugin constructs labels including localQueueName, preemption. +When a job comes, following things happens. +1. `SetSchedulerName` sets the `schedulerName` with the specific scheduler name +2. `CreateLabels` new basic labels based on the scheduler. +3. `CreateGroupLabels` creates `kueue.x-k8s.io/pod-group-name` or `yunikorn.apache.org/task-groups` according to the calculatied results from CRD. +4. Merging labels and annotations from `CreateLabels` and `CreateGroupLabels` to the CRD. ```go func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.TaskExecutionContext) (pluginsCore.Transition, error) { From d6a420ad7fd85048ea90de9a1ac837712ee75dea Mon Sep 17 00:00:00 2001 From: yuteng Date: Mon, 11 Nov 2024 07:03:15 +0800 Subject: [PATCH 17/20] update Signed-off-by: yuteng --- rfc/system/5575-supporting-yunikorn-and-kueue.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 2185598aa8..02fe48b8b3 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -132,11 +132,6 @@ func (e *PluginManager) addObjectMetadata(taskCtx pluginsCore.TaskExecutionMetad } ``` -When batchscheduler in flyte is yunikorn, some examples are like following. -For example, this appoarch submits a Ray job owned by user1 in organization1 to "root.organization1.ray". -A spark application in ns1 submitted by user4 in organization1 is in "root.organization1.ns1". -In the other hand, results of these examples are "organization1-ray" and "organization1-ns1" when adopting Kueue. - ## 4 Metrics & Dashboards 1. The Yunikorn scheduler add applications to a specific queue based on their user info, queue name for any application type. From e9c401a826f4e131681dece9a449e7a128cb002c Mon Sep 17 00:00:00 2001 From: yuteng Date: Mon, 11 Nov 2024 23:16:01 +0800 Subject: [PATCH 18/20] update Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 02fe48b8b3..a11521cb1e 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -44,15 +44,15 @@ Thus, a clusterQueue including multiple resources represents the total acessaibl | | ray、spark、default | A tenant can submit organization-specific tasks to queues such as organization.ray, organization.spark and organization.default to track which job types are submittable. -A scheduling plugin implements functions `SetSchedulerName`, `PatchLabels` and `PatchGroupLabels` to update labels and `schedulerName`. -`PatchLabels` patches necassary labels, such as `queuename`, `user-info` and `applcationID`, to jobs. -`PatchGroupLabels` supports creating `group-pod` and `task-group` labels based on incoming CRD if need. +A scheduling plugin implements functions `SetSchedulerName`, `CreateLabels` and `CreateGroupLabels` to create labels and `schedulerName`. +`CreateLabels` patches necassary labels, such as `queuename`, `user-info` and `applcationID`, to jobs. +`CreateGroupLabels` supports creating `group-pod` and `task-group` labels based on incoming CRD if need. `SetSchedulerName` set `schedulerName` field in `podTemplate`. ```go type SchedulePlugin interface { - PatchLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) - PatchGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) + CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) + CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) GetGroupLabels() (labels, annotations map[string]string) SetSchedulerName(object client.Object) } @@ -67,7 +67,7 @@ func (yk *YunikornSchedulPlugin) GetGroupLabels() (labels, annotations map[strin return yk.Labels, yk.Annotations } -func (yk *YunikornSchedulePlugin) PatchLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { +func (yk *YunikornSchedulePlugin) CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { // Set queue name based on the job type and flyteidl.Identifier fields including "ResourceType", "Org" and "Name". // 1.Clean yk.Labels and yk.Annotations // 2.Add yunikorn.apache.org/user.info = . @@ -75,7 +75,7 @@ func (yk *YunikornSchedulePlugin) PatchLabels(taskCtx pluginsCore.TaskExecutionM // 4.Add yunikorn.apache.org/queue = . } -func (yk *YunikornSchedulePlugin) PatchGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { +func (yk *YunikornSchedulePlugin) CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { // 1.Add yunikorn.apache.org/task-group-name = yk.CreateTaskgroupName(ResourceType) // 2.Add yunikorn.apache.org/task-groups = yk.CreateTaskgroup(object) // 3.Add yunikorn.apache.org/schedulingPolicyParameters = yk.jobs[ResourceType] @@ -92,14 +92,14 @@ func (k *KueueScheduablePlugin) GetGroupLabels() (labels, annotations map[string return k.Labels, k.Annotations } -func (k *KueueScheduablePlugin) PatchLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { +func (k *KueueScheduablePlugin) CreateLabels(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) (labels, annotations map[string]string) { // Set queue name based on the job type and flyteidl.Identifier field "Org". // Clean k.Labels and k.Annotations // 1.Add kueue.x-k8s.io/queue-name = . // Update k.Labels and k.Annotations } -func (k *KueueScheduablePlugin) PatchGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { +func (k *KueueScheduablePlugin) CreateGroupLabels(ctx context.Context, object client.Object, taskTmpl *core.TaskTemplate) { // Add Label "kueue.x-k8s.io/pod-group-name" and "kueue.x-k8s.io/pod-group-total-count" for spark、dask. // If object type is ray CRD and kubeflow CRD which are supported by Kueue then skips. // Update k.Labels and k.Annotations From 4387d589263d6aa505acfdc43cf683b060e2581b Mon Sep 17 00:00:00 2001 From: yuteng Date: Tue, 12 Nov 2024 21:58:00 +0800 Subject: [PATCH 19/20] The k8s plugin implements k8s.plugin and schedulePlugins Signed-off-by: yuteng --- .../5575-supporting-yunikorn-and-kueue.md | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index a11521cb1e..50041203f6 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -113,18 +113,42 @@ When a job comes, following things happens. 4. Merging labels and annotations from `CreateLabels` and `CreateGroupLabels` to the CRD. ```go +type PluginManager struct { + id string + plugin k8s.Plugin + resourceToWatch runtime.Object + kubeClient pluginsCore.KubeClient + metrics PluginMetrics + // Per namespace-resource + backOffController *backoff.Controller + resourceLevelMonitor *ResourceLevelMonitor + eventWatcher EventWatcher +} + func (e *PluginManager) launchResource(ctx context.Context, tCtx pluginsCore.TaskExecutionContext) (pluginsCore.Transition, error) { o, err := e.plugin.BuildResource(ctx, k8sTaskCtx) if err != nil { return pluginsCore.UnknownTransition, err } - e.SchedulerPlugin.SetSchedulerName(o) + if p, ok := e.plugin.(k8s.ScheduablePlugin); ok { + o, err = p.SetSchedulerName(o) + if err != nil { + return pluginsCore.UnknownTransition, err + } + } } func (e *PluginManager) addObjectMetadata(taskCtx pluginsCore.TaskExecutionMetadata, o client.Object, cfg *config.K8sPluginConfig) { - e.SchedulerPlugin.CreateLabels(taskCtx, o) - e.SchedulerPlugin.CreateGroupLabels(taskCtx, o) - schedulerLabels, schedulerAnnotations := e.SchedulerPlugin.GetLabels() + var schedulerLabels, schedulerAnnotations map[string]string + if p, ok := e.plugin.(k8s.ScheduablePlugin); ok { + o, err = p.SetSchedulerName(o) + if err != nil { + return pluginsCore.UnknownTransition, err + } + p.CreateLabels(taskCtx, o) + p.CreateGroupLabels(taskCtx, o) + schedulerLabels, schedulerAnnotations = e.plugin.GetLabels() + } o.SetNamespace(taskCtx.GetNamespace()) o.SetAnnotations(pluginsUtils.UnionMaps(cfg.DefaultAnnotations, o.GetAnnotations(), pluginsUtils.CopyMap(taskCtx.GetAnnotations(), schedulerAnnotations))) o.SetLabels(pluginsUtils.UnionMaps(cfg.DefaultLabels, o.GetLabels(), pluginsUtils.CopyMap(taskCtx.GetLabels(), schedulerLabels))) From 87ee5328bd1dfc52c293a51df0d625748eb8d7ab Mon Sep 17 00:00:00 2001 From: yuteng Date: Wed, 13 Nov 2024 05:54:30 +0800 Subject: [PATCH 20/20] update Signed-off-by: yuteng --- rfc/system/5575-supporting-yunikorn-and-kueue.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/rfc/system/5575-supporting-yunikorn-and-kueue.md b/rfc/system/5575-supporting-yunikorn-and-kueue.md index 50041203f6..b0e5064847 100644 --- a/rfc/system/5575-supporting-yunikorn-and-kueue.md +++ b/rfc/system/5575-supporting-yunikorn-and-kueue.md @@ -31,11 +31,9 @@ queueconfig: - type: "spark" gangscheduling: "placeholderTimeoutInSeconds=30 gangSchedulingStyle=hard" allow-preemption: true - - type: ".*" ``` `root.organization1.ray` is the queue of the ray job submitted by user1 belonging organization1. -`.*` means that other jobs excluding ray and spark will be submitted to `root..default`. ResourceFlavor allocates resource based on labels which indicates that category-based resource allocation by organization label is available. Thus, a clusterQueue including multiple resources represents the total acessaible resource for an organization.