-
Notifications
You must be signed in to change notification settings - Fork 0
/
slurm_filer.txt
325 lines (251 loc) · 10.3 KB
/
slurm_filer.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
################################
## Cluster Configuration File ##
################################
[cluster Slurm]
FormLayout = selectionpanel
Category = Schedulers
Autoscale = $Autoscale
[[node defaults]]
UsePublicNetwork = $UsePublicNetwork
Credentials = $Credentials
SubnetId = $SubnetId
Region = $Region
KeyPairLocation = ~/.ssh/cyclecloud.pem
# Slurm autoscaling supports both Terminate and Deallocate shutdown policies
ShutdownPolicy = $configuration_slurm_shutdown_policy
[[[configuration]]]
slurm.version = $configuration_slurm_version
# For fast spin-up after Deallocate, force an immediate re-converge on boot
cyclecloud.converge_on_boot = true
[[[cluster-init cyclecloud/slurm:default]]]
[[[cluster-init myproject:default:1.0.0]]]
Optional = true
[[node master]]
MachineType = $MasterMachineType
ImageName = $MasterImageName
IsReturnProxy = $ReturnProxy
AdditionalClusterInitSpecs = $MasterClusterInitSpecs
[[[configuration]]]
[[[cluster-init cyclecloud/slurm:master]]]
[[[network-interface eth0]]]
AssociatePublicIpAddress = $UsePublicNetwork
### Begin changes
[[[volume data]]]
Size = 200
Persistent = true
StorageAccountType = StandardSSD_LRS
Mount = data
[[[configuration cyclecloud.mounts.data]]]
mountpoint = /data
fs_type = ext4
[[[configuration cyclecloud.exports.nfs_data]]]
type = nfs
export_path = /data
### End changes
[[[input-endpoint ganglia]]]
PrivatePort = 8652
PublicPort = 8652
[[nodearray hpc]]
MachineType = $HPCMachineType
ImageName = $HPCImageName
MaxCoreCount = $MaxHPCExecuteCoreCount
Azure.MaxScalesetSize = $HPCMaxScalesetSize
AdditionalClusterInitSpecs = $HPCClusterInitSpecs
[[[configuration]]]
slurm.autoscale = true
slurm.default_partition = true
slurm.hpc = true
[[[cluster-init cyclecloud/slurm:execute]]]
[[[network-interface eth0]]]
AssociatePublicIpAddress = $ExecuteNodesPublic
### begin changes
[[[configuration cyclecloud.mounts.nfs_data]]]
type = nfs
mountpoint = /data
export_path = /data
### end changes
[[nodearray htc]]
MachineType = $HTCMachineType
ImageName = $HTCImageName
MaxCoreCount = $MaxHTCExecuteCoreCount
Interruptible = $HTCUseLowPrio
AdditionalClusterInitSpecs = $HTCClusterInitSpecs
[[[configuration]]]
slurm.autoscale = true
slurm.hpc = false
[[[cluster-init cyclecloud/slurm:execute]]]
[[[network-interface eth0]]]
AssociatePublicIpAddress = $ExecuteNodesPublic
### begin changes
[[[configuration cyclecloud.mounts.nfs_data]]]
type = nfs
mountpoint = /data
export_path = /data
### end changes
[[nodearray gpu]]
Extends = hpc
Machinetype = $GPUMachineType
[[[configuration]]]
slurm.default_partition = false
[parameters About]
Order = 1
[[parameters About Slurm]]
[[[parameter slurm]]]
HideLabel = true
Config.Plugin = pico.widget.HtmlTemplateWidget
Config.Template := "<table><tr><td><img src='static/cloud/cluster/ui/ClusterIcon/slurm.png' width='192' height='192'></td></tr><tr><td><p>Slurm is a highly configurable open source workload manager. See the <a href=\"https://www.schedmd.com/\" target=\"_blank\">Slurm project site</a> for an overview.</p><p>Follow the instructions in the <a href=\"https://github.com/azure/cyclecloud-slurm/\" target=\"_blank\">README</a> for details on instructions on extending and configuring the Project for your environment.</p></td></tr></table>"
[parameters Required Settings]
Order = 10
[[parameters Virtual Machines ]]
Description = "The cluster, in this case, has two roles: the scheduler master-node with shared filer and the execute hosts. Configure which VM types to use based on the requirements of your application."
Order = 20
[[[parameter Region]]]
Label = Region
Description = Deployment Location
ParameterType = Cloud.Region
DefaultValue = westus2
[[[parameter MasterMachineType]]]
Label = Master VM Type
Description = The VM type for scheduler master and shared filer.
ParameterType = Cloud.MachineType
DefaultValue = Standard_D12_v2
[[[parameter HPCMachineType]]]
Label = HPC VM Type
Description = The VM type for HPC execute nodes
ParameterType = Cloud.MachineType
DefaultValue = Standard_F2s_v2
[[[parameter HTCMachineType]]]
Label = HTC VM Type
Description = The VM type for HTC execute nodes
ParameterType = Cloud.MachineType
DefaultValue = Standard_F2s_v2
[[[parameter GPUMachineType]]]
Label = GPU VM Type
Description = The VM type for GPU execute nodes
ParameterType = Cloud.MachineType
DefaultValue = Standard_F2s_v2
[[parameters Auto-Scaling]]
Description = "The cluster can autoscale to the workload, adding execute hosts as jobs are queued. To enable this check the box below and choose the initial and maximum core counts for the cluster"
Order = 30
[[[parameter Autoscale]]]
Label = Autoscale
DefaultValue = true
Widget.Plugin = pico.form.BooleanCheckBox
Widget.Label = Start and stop execute instances automatically
[[[parameter MaxHPCExecuteCoreCount]]]
Label = Max HPC Cores
Description = The total number of HPC execute cores to start
DefaultValue = 100
Config.Plugin = pico.form.NumberTextBox
Config.MinValue = 1
Config.IntegerOnly = true
[[[parameter MaxHTCExecuteCoreCount]]]
Label = Max HTC Cores
Description = The total number of HTC execute cores to start
DefaultValue = 100
Config.Plugin = pico.form.NumberTextBox
Config.MinValue = 1
Config.IntegerOnly = true
[[[parameter HPCMaxScalesetSize]]]
Label = Max VMs per Scaleset
Description = The maximum number of VMs created per VM Scaleset e.g. switch in Slurm.
DefaultValue = 100
Config.Plugin = pico.form.NumberTextBox
Config.MinValue = 1
Config.IntegerOnly = true
[[[parameter HTCUseLowPrio]]]
Label = Low Priority
DefaultValue = false
Widget.Plugin = pico.form.BooleanCheckBox
Widget.Label = Use low priority instances for HTC execute hosts
[[parameters Networking]]
Order = 40
[[[parameter SubnetId]]]
Label = Subnet ID
Description = Subnet Resource Path (ResourceGroup/VirtualNetwork/Subnet)
ParameterType = Azure.Subnet
Required = True
[parameters Advanced Settings]
Order = 20
[[parameters Azure Settings]]
Order = 10
[[[parameter Credentials]]]
Description = The credentials for the cloud provider
ParameterType = Cloud.Credentials
[[parameters Slurm Settings ]]
Description = "Section for configuring Slurm"
Order = 5
[[[parameter configuration_slurm_version]]]
Required = True
Label = Slurm Version
Description = Version of Slurm to install on the cluster
ParameterType = StringList
Config.Plugin = pico.form.Dropdown
Config.FreeForm = true
Config.Entries := {[Value="19.05.5-1"], [Value="18.08.9-1"]}
DefaultValue = 19.05.5-1
[[[parameter configuration_slurm_shutdown_policy]]]
Label = ShutdownPolicy
description = By default, autostop will Delete stopped VMS for lowest cost. Optionally, Stop/Deallocate the VMs for faster restart instead.
DefaultValue = Terminate
config.plugin = pico.control.AutoCompleteDropdown
[[[[list Config.Entries]]]]
Name = Terminate
Label = Terminate
[[[[list Config.Entries]]]]
Name = Deallocate
Label = Deallocate
[[parameters Software]]
Description = "Specify the scheduling software, and base OS installed on all nodes, and optionally the cluster-init and chef versions from your Locker."
Order = 10
[[[parameter MasterImageName]]]
Label = Master OS
ParameterType = Cloud.Image
Config.OS = linux
DefaultValue = cycle.image.centos7
Config.Filter := Package in {"cycle.image.centos7", "cycle.image.ubuntu18"}
[[[parameter HPCImageName]]]
Label = HPC OS
ParameterType = Cloud.Image
Config.OS = linux
DefaultValue = cycle.image.centos7
Config.Filter := Package in {"cycle.image.centos7", "cycle.image.ubuntu18"}
[[[parameter HTCImageName]]]
Label = HTC OS
ParameterType = Cloud.Image
Config.OS = linux
DefaultValue = cycle.image.centos7
Config.Filter := Package in {"cycle.image.centos7", "cycle.image.ubuntu18"}
[[[parameter MasterClusterInitSpecs]]]
Label = Master Cluster-Init
DefaultValue = =undefined
Description = Cluster init specs to apply to the master node
ParameterType = Cloud.ClusterInitSpecs
[[[parameter HTCClusterInitSpecs]]]
Label = HTC Cluster-Init
DefaultValue = =undefined
Description = Cluster init specs to apply to HTC execute nodes
ParameterType = Cloud.ClusterInitSpecs
[[[parameter HPCClusterInitSpecs]]]
Label = HPC Cluster-Init
DefaultValue = =undefined
Description = Cluster init specs to apply to HPC execute nodes
ParameterType = Cloud.ClusterInitSpecs
[[parameters Advanced Networking]]
Description = Advanced networking settings
[[[parameter ReturnProxy]]]
Label = Return Proxy
DefaultValue = true
ParameterType = Boolean
Config.Label = Use SSH tunnel to connect to CycleCloud (required if direct access is blocked)
[[[parameter UsePublicNetwork]]]
Label = Public Head Node
DefaultValue = true
ParameterType = Boolean
Config.Label = Access master node from the Internet
[[[parameter ExecuteNodesPublic]]]
Label = Public Execute
DefaultValue = false
ParameterType = Boolean
Config.Label = Access execute nodes from the Internet
Conditions.Excluded := UsePublicNetwork isnt true