Kubernetes Operator for Tangled Spindles

rbac fixes

evan.jarrett.net ad06efb7 86613913

verified
+177 -33
+6
config/rbac/role.yaml
··· 7 7 - apiGroups: 8 8 - "" 9 9 resources: 10 + - nodes 11 + verbs: 12 + - list 13 + - apiGroups: 14 + - "" 15 + resources: 10 16 - pods 11 17 - secrets 12 18 verbs:
+2
config/rbac/spindle_job_service_account.yaml
··· 7 7 name: spindle-job-runner 8 8 namespace: system 9 9 automountServiceAccountToken: false 10 + imagePullSecrets: 11 + - name: atcr-login 10 12 --- 11 13 # Note: No Role or RoleBinding created intentionally 12 14 # Job pods should have no permissions to read Secrets, list Pods, etc.
+8 -1
internal/controller/spindleset_controller.go
··· 55 55 // +kubebuilder:rbac:groups=loom.j5t.io,resources=spindlesets/finalizers,verbs=update 56 56 // +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete 57 57 // +kubebuilder:rbac:groups=batch,resources=jobs/status,verbs=get 58 + // +kubebuilder:rbac:groups="",resources=nodes,verbs=list 58 59 // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch 59 60 // +kubebuilder:rbac:groups="",resources=pods/log,verbs=get 60 61 // +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch ··· 345 346 } 346 347 } 347 348 349 + // List nodes for profile selection (to validate nodeSelector labels exist) 350 + var nodeList corev1.NodeList 351 + if err := r.Client.List(ctx, &nodeList); err != nil { 352 + return fmt.Errorf("failed to list nodes: %w", err) 353 + } 354 + 348 355 // Convert workflow steps to jobbuilder format and create Jobs for each workflow 349 356 for _, workflowSpec := range pipelineRun.Workflows { 350 357 // Check if Job already exists ··· 396 403 } 397 404 398 405 // Create the Job 399 - job, err := jobbuilder.BuildJob(jobConfig) 406 + job, err := jobbuilder.BuildJob(jobConfig, &nodeList) 400 407 if err != nil { 401 408 return fmt.Errorf("failed to build job for workflow %s: %w", workflowSpec.Name, err) 402 409 }
+39 -8
internal/jobbuilder/job_template.go
··· 61 61 Namespace string 62 62 } 63 63 64 - // selectResourceProfile selects the first resource profile matching the workflow architecture. 64 + // nodeMatchesSelector returns true if at least one node has all the labels in selector. 65 + func nodeMatchesSelector(nodes *corev1.NodeList, selector map[string]string) bool { 66 + if nodes == nil { 67 + return false 68 + } 69 + for _, node := range nodes.Items { 70 + if labelsMatch(node.Labels, selector) { 71 + return true 72 + } 73 + } 74 + return false 75 + } 76 + 77 + // labelsMatch returns true if nodeLabels contains all key-value pairs from selector. 78 + func labelsMatch(nodeLabels, selector map[string]string) bool { 79 + for key, value := range selector { 80 + if nodeLabels[key] != value { 81 + return false 82 + } 83 + } 84 + return true 85 + } 86 + 87 + // selectResourceProfile selects the first resource profile matching the workflow architecture 88 + // and whose nodeSelector labels all exist on at least one available node. 65 89 // Returns the profile's resources and nodeSelector, or default values if no match is found. 66 - func selectResourceProfile(profiles []loomv1alpha1.ResourceProfile, architecture string) (corev1.ResourceRequirements, map[string]string) { 90 + func selectResourceProfile(profiles []loomv1alpha1.ResourceProfile, architecture string, nodes *corev1.NodeList) (corev1.ResourceRequirements, map[string]string) { 67 91 // Iterate through profiles to find first match 68 92 for _, profile := range profiles { 69 93 // Check if profile's nodeSelector has the matching architecture 70 - if arch, ok := profile.NodeSelector["kubernetes.io/arch"]; ok && arch == architecture { 94 + arch, ok := profile.NodeSelector["kubernetes.io/arch"] 95 + if !ok || arch != architecture { 96 + continue 97 + } 98 + 99 + // Check if ALL nodeSelector labels exist on at least one node 100 + if nodeMatchesSelector(nodes, profile.NodeSelector) { 71 101 return profile.Resources, profile.NodeSelector 72 102 } 73 103 } 74 104 75 - // No profile matched - return defaults 105 + // No profile matched - return defaults with just architecture selector 76 106 return corev1.ResourceRequirements{ 77 107 Requests: corev1.ResourceList{ 78 108 corev1.ResourceCPU: resource.MustParse("500m"), ··· 82 112 corev1.ResourceCPU: resource.MustParse("2"), 83 113 corev1.ResourceMemory: resource.MustParse("4Gi"), 84 114 }, 85 - }, nil 115 + }, map[string]string{"kubernetes.io/arch": architecture} 86 116 } 87 117 88 118 // BuildJob creates a Kubernetes Job specification for running a spindle workflow. 89 - func BuildJob(config WorkflowConfig) (*batchv1.Job, error) { 119 + // The nodes parameter is used to validate that resource profile nodeSelectors can be satisfied. 120 + func BuildJob(config WorkflowConfig, nodes *corev1.NodeList) (*batchv1.Job, error) { 90 121 if config.WorkflowName == "" { 91 122 return nil, fmt.Errorf("workflow name is required") 92 123 } ··· 106 137 return nil, fmt.Errorf("failed to marshal workflow spec: %w", err) 107 138 } 108 139 109 - // Select resource profile based on workflow architecture 110 - resources, profileNodeSelector := selectResourceProfile(config.Template.ResourceProfiles, config.Architecture) 140 + // Select resource profile based on workflow architecture and available nodes 141 + resources, profileNodeSelector := selectResourceProfile(config.Template.ResourceProfiles, config.Architecture, nodes) 111 142 112 143 // Build architecture-based node affinity 113 144 archAffinity := BuildArchitectureAffinity(config.Architecture)
+122 -24
internal/jobbuilder/job_template_test.go
··· 1 1 package jobbuilder 2 2 3 3 import ( 4 + "fmt" 4 5 "testing" 5 6 6 7 corev1 "k8s.io/api/core/v1" 8 + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 7 9 "k8s.io/apimachinery/pkg/api/resource" 8 10 9 11 loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1" 10 12 ) 11 13 14 + // Helper function to create a mock node list for tests 15 + func makeNodeList(nodes ...map[string]string) *corev1.NodeList { 16 + list := &corev1.NodeList{} 17 + for i, labels := range nodes { 18 + list.Items = append(list.Items, corev1.Node{ 19 + ObjectMeta: metav1.ObjectMeta{ 20 + Name: fmt.Sprintf("node-%d", i), 21 + Labels: labels, 22 + }, 23 + }) 24 + } 25 + return list 26 + } 27 + 12 28 func TestSelectResourceProfile(t *testing.T) { 13 29 tests := []struct { 14 30 name string 15 31 profiles []loomv1alpha1.ResourceProfile 16 32 architecture string 33 + nodes *corev1.NodeList 17 34 wantCPU string 18 35 wantMemory string 19 36 wantLabels map[string]string 20 37 }{ 21 38 { 22 - name: "select arm64 profile", 39 + name: "select arm64 profile when matching node exists", 23 40 profiles: []loomv1alpha1.ResourceProfile{ 24 41 { 25 42 NodeSelector: map[string]string{ ··· 53 70 }, 54 71 }, 55 72 architecture: "arm64", 56 - wantCPU: "1", 57 - wantMemory: "2Gi", 73 + nodes: makeNodeList( 74 + map[string]string{"kubernetes.io/arch": "arm64"}, 75 + map[string]string{"kubernetes.io/arch": "amd64"}, 76 + ), 77 + wantCPU: "1", 78 + wantMemory: "2Gi", 58 79 wantLabels: map[string]string{ 59 80 "kubernetes.io/arch": "arm64", 60 81 }, 61 82 }, 62 83 { 63 - name: "select amd64 profile", 84 + name: "select amd64 profile when matching node exists", 64 85 profiles: []loomv1alpha1.ResourceProfile{ 65 86 { 66 87 NodeSelector: map[string]string{ ··· 86 107 }, 87 108 }, 88 109 architecture: "amd64", 89 - wantCPU: "4", 90 - wantMemory: "8Gi", 110 + nodes: makeNodeList( 111 + map[string]string{"kubernetes.io/arch": "amd64"}, 112 + ), 113 + wantCPU: "4", 114 + wantMemory: "8Gi", 91 115 wantLabels: map[string]string{ 92 116 "kubernetes.io/arch": "amd64", 93 117 }, 94 118 }, 95 119 { 96 - name: "select first matching profile with additional labels", 120 + name: "skip profile when additional labels dont exist on nodes", 97 121 profiles: []loomv1alpha1.ResourceProfile{ 98 122 { 99 123 NodeSelector: map[string]string{ ··· 120 144 }, 121 145 }, 122 146 architecture: "arm64", 123 - wantCPU: "4", 124 - wantMemory: "8Gi", 147 + // Node exists but does NOT have node-tier=large label 148 + nodes: makeNodeList( 149 + map[string]string{"kubernetes.io/arch": "arm64"}, 150 + ), 151 + // Should skip first profile and use second one 152 + wantCPU: "1", 153 + wantMemory: "2Gi", 154 + wantLabels: map[string]string{ 155 + "kubernetes.io/arch": "arm64", 156 + }, 157 + }, 158 + { 159 + name: "select profile with additional labels when matching node exists", 160 + profiles: []loomv1alpha1.ResourceProfile{ 161 + { 162 + NodeSelector: map[string]string{ 163 + "kubernetes.io/arch": "arm64", 164 + "node-tier": "large", 165 + }, 166 + Resources: corev1.ResourceRequirements{ 167 + Requests: corev1.ResourceList{ 168 + corev1.ResourceCPU: resource.MustParse("4"), 169 + corev1.ResourceMemory: resource.MustParse("8Gi"), 170 + }, 171 + }, 172 + }, 173 + { 174 + NodeSelector: map[string]string{ 175 + "kubernetes.io/arch": "arm64", 176 + }, 177 + Resources: corev1.ResourceRequirements{ 178 + Requests: corev1.ResourceList{ 179 + corev1.ResourceCPU: resource.MustParse("1"), 180 + corev1.ResourceMemory: resource.MustParse("2Gi"), 181 + }, 182 + }, 183 + }, 184 + }, 185 + architecture: "arm64", 186 + // Node has BOTH labels 187 + nodes: makeNodeList( 188 + map[string]string{"kubernetes.io/arch": "arm64", "node-tier": "large"}, 189 + ), 190 + wantCPU: "4", 191 + wantMemory: "8Gi", 125 192 wantLabels: map[string]string{ 126 193 "kubernetes.io/arch": "arm64", 127 194 "node-tier": "large", 128 195 }, 129 196 }, 130 197 { 131 - name: "fallback to defaults when no profile matches", 198 + name: "fallback to defaults when no architecture profile matches", 132 199 profiles: []loomv1alpha1.ResourceProfile{ 133 200 { 134 201 NodeSelector: map[string]string{ ··· 143 210 }, 144 211 }, 145 212 architecture: "arm64", 146 - wantCPU: "500m", 147 - wantMemory: "1Gi", 148 - wantLabels: nil, 213 + nodes: makeNodeList( 214 + map[string]string{"kubernetes.io/arch": "arm64"}, 215 + ), 216 + wantCPU: "500m", 217 + wantMemory: "1Gi", 218 + wantLabels: map[string]string{ 219 + "kubernetes.io/arch": "arm64", 220 + }, 149 221 }, 150 222 { 151 223 name: "fallback to defaults when no profiles configured", 152 224 profiles: []loomv1alpha1.ResourceProfile{}, 153 225 architecture: "amd64", 154 - wantCPU: "500m", 155 - wantMemory: "1Gi", 156 - wantLabels: nil, 226 + nodes: makeNodeList( 227 + map[string]string{"kubernetes.io/arch": "amd64"}, 228 + ), 229 + wantCPU: "500m", 230 + wantMemory: "1Gi", 231 + wantLabels: map[string]string{ 232 + "kubernetes.io/arch": "amd64", 233 + }, 157 234 }, 158 235 } 159 236 160 237 for _, tt := range tests { 161 238 t.Run(tt.name, func(t *testing.T) { 162 - gotResources, gotLabels := selectResourceProfile(tt.profiles, tt.architecture) 239 + gotResources, gotLabels := selectResourceProfile(tt.profiles, tt.architecture, tt.nodes) 163 240 164 241 // Check CPU request 165 242 gotCPU := gotResources.Requests[corev1.ResourceCPU] ··· 177 254 178 255 // Check labels 179 256 if len(gotLabels) != len(tt.wantLabels) { 180 - t.Errorf("selectResourceProfile() labels count = %d, want %d", len(gotLabels), len(tt.wantLabels)) 257 + t.Errorf("selectResourceProfile() labels count = %d, want %d (got: %v)", len(gotLabels), len(tt.wantLabels), gotLabels) 181 258 } 182 259 for k, wantV := range tt.wantLabels { 183 260 if gotV, ok := gotLabels[k]; !ok || gotV != wantV { ··· 192 269 tests := []struct { 193 270 name string 194 271 config WorkflowConfig 272 + nodes *corev1.NodeList 195 273 wantCPU string 196 274 wantMemory string 197 275 wantNodeSelector map[string]string 198 276 wantErr bool 199 277 }{ 200 278 { 201 - name: "use arm64 profile with additional labels", 279 + name: "use arm64 profile with additional labels when node matches", 202 280 config: WorkflowConfig{ 203 281 WorkflowName: "test-workflow", 204 282 PipelineID: "test-pipeline", ··· 224 302 }, 225 303 }, 226 304 }, 305 + nodes: makeNodeList( 306 + map[string]string{"kubernetes.io/arch": "arm64", "node-tier": "large"}, 307 + ), 227 308 wantCPU: "2", 228 309 wantMemory: "4Gi", 229 310 wantNodeSelector: map[string]string{ ··· 269 350 }, 270 351 }, 271 352 }, 353 + nodes: makeNodeList( 354 + map[string]string{"kubernetes.io/arch": "arm64"}, 355 + map[string]string{"kubernetes.io/arch": "amd64"}, 356 + ), 272 357 wantCPU: "4", 273 358 wantMemory: "8Gi", 274 359 wantNodeSelector: map[string]string{ ··· 277 362 wantErr: false, 278 363 }, 279 364 { 280 - name: "profile with multiple labels", 365 + name: "fallback to simpler profile when labels dont match", 281 366 config: WorkflowConfig{ 282 367 WorkflowName: "test-workflow", 283 368 PipelineID: "test-pipeline", ··· 296 381 }, 297 382 Resources: corev1.ResourceRequirements{ 298 383 Requests: corev1.ResourceList{ 384 + corev1.ResourceCPU: resource.MustParse("4"), 385 + corev1.ResourceMemory: resource.MustParse("8Gi"), 386 + }, 387 + }, 388 + }, 389 + { 390 + NodeSelector: map[string]string{ 391 + "kubernetes.io/arch": "arm64", 392 + }, 393 + Resources: corev1.ResourceRequirements{ 394 + Requests: corev1.ResourceList{ 299 395 corev1.ResourceCPU: resource.MustParse("1"), 300 396 corev1.ResourceMemory: resource.MustParse("2Gi"), 301 397 }, ··· 304 400 }, 305 401 }, 306 402 }, 403 + // Node has arm64 but NOT the other labels 404 + nodes: makeNodeList( 405 + map[string]string{"kubernetes.io/arch": "arm64"}, 406 + ), 307 407 wantCPU: "1", 308 408 wantMemory: "2Gi", 309 409 wantNodeSelector: map[string]string{ 310 410 "kubernetes.io/arch": "arm64", 311 - "node-tier": "large", 312 - "custom-label": "custom-value", 313 411 }, 314 412 wantErr: false, 315 413 }, ··· 317 415 318 416 for _, tt := range tests { 319 417 t.Run(tt.name, func(t *testing.T) { 320 - job, err := BuildJob(tt.config) 418 + job, err := BuildJob(tt.config, tt.nodes) 321 419 if (err != nil) != tt.wantErr { 322 420 t.Errorf("BuildJob() error = %v, wantErr %v", err, tt.wantErr) 323 421 return ··· 343 441 // Check nodeSelector 344 442 gotNodeSelector := job.Spec.Template.Spec.NodeSelector 345 443 if len(gotNodeSelector) != len(tt.wantNodeSelector) { 346 - t.Errorf("BuildJob() nodeSelector count = %d, want %d", len(gotNodeSelector), len(tt.wantNodeSelector)) 444 + t.Errorf("BuildJob() nodeSelector count = %d, want %d (got: %v)", len(gotNodeSelector), len(tt.wantNodeSelector), gotNodeSelector) 347 445 } 348 446 for k, wantV := range tt.wantNodeSelector { 349 447 if gotV, ok := gotNodeSelector[k]; !ok || gotV != wantV {