Kubernetes Operator for Tangled Spindles

fix error cleaning up spindle pods

evan.jarrett.net 07d0ebf1 790d6b80

verified
+37 -17
+3
Makefile
··· 297 297 deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. 298 298 cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} 299 299 $(KUSTOMIZE) build config/default | $(KUBECTL) apply -f - 300 + @echo "Waiting for deployment to be ready and forcing rollout to pull fresh image..." 301 + $(KUBECTL) rollout restart deployment/loom-controller-manager -n loom-system 302 + $(KUBECTL) rollout status deployment/loom-controller-manager -n loom-system --timeout=120s 300 303 301 304 .PHONY: undeploy 302 305 undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
+2 -2
config/manager/loom-config.yaml
··· 12 12 template: 13 13 resources: 14 14 requests: 15 - cpu: "500m" 15 + cpu: "1" 16 16 memory: "1Gi" 17 17 limits: 18 - cpu: "2" 18 + cpu: "3" 19 19 memory: "4Gi"
+1 -1
go.mod
··· 197 197 ) 198 198 199 199 // Use our custom version of tangled until its upstreamed 200 - replace tangled.org/core => tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109170743-a3bd4cd90024 200 + replace tangled.org/core => tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109203816-d73c73e8728b
+2 -2
go.sum
··· 687 687 sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= 688 688 sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= 689 689 sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= 690 - tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109170743-a3bd4cd90024 h1:Jy8zRhQ9gLD+49+hAfmgCccLmnkhReiRTHS/5QL1/II= 691 - tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109170743-a3bd4cd90024/go.mod h1:CUO6beA36K/Cwt0u2yrO5CG+L7+LzAc6zi6WudwO7qs= 690 + tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109203816-d73c73e8728b h1:m30BNzXLTO/V7tyh9levCuavrjgxH0KD5Y6lvX3S69s= 691 + tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109203816-d73c73e8728b/go.mod h1:CUO6beA36K/Cwt0u2yrO5CG+L7+LzAc6zi6WudwO7qs=
+29 -12
internal/controller/spindleset_controller.go
··· 87 87 88 88 logger.Info("Reconciling SpindleSet", "name", spindleSet.Name, "pipelineID", spindleSet.Spec.PipelineRun.PipelineID) 89 89 90 - // Add finalizer if not present 90 + // Handle deletion 91 + if !spindleSet.DeletionTimestamp.IsZero() { 92 + return r.handleDeletion(ctx, spindleSet) 93 + } 94 + 95 + // Add finalizer if not present (only when not being deleted) 91 96 if !controllerutil.ContainsFinalizer(spindleSet, "loom.j5t.io/finalizer") { 92 97 controllerutil.AddFinalizer(spindleSet, "loom.j5t.io/finalizer") 93 98 if err := r.Update(ctx, spindleSet); err != nil { 94 99 return ctrl.Result{}, err 95 100 } 96 - } 97 - 98 - // Handle deletion 99 - if !spindleSet.DeletionTimestamp.IsZero() { 100 - return r.handleDeletion(ctx, spindleSet) 101 101 } 102 102 103 103 // Ensure Jobs are created for workflows ··· 133 133 logger := log.FromContext(ctx) 134 134 logger.Info("Handling SpindleSet deletion") 135 135 136 + // Check if finalizer is present 137 + if !controllerutil.ContainsFinalizer(spindleSet, "loom.j5t.io/finalizer") { 138 + // Finalizer already removed, nothing to do 139 + return ctrl.Result{}, nil 140 + } 141 + 136 142 // List all Jobs owned by this SpindleSet 137 143 jobList := &batchv1.JobList{} 138 144 if err := r.List(ctx, jobList, client.InNamespace(spindleSet.Namespace), client.MatchingLabels{ ··· 142 148 return ctrl.Result{}, err 143 149 } 144 150 145 - // Delete all Jobs 151 + // Delete all Jobs with proper propagation to ensure Pods are deleted 152 + propagationPolicy := metav1.DeletePropagationForeground 153 + deleteOptions := client.DeleteOptions{ 154 + PropagationPolicy: &propagationPolicy, 155 + } 156 + 146 157 for _, job := range jobList.Items { 147 - logger.Info("Deleting Job", "jobName", job.Name) 148 - if err := r.Delete(ctx, &job); client.IgnoreNotFound(err) != nil { 158 + logger.Info("Deleting Job and associated Pods", "jobName", job.Name) 159 + if err := r.Delete(ctx, &job, &deleteOptions); client.IgnoreNotFound(err) != nil { 149 160 logger.Error(err, "Failed to delete Job", "jobName", job.Name) 150 161 return ctrl.Result{}, err 151 162 } ··· 367 378 func (r *SpindleSetReconciler) cleanupOrphanedJobs(ctx context.Context, namespace string) error { 368 379 logger := log.FromContext(ctx) 369 380 381 + // Delete options with propagation policy to ensure Pods are deleted 382 + propagationPolicy := metav1.DeletePropagationForeground 383 + deleteOptions := client.DeleteOptions{ 384 + PropagationPolicy: &propagationPolicy, 385 + } 386 + 370 387 // List all spindle Jobs in the namespace 371 388 jobList := &batchv1.JobList{} 372 389 if err := r.List(ctx, jobList, ··· 380 397 if spindleSetName == "" { 381 398 // Job missing spindleset label - this is an orphan from old code 382 399 logger.Info("Found Job without spindleset label, deleting", "job", job.Name) 383 - if err := r.Delete(ctx, &job); client.IgnoreNotFound(err) != nil { 400 + if err := r.Delete(ctx, &job, &deleteOptions); client.IgnoreNotFound(err) != nil { 384 401 logger.Error(err, "Failed to delete orphaned Job", "job", job.Name) 385 402 } 386 403 continue ··· 396 413 if apierrors.IsNotFound(err) { 397 414 // SpindleSet was deleted but Job remains - this is an orphan 398 415 logger.Info("Found orphaned Job, SpindleSet no longer exists", "job", job.Name, "spindleset", spindleSetName) 399 - if err := r.Delete(ctx, &job); client.IgnoreNotFound(err) != nil { 416 + if err := r.Delete(ctx, &job, &deleteOptions); client.IgnoreNotFound(err) != nil { 400 417 logger.Error(err, "Failed to delete orphaned Job", "job", job.Name) 401 418 } 402 419 continue ··· 416 433 // TTL is 3600 seconds (1 hour), add buffer and clean up after 2 hours 417 434 if age.Hours() > 2 { 418 435 logger.Info("Cleaning up old completed Job", "job", job.Name, "age", age) 419 - if err := r.Delete(ctx, &job); client.IgnoreNotFound(err) != nil { 436 + if err := r.Delete(ctx, &job, &deleteOptions); client.IgnoreNotFound(err) != nil { 420 437 logger.Error(err, "Failed to delete old Job", "job", job.Name) 421 438 } 422 439 }