Kubernetes Operator for Tangled Spindles

actually listen for command errors

evan.jarrett.net c307718d bdb1bbab

verified
+233 -99
+8
Makefile
··· 267 267 uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. 268 268 $(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - 269 269 270 + # OVERLAY selects the kustomize overlay to use (default, prod, etc.) 271 + # Use 'make deploy OVERLAY=prod' for cluster-specific settings 272 + OVERLAY ?= default 273 + 270 274 .PHONY: deploy 271 275 deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. 272 276 cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} 277 + ifeq ($(OVERLAY),default) 273 278 $(KUSTOMIZE) build config/default | $(KUBECTL) apply -f - 279 + else 280 + $(KUSTOMIZE) build config/overlays/$(OVERLAY) | $(KUBECTL) apply -f - 281 + endif 274 282 @echo "Waiting for deployment to be ready and forcing rollout to pull fresh image..." 275 283 $(KUBECTL) rollout restart deployment/loom-controller-manager -n loom-system 276 284 $(KUBECTL) rollout status deployment/loom-controller-manager -n loom-system --timeout=120s
+23 -2
cmd/runner/main.go
··· 19 19 command string 20 20 } 21 21 22 + // extendedLogLine extends models.LogLine with exit code for error reporting 23 + type extendedLogLine struct { 24 + models.LogLine 25 + ExitCode int `json:"exit_code,omitempty"` 26 + } 27 + 22 28 func (s *simpleStep) Name() string { 23 29 return s.name 24 30 } ··· 177 183 } 178 184 } 179 185 180 - // Emit step end event 181 - emitControlEvent(stepID, simpleStep, models.StepStatusEnd) 186 + // Emit step end event with exit code for error reporting 187 + emitControlEventWithCode(stepID, simpleStep, models.StepStatusEnd, exitCode) 182 188 183 189 if exitCode != 0 { 184 190 return fmt.Errorf("command exited with code %d", exitCode) ··· 204 210 func emitControlEvent(stepID int, step models.Step, status models.StepStatus) { 205 211 logLine := models.NewControlLogLine(stepID, step, status) 206 212 emitJSON(logLine) 213 + } 214 + 215 + // emitControlEventWithCode emits a control event with an exit code for error reporting 216 + func emitControlEventWithCode(stepID int, step models.Step, status models.StepStatus, exitCode int) { 217 + logLine := models.NewControlLogLine(stepID, step, status) 218 + extended := extendedLogLine{ 219 + LogLine: logLine, 220 + ExitCode: exitCode, 221 + } 222 + data, err := json.Marshal(extended) 223 + if err != nil { 224 + fmt.Fprintf(os.Stderr, "ERROR: failed to marshal JSON: %v\n", err) 225 + return 226 + } 227 + fmt.Println(string(data)) 207 228 } 208 229 209 230 func emitDataEvent(stepID int, stream, content string) {
+27
config/overlays/prod/kustomization.yaml
··· 1 + apiVersion: kustomize.config.k8s.io/v1beta1 2 + kind: Kustomization 3 + 4 + resources: 5 + - ../../default 6 + 7 + patches: 8 + - target: 9 + kind: PersistentVolumeClaim 10 + name: loom-spindle-db 11 + patch: |- 12 + - op: add 13 + path: /spec/storageClassName 14 + value: longhorn 15 + - op: add 16 + path: /spec/volumeName 17 + value: pvc-loom-spindle-db 18 + - target: 19 + kind: PersistentVolumeClaim 20 + name: loom-spindle-logs 21 + patch: |- 22 + - op: add 23 + path: /spec/storageClassName 24 + value: longhorn 25 + - op: add 26 + path: /spec/volumeName 27 + value: pvc-loom-spindle-logs
+12 -3
internal/engine/kubernetes_engine.go
··· 36 36 podPhase corev1.PodPhase // Track pod phase at stream creation time 37 37 } 38 38 39 + // extendedLogLine extends models.LogLine with exit code for error reporting 40 + type extendedLogLine struct { 41 + models.LogLine 42 + ExitCode int `json:"exit_code,omitempty"` 43 + } 44 + 39 45 // KubernetesEngine implements the spindle Engine interface for Kubernetes Jobs. 40 46 type KubernetesEngine struct { 41 47 client client.Client ··· 547 553 for scanner.Scan() { 548 554 line := scanner.Text() 549 555 550 - // Try to parse as models.LogLine from the runner binary 551 - var logLine models.LogLine 556 + // Try to parse as extendedLogLine from the runner binary (includes exit_code) 557 + var logLine extendedLogLine 552 558 if err := json.Unmarshal([]byte(line), &logLine); err != nil { 553 559 // Not JSON or parse error - skip 554 560 continue ··· 571 577 // Use control events from runner for flow control only 572 578 // Don't write them - the core spindle engine writes control events 573 579 if logLine.StepStatus == models.StepStatusEnd { 574 - // Step is done, return 580 + // Check exit code before returning success 581 + if logLine.ExitCode != 0 { 582 + return fmt.Errorf("step %d failed with exit code %d", stepID, logLine.ExitCode) 583 + } 575 584 return nil 576 585 } 577 586 // For "start" events, just continue reading
+102 -94
internal/jobbuilder/job_template.go
··· 215 215 AutomountServiceAccountToken: &[]bool{false}[0], 216 216 217 217 // Init containers: setup user and buildah storage, install runner binary, then clone repository 218 - InitContainers: []corev1.Container{ 219 - // Setup user creates /etc/passwd and /etc/group entries for UID 1000 220 - // This is needed because many tools (like buildah) require a valid passwd entry 221 - // Also configures buildah storage directories and storage.conf 222 - { 223 - Name: "setup-user", 224 - Image: "busybox:latest", 225 - Command: []string{"/bin/sh", "-c"}, 226 - Args: []string{` 227 - cat > /etc-override/passwd <<'EOF' 228 - root:x:0:0:root:/root:/bin/bash 229 - nobody:x:65534:65534:nobody:/nonexistent:/usr/sbin/nologin 230 - runner:x:1000:1000:runner:/home/runner:/bin/sh 231 - EOF 232 - cat > /etc-override/group <<'EOF' 233 - root:x:0: 234 - nobody:x:65534: 235 - runner:x:1000: 236 - EOF 237 - # subuid/subgid mappings for rootless buildah user namespaces 238 - cat > /etc-override/subuid <<'EOF' 239 - runner:100000:65536 240 - EOF 241 - cat > /etc-override/subgid <<'EOF' 242 - runner:100000:65536 243 - EOF 244 - # Create home directory structure then fix ownership 245 - mkdir -p /home-override/runner/.config/containers 246 - chmod 700 /home-override/runner/.config 247 - chown -R 1000:1000 /home-override/runner 248 - # Configure buildah storage directories (for workflows using buildah images) 249 - mkdir -p /var/lib/containers/storage /var/lib/containers/runroot 250 - cat > /var/lib/containers/storage.conf <<'EOF' 251 - [storage] 252 - driver = "overlay" 253 - runroot = "/var/lib/containers/runroot" 254 - graphroot = "/var/lib/containers/storage" 255 - EOF 256 - chown -R 1000:1000 /var/lib/containers 257 - echo "User and buildah storage setup complete" 258 - `}, 259 - SecurityContext: &corev1.SecurityContext{ 260 - AllowPrivilegeEscalation: &[]bool{false}[0], 261 - // Note: This init container runs as root to create the passwd/group files 262 - // All subsequent containers run as UID 1000 (non-root) 263 - RunAsNonRoot: &[]bool{false}[0], 264 - RunAsUser: &[]int64{0}[0], 265 - Capabilities: &corev1.Capabilities{ 266 - Drop: []corev1.Capability{"ALL"}, 267 - // CAP_CHOWN is needed to set ownership of home directory for UID 1000 268 - Add: []corev1.Capability{"CHOWN"}, 269 - }, 270 - }, 271 - VolumeMounts: []corev1.VolumeMount{ 272 - { 273 - Name: "etc-override", 274 - MountPath: "/etc-override", 275 - }, 276 - { 277 - Name: "home-override", 278 - MountPath: "/home-override", 279 - }, 280 - { 281 - Name: "buildah-storage", 282 - MountPath: "/var/lib/containers", 283 - }, 284 - }, 285 - }, 286 - { 287 - Name: "install-runner", 288 - Image: config.LoomImage, 289 - Command: []string{"/loom-runner", "--install", "/runner-bin/loom-runner"}, 290 - SecurityContext: &corev1.SecurityContext{ 291 - AllowPrivilegeEscalation: &[]bool{false}[0], 292 - RunAsNonRoot: &[]bool{true}[0], 293 - RunAsUser: &[]int64{1000}[0], 294 - ReadOnlyRootFilesystem: &[]bool{true}[0], 295 - Capabilities: &corev1.Capabilities{ 296 - Drop: []corev1.Capability{"ALL"}, 297 - }, 298 - }, 299 - VolumeMounts: []corev1.VolumeMount{ 300 - { 301 - Name: "runner-binary", 302 - MountPath: "/runner-bin", 303 - }, 304 - }, 305 - }, 306 - buildCloneInitContainer(config), 307 - }, 218 + InitContainers: buildInitContainers(config), 308 219 309 220 // Main container: run loom-runner binary in user's image 310 221 // Use shell to prepend /runner-bin to PATH, preserving the image's PATH ··· 404 315 return envFrom 405 316 } 406 317 318 + // buildInitContainers creates the init containers for the pod. 319 + // This function properly handles the optional clone container. 320 + func buildInitContainers(config WorkflowConfig) []corev1.Container { 321 + initContainers := []corev1.Container{ 322 + // Setup user creates /etc/passwd and /etc/group entries for UID 1000 323 + // This is needed because many tools (like buildah) require a valid passwd entry 324 + // Also configures buildah storage directories and storage.conf 325 + { 326 + Name: "setup-user", 327 + Image: "busybox:latest", 328 + Command: []string{"/bin/sh", "-c"}, 329 + Args: []string{` 330 + cat > /etc-override/passwd <<'EOF' 331 + root:x:0:0:root:/root:/bin/bash 332 + nobody:x:65534:65534:nobody:/nonexistent:/usr/sbin/nologin 333 + runner:x:1000:1000:runner:/home/runner:/bin/sh 334 + EOF 335 + cat > /etc-override/group <<'EOF' 336 + root:x:0: 337 + nobody:x:65534: 338 + runner:x:1000: 339 + EOF 340 + # subuid/subgid mappings for rootless buildah user namespaces 341 + cat > /etc-override/subuid <<'EOF' 342 + runner:100000:65536 343 + EOF 344 + cat > /etc-override/subgid <<'EOF' 345 + runner:100000:65536 346 + EOF 347 + # Create home directory structure then fix ownership 348 + mkdir -p /home-override/runner/.config/containers 349 + chmod 700 /home-override/runner/.config 350 + chown -R 1000:1000 /home-override/runner 351 + # Configure buildah storage directories (for workflows using buildah images) 352 + mkdir -p /var/lib/containers/storage /var/lib/containers/runroot 353 + cat > /var/lib/containers/storage.conf <<'EOF' 354 + [storage] 355 + driver = "overlay" 356 + runroot = "/var/lib/containers/runroot" 357 + graphroot = "/var/lib/containers/storage" 358 + EOF 359 + chown -R 1000:1000 /var/lib/containers 360 + echo "User and buildah storage setup complete" 361 + `}, 362 + SecurityContext: &corev1.SecurityContext{ 363 + AllowPrivilegeEscalation: &[]bool{false}[0], 364 + // Note: This init container runs as root to create the passwd/group files 365 + // All subsequent containers run as UID 1000 (non-root) 366 + RunAsNonRoot: &[]bool{false}[0], 367 + RunAsUser: &[]int64{0}[0], 368 + Capabilities: &corev1.Capabilities{ 369 + Drop: []corev1.Capability{"ALL"}, 370 + // CAP_CHOWN is needed to set ownership of home directory for UID 1000 371 + Add: []corev1.Capability{"CHOWN"}, 372 + }, 373 + }, 374 + VolumeMounts: []corev1.VolumeMount{ 375 + { 376 + Name: "etc-override", 377 + MountPath: "/etc-override", 378 + }, 379 + { 380 + Name: "home-override", 381 + MountPath: "/home-override", 382 + }, 383 + { 384 + Name: "buildah-storage", 385 + MountPath: "/var/lib/containers", 386 + }, 387 + }, 388 + }, 389 + { 390 + Name: "install-runner", 391 + Image: config.LoomImage, 392 + Command: []string{"/loom-runner", "--install", "/runner-bin/loom-runner"}, 393 + SecurityContext: &corev1.SecurityContext{ 394 + AllowPrivilegeEscalation: &[]bool{false}[0], 395 + RunAsNonRoot: &[]bool{true}[0], 396 + RunAsUser: &[]int64{1000}[0], 397 + ReadOnlyRootFilesystem: &[]bool{true}[0], 398 + Capabilities: &corev1.Capabilities{ 399 + Drop: []corev1.Capability{"ALL"}, 400 + }, 401 + }, 402 + VolumeMounts: []corev1.VolumeMount{ 403 + { 404 + Name: "runner-binary", 405 + MountPath: "/runner-bin", 406 + }, 407 + }, 408 + }, 409 + } 410 + 411 + // Only add clone container if cloning is not skipped 412 + if !config.SkipClone { 413 + initContainers = append(initContainers, buildCloneInitContainer(config)) 414 + } 415 + 416 + return initContainers 417 + } 418 + 407 419 // buildCloneInitContainer creates the init container for cloning the git repository. 408 420 // Uses the shared clone command builder from tangled.org/core/spindle/steps. 409 421 func buildCloneInitContainer(config WorkflowConfig) corev1.Container { 410 - // If clone is skipped, return an empty container (will be filtered out) 411 - if config.SkipClone { 412 - return corev1.Container{} 413 - } 414 422 415 423 // Build the shell script from clone commands 416 424 // Add set -e for error handling, safe.directory config to handle ownership mismatch
+61
internal/jobbuilder/job_template_test.go
··· 265 265 } 266 266 } 267 267 268 + func TestBuildInitContainers(t *testing.T) { 269 + tests := []struct { 270 + name string 271 + config WorkflowConfig 272 + wantContainers int 273 + wantCloneExists bool 274 + }{ 275 + { 276 + name: "includes clone container when SkipClone is false", 277 + config: WorkflowConfig{ 278 + LoomImage: "loom:latest", 279 + SkipClone: false, 280 + CloneCommands: []string{"git clone https://example.com/repo"}, 281 + }, 282 + wantContainers: 3, // setup-user, install-runner, clone-repo 283 + wantCloneExists: true, 284 + }, 285 + { 286 + name: "excludes clone container when SkipClone is true", 287 + config: WorkflowConfig{ 288 + LoomImage: "loom:latest", 289 + SkipClone: true, 290 + }, 291 + wantContainers: 2, // setup-user, install-runner 292 + wantCloneExists: false, 293 + }, 294 + } 295 + 296 + for _, tt := range tests { 297 + t.Run(tt.name, func(t *testing.T) { 298 + containers := buildInitContainers(tt.config) 299 + 300 + if len(containers) != tt.wantContainers { 301 + t.Errorf("buildInitContainers() = %d containers, want %d", len(containers), tt.wantContainers) 302 + for i, c := range containers { 303 + t.Logf(" container[%d]: %s", i, c.Name) 304 + } 305 + } 306 + 307 + // Verify all containers have valid names (non-empty) 308 + for i, c := range containers { 309 + if c.Name == "" { 310 + t.Errorf("buildInitContainers() container[%d] has empty name", i) 311 + } 312 + } 313 + 314 + // Check for clone container 315 + hasClone := false 316 + for _, c := range containers { 317 + if c.Name == "clone-repo" { 318 + hasClone = true 319 + break 320 + } 321 + } 322 + if hasClone != tt.wantCloneExists { 323 + t.Errorf("buildInitContainers() clone-repo exists = %v, want %v", hasClone, tt.wantCloneExists) 324 + } 325 + }) 326 + } 327 + } 328 + 268 329 func TestBuildJob(t *testing.T) { 269 330 tests := []struct { 270 331 name string