Kubernetes Operator for Tangled Spindles

unify dockerfile, create a release flow

evan.jarrett.net ee82083d bf34eb23

verified
+703 -69
+32
.tangled/workflows/release.yaml
··· 1 + # ATCR Release Pipeline for Tangled.org 2 + # Triggers on version tags and builds cross-platform binaries using buildah 3 + 4 + when: 5 + - event: ["push"] 6 + tag: ["v*"] 7 + 8 + engine: kubernetes 9 + image: quay.io/buildah/stable:latest 10 + architecture: amd64 11 + 12 + environment: 13 + IMAGE_REGISTRY: atcr.io 14 + 15 + steps: 16 + - name: Login to registry 17 + command: | 18 + echo "${APP_PASSWORD}" | buildah login \ 19 + -u "${TANGLED_REPO_DID}" \ 20 + --password-stdin \ 21 + ${IMAGE_REGISTRY} 22 + 23 + - name: Build and push Loom image 24 + command: | 25 + buildah bud \ 26 + --tag ${IMAGE_REGISTRY}/${TANGLED_REPO_DID}/${TANGLED_REPO_NAME}:${TANGLED_REF_NAME} \ 27 + --tag ${IMAGE_REGISTRY}/${TANGLED_REPO_DID}/${TANGLED_REPO_NAME}:latest \ 28 + --file ./Dockerfile \ 29 + . 30 + 31 + buildah push ${IMAGE_REGISTRY}/${TANGLED_REPO_DID}/${TANGLED_REPO_NAME}:latest 32 + buildah push ${IMAGE_REGISTRY}/${TANGLED_REPO_DID}/${TANGLED_REPO_NAME}:${TANGLED_REF_NAME}
+11 -7
.tangled/workflows/workflow-amd64.yaml
··· 7 7 image: golang:1.25-trixie 8 8 architecture: amd64 9 9 10 + environment: 11 + IMAGE_REGISTRY: atcr.io 12 + 10 13 steps: 11 - - name: build manager binary 14 + - name: test environment vars 12 15 command: | 13 - make build 16 + printenv 14 17 15 - - name: verify build artifacts 18 + - name: Login to registry 16 19 command: | 17 - ls -lh bin/ 20 + echo "${APP_PASSWORD}" | buildah login \ 21 + -u "${TANGLED_REPO_DID}" \ 22 + --password-stdin \ 23 + ${IMAGE_REGISTRY} 24 + 18 25 19 - - name: hello 20 - command: | 21 - echo "hello"
+23 -20
Dockerfile
··· 1 - # Build the manager binary 2 - FROM golang:1.24 AS builder 1 + # Build both binaries 2 + FROM --platform=$BUILDPLATFORM golang:1.24 AS builder 3 + 3 4 ARG TARGETOS 4 5 ARG TARGETARCH 5 6 6 7 WORKDIR /workspace 7 - # Copy the core module (for local replace directive) 8 + 9 + # Copy core dependency (from replace directive in go.mod) 8 10 COPY core/ core/ 9 - # Copy the Go Modules manifests 10 - COPY loom/go.mod loom/go.mod 11 - COPY loom/go.sum loom/go.sum 12 - # cache deps before building and copying source so that we don't need to re-download as much 13 - # and so that source changes don't invalidate our downloaded layer 11 + 12 + # Copy loom go mod files and download deps 13 + COPY loom/go.mod loom/go.sum loom/ 14 14 WORKDIR /workspace/loom 15 15 RUN go mod download 16 16 17 - # Copy the go source 18 - COPY loom/cmd/controller/main.go cmd/controller/main.go 17 + # Copy loom source code 19 18 COPY loom/api/ api/ 19 + COPY loom/cmd/ cmd/ 20 20 COPY loom/internal/ internal/ 21 21 22 - # Build 23 - # CGO is required for go-sqlite3 24 - RUN CGO_ENABLED=1 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/controller/main.go 22 + # Build runner (static, no CGO) 23 + # Use -s -w to strip debug symbols and reduce binary size 24 + RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ 25 + go build -a -ldflags='-s -w -extldflags "-static"' \ 26 + -o loom-runner ./cmd/runner 25 27 26 - # Use debian-slim for debugging (normally we'd use distroless) 27 - # Refer to https://github.com/GoogleContainerTools/distroless for more details 28 - FROM debian:12-slim 29 - RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* 30 - WORKDIR / 31 - COPY --from=builder /workspace/loom/manager . 32 - USER 65532:65532 28 + # Build controller (requires CGO for sqlite3) 29 + # Use -s -w to strip debug symbols and reduce binary size 30 + RUN CGO_ENABLED=1 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ 31 + go build -a -ldflags='-s -w' -o manager ./cmd/controller 33 32 33 + # Unified image with both binaries 34 + FROM gcr.io/distroless/base-debian12:nonroot 35 + COPY --from=builder /workspace/loom/manager /manager 36 + COPY --from=builder /workspace/loom/loom-runner /loom-runner 34 37 ENTRYPOINT ["/manager"]
+2 -2
Makefile
··· 225 225 --tag $(RUNNER_IMG) \ 226 226 --tag $(RUNNER_IMG_LATEST) \ 227 227 --push \ 228 - --file loom/cmd/runner/Dockerfile \ 228 + --file Dockerfile.runner \ 229 229 . 230 230 231 231 .PHONY: docker-build-runner-local 232 232 docker-build-runner-local: ## Build local runner image (single arch) for testing. 233 233 cd .. && $(CONTAINER_TOOL) build \ 234 234 --tag $(RUNNER_IMG_LATEST) \ 235 - --file loom/cmd/runner/Dockerfile \ 235 + --file Dockerfile.runner \ 236 236 . 237 237 238 238 .PHONY: test-registry-auth
+11 -4
cmd/controller/main.go
··· 397 397 } 398 398 }() 399 399 400 + // Get loom image from environment (used for runner init container) 401 + loomImage := os.Getenv("LOOM_IMAGE") 402 + if loomImage == "" { 403 + loomImage = "atcr.io/evan.jarrett.net/loom:latest" // default fallback 404 + } 405 + 400 406 // Setup controller with spindle components 401 407 if err := (&controller.SpindleSetReconciler{ 402 - Client: mgr.GetClient(), 403 - Scheme: mgr.GetScheme(), 404 - Config: mgr.GetConfig(), 405 - Spindle: s, 408 + Client: mgr.GetClient(), 409 + Scheme: mgr.GetScheme(), 410 + Config: mgr.GetConfig(), 411 + Spindle: s, 412 + LoomImage: loomImage, 406 413 }).SetupWithManager(mgr); err != nil { 407 414 setupLog.Error(err, "unable to create controller", "controller", "SpindleSet") 408 415 os.Exit(1)
-35
cmd/runner/Dockerfile
··· 1 - FROM --platform=$BUILDPLATFORM golang:1.25 AS builder 2 - 3 - ARG TARGETOS 4 - ARG TARGETARCH 5 - 6 - WORKDIR /workspace 7 - 8 - # Copy core dependency (from replace directive in go.mod) 9 - COPY core/ core/ 10 - 11 - # Copy loom go mod files 12 - COPY loom/go.mod loom/go.sum loom/ 13 - WORKDIR /workspace/loom 14 - RUN go mod download 15 - 16 - # Copy loom source code 17 - COPY loom/api/ api/ 18 - COPY loom/cmd/runner/ cmd/runner/ 19 - 20 - # Build static binary 21 - RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ 22 - go build -a -ldflags '-extldflags "-static"' \ 23 - -o loom-runner ./cmd/runner 24 - 25 - # Use minimal base image 26 - FROM alpine:latest 27 - 28 - # Install ca-certificates for HTTPS 29 - RUN apk --no-cache add ca-certificates 30 - 31 - # Copy the binary from builder (built in /workspace/loom/) 32 - COPY --from=builder /workspace/loom/loom-runner /loom-runner 33 - 34 - # Set entrypoint 35 - ENTRYPOINT ["/loom-runner"]
+2
config/manager/manager.yaml
··· 74 74 valueFrom: 75 75 fieldRef: 76 76 fieldPath: metadata.namespace 77 + - name: LOOM_IMAGE 78 + value: "atcr.io/evan.jarrett.net/loom:latest" 77 79 - name: SPINDLE_SERVER_HOSTNAME 78 80 value: "loom.jarrett.net" 79 81 - name: SPINDLE_SERVER_OWNER
+612
docs/proposals/runtime-interface-upstream.md
··· 1 + # Proposal: Runtime Interface for Spindle Engines 2 + 3 + **Status:** Draft 4 + **Author:** @evanjarrett 5 + **Date:** 2025-12-09 6 + 7 + ## Summary 8 + 9 + Extract a `Runtime` interface from the nixery engine to enable: 10 + 1. A new `engine:docker` that accepts user-specified images 11 + 2. Support for multiple container runtimes (Docker, Podman) 12 + 3. Downstream implementations (e.g., Kubernetes in Loom) 13 + 14 + ## Motivation 15 + 16 + Currently, the spindle engine architecture tightly couples workflow execution logic with Docker-specific container management in the nixery engine. This creates several limitations: 17 + 18 + 1. **No user-specified images**: Users must declare Nix dependencies; they can't use pre-built Docker images like `node:20` or `golang:1.22` 19 + 20 + 2. **Single runtime**: Only Docker daemon is supported; no path to Podman or other OCI runtimes 21 + 22 + 3. **Downstream friction**: Loom (Kubernetes-based spindle) must reimplement the entire Engine interface rather than reusing workflow parsing and step execution logic 23 + 24 + ## Proposal 25 + 26 + ### New Runtime Interface 27 + 28 + Create `spindle/models/runtime.go`: 29 + 30 + ```go 31 + package models 32 + 33 + import ( 34 + "context" 35 + "io" 36 + ) 37 + 38 + // Runtime abstracts container/job execution environments. 39 + // Implementations: Docker, Podman, (downstream: Kubernetes) 40 + type Runtime interface { 41 + // Setup creates the execution environment and returns a handle. 42 + // The environment should be ready for Exec calls after Setup returns. 43 + Setup(ctx context.Context, opts SetupOpts) (Handle, error) 44 + 45 + // Exec runs a command in the environment. 46 + // For container runtimes, this is typically `exec` into a running container. 47 + // For job-based runtimes (K8s), this may stream logs from a pre-defined job. 48 + Exec(ctx context.Context, h Handle, opts ExecOpts) (*ExecResult, error) 49 + 50 + // Destroy tears down the environment and releases resources. 51 + Destroy(ctx context.Context, h Handle) error 52 + } 53 + 54 + // SetupOpts configures the execution environment. 55 + type SetupOpts struct { 56 + // Image is the container image to use (e.g., "node:20", "nixery.dev/shell/git") 57 + Image string 58 + 59 + // WorkflowID uniquely identifies this workflow run (used for labeling/naming) 60 + WorkflowID WorkflowId 61 + 62 + // WorkDir is the working directory inside the container (e.g., "/tangled/workspace") 63 + WorkDir string 64 + 65 + // Labels for the container/job (e.g., {"sh.tangled.pipeline/workflow_id": "..."}) 66 + Labels map[string]string 67 + 68 + // Security options 69 + DropAllCaps bool 70 + AddCaps []string // e.g., ["DAC_OVERRIDE", "CHOWN", "FOWNER", "SETUID", "SETGID"] 71 + 72 + // Architecture hint for multi-arch scheduling (used by K8s runtime) 73 + Architecture string 74 + } 75 + 76 + // ExecOpts configures a single command execution. 77 + type ExecOpts struct { 78 + // Command to run (e.g., ["bash", "-c", "npm install"]) 79 + Command []string 80 + 81 + // Environment variables 82 + Env []string 83 + 84 + // Output streams (nil = discard) 85 + Stdout io.Writer 86 + Stderr io.Writer 87 + } 88 + 89 + // ExecResult contains the outcome of an Exec call. 90 + type ExecResult struct { 91 + ExitCode int 92 + OOMKilled bool 93 + } 94 + 95 + // Handle is an opaque reference to an execution environment. 96 + type Handle interface { 97 + // ID returns a unique identifier for this environment (container ID, job name, etc.) 98 + ID() string 99 + } 100 + 101 + // RuntimeMode indicates how the runtime executes steps. 102 + type RuntimeMode int 103 + 104 + const ( 105 + // RuntimeModeExec means steps are executed one at a time via Exec calls. 106 + // Used by Docker/Podman where we exec into a running container. 107 + RuntimeModeExec RuntimeMode = iota 108 + 109 + // RuntimeModeBatch means all steps run in a single invocation. 110 + // Used by Kubernetes where a Job runs all steps and the engine streams logs. 111 + // In this mode, Exec() streams logs rather than executing commands. 112 + RuntimeModeBatch 113 + ) 114 + 115 + // RuntimeInfo provides metadata about a runtime implementation. 116 + type RuntimeInfo interface { 117 + Mode() RuntimeMode 118 + } 119 + ``` 120 + 121 + ### Refactored Engine Structure 122 + 123 + Engines become thin wrappers focused on image resolution and step generation: 124 + 125 + ```go 126 + // engines/base/engine.go - shared logic 127 + package base 128 + 129 + type Engine struct { 130 + Runtime models.Runtime 131 + Logger *slog.Logger 132 + Handles map[string]models.Handle 133 + mu sync.Mutex 134 + } 135 + 136 + func (e *Engine) SetupWorkflow(ctx context.Context, wid models.WorkflowId, wf *models.Workflow) error { 137 + data := wf.Data.(WorkflowData) 138 + 139 + handle, err := e.Runtime.Setup(ctx, models.SetupOpts{ 140 + Image: data.Image, 141 + WorkflowID: wid, 142 + WorkDir: "/tangled/workspace", 143 + Labels: map[string]string{"sh.tangled.pipeline/workflow_id": wid.String()}, 144 + DropAllCaps: true, 145 + AddCaps: []string{"DAC_OVERRIDE", "CHOWN", "FOWNER", "SETUID", "SETGID"}, 146 + }) 147 + if err != nil { 148 + return err 149 + } 150 + 151 + e.mu.Lock() 152 + e.Handles[wid.String()] = handle 153 + e.mu.Unlock() 154 + 155 + // Create workspace directories 156 + _, err = e.Runtime.Exec(ctx, handle, models.ExecOpts{ 157 + Command: []string{"mkdir", "-p", "/tangled/workspace", "/tangled/home"}, 158 + }) 159 + return err 160 + } 161 + 162 + func (e *Engine) RunStep(ctx context.Context, wid models.WorkflowId, w *models.Workflow, idx int, secrets []secrets.UnlockedSecret, wfLogger *models.WorkflowLogger) error { 163 + e.mu.Lock() 164 + handle := e.Handles[wid.String()] 165 + e.mu.Unlock() 166 + 167 + step := w.Steps[idx] 168 + env := buildEnvs(w.Environment, step, secrets) 169 + 170 + var stdout, stderr io.Writer 171 + if wfLogger != nil { 172 + stdout = wfLogger.DataWriter(idx, "stdout") 173 + stderr = wfLogger.DataWriter(idx, "stderr") 174 + } 175 + 176 + result, err := e.Runtime.Exec(ctx, handle, models.ExecOpts{ 177 + Command: []string{"bash", "-c", step.Command()}, 178 + Env: env, 179 + Stdout: stdout, 180 + Stderr: stderr, 181 + }) 182 + if err != nil { 183 + return err 184 + } 185 + if result.OOMKilled { 186 + return ErrOOMKilled 187 + } 188 + if result.ExitCode != 0 { 189 + return engine.ErrWorkflowFailed 190 + } 191 + return nil 192 + } 193 + 194 + func (e *Engine) DestroyWorkflow(ctx context.Context, wid models.WorkflowId) error { 195 + e.mu.Lock() 196 + handle, exists := e.Handles[wid.String()] 197 + delete(e.Handles, wid.String()) 198 + e.mu.Unlock() 199 + 200 + if !exists { 201 + return nil 202 + } 203 + return e.Runtime.Destroy(ctx, handle) 204 + } 205 + ``` 206 + 207 + ### Engine Implementations 208 + 209 + **Nixery Engine** (image from dependencies): 210 + 211 + ```go 212 + // engines/nixery/engine.go 213 + package nixery 214 + 215 + type Engine struct { 216 + *base.Engine 217 + cfg *config.Config 218 + } 219 + 220 + func (e *Engine) InitWorkflow(twf tangled.Pipeline_Workflow, tpl tangled.Pipeline) (*models.Workflow, error) { 221 + var spec struct { 222 + Steps []StepSpec `yaml:"steps"` 223 + Dependencies map[string][]string `yaml:"dependencies"` 224 + Environment map[string]string `yaml:"environment"` 225 + } 226 + yaml.Unmarshal([]byte(twf.Raw), &spec) 227 + 228 + // NIXERY-SPECIFIC: Build image URL from dependencies 229 + image := workflowImage(spec.Dependencies, e.cfg.NixeryPipelines.Nixery) 230 + 231 + steps := []models.Step{} 232 + 233 + // Add nixery-specific setup steps 234 + steps = append(steps, nixConfStep()) 235 + steps = append(steps, models.BuildCloneStep(twf, *tpl.TriggerMetadata, e.cfg.Server.Dev)) 236 + if depStep := dependencyStep(spec.Dependencies); depStep != nil { 237 + steps = append(steps, *depStep) 238 + } 239 + 240 + // Add user steps 241 + for _, s := range spec.Steps { 242 + steps = append(steps, Step{name: s.Name, command: s.Command, ...}) 243 + } 244 + 245 + return &models.Workflow{ 246 + Name: twf.Name, 247 + Steps: steps, 248 + Environment: spec.Environment, 249 + Data: base.WorkflowData{Image: image}, 250 + }, nil 251 + } 252 + 253 + func (e *Engine) WorkflowTimeout() time.Duration { 254 + // ... existing config-based timeout 255 + } 256 + ``` 257 + 258 + **Docker Engine** (user-specified image): 259 + 260 + ```go 261 + // engines/docker/engine.go 262 + package docker 263 + 264 + type Engine struct { 265 + *base.Engine 266 + } 267 + 268 + func (e *Engine) InitWorkflow(twf tangled.Pipeline_Workflow, tpl tangled.Pipeline) (*models.Workflow, error) { 269 + var spec struct { 270 + Image string `yaml:"image"` 271 + Steps []StepSpec `yaml:"steps"` 272 + Environment map[string]string `yaml:"environment"` 273 + } 274 + yaml.Unmarshal([]byte(twf.Raw), &spec) 275 + 276 + // DOCKER-SPECIFIC: Require explicit image 277 + if spec.Image == "" { 278 + return nil, fmt.Errorf("docker engine requires 'image' field in workflow") 279 + } 280 + 281 + steps := []models.Step{} 282 + 283 + // Add clone step (shared with nixery) 284 + steps = append(steps, models.BuildCloneStep(twf, *tpl.TriggerMetadata, false)) 285 + 286 + // Add user steps 287 + for _, s := range spec.Steps { 288 + steps = append(steps, SimpleStep{Name: s.Name, Command: s.Command, ...}) 289 + } 290 + 291 + return &models.Workflow{ 292 + Name: twf.Name, 293 + Steps: steps, 294 + Environment: spec.Environment, 295 + Data: base.WorkflowData{Image: spec.Image}, 296 + }, nil 297 + } 298 + 299 + func (e *Engine) WorkflowTimeout() time.Duration { 300 + return 1 * time.Hour // default 301 + } 302 + ``` 303 + 304 + ### Runtime Implementations 305 + 306 + **Docker Runtime**: 307 + 308 + ```go 309 + // runtime/docker/runtime.go 310 + package docker 311 + 312 + type Runtime struct { 313 + client client.APIClient 314 + logger *slog.Logger 315 + } 316 + 317 + type handle struct { 318 + containerID string 319 + networkID string 320 + } 321 + 322 + func (h *handle) ID() string { return h.containerID } 323 + 324 + func (r *Runtime) Setup(ctx context.Context, opts models.SetupOpts) (models.Handle, error) { 325 + // Create network 326 + netResp, _ := r.client.NetworkCreate(ctx, networkName(opts.WorkflowID), ...) 327 + 328 + // Pull image 329 + reader, _ := r.client.ImagePull(ctx, opts.Image, image.PullOptions{}) 330 + io.Copy(io.Discard, reader) 331 + reader.Close() 332 + 333 + // Create container 334 + resp, _ := r.client.ContainerCreate(ctx, &container.Config{ 335 + Image: opts.Image, 336 + Cmd: []string{"cat"}, 337 + OpenStdin: true, 338 + WorkingDir: opts.WorkDir, 339 + Labels: opts.Labels, 340 + }, &container.HostConfig{ 341 + CapDrop: capDrop(opts.DropAllCaps), 342 + CapAdd: opts.AddCaps, 343 + // ... mounts, security opts 344 + }, nil, nil, "") 345 + 346 + r.client.ContainerStart(ctx, resp.ID, container.StartOptions{}) 347 + 348 + return &handle{containerID: resp.ID, networkID: netResp.ID}, nil 349 + } 350 + 351 + func (r *Runtime) Exec(ctx context.Context, h models.Handle, opts models.ExecOpts) (*models.ExecResult, error) { 352 + dh := h.(*handle) 353 + 354 + execResp, _ := r.client.ContainerExecCreate(ctx, dh.containerID, container.ExecOptions{ 355 + Cmd: opts.Command, 356 + Env: opts.Env, 357 + AttachStdout: true, 358 + AttachStderr: true, 359 + }) 360 + 361 + attach, _ := r.client.ContainerExecAttach(ctx, execResp.ID, container.ExecAttachOptions{}) 362 + defer attach.Close() 363 + 364 + stdcopy.StdCopy(opts.Stdout, opts.Stderr, attach.Reader) 365 + 366 + inspect, _ := r.client.ContainerExecInspect(ctx, execResp.ID) 367 + 368 + // Check OOMKilled 369 + containerInspect, _ := r.client.ContainerInspect(ctx, dh.containerID) 370 + 371 + return &models.ExecResult{ 372 + ExitCode: inspect.ExitCode, 373 + OOMKilled: containerInspect.State.OOMKilled, 374 + }, nil 375 + } 376 + 377 + func (r *Runtime) Destroy(ctx context.Context, h models.Handle) error { 378 + dh := h.(*handle) 379 + r.client.ContainerStop(ctx, dh.containerID, container.StopOptions{}) 380 + r.client.ContainerRemove(ctx, dh.containerID, container.RemoveOptions{RemoveVolumes: true}) 381 + r.client.NetworkRemove(ctx, dh.networkID) 382 + return nil 383 + } 384 + ``` 385 + 386 + **Podman Runtime**: 387 + 388 + ```go 389 + // runtime/podman/runtime.go 390 + package podman 391 + 392 + // Podman is API-compatible with Docker for most operations. 393 + // This runtime uses the Podman socket instead of Docker socket. 394 + 395 + type Runtime struct { 396 + client *podman.APIClient // or use Docker client with Podman socket 397 + logger *slog.Logger 398 + } 399 + 400 + // Implementation nearly identical to Docker runtime. 401 + // Key differences: 402 + // - Socket path: /run/user/1000/podman/podman.sock (rootless) or /run/podman/podman.sock 403 + // - Some API differences in network handling 404 + // - Native support for rootless containers 405 + ``` 406 + 407 + ### Kubernetes Runtime (Downstream - Loom) 408 + 409 + ```go 410 + // In loom repo: runtime/kubernetes/runtime.go 411 + package kubernetes 412 + 413 + type Runtime struct { 414 + client client.Client 415 + config *rest.Config 416 + namespace string 417 + template SpindleTemplate 418 + } 419 + 420 + func (r *Runtime) Mode() models.RuntimeMode { 421 + return models.RuntimeModeBatch // All steps run in single Job 422 + } 423 + 424 + func (r *Runtime) Setup(ctx context.Context, opts models.SetupOpts) (models.Handle, error) { 425 + // Build Job spec with: 426 + // - Init containers for setup (user namespace, clone, etc.) 427 + // - Main container running loom-runner with all steps 428 + // - Node affinity based on opts.Architecture 429 + 430 + job := jobbuilder.BuildJob(jobbuilder.WorkflowConfig{ 431 + Image: opts.Image, 432 + Architecture: opts.Architecture, 433 + // ... steps passed via ConfigMap 434 + }) 435 + 436 + r.client.Create(ctx, job) 437 + 438 + // Wait for pod to be running 439 + pod := waitForPod(ctx, job) 440 + 441 + return &k8sHandle{ 442 + jobName: job.Name, 443 + podName: pod.Name, 444 + }, nil 445 + } 446 + 447 + func (r *Runtime) Exec(ctx context.Context, h models.Handle, opts models.ExecOpts) (*models.ExecResult, error) { 448 + // In batch mode, Exec streams logs rather than executing commands. 449 + // The loom-runner binary inside the Job executes all steps. 450 + // This method reads log output and returns when the step completes. 451 + 452 + kh := h.(*k8sHandle) 453 + 454 + // Stream logs from pod, parse JSON log lines from loom-runner 455 + // Return when step end marker is seen 456 + 457 + return &models.ExecResult{ExitCode: 0}, nil 458 + } 459 + 460 + func (r *Runtime) Destroy(ctx context.Context, h models.Handle) error { 461 + kh := h.(*k8sHandle) 462 + // Delete Job (GC handles pod cleanup) 463 + return r.client.Delete(ctx, &batchv1.Job{ObjectMeta: metav1.ObjectMeta{Name: kh.jobName}}) 464 + } 465 + ``` 466 + 467 + ### Workflow YAML Examples 468 + 469 + **engine:nixery** (current behavior): 470 + ```yaml 471 + engine: nixery 472 + dependencies: 473 + nixpkgs: 474 + - nodejs 475 + - python3 476 + steps: 477 + - name: build 478 + command: npm run build 479 + ``` 480 + 481 + **engine:docker** (new): 482 + ```yaml 483 + engine: docker 484 + image: node:20-alpine 485 + steps: 486 + - name: install 487 + command: npm ci 488 + - name: build 489 + command: npm run build 490 + - name: test 491 + command: npm test 492 + ``` 493 + 494 + **engine:kubernetes** (downstream, in Loom): 495 + ```yaml 496 + engine: kubernetes 497 + image: golang:1.22 498 + architecture: arm64 499 + steps: 500 + - name: build 501 + command: go build ./... 502 + - name: test 503 + command: go test ./... 504 + ``` 505 + 506 + ### Server Wiring 507 + 508 + ```go 509 + // server.go 510 + func Run(ctx context.Context) error { 511 + cfg, err := config.Load(ctx) 512 + 513 + // Create runtime based on config 514 + var rt models.Runtime 515 + switch cfg.Runtime.Type { 516 + case "docker": 517 + dockerClient, _ := client.NewClientWithOpts(client.FromEnv) 518 + rt = docker.NewRuntime(dockerClient, logger) 519 + case "podman": 520 + podmanClient, _ := podman.NewClient(cfg.Runtime.Podman.Socket) 521 + rt = podman.NewRuntime(podmanClient, logger) 522 + default: 523 + rt = docker.NewRuntime(...) // default 524 + } 525 + 526 + // Create engines with shared runtime 527 + nixeryEng := nixery.New(rt, cfg, logger) 528 + dockerEng := dockerengine.New(rt, logger) 529 + 530 + s, _ := New(ctx, cfg, map[string]models.Engine{ 531 + "nixery": nixeryEng, 532 + "docker": dockerEng, 533 + }) 534 + 535 + return s.Start(ctx) 536 + } 537 + ``` 538 + 539 + ### Configuration 540 + 541 + ```toml 542 + # spindle.toml 543 + 544 + [runtime] 545 + type = "docker" # or "podman" 546 + 547 + [runtime.docker] 548 + # Uses DOCKER_HOST env var by default 549 + 550 + [runtime.podman] 551 + socket = "/run/user/1000/podman/podman.sock" 552 + ``` 553 + 554 + ## Migration Path 555 + 556 + ### Phase 1: Extract Runtime Interface 557 + - Add `models/runtime.go` with interface definition 558 + - Add `runtime/docker/` implementation 559 + - Refactor nixery to use docker runtime internally 560 + - No breaking changes to existing workflows 561 + 562 + ### Phase 2: Add Docker Engine 563 + - Add `engines/docker/` that uses same runtime 564 + - Register as `"docker"` in engine map 565 + - Users can now use `engine: docker` with explicit images 566 + 567 + ### Phase 3: Add Podman Runtime 568 + - Add `runtime/podman/` implementation 569 + - Add config option to select runtime 570 + - Podman users can run nixery/docker engines without Docker daemon 571 + 572 + ### Phase 4: Downstream Kubernetes (Loom) 573 + - Loom implements `runtime/kubernetes/` 574 + - Can register `"kubernetes"` engine or reuse `"docker"`/`"nixery"` engines with K8s runtime 575 + - Maintains current Job + loom-runner architecture 576 + 577 + ## Alternatives Considered 578 + 579 + ### 1. Keep engines monolithic 580 + - Pro: Simpler, less abstraction 581 + - Con: Code duplication, can't swap runtimes, harder for downstream 582 + 583 + ### 2. Docker-in-Docker for Kubernetes 584 + - Pro: Identical behavior to local execution 585 + - Con: Security concerns, complexity, resource overhead 586 + 587 + ### 3. Runtime as engine parameter 588 + - Pro: More flexible per-workflow 589 + - Con: Overcomplicates workflow YAML, runtime is deployment choice not user choice 590 + 591 + ## Open Questions 592 + 593 + 1. **Should runtime selection be per-engine or global?** 594 + - Proposal: Global (deployment config), not per-workflow 595 + 596 + 2. **How to handle runtime-specific features?** 597 + - E.g., K8s node affinity, Docker network modes 598 + - Proposal: `SetupOpts` has optional fields; runtimes ignore unsupported options 599 + 600 + 3. **Should we upstream the Kubernetes runtime?** 601 + - Proposal: No, keep in Loom. Upstream provides interface, downstream implements. 602 + 603 + 4. **Podman rootless considerations?** 604 + - User namespaces, different capability handling 605 + - Need testing matrix 606 + 607 + ## References 608 + 609 + - [Current nixery engine](/home/data/core/spindle/engines/nixery/engine.go) 610 + - [Loom KubernetesEngine](/home/data/loom/internal/engine/kubernetes_engine.go) 611 + - [Docker Engine API](https://docs.docker.com/engine/api/) 612 + - [Podman API](https://docs.podman.io/en/latest/_static/api.html)
+5
internal/controller/spindleset_controller.go
··· 49 49 Config *rest.Config 50 50 Spindle *spindle.Spindle 51 51 52 + // LoomImage is the loom image containing the runner binary 53 + // Set from LOOM_IMAGE environment variable 54 + LoomImage string 55 + 52 56 // Track watched Jobs for status reporting 53 57 watchedJobs sync.Map // map[string]models.WorkflowId 54 58 } ··· 439 443 PipelineID: pipelineRun.PipelineID, 440 444 SpindleSetName: spindleSet.Name, 441 445 Image: workflowSpec.Image, 446 + LoomImage: r.LoomImage, 442 447 Architecture: workflowSpec.Architecture, 443 448 Steps: jobSteps, 444 449 WorkflowSpec: workflowSpec, // Pass full workflow spec to runner
+5 -1
internal/jobbuilder/job_template.go
··· 34 34 // Image is the container image to use for execution 35 35 Image string 36 36 37 + // LoomImage is the loom image containing the runner binary 38 + // Used by the install-runner init container 39 + LoomImage string 40 + 37 41 // Architecture is the target architecture (amd64, arm64) 38 42 Architecture string 39 43 ··· 263 267 }, 264 268 { 265 269 Name: "install-runner", 266 - Image: "atcr.io/evan.jarrett.net/loom-runner:latest", 270 + Image: config.LoomImage, 267 271 Command: []string{"cp", "/loom-runner", "/runner-bin/loom-runner"}, 268 272 SecurityContext: &corev1.SecurityContext{ 269 273 AllowPrivilegeEscalation: &[]bool{false}[0],