Kubernetes Operator for Tangled Spindles

add jobbuilder, code cleanup, try and use upstream spindle models

evan.jarrett.net 6ea91538 7f3173ca

verified
+190 -1322
-1066
Kubernetes Operator for Github Actions Runner.md
··· 1 - # Kubernetes Operator for Github Actions Runner 2 - Created at 2025-11-03 05:24:53 UTC 3 - 4 - > @ 2025-11-03 05:24:53 UTC 5 - > 6 - > I want to implement a kubernetes operator similar to github actions runner controller. I want to primarily focus on the logic for spawning a “runner” container via jobs and polling logs as it’s processing. I also want to make sure i can build docker containers within these runners. And the jobs should have logic to target nodes based on os architecture. This is supposed to be a rough poc, i already have go logic that works for reading a yaml file and figuring out what workflow to run. It just needs to send that job/step to a job in kubes 7 - 8 - <details><summary>Thinking</summary> 9 - 10 - The user is asking for help implementing a Kubernetes operator similar to GitHub Actions runner controller. They want to: 11 - 1. Spawn runner containers via Jobs 12 - 2. Poll logs as it's processing 13 - 3. Build docker containers within runners 14 - 4. Target nodes based on OS architecture 15 - 5. This is a rough POC and they already have Go logic to read YAML and figure out what workflow to run 16 - 17 - I'll provide a comprehensive answer covering the implementation of these requirements. 18 - 19 - </details> 20 - 21 - I'll help you implement a Kubernetes operator for spawning runner containers similar to GitHub Actions Runner Controller. Here's a comprehensive approach for your POC: 22 - 23 - ## **Kubernetes Job Spawning Logic** 24 - 25 - Here's the core Go code structure for creating and managing Jobs: 26 - 27 - ```go 28 - package operator 29 - 30 - import ( 31 - "context" 32 - "fmt" 33 - batchv1 "k8s.io/api/batch/v1" 34 - corev1 "k8s.io/api/core/v1" 35 - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 - "k8s.io/client-go/kubernetes" 37 - ) 38 - 39 - type RunnerController struct { 40 - clientset *kubernetes.Clientset 41 - namespace string 42 - } 43 - 44 - func (rc *RunnerController) CreateRunnerJob(workflowID, stepName, stepCommand string, arch string) (*batchv1.Job, error) { 45 - job := &batchv1.Job{ 46 - ObjectMeta: metav1.ObjectMeta{ 47 - Name: fmt.Sprintf("runner-%s-%s", workflowID, stepName), 48 - Namespace: rc.namespace, 49 - Labels: map[string]string{ 50 - "workflow-id": workflowID, 51 - "step-name": stepName, 52 - "type": "runner", 53 - }, 54 - }, 55 - Spec: batchv1.JobSpec{ 56 - Template: corev1.PodTemplateSpec{ 57 - ObjectMeta: metav1.ObjectMeta{ 58 - Labels: map[string]string{ 59 - "workflow-id": workflowID, 60 - "step-name": stepName, 61 - }, 62 - }, 63 - Spec: corev1.PodSpec{ 64 - RestartPolicy: corev1.RestartPolicyNever, 65 - NodeSelector: map[string]string{ 66 - "kubernetes.io/arch": arch, // amd64, arm64, etc. 67 - }, 68 - Containers: []corev1.Container{ 69 - { 70 - Name: "runner", 71 - Image: "docker:dind", // Docker in Docker image 72 - Command: []string{"/bin/sh", "-c"}, 73 - Args: []string{stepCommand}, 74 - Env: []corev1.EnvVar{ 75 - { 76 - Name: "DOCKER_HOST", 77 - Value: "tcp://localhost:2375", 78 - }, 79 - }, 80 - VolumeMounts: []corev1.VolumeMount{ 81 - { 82 - Name: "docker-graph-storage", 83 - MountPath: "/var/lib/docker", 84 - }, 85 - }, 86 - }, 87 - { 88 - Name: "dind-daemon", 89 - Image: "docker:dind", 90 - Command: []string{"dockerd"}, 91 - Args: []string{ 92 - "--host=tcp://0.0.0.0:2375", 93 - "--storage-driver=overlay2", 94 - }, 95 - SecurityContext: &corev1.SecurityContext{ 96 - Privileged: &[]bool{true}[0], 97 - }, 98 - VolumeMounts: []corev1.VolumeMount{ 99 - { 100 - Name: "docker-graph-storage", 101 - MountPath: "/var/lib/docker", 102 - }, 103 - }, 104 - }, 105 - }, 106 - Volumes: []corev1.Volume{ 107 - { 108 - Name: "docker-graph-storage", 109 - VolumeSource: corev1.VolumeSource{ 110 - EmptyDir: &corev1.EmptyDirVolumeSource{}, 111 - }, 112 - }, 113 - }, 114 - }, 115 - }, 116 - }, 117 - } 118 - 119 - return rc.clientset.BatchV1().Jobs(rc.namespace).Create(context.TODO(), job, metav1.CreateOptions{}) 120 - } 121 - ``` 122 - 123 - ## **Docker-in-Docker Configuration** 124 - 125 - For building Docker containers within your runners, you have two main approaches [^6][^2]: 126 - 127 - ### **Option 1: Docker-in-Docker (DinD)** 128 - The above example uses a sidecar container approach with the `docker:dind` image. This provides full Docker functionality but requires **privileged mode**. 129 - 130 - ### **Option 2: Kaniko (Rootless Alternative)** 131 - For better security, consider using Kaniko for building images without Docker daemon: 132 - 133 - ```go 134 - // Alternative container spec for Kaniko 135 - { 136 - Name: "runner", 137 - Image: "gcr.io/kaniko-project/executor:latest", 138 - Args: []string{ 139 - "--dockerfile=Dockerfile", 140 - "--context=git://github.com/your-repo.git", 141 - "--destination=your-registry/image:tag", 142 - }, 143 - } 144 - ``` 145 - 146 - ## **Log Polling Implementation** 147 - 148 - Here's how to poll and stream logs from your runner Jobs [^1][^3]: 149 - 150 - ```go 151 - import ( 152 - "bufio" 153 - "io" 154 - "time" 155 - "k8s.io/client-go/kubernetes" 156 - ) 157 - 158 - func (rc *RunnerController) PollJobLogs(jobName string) error { 159 - // Wait for pod to be created 160 - time.Sleep(2 * time.Second) 161 - 162 - // Get pods for the job 163 - labelSelector := fmt.Sprintf("job-name=%s", jobName) 164 - pods, err := rc.clientset.CoreV1().Pods(rc.namespace).List( 165 - context.TODO(), 166 - metav1.ListOptions{LabelSelector: labelSelector}, 167 - ) 168 - if err != nil || len(pods.Items) == 0 { 169 - return fmt.Errorf("no pods found for job %s", jobName) 170 - } 171 - 172 - podName := pods.Items[0].Name 173 - 174 - // Stream logs 175 - req := rc.clientset.CoreV1().Pods(rc.namespace).GetLogs(podName, &corev1.PodLogOptions{ 176 - Follow: true, 177 - Container: "runner", 178 - Timestamps: true, 179 - }) 180 - 181 - stream, err := req.Stream(context.TODO()) 182 - if err != nil { 183 - return err 184 - } 185 - defer stream.Close() 186 - 187 - // Read and process logs 188 - reader := bufio.NewReader(stream) 189 - for { 190 - line, err := reader.ReadString('\n') 191 - if err != nil { 192 - if err == io.EOF { 193 - break 194 - } 195 - return err 196 - } 197 - // Process log line (send to channel, write to file, etc.) 198 - fmt.Print(line) 199 - } 200 - 201 - return nil 202 - } 203 - ``` 204 - 205 - ## **Architecture-Based Node Targeting** 206 - 207 - To target specific node architectures, use **nodeSelector** or **nodeAffinity** [^1]: 208 - 209 - ```go 210 - // Enhanced node selection with affinity 211 - Spec: corev1.PodSpec{ 212 - Affinity: &corev1.Affinity{ 213 - NodeAffinity: &corev1.NodeAffinity{ 214 - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 215 - NodeSelectorTerms: []corev1.NodeSelectorTerm{ 216 - { 217 - MatchExpressions: []corev1.NodeSelectorRequirement{ 218 - { 219 - Key: "kubernetes.io/arch", 220 - Operator: corev1.NodeSelectorOpIn, 221 - Values: []string{arch}, // "amd64", "arm64" 222 - }, 223 - { 224 - Key: "kubernetes.io/os", 225 - Operator: corev1.NodeSelectorOpIn, 226 - Values: []string{"linux"}, 227 - }, 228 - }, 229 - }, 230 - }, 231 - }, 232 - }, 233 - }, 234 - // ... rest of pod spec 235 - } 236 - ``` 237 - 238 - ## **Job Monitoring and Cleanup** 239 - 240 - Add logic to monitor job completion and cleanup: 241 - 242 - ```go 243 - func (rc *RunnerController) WaitForJobCompletion(jobName string, timeout time.Duration) (*batchv1.Job, error) { 244 - ctx, cancel := context.WithTimeout(context.Background(), timeout) 245 - defer cancel() 246 - 247 - ticker := time.NewTicker(5 * time.Second) 248 - defer ticker.Stop() 249 - 250 - for { 251 - select { 252 - case <-ctx.Done(): 253 - return nil, fmt.Errorf("job %s timed out", jobName) 254 - case <-ticker.C: 255 - job, err := rc.clientset.BatchV1().Jobs(rc.namespace).Get( 256 - context.TODO(), jobName, metav1.GetOptions{}, 257 - ) 258 - if err != nil { 259 - return nil, err 260 - } 261 - 262 - if job.Status.Succeeded > 0 { 263 - return job, nil 264 - } 265 - if job.Status.Failed > 0 { 266 - return job, fmt.Errorf("job %s failed", jobName) 267 - } 268 - } 269 - } 270 - } 271 - 272 - func (rc *RunnerController) CleanupJob(jobName string) error { 273 - propagationPolicy := metav1.DeletePropagationBackground 274 - return rc.clientset.BatchV1().Jobs(rc.namespace).Delete( 275 - context.TODO(), 276 - jobName, 277 - metav1.DeleteOptions{ 278 - PropagationPolicy: &propagationPolicy, 279 - }, 280 - ) 281 - } 282 - ``` 283 - 284 - ## **Integration with Your Workflow Parser** 285 - 286 - Here's how to tie it all together with your existing YAML parser: 287 - 288 - ```go 289 - type WorkflowExecutor struct { 290 - controller *RunnerController 291 - } 292 - 293 - func (we *WorkflowExecutor) ExecuteWorkflow(workflow *WorkflowSpec) error { 294 - for _, step := range workflow.Steps { 295 - // Create job for step 296 - job, err := we.controller.CreateRunnerJob( 297 - workflow.ID, 298 - step.Name, 299 - step.Command, 300 - step.Architecture, // from your YAML 301 - ) 302 - if err != nil { 303 - return err 304 - } 305 - 306 - // Start log polling in goroutine 307 - go we.controller.PollJobLogs(job.Name) 308 - 309 - // Wait for completion 310 - completedJob, err := we.controller.WaitForJobCompletion(job.Name, 30*time.Minute) 311 - if err != nil { 312 - return err 313 - } 314 - 315 - // Optional: cleanup 316 - if workflow.CleanupOnComplete { 317 - we.controller.CleanupJob(job.Name) 318 - } 319 - } 320 - return nil 321 - } 322 - ``` 323 - 324 - ## **Key Considerations** 325 - 326 - - **Security**: Running Docker-in-Docker requires privileged containers. Consider using Kaniko or Buildah for rootless builds 327 - - **Resource Management**: Set appropriate resource limits on Jobs to prevent resource exhaustion 328 - - **Persistent Storage**: For caching Docker layers between builds, consider using PersistentVolumes instead of EmptyDir 329 - - **RBAC**: Ensure your operator has proper ServiceAccount permissions to create/delete Jobs and read logs 330 - 331 - This POC gives you a solid foundation similar to the GitHub Actions Runner Controller [^4][^5], with the core functionality you need for spawning runners, building containers, and monitoring execution. 332 - 333 - 334 - _References_: 335 - [^1]: [Jobs | Kubernetes](https://kubernetes.io/docs/concepts/workloads/controllers/job/) 336 - [^2]: [Jenkins: running workers in Kubernetes and Docker images build](https://dev.to/setevoy/jenkins-running-workers-in-kubernetes-and-docker-images-build-1lef) 337 - [^3]: [kubectl logs - Kubernetes](https://kubernetes.io/docs/reference/kubectl/generated/kubectl_logs/) 338 - [^4]: [Kubernetes controller for GitHub Actions self-hosted runners](https://github.com/actions/actions-runner-controller) 339 - [^5]: [GitHub - evryfs/github-actions-runner-operator: K8S operator for...](https://github.com/evryfs/github-actions-runner-operator) 340 - [^6]: [Automating Docker Builds in Kubernetes | by Sebastien Goasguen](https://sebgoa.medium.com/automating-docker-builds-in-kubernetes-15ef047795ab) 341 - 342 - > @ 2025-11-03 05:37:24 UTC 343 - > 344 - > Yeah i think i want to support buildah primarily. Not to say we can’t have ways to run alternative images later. Why would i want to choose node affinity over nodeselector? 345 - 346 - <details><summary>Thinking</summary> 347 - 348 - The user is asking about why they would choose node affinity over nodeSelector, and they want to use Buildah primarily for container builds. Let me provide a comprehensive answer about the differences between nodeSelector and nodeAffinity, and then update the code examples to use Buildah instead of Docker-in-Docker. 349 - 350 - Based on the research provided: 351 - 1. NodeSelector is simpler but less flexible 352 - 2. NodeAffinity provides more advanced scheduling options 353 - 3. Buildah can run rootless, which is better for security 354 - 355 - Let me structure a response that addresses both points clearly. 356 - 357 - </details> 358 - 359 - Great choice on **Buildah**! It's much more secure since it can run rootless without requiring privileged containers. Let me explain the nodeAffinity vs nodeSelector decision and provide updated code for Buildah. 360 - 361 - ## **NodeSelector vs NodeAffinity: When to Choose Which** 362 - 363 - ### **NodeSelector** 364 - - **Simple equality-based selection** [^4][^2] 365 - - Only supports "AND" logic between labels 366 - - More concise for basic requirements 367 - - Example: `kubernetes.io/arch: amd64` 368 - 369 - ### **NodeAffinity** 370 - NodeAffinity offers **significantly more flexibility** that you'll likely want for a runner controller [^2][^1]: 371 - 372 - 1. **Complex expressions**: Use operators like `In`, `NotIn`, `Exists`, `DoesNotExist`, `Gt`, `Lt` 373 - 2. **Soft preferences**: Use `preferredDuringSchedulingIgnoredDuringExecution` for optional requirements 374 - 3. **Multiple criteria**: Combine multiple conditions with OR logic 375 - 4. **Weighted preferences**: Assign weights to preferred nodes 376 - 377 - For your runner controller, **nodeAffinity is better** because: 378 - - You can handle fallback scenarios (prefer arm64 but accept amd64) 379 - - You can exclude certain nodes (avoid nodes with specific taints) 380 - - You can combine architecture with other requirements (GPU nodes, specific OS versions) 381 - 382 - ## **Buildah-Based Runner Implementation** 383 - 384 - Here's the updated implementation using Buildah: 385 - 386 - ```go 387 - package operator 388 - 389 - import ( 390 - "context" 391 - "fmt" 392 - batchv1 "k8s.io/api/batch/v1" 393 - corev1 "k8s.io/api/core/v1" 394 - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 395 - "k8s.io/client-go/kubernetes" 396 - ) 397 - 398 - type BuildahRunnerConfig struct { 399 - Registry string 400 - RegistryAuth string // base64 encoded auth 401 - StorageDriver string // overlay, vfs 402 - Architecture string // amd64, arm64 403 - } 404 - 405 - func (rc *RunnerController) CreateBuildahRunnerJob( 406 - workflowID, stepName string, 407 - buildContext string, 408 - config BuildahRunnerConfig, 409 - ) (*batchv1.Job, error) { 410 - 411 - job := &batchv1.Job{ 412 - ObjectMeta: metav1.ObjectMeta{ 413 - Name: fmt.Sprintf("runner-%s-%s", workflowID, stepName), 414 - Namespace: rc.namespace, 415 - Labels: map[string]string{ 416 - "workflow-id": workflowID, 417 - "step-name": stepName, 418 - "builder": "buildah", 419 - }, 420 - }, 421 - Spec: batchv1.JobSpec{ 422 - BackoffLimit: &[]int32{2}[0], 423 - Template: corev1.PodTemplateSpec{ 424 - ObjectMeta: metav1.ObjectMeta{ 425 - Labels: map[string]string{ 426 - "workflow-id": workflowID, 427 - "step-name": stepName, 428 - }, 429 - Annotations: map[string]string{ 430 - "container.apparmor.security.beta.kubernetes.io/buildah": "unconfined", 431 - }, 432 - }, 433 - Spec: corev1.PodSpec{ 434 - RestartPolicy: corev1.RestartPolicyNever, 435 - 436 - // Using NodeAffinity for flexible architecture targeting 437 - Affinity: &corev1.Affinity{ 438 - NodeAffinity: &corev1.NodeAffinity{ 439 - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 440 - NodeSelectorTerms: []corev1.NodeSelectorTerm{ 441 - { 442 - MatchExpressions: []corev1.NodeSelectorRequirement{ 443 - { 444 - Key: "kubernetes.io/arch", 445 - Operator: corev1.NodeSelectorOpIn, 446 - Values: []string{config.Architecture}, 447 - }, 448 - }, 449 - }, 450 - }, 451 - }, 452 - // Optional: Prefer nodes with more resources 453 - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.PreferredSchedulingTerm{ 454 - { 455 - Weight: 100, 456 - Preference: corev1.NodeSelectorTerm{ 457 - MatchExpressions: []corev1.NodeSelectorRequirement{ 458 - { 459 - Key: "node-role.kubernetes.io/runner", 460 - Operator: corev1.NodeSelectorOpExists, 461 - }, 462 - }, 463 - }, 464 - }, 465 - }, 466 - }, 467 - }, 468 - 469 - Containers: []corev1.Container{ 470 - { 471 - Name: "buildah", 472 - Image: "quay.io/buildah/stable:latest", 473 - Command: []string{"/bin/bash", "-c"}, 474 - Args: []string{fmt.Sprintf(` 475 - set -e 476 - echo "Starting build process..." 477 - 478 - # Clone or prepare build context 479 - %s 480 - 481 - # Build the image 482 - buildah bud \ 483 - --storage-driver=%s \ 484 - --format=docker \ 485 - --tls-verify=false \ 486 - -t %s/%s:%s \ 487 - -f Dockerfile . 488 - 489 - # Push to registry (if configured) 490 - if [ -n "$REGISTRY_AUTH" ]; then 491 - echo "$REGISTRY_AUTH" | base64 -d > /tmp/auth.json 492 - buildah push \ 493 - --authfile /tmp/auth.json \ 494 - --tls-verify=false \ 495 - %s/%s:%s 496 - rm /tmp/auth.json 497 - fi 498 - 499 - echo "Build completed successfully" 500 - `, 501 - buildContext, 502 - config.StorageDriver, 503 - config.Registry, workflowID, stepName, 504 - config.Registry, workflowID, stepName, 505 - )}, 506 - 507 - SecurityContext: &corev1.SecurityContext{ 508 - // Buildah can run rootless! 509 - RunAsUser: &[]int64{1000}[0], 510 - RunAsGroup: &[]int64{1000}[0], 511 - Capabilities: &corev1.Capabilities{ 512 - Add: []corev1.Capability{ 513 - "SETUID", 514 - "SETGID", 515 - }, 516 - }, 517 - }, 518 - 519 - Env: []corev1.EnvVar{ 520 - { 521 - Name: "STORAGE_DRIVER", 522 - Value: config.StorageDriver, 523 - }, 524 - { 525 - Name: "REGISTRY_AUTH", 526 - Value: config.RegistryAuth, 527 - }, 528 - { 529 - Name: "BUILDAH_ISOLATION", 530 - Value: "chroot", 531 - }, 532 - }, 533 - 534 - Resources: corev1.ResourceRequirements{ 535 - Requests: corev1.ResourceList{ 536 - corev1.ResourceCPU: resource.MustParse("500m"), 537 - corev1.ResourceMemory: resource.MustParse("1Gi"), 538 - }, 539 - Limits: corev1.ResourceList{ 540 - corev1.ResourceCPU: resource.MustParse("2"), 541 - corev1.ResourceMemory: resource.MustParse("4Gi"), 542 - }, 543 - }, 544 - 545 - VolumeMounts: []corev1.VolumeMount{ 546 - { 547 - Name: "buildah-storage", 548 - MountPath: "/var/lib/containers", 549 - }, 550 - }, 551 - }, 552 - }, 553 - 554 - Volumes: []corev1.Volume{ 555 - { 556 - Name: "buildah-storage", 557 - VolumeSource: corev1.VolumeSource{ 558 - EmptyDir: &corev1.EmptyDirVolumeSource{ 559 - Medium: corev1.StorageMediumMemory, // Use RAM for speed 560 - }, 561 - }, 562 - }, 563 - }, 564 - }, 565 - }, 566 - }, 567 - } 568 - 569 - return rc.clientset.BatchV1().Jobs(rc.namespace).Create(context.TODO(), job, metav1.CreateOptions{}) 570 - } 571 - ``` 572 - 573 - ## **Supporting Multiple Build Tools with Strategy Pattern** 574 - 575 - Since you mentioned supporting alternative images later, here's an extensible approach: 576 - 577 - ```go 578 - type BuilderStrategy interface { 579 - CreateJob(workflowID, stepName string, config BuildConfig) *batchv1.Job 580 - GetImage() string 581 - GetSecurityContext() *corev1.SecurityContext 582 - } 583 - 584 - type BuildahStrategy struct{} 585 - type KanikoStrategy struct{} 586 - type DockerStrategy struct{} 587 - 588 - func (b *BuildahStrategy) GetSecurityContext() *corev1.SecurityContext { 589 - return &corev1.SecurityContext{ 590 - RunAsUser: &[]int64{1000}[0], 591 - RunAsGroup: &[]int64{1000}[0], 592 - Capabilities: &corev1.Capabilities{ 593 - Add: []corev1.Capability{"SETUID", "SETGID"}, 594 - }, 595 - } 596 - } 597 - 598 - func (k *KanikoStrategy) GetSecurityContext() *corev1.SecurityContext { 599 - return &corev1.SecurityContext{ 600 - RunAsUser: &[]int64{0}[0], // Kaniko needs root but doesn't need privileged 601 - } 602 - } 603 - 604 - // Factory pattern for builder selection 605 - func GetBuilderStrategy(builderType string) BuilderStrategy { 606 - switch builderType { 607 - case "kaniko": 608 - return &KanikoStrategy{} 609 - case "docker": 610 - return &DockerStrategy{} 611 - default: 612 - return &BuildahStrategy{} // Default to Buildah 613 - } 614 - } 615 - ``` 616 - 617 - ## **Advanced NodeAffinity Example** 618 - 619 - Here's a more sophisticated nodeAffinity setup that showcases why it's superior for complex requirements: 620 - 621 - ```go 622 - Affinity: &corev1.Affinity{ 623 - NodeAffinity: &corev1.NodeAffinity{ 624 - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 625 - NodeSelectorTerms: []corev1.NodeSelectorTerm{ 626 - { 627 - MatchExpressions: []corev1.NodeSelectorRequirement{ 628 - { 629 - Key: "kubernetes.io/arch", 630 - Operator: corev1.NodeSelectorOpIn, 631 - Values: []string{"amd64", "arm64"}, // Either architecture 632 - }, 633 - { 634 - Key: "node.kubernetes.io/instance-type", 635 - Operator: corev1.NodeSelectorOpNotIn, 636 - Values: []string{"t2.micro", "t2.small"}, // Exclude small instances 637 - }, 638 - }, 639 - }, 640 - }, 641 - }, 642 - PreferredDuringSchedulingIgnoredDuringExecution: []corev1.PreferredSchedulingTerm{ 643 - { 644 - Weight: 100, 645 - Preference: corev1.NodeSelectorTerm{ 646 - MatchExpressions: []corev1.NodeSelectorRequirement{ 647 - { 648 - Key: "kubernetes.io/arch", 649 - Operator: corev1.NodeSelectorOpIn, 650 - Values: []string{config.PreferredArch}, // Prefer specific arch 651 - }, 652 - }, 653 - }, 654 - }, 655 - { 656 - Weight: 50, 657 - Preference: corev1.NodeSelectorTerm{ 658 - MatchExpressions: []corev1.NodeSelectorRequirement{ 659 - { 660 - Key: "workload-type", 661 - Operator: corev1.NodeSelectorOpIn, 662 - Values: []string{"build-optimized"}, 663 - }, 664 - }, 665 - }, 666 - }, 667 - }, 668 - }, 669 - } 670 - ``` 671 - 672 - **Buildah** is an excellent choice for your runner controller because it: 673 - - Runs without root privileges (better security) [^5][^3] 674 - - Doesn't require a daemon 675 - - Has lower resource overhead 676 - - Supports multiple storage drivers 677 - - Works well in Kubernetes environments 678 - 679 - The combination of Buildah with nodeAffinity gives you a secure, flexible runner system that can intelligently place workloads across your cluster. 680 - 681 - 682 - _References_: 683 - [^1]: [Mastering Node Placement in Kubernetes: Node Selectors vs. Node Affinity](https://medium.com/@rahulbansod519/mastering-node-placement-in-kubernetes-node-selectors-vs-node-affinity-f6a24cda0ddd) 684 - [^2]: [Node Selector vs Node Affinity in Kubernetes - Medium](https://medium.com/@reach2shristi.81/node-selector-vs-node-affinity-in-kubernetes-3292e9552aa0) 685 - [^3]: [Buildah and Podman Relationship | buildah.io](https://buildah.io/blogs/2018/10/31/podman-buildah-relationship.html) 686 - [^4]: [Kubernetes: Node Selector and Node Affinity | by Claire Lee | Medium](https://yuminlee2.medium.com/kubernetes-node-selector-and-node-affinity-ecb3a4d69165) 687 - [^5]: [Building a Buildah container image for Kubernetes](https://buildah.io/blogs/2018/03/01/building-buildah-container-image-for-kubernetes.html) 688 - 689 - > @ 2025-11-03 05:43:40 UTC 690 - > 691 - > How does GitHub handle isolation? I believe each step in a job is the same runner, but different jobs are isolated? This code appears to make a single step into a job? Is there shared context between steps? Ie environment variables, cached files etc? 692 - 693 - <details><summary>Thinking</summary> 694 - 695 - The user is asking about GitHub's isolation model and noticing that my code creates a separate Kubernetes Job for each step, which is different from how GitHub Actions works. They're right - GitHub Actions runs all steps in a job on the same runner with shared context, while different jobs are isolated. Let me explain this architecture and provide updated code that better mimics GitHub's model. 696 - 697 - </details> 698 - 699 - Excellent observation! You're absolutely right - **GitHub Actions runs all steps within a job on the same runner**, sharing context between steps. Your current code creates separate Kubernetes Jobs for each step, which breaks this model. Let me explain GitHub's architecture and provide a better implementation. 700 - 701 - ## **GitHub Actions Isolation Model** 702 - 703 - ### **How GitHub Actually Works:** 704 - - **Job Level**: Each job gets its own runner (isolated environment) [^1] 705 - - **Step Level**: All steps within a job execute sequentially on the same runner [^1] 706 - - **Shared Context**: Steps share: 707 - - Filesystem/workspace 708 - - Environment variables 709 - - Docker volumes [^4] 710 - - Network context 711 - - Installed tools/dependencies 712 - 713 - ### **Isolation Boundaries:** 714 - - **Between Jobs**: Complete isolation (different runners/pods) 715 - - **Between Steps**: No isolation (same container/environment) 716 - - **Between Workflows**: Complete isolation 717 - 718 - ## **Corrected Implementation: Job = Pod, Steps = Sequential Commands** 719 - 720 - Here's a redesigned approach that properly mimics GitHub's model: 721 - 722 - ```go 723 - package operator 724 - 725 - import ( 726 - "context" 727 - "encoding/json" 728 - "fmt" 729 - "strings" 730 - batchv1 "k8s.io/api/batch/v1" 731 - corev1 "k8s.io/api/core/v1" 732 - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 733 - ) 734 - 735 - type WorkflowJob struct { 736 - ID string 737 - Name string 738 - Steps []Step 739 - Architecture string 740 - Environment map[string]string 741 - } 742 - 743 - type Step struct { 744 - Name string 745 - Command string 746 - Env map[string]string 747 - Uses string // For action references 748 - } 749 - 750 - func (rc *RunnerController) CreateJobRunner(job WorkflowJob) (*batchv1.Job, error) { 751 - // Build a single script that executes all steps sequentially 752 - runnerScript := rc.buildRunnerScript(job.Steps, job.Environment) 753 - 754 - k8sJob := &batchv1.Job{ 755 - ObjectMeta: metav1.ObjectMeta{ 756 - Name: fmt.Sprintf("runner-job-%s", job.ID), 757 - Namespace: rc.namespace, 758 - Labels: map[string]string{ 759 - "workflow-id": job.ID, 760 - "job-name": job.Name, 761 - "type": "github-runner", 762 - }, 763 - }, 764 - Spec: batchv1.JobSpec{ 765 - BackoffLimit: &[]int32{0}[0], // No retries for workflow jobs 766 - Template: corev1.PodTemplateSpec{ 767 - Spec: corev1.PodSpec{ 768 - RestartPolicy: corev1.RestartPolicyNever, 769 - 770 - // Single init container to set up workspace 771 - InitContainers: []corev1.Container{ 772 - { 773 - Name: "setup-workspace", 774 - Image: "alpine/git:latest", 775 - Command: []string{"/bin/sh", "-c"}, 776 - Args: []string{` 777 - # Clone repository or set up workspace 778 - git clone ${REPO_URL} /workspace || true 779 - cd /workspace 780 - git checkout ${GITHUB_SHA:-main} 781 - `}, 782 - VolumeMounts: []corev1.VolumeMount{ 783 - { 784 - Name: "workspace", 785 - MountPath: "/workspace", 786 - }, 787 - }, 788 - }, 789 - }, 790 - 791 - // Main runner container executes ALL steps 792 - Containers: []corev1.Container{ 793 - { 794 - Name: "runner", 795 - Image: "quay.io/buildah/stable:latest", 796 - Command: []string{"/bin/bash", "-c"}, 797 - Args: []string{runnerScript}, 798 - 799 - WorkingDir: "/workspace", 800 - 801 - Env: rc.buildEnvironment(job), 802 - 803 - VolumeMounts: []corev1.VolumeMount{ 804 - { 805 - Name: "workspace", 806 - MountPath: "/workspace", 807 - }, 808 - { 809 - Name: "step-outputs", 810 - MountPath: "/tmp/step-outputs", 811 - }, 812 - { 813 - Name: "github-env", 814 - MountPath: "/tmp/github", 815 - }, 816 - { 817 - Name: "buildah-storage", 818 - MountPath: "/var/lib/containers", 819 - }, 820 - }, 821 - 822 - SecurityContext: &corev1.SecurityContext{ 823 - RunAsUser: &[]int64{1000}[0], 824 - RunAsGroup: &[]int64{1000}[0], 825 - }, 826 - }, 827 - }, 828 - 829 - Volumes: []corev1.Volume{ 830 - { 831 - Name: "workspace", 832 - VolumeSource: corev1.VolumeSource{ 833 - EmptyDir: &corev1.EmptyDirVolumeSource{}, 834 - }, 835 - }, 836 - { 837 - Name: "step-outputs", 838 - VolumeSource: corev1.VolumeSource{ 839 - EmptyDir: &corev1.EmptyDirVolumeSource{}, 840 - }, 841 - }, 842 - { 843 - Name: "github-env", 844 - VolumeSource: corev1.VolumeSource{ 845 - EmptyDir: &corev1.EmptyDirVolumeSource{}, 846 - }, 847 - }, 848 - { 849 - Name: "buildah-storage", 850 - VolumeSource: corev1.VolumeSource{ 851 - EmptyDir: &corev1.EmptyDirVolumeSource{ 852 - Medium: corev1.StorageMediumMemory, 853 - }, 854 - }, 855 - }, 856 - }, 857 - 858 - NodeSelector: map[string]string{ 859 - "kubernetes.io/arch": job.Architecture, 860 - }, 861 - }, 862 - }, 863 - }, 864 - } 865 - 866 - return rc.clientset.BatchV1().Jobs(rc.namespace).Create(context.TODO(), k8sJob, metav1.CreateOptions{}) 867 - } 868 - ``` 869 - 870 - ## **Step Execution Script Builder** 871 - 872 - This critical function creates a bash script that maintains context between steps: 873 - 874 - ```go 875 - func (rc *RunnerController) buildRunnerScript(steps []Step, jobEnv map[string]string) string { 876 - var script strings.Builder 877 - 878 - script.WriteString(`#!/bin/bash 879 - set -e 880 - 881 - # GitHub Actions compatible environment setup 882 - export GITHUB_ENV=/tmp/github/env 883 - export GITHUB_PATH=/tmp/github/path 884 - export GITHUB_OUTPUT=/tmp/github/output 885 - export GITHUB_STEP_SUMMARY=/tmp/github/step-summary.md 886 - 887 - # Initialize files 888 - touch $GITHUB_ENV $GITHUB_PATH $GITHUB_OUTPUT $GITHUB_STEP_SUMMARY 889 - 890 - # Helper functions matching GitHub Actions 891 - add_to_env() { 892 - echo "$1=$2" >> $GITHUB_ENV 893 - export $1="$2" 894 - } 895 - 896 - add_to_path() { 897 - echo "$1" >> $GITHUB_PATH 898 - export PATH="$1:$PATH" 899 - } 900 - 901 - set_output() { 902 - echo "$1=$2" >> $GITHUB_OUTPUT 903 - } 904 - 905 - # Function to load environment between steps 906 - load_github_env() { 907 - if [ -f "$GITHUB_ENV" ]; then 908 - set -a 909 - source $GITHUB_ENV 910 - set +a 911 - fi 912 - if [ -f "$GITHUB_PATH" ]; then 913 - while IFS= read -r line; do 914 - export PATH="$line:$PATH" 915 - done < $GITHUB_PATH 916 - fi 917 - } 918 - 919 - `) 920 - 921 - // Add each step 922 - for i, step := range steps { 923 - script.WriteString(fmt.Sprintf(` 924 - echo "==== Starting Step: %s ====" 925 - load_github_env 926 - 927 - # Step %d environment variables 928 - `, step.Name, i+1)) 929 - 930 - // Add step-specific environment variables 931 - for k, v := range step.Env { 932 - script.WriteString(fmt.Sprintf("export %s='%s'\n", k, v)) 933 - } 934 - 935 - // Execute step command 936 - script.WriteString(fmt.Sprintf(` 937 - # Execute step 938 - ( 939 - %s 940 - ) || { 941 - echo "Step '%s' failed with exit code $?" 942 - exit 1 943 - } 944 - 945 - echo "==== Completed Step: %s ====" 946 - `, step.Command, step.Name, step.Name)) 947 - } 948 - 949 - return script.String() 950 - } 951 - ``` 952 - 953 - ## **Shared Context Implementation** 954 - 955 - Here's how to properly share context between steps like GitHub Actions does [^2][^3]: 956 - 957 - ```go 958 - type StepOutput struct { 959 - StepName string 960 - Outputs map[string]string 961 - } 962 - 963 - // Enhanced step structure with outputs 964 - type EnhancedStep struct { 965 - Step 966 - ID string // For referencing outputs 967 - Outputs []string // Expected output keys 968 - } 969 - 970 - func (rc *RunnerController) buildEnhancedRunnerScript(steps []EnhancedStep) string { 971 - var script strings.Builder 972 - 973 - // ... (previous setup code) ... 974 - 975 - for i, step := range steps { 976 - script.WriteString(fmt.Sprintf(` 977 - # Step: %s (id: %s) 978 - STEP_OUTPUT_FILE="/tmp/step-outputs/%s.json" 979 - 980 - # Run step and capture outputs 981 - ( 982 - %s 983 - 984 - # Save step outputs if any were set 985 - if [ -f "$GITHUB_OUTPUT" ]; then 986 - # Parse and save outputs for this step 987 - python3 -c " 988 - import json 989 - outputs = {} 990 - with open('$GITHUB_OUTPUT', 'r') as f: 991 - for line in f: 992 - if '=' in line: 993 - key, value = line.strip().split('=', 1) 994 - outputs[key] = value 995 - with open('$STEP_OUTPUT_FILE', 'w') as f: 996 - json.dump(outputs, f) 997 - " 998 - fi 999 - ) 1000 - 1001 - # Make outputs available as environment variables for next steps 1002 - if [ -f "$STEP_OUTPUT_FILE" ]; then 1003 - export STEP_%s_OUTPUTS=$(cat $STEP_OUTPUT_FILE) 1004 - fi 1005 - `, step.Name, step.ID, step.ID, step.Command, strings.ToUpper(step.ID))) 1006 - } 1007 - 1008 - return script.String() 1009 - } 1010 - ``` 1011 - 1012 - ## **Multi-Container Pattern for Complex Workflows** 1013 - 1014 - For workflows that need services (like databases), use a multi-container pod: 1015 - 1016 - ```go 1017 - // For jobs that need service containers (like GitHub Actions services) 1018 - func (rc *RunnerController) CreateJobWithServices(job WorkflowJob, services []ServiceContainer) (*batchv1.Job, error) { 1019 - containers := []corev1.Container{ 1020 - // Main runner container 1021 - { 1022 - Name: "runner", 1023 - // ... (as before) 1024 - }, 1025 - } 1026 - 1027 - // Add service containers that run alongside the main runner 1028 - for _, svc := range services { 1029 - containers = append(containers, corev1.Container{ 1030 - Name: svc.Name, 1031 - Image: svc.Image, 1032 - Env: convertToEnvVar(svc.Environment), 1033 - Ports: svc.Ports, 1034 - // Services run for the lifetime of the job 1035 - Command: svc.Command, 1036 - }) 1037 - } 1038 - 1039 - // Update pod spec with all containers 1040 - // ... 1041 - } 1042 - ``` 1043 - 1044 - ## **Key Differences from Original Implementation** 1045 - 1046 - | Aspect | GitHub Actions Model | Your Original Code | Corrected Approach | 1047 - |--------|---------------------|-------------------|-------------------| 1048 - | **Job Mapping** | Job = Runner Instance | Step = K8s Job | Job = K8s Job/Pod | 1049 - | **Step Execution** | Sequential in same environment | Separate pods | Sequential in same container | 1050 - | **Filesystem** | Shared across steps | Isolated per step | Shared via volumes | 1051 - | **Environment Variables** | Persistent across steps | Isolated | Persistent via GITHUB_ENV | 1052 - | **Docker/Buildah Cache** | Shared in job | Lost between steps | Shared via volume | 1053 - 1054 - This corrected approach properly mimics GitHub Actions' execution model while maintaining the isolation boundaries that make sense in Kubernetes. 1055 - 1056 - 1057 - _References_: 1058 - [^1]: [Understanding GitHub Actions](https://docs.github.com/articles/getting-started-with-github-actions) 1059 - [^2]: [Store information in variables - GitHub Docs](https://docs.github.com/en/actions/how-tos/write-workflows/choose-what-workflows-do/use-variables) 1060 - [^3]: [Contexts - GitHub Docs](https://docs.github.com/en/actions/concepts/workflows-and-actions/contexts) 1061 - [^4]: [Workflow syntax for GitHub Actions](https://docs.github.com/actions/using-workflows/workflow-syntax-for-github-actions) 1062 - 1063 - 1064 - --- 1065 - 1066 - _Generated by [Kagi Assistant](https://kagi.com/assistant)_
+9
api/v1alpha1/spindleset_types.go
··· 57 57 // +kubebuilder:validation:Required 58 58 CommitSHA string `json:"commitSHA"` 59 59 60 + // CloneCommands are the git commands to run in the clone init container. 61 + // Generated by tangled.org/core/spindle/steps.BuildCloneCommands(). 62 + // +optional 63 + CloneCommands []string `json:"cloneCommands,omitempty"` 64 + 65 + // SkipClone indicates whether to skip the clone init container entirely. 66 + // +optional 67 + SkipClone bool `json:"skipClone,omitempty"` 68 + 60 69 // Workflows is the list of workflows to execute in this pipeline. 61 70 // +kubebuilder:validation:MinItems=1 62 71 Workflows []WorkflowSpec `json:"workflows"`
+5
api/v1alpha1/zz_generated.deepcopy.go
··· 29 29 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. 30 30 func (in *PipelineRunSpec) DeepCopyInto(out *PipelineRunSpec) { 31 31 *out = *in 32 + if in.CloneCommands != nil { 33 + in, out := &in.CloneCommands, &out.CloneCommands 34 + *out = make([]string, len(*in)) 35 + copy(*out, *in) 36 + } 32 37 if in.Workflows != nil { 33 38 in, out := &in.Workflows, &out.Workflows 34 39 *out = make([]WorkflowSpec, len(*in))
+40 -45
cmd/runner/main.go
··· 8 8 "io" 9 9 "os" 10 10 "os/exec" 11 - "time" 12 11 12 + "tangled.org/core/spindle/models" 13 13 loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1" 14 14 ) 15 15 16 - // LogEvent represents a structured log event emitted by the runner. 17 - type LogEvent struct { 18 - Kind string `json:"kind"` // "control" or "data" 19 - Event string `json:"event,omitempty"` // "start", "end" (for control events) 20 - StepID int `json:"step_id"` // 0-based step index 21 - StepName string `json:"step_name,omitempty"` // Step name 22 - WorkflowName string `json:"workflow_name"` // Workflow name for log separation 23 - Stream string `json:"stream,omitempty"` // "stdout" or "stderr" (for data events) 24 - Content string `json:"content,omitempty"` // Log line content (for data events) 25 - ExitCode *int `json:"exit_code,omitempty"` // Exit code (for control/end events) 26 - Timestamp string `json:"timestamp"` // ISO 8601 timestamp 16 + // simpleStep implements the models.Step interface 17 + type simpleStep struct { 18 + name string 19 + command string 20 + } 21 + 22 + func (s *simpleStep) Name() string { 23 + return s.name 24 + } 25 + 26 + func (s *simpleStep) Command() string { 27 + return s.command 28 + } 29 + 30 + func (s *simpleStep) Kind() models.StepKind { 31 + return models.StepKindUser 27 32 } 28 33 29 34 func main() { ··· 57 62 // Execute each step 58 63 ctx := context.Background() 59 64 for i, step := range workflow.Steps { 60 - if err := executeStep(ctx, i, step, workflow.Name); err != nil { 65 + if err := executeStep(ctx, i, step); err != nil { 61 66 return fmt.Errorf("step %d (%s) failed: %w", i, step.Name, err) 62 67 } 63 68 } ··· 65 70 return nil 66 71 } 67 72 68 - func executeStep(ctx context.Context, stepID int, step loomv1alpha1.WorkflowStep, workflowName string) error { 73 + func executeStep(ctx context.Context, stepID int, step loomv1alpha1.WorkflowStep) error { 74 + // Create a simple step for logging 75 + simpleStep := &simpleStep{ 76 + name: step.Name, 77 + command: step.Command, 78 + } 79 + 69 80 // Emit step start event 70 - emitControlEvent(stepID, step.Name, workflowName, "start", nil) 81 + emitControlEvent(stepID, simpleStep, models.StepStatusStart) 71 82 72 83 // Set step-specific environment variables 73 84 if step.Environment != nil { ··· 95 106 96 107 // Start the command 97 108 if err := cmd.Start(); err != nil { 98 - exitCode := 1 99 - emitControlEvent(stepID, step.Name, workflowName, "end", &exitCode) 109 + emitControlEvent(stepID, simpleStep, models.StepStatusEnd) 100 110 return fmt.Errorf("failed to start command: %w", err) 101 111 } 102 112 103 113 // Stream stdout and stderr concurrently 104 114 done := make(chan error, 2) 105 - go streamOutput(stdout, stepID, workflowName, "stdout", done) 106 - go streamOutput(stderr, stepID, workflowName, "stderr", done) 115 + go streamOutput(stdout, stepID, "stdout", done) 116 + go streamOutput(stderr, stepID, "stderr", done) 107 117 108 118 // Wait for both streams to complete 109 119 for i := 0; i < 2; i++ { ··· 125 135 } 126 136 127 137 // Emit step end event 128 - emitControlEvent(stepID, step.Name, workflowName, "end", &exitCode) 138 + emitControlEvent(stepID, simpleStep, models.StepStatusEnd) 129 139 130 140 if exitCode != 0 { 131 141 return fmt.Errorf("command exited with code %d", exitCode) ··· 134 144 return nil 135 145 } 136 146 137 - func streamOutput(reader io.Reader, stepID int, workflowName, stream string, done chan<- error) { 147 + func streamOutput(reader io.Reader, stepID int, stream string, done chan<- error) { 138 148 scanner := bufio.NewScanner(reader) 139 149 // Increase buffer size for long lines 140 150 buf := make([]byte, 0, 64*1024) ··· 142 152 143 153 for scanner.Scan() { 144 154 line := scanner.Text() 145 - emitDataEvent(stepID, workflowName, stream, line) 155 + emitDataEvent(stepID, stream, line) 146 156 } 147 157 148 158 done <- scanner.Err() 149 159 } 150 160 151 - func emitControlEvent(stepID int, stepName, workflowName, event string, exitCode *int) { 152 - ev := LogEvent{ 153 - Kind: "control", 154 - Event: event, 155 - StepID: stepID, 156 - StepName: stepName, 157 - WorkflowName: workflowName, 158 - ExitCode: exitCode, 159 - Timestamp: time.Now().UTC().Format(time.RFC3339Nano), 160 - } 161 - emitJSON(ev) 161 + func emitControlEvent(stepID int, step models.Step, status models.StepStatus) { 162 + logLine := models.NewControlLogLine(stepID, step, status) 163 + emitJSON(logLine) 162 164 } 163 165 164 - func emitDataEvent(stepID int, workflowName, stream, content string) { 165 - ev := LogEvent{ 166 - Kind: "data", 167 - StepID: stepID, 168 - WorkflowName: workflowName, 169 - Stream: stream, 170 - Content: content, 171 - Timestamp: time.Now().UTC().Format(time.RFC3339Nano), 172 - } 173 - emitJSON(ev) 166 + func emitDataEvent(stepID int, stream, content string) { 167 + logLine := models.NewDataLogLine(stepID, content, stream) 168 + emitJSON(logLine) 174 169 } 175 170 176 - func emitJSON(event LogEvent) { 177 - data, err := json.Marshal(event) 171 + func emitJSON(logLine models.LogLine) { 172 + data, err := json.Marshal(logLine) 178 173 if err != nil { 179 174 fmt.Fprintf(os.Stderr, "ERROR: failed to marshal JSON: %v\n", err) 180 175 return
+11
config/crd/bases/loom.j5t.io_spindlesets.yaml
··· 69 69 PipelineRun contains pipeline-specific information for this pipeline execution. 70 70 This SpindleSet is ephemeral and represents a single pipeline run. 71 71 properties: 72 + cloneCommands: 73 + description: |- 74 + CloneCommands are the git commands to run in the clone init container. 75 + Generated by tangled.org/core/spindle/steps.BuildCloneCommands(). 76 + items: 77 + type: string 78 + type: array 72 79 commitSHA: 73 80 description: CommitSHA is the Git commit to checkout. 74 81 type: string ··· 83 90 repoURL: 84 91 description: RepoURL is the Git repository URL to clone. 85 92 type: string 93 + skipClone: 94 + description: SkipClone indicates whether to skip the clone init 95 + container entirely. 96 + type: boolean 86 97 workflows: 87 98 description: Workflows is the list of workflows to execute in 88 99 this pipeline.
+1 -1
go.mod
··· 197 197 ) 198 198 199 199 // Use our custom version of tangled until its upstreamed 200 - replace tangled.org/core => tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109203816-d73c73e8728b 200 + replace tangled.org/core => tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251111023958-287f7ea82ba6
+2 -2
go.sum
··· 687 687 sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= 688 688 sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= 689 689 sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= 690 - tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109203816-d73c73e8728b h1:m30BNzXLTO/V7tyh9levCuavrjgxH0KD5Y6lvX3S69s= 691 - tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251109203816-d73c73e8728b/go.mod h1:CUO6beA36K/Cwt0u2yrO5CG+L7+LzAc6zi6WudwO7qs= 690 + tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251111023958-287f7ea82ba6 h1:0vac5SqnzXzZi2HBZskjdUFdJflXTMMS9TM7mwttXeA= 691 + tangled.org/evan.jarrett.net/core v1.9.1-alpha.0.20251111023958-287f7ea82ba6/go.mod h1:CUO6beA36K/Cwt0u2yrO5CG+L7+LzAc6zi6WudwO7qs=
+3 -1
internal/controller/spindleset_controller.go
··· 35 35 "sigs.k8s.io/controller-runtime/pkg/log" 36 36 37 37 loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1" 38 - "tangled.org/evan.jarrett.net/loom/pkg/jobbuilder" 38 + "tangled.org/evan.jarrett.net/loom/internal/jobbuilder" 39 39 ) 40 40 41 41 // SpindleSetReconciler reconciles a SpindleSet object ··· 341 341 WorkflowSpec: workflowSpec, // Pass full workflow spec to runner 342 342 RepoURL: pipelineRun.RepoURL, 343 343 CommitSHA: pipelineRun.CommitSHA, 344 + CloneCommands: pipelineRun.CloneCommands, 345 + SkipClone: pipelineRun.SkipClone, 344 346 Secrets: nil, // TODO: Handle secrets 345 347 Template: spindleSet.Spec.Template, 346 348 Namespace: spindleSet.Namespace,
+22 -1
internal/controller/spindleset_controller_test.go
··· 51 51 Name: resourceName, 52 52 Namespace: "default", 53 53 }, 54 - // TODO(user): Specify other spec details if needed. 54 + Spec: loomv1alpha1.SpindleSetSpec{ 55 + Template: loomv1alpha1.SpindleTemplate{}, 56 + PipelineRun: &loomv1alpha1.PipelineRunSpec{ 57 + PipelineID: "test-pipeline-123", 58 + Knot: "knot.example.com", 59 + RepoURL: "https://knot.example.com/did:plc:test/repo", 60 + CommitSHA: "abc123", 61 + Workflows: []loomv1alpha1.WorkflowSpec{ 62 + { 63 + Name: "test-workflow", 64 + Image: "alpine:latest", 65 + Architecture: "amd64", 66 + Steps: []loomv1alpha1.WorkflowStep{ 67 + { 68 + Name: "test-step", 69 + Command: "echo 'hello'", 70 + }, 71 + }, 72 + }, 73 + }, 74 + }, 75 + }, 55 76 } 56 77 Expect(k8sClient.Create(ctx, resource)).To(Succeed()) 57 78 }
+50 -43
internal/engine/kubernetes_engine.go
··· 22 22 "tangled.org/core/api/tangled" 23 23 "tangled.org/core/spindle/models" 24 24 "tangled.org/core/spindle/secrets" 25 + "tangled.org/core/spindle/steps" 25 26 26 27 loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1" 27 28 ) ··· 115 116 116 117 // Store the parsed spec in Data field for later use 117 118 workflowData := map[string]interface{}{ 118 - "spec": spec, 119 - "triggerRepo": tpl.TriggerMetadata.Repo, 120 - "cloneOpts": twf.Clone, 119 + "spec": spec, 120 + "triggerMetadata": tpl.TriggerMetadata, 121 + "cloneOpts": twf.Clone, 121 122 } 122 123 123 124 workflow := &models.Workflow{ ··· 144 145 return fmt.Errorf("workflow spec not found in data") 145 146 } 146 147 147 - triggerRepo, ok := workflowData["triggerRepo"].(*tangled.Pipeline_TriggerRepo) 148 + triggerMetadata, ok := workflowData["triggerMetadata"].(*tangled.Pipeline_TriggerMetadata) 149 + if !ok { 150 + return fmt.Errorf("trigger metadata not found in data") 151 + } 152 + 153 + cloneOpts, ok := workflowData["cloneOpts"].(*tangled.Pipeline_CloneOpts) 148 154 if !ok { 149 - return fmt.Errorf("trigger repo not found in data") 155 + return fmt.Errorf("clone options not found in data") 156 + } 157 + 158 + // Use shared clone command builder to extract commit SHA and build repo URL 159 + // Note: We're not using the actual commands here, just extracting the metadata 160 + // The Job builder will regenerate the commands for the init container 161 + cloneConfig := steps.CloneConfig{ 162 + Workflow: tangled.Pipeline_Workflow{ 163 + Clone: cloneOpts, 164 + }, 165 + TriggerMetadata: *triggerMetadata, 166 + DevMode: false, // TODO: Make this configurable 167 + WorkspaceDir: "/tangled/workspace", 150 168 } 151 169 152 - // Determine commit SHA based on trigger type 153 - // For now, assume we're handling push events with NewSha 154 - // TODO: Handle other trigger types (PR, manual) 155 - commitSHA := "" // Need to extract from trigger metadata 170 + cloneCommands, err := steps.BuildCloneCommands(cloneConfig) 171 + if err != nil { 172 + return fmt.Errorf("failed to build clone commands: %w", err) 173 + } 156 174 157 - // Build repository clone URL 158 - repoURL := fmt.Sprintf("https://%s/%s/%s", triggerRepo.Knot, triggerRepo.Did, triggerRepo.Repo) 175 + commitSHA := cloneCommands.CommitSHA 176 + repoURL := cloneCommands.RepoURL 159 177 160 178 // Store knot for status reporting 161 - e.currentKnot = triggerRepo.Knot 179 + e.currentKnot = triggerMetadata.Repo.Knot 162 180 163 181 // Generate SpindleSet name: spindle-{workflowName}-{pipelineID} 164 182 // Sanitize workflow name (remove .yaml/.yml, replace dots) ··· 186 204 Spec: loomv1alpha1.SpindleSetSpec{ 187 205 Template: e.template, 188 206 PipelineRun: &loomv1alpha1.PipelineRunSpec{ 189 - PipelineID: wid.PipelineId.Rkey, 190 - Knot: e.currentKnot, 191 - RepoURL: repoURL, 192 - CommitSHA: commitSHA, 193 - Workflows: []loomv1alpha1.WorkflowSpec{spec}, // Single workflow per SpindleSet 207 + PipelineID: wid.PipelineId.Rkey, 208 + Knot: e.currentKnot, 209 + RepoURL: repoURL, 210 + CommitSHA: commitSHA, 211 + CloneCommands: cloneCommands.All, 212 + SkipClone: cloneCommands.Skip, 213 + Workflows: []loomv1alpha1.WorkflowSpec{spec}, // Single workflow per SpindleSet 194 214 }, 195 215 }, 196 216 } ··· 321 341 return nil 322 342 } 323 343 324 - // LogEvent represents a structured log event from the runner binary 325 - type LogEvent struct { 326 - Kind string `json:"kind"` // "control" or "data" 327 - Event string `json:"event,omitempty"` // "start", "end" (for control events) 328 - StepID int `json:"step_id"` // 0-based step index 329 - StepName string `json:"step_name,omitempty"` // Step name 330 - WorkflowName string `json:"workflow_name"` // Workflow name for log separation 331 - Stream string `json:"stream,omitempty"` // "stdout" or "stderr" (for data events) 332 - Content string `json:"content,omitempty"` // Log line content (for data events) 333 - ExitCode *int `json:"exit_code,omitempty"` // Exit code (for control/end events) 334 - Timestamp string `json:"timestamp"` // ISO 8601 timestamp 335 - } 336 - 337 344 // getOrCreateLogStream gets an existing log stream or creates a new one for step 0 338 345 func (e *KubernetesEngine) getOrCreateLogStream(ctx context.Context, wid models.WorkflowId, job *batchv1.Job) (*workflowLogStream, error) { 339 346 widKey := wid.String() ··· 456 463 for scanner.Scan() { 457 464 line := scanner.Text() 458 465 459 - // Try to parse as JSON event 460 - var event LogEvent 461 - if err := json.Unmarshal([]byte(line), &event); err != nil { 466 + // Try to parse as models.LogLine from the runner binary 467 + var logLine models.LogLine 468 + if err := json.Unmarshal([]byte(line), &logLine); err != nil { 462 469 // Not JSON or parse error - skip 463 470 continue 464 471 } 465 472 466 473 // Validate step index 467 - if event.StepID < 0 || event.StepID >= len(workflow.Steps) { 474 + if logLine.StepId < 0 || logLine.StepId >= len(workflow.Steps) { 468 475 continue 469 476 } 470 477 471 478 // Only process events for the current step 472 - if event.StepID != stepID { 479 + if logLine.StepId != stepID { 473 480 // Got event for a different step - this shouldn't happen in sequential execution 474 481 // but log it and continue 475 482 continue 476 483 } 477 484 478 - switch event.Kind { 479 - case "control": 485 + switch logLine.Kind { 486 + case models.LogKindControl: 480 487 // Use control events from runner for flow control only 481 488 // Don't write them - the core spindle engine writes control events 482 - if event.Event == "end" { 489 + if logLine.StepStatus == models.StepStatusEnd { 483 490 // Step is done, return 484 491 return nil 485 492 } 486 493 // For "start" events, just continue reading 487 494 488 - case "data": 495 + case models.LogKindData: 489 496 // Log output from step 490 - if event.Stream == "" { 491 - event.Stream = "stdout" // Default to stdout 497 + if logLine.Stream == "" { 498 + logLine.Stream = "stdout" // Default to stdout 492 499 } 493 - dataWriter := wfLogger.DataWriter(event.StepID, event.Stream) 494 - dataWriter.Write([]byte(event.Content + "\n")) 500 + dataWriter := wfLogger.DataWriter(logLine.StepId, logLine.Stream) 501 + dataWriter.Write([]byte(logLine.Content + "\n")) 495 502 } 496 503 } 497 504
pkg/jobbuilder/affinity.go internal/jobbuilder/affinity.go
+47 -34
pkg/jobbuilder/job_template.go internal/jobbuilder/job_template.go
··· 13 13 loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1" 14 14 ) 15 15 16 + // WorkflowStep represents a single step in a workflow. 17 + type WorkflowStep struct { 18 + Name string 19 + Command string 20 + Env map[string]string 21 + } 22 + 16 23 // WorkflowConfig contains all the information needed to create a spindle Job. 17 24 type WorkflowConfig struct { 18 25 // WorkflowName is the name of the workflow (used in Job name and labels) ··· 41 48 42 49 // CommitSHA is the git commit to checkout 43 50 CommitSHA string 51 + 52 + // CloneCommands are the git commands to run in the init container 53 + // Generated by steps.BuildCloneCommands() 54 + CloneCommands []string 55 + 56 + // SkipClone indicates whether to skip the clone init container entirely 57 + SkipClone bool 44 58 45 59 // Secrets contains auth tokens or other secrets needed 46 60 Secrets []string ··· 169 183 }, 170 184 }, 171 185 }, 172 - { 173 - Name: "clone-repo", 174 - Image: "alpine/git:latest", 175 - Command: []string{"/bin/sh", "-c"}, 176 - SecurityContext: &corev1.SecurityContext{ 177 - AllowPrivilegeEscalation: &[]bool{false}[0], 178 - Capabilities: &corev1.Capabilities{ 179 - Drop: []corev1.Capability{"ALL"}, 180 - }, 181 - }, 182 - Args: []string{ 183 - fmt.Sprintf(` 184 - set -e 185 - echo "Cloning repository..." 186 - git clone %s /tangled/workspace || { 187 - echo "Failed to clone repository" 188 - exit 1 189 - } 190 - cd /tangled/workspace 191 - echo "Checking out commit %s..." 192 - git checkout %s || { 193 - echo "Failed to checkout commit" 194 - exit 1 195 - } 196 - echo "Repository ready" 197 - `, config.RepoURL, config.CommitSHA, config.CommitSHA), 198 - }, 199 - VolumeMounts: []corev1.VolumeMount{ 200 - { 201 - Name: "workspace", 202 - MountPath: "/tangled/workspace", 203 - }, 204 - }, 205 - }, 186 + buildCloneInitContainer(config), 206 187 }, 207 188 208 189 // Main container: run loom-runner binary in user's image ··· 317 298 318 299 return env 319 300 } 301 + 302 + // buildCloneInitContainer creates the init container for cloning the git repository. 303 + // Uses the shared clone command builder from tangled.org/core/spindle/steps. 304 + func buildCloneInitContainer(config WorkflowConfig) corev1.Container { 305 + // If clone is skipped, return an empty container (will be filtered out) 306 + if config.SkipClone { 307 + return corev1.Container{} 308 + } 309 + 310 + // Build the shell script from clone commands 311 + // Add set -e for error handling and echo for visibility 312 + script := "set -e\n" + strings.Join(config.CloneCommands, "\n") + "\necho \"Repository ready\"" 313 + 314 + return corev1.Container{ 315 + Name: "clone-repo", 316 + Image: "alpine/git:latest", 317 + Command: []string{"/bin/sh", "-c"}, 318 + SecurityContext: &corev1.SecurityContext{ 319 + AllowPrivilegeEscalation: &[]bool{false}[0], 320 + Capabilities: &corev1.Capabilities{ 321 + Drop: []corev1.Capability{"ALL"}, 322 + }, 323 + }, 324 + Args: []string{script}, 325 + VolumeMounts: []corev1.VolumeMount{ 326 + { 327 + Name: "workspace", 328 + MountPath: "/tangled/workspace", 329 + }, 330 + }, 331 + } 332 + }
-129
pkg/jobbuilder/script_builder.go
··· 1 - package jobbuilder 2 - 3 - import ( 4 - "fmt" 5 - "strings" 6 - ) 7 - 8 - // WorkflowStep represents a single step in a workflow. 9 - type WorkflowStep struct { 10 - Name string 11 - Command string 12 - Env map[string]string 13 - } 14 - 15 - // BuildStepExecutionScript generates a bash script that executes all workflow steps sequentially. 16 - // The script implements GitHub Actions-compatible environment passing between steps. 17 - func BuildStepExecutionScript(steps []WorkflowStep) string { 18 - var script strings.Builder 19 - 20 - // Script header with error handling 21 - script.WriteString(`#!/bin/bash 22 - set -e 23 - set -o pipefail 24 - 25 - `) 26 - 27 - // GitHub Actions-compatible environment setup 28 - script.WriteString(`# GitHub Actions-compatible environment setup 29 - export GITHUB_ENV=/tmp/github/env 30 - export GITHUB_PATH=/tmp/github/path 31 - export GITHUB_OUTPUT=/tmp/github/output 32 - export GITHUB_STEP_SUMMARY=/tmp/github/step-summary.md 33 - 34 - # Initialize files 35 - mkdir -p /tmp/github 36 - touch $GITHUB_ENV $GITHUB_PATH $GITHUB_OUTPUT $GITHUB_STEP_SUMMARY 37 - 38 - # Helper functions for environment passing 39 - add_to_env() { 40 - echo "$1=$2" >> $GITHUB_ENV 41 - export "$1"="$2" 42 - } 43 - 44 - add_to_path() { 45 - echo "$1" >> $GITHUB_PATH 46 - export PATH="$1:$PATH" 47 - } 48 - 49 - set_output() { 50 - echo "$1=$2" >> $GITHUB_OUTPUT 51 - } 52 - 53 - # Load environment variables set by previous steps 54 - load_github_env() { 55 - if [ -f "$GITHUB_ENV" ]; then 56 - set -a 57 - source "$GITHUB_ENV" 2>/dev/null || true 58 - set +a 59 - fi 60 - if [ -f "$GITHUB_PATH" ]; then 61 - while IFS= read -r line; do 62 - if [ -n "$line" ]; then 63 - export PATH="$line:$PATH" 64 - fi 65 - done < "$GITHUB_PATH" 66 - fi 67 - } 68 - 69 - # Timestamp helper 70 - timestamp() { 71 - date -u +"%Y-%m-%dT%H:%M:%S.%3NZ" 72 - } 73 - 74 - `) 75 - 76 - // Generate code for each step 77 - for i, step := range steps { 78 - script.WriteString(fmt.Sprintf(` 79 - ############################################################################## 80 - # Step %d: %s 81 - ############################################################################## 82 - echo "" 83 - echo "$(timestamp) ===== Starting Step %d: %s =====" 84 - echo "" 85 - 86 - # Load environment from previous steps 87 - load_github_env 88 - 89 - `, i+1, step.Name, i+1, step.Name)) 90 - 91 - // Set step-specific environment variables 92 - if len(step.Env) > 0 { 93 - script.WriteString("# Step-specific environment variables\n") 94 - for k, v := range step.Env { 95 - // Escape single quotes in the value 96 - escapedValue := strings.ReplaceAll(v, "'", "'\\''") 97 - script.WriteString(fmt.Sprintf("export %s='%s'\n", k, escapedValue)) 98 - } 99 - script.WriteString("\n") 100 - } 101 - 102 - // Execute the step command 103 - script.WriteString("# Execute step command\n") 104 - script.WriteString("(\n") 105 - script.WriteString(" cd /tangled/workspace\n") 106 - script.WriteString(" " + step.Command + "\n") 107 - script.WriteString(") || {\n") 108 - script.WriteString(fmt.Sprintf(" echo \"$(timestamp) Step '%s' failed with exit code $?\"\n", step.Name)) 109 - script.WriteString(" exit 1\n") 110 - script.WriteString("}\n\n") 111 - 112 - script.WriteString(fmt.Sprintf("echo \"$(timestamp) ===== Completed Step %d: %s =====\"\n", i+1, step.Name)) 113 - } 114 - 115 - // Script footer 116 - script.WriteString(` 117 - echo "" 118 - echo "$(timestamp) ===== All steps completed successfully =====" 119 - exit 0 120 - `) 121 - 122 - return script.String() 123 - } 124 - 125 - // EscapeShellString escapes a string for safe use in shell scripts. 126 - func EscapeShellString(s string) string { 127 - // Replace single quotes with '\'' 128 - return strings.ReplaceAll(s, "'", "'\\''") 129 - }