tangled
alpha
login
or
join now
evan.jarrett.net
/
loom
10
fork
atom
Kubernetes Operator for Tangled Spindles
10
fork
atom
overview
issues
pulls
pipelines
fixes for buildah, logs missing, etc
evan.jarrett.net
3 months ago
953da5d2
26f81703
verified
This commit was signed with the committer's
known signature
.
evan.jarrett.net
SSH Key Fingerprint:
SHA256:bznk0uVPp7XFOl67P0uTM1pCjf2A4ojeP/lsUE7uauQ=
0/2
workflow-amd64.yaml
failed
5m 2s
workflow-arm64.yaml
failed
5m 4s
+205
-30
7 changed files
expand all
collapse all
unified
split
BUILDAH_IMPLEMENTATION.md
config
manager
manager.yaml
pvc.yaml
internal
controller
spindleset_controller.go
engine
kubernetes_engine.go
jobbuilder
job_template.go
job_template_test.go
+19
-9
BUILDAH_IMPLEMENTATION.md
reviewed
···
26
26
27
27
Loom Jobs can now build container images using **buildah**, a rootless, daemonless container building tool. This implementation provides:
28
28
29
29
-
- ✅ **Rootless operation** - All containers run as UID 10000 (non-root)
29
29
+
- ✅ **Rootless operation** - All containers run as UID 1000 (non-root)
30
30
- ✅ **No privileged mode** - Maintains strict security posture
31
31
- ✅ **No caching** - Ephemeral builds like GitHub Actions runners
32
32
- ✅ **Registry authentication** - Optional Docker config secret mounting
···
63
63
```yaml
64
64
securityContext:
65
65
runAsNonRoot: true
66
66
-
runAsUser: 10000
67
67
-
fsGroup: 10000
66
66
+
runAsUser: 1000
67
67
+
fsGroup: 1000
68
68
seccompProfile:
69
69
type: RuntimeDefault
70
70
```
···
73
73
74
74
#### 2. Init Containers
75
75
76
76
-
**a. install-runner** (existing)
76
76
+
**a. setup-user** (new)
77
77
+
- Image: `busybox:latest`
78
78
+
- Creates `/etc/passwd` and `/etc/group` entries for UID 1000
79
79
+
- Creates `/home/runner` directory
80
80
+
- Runs as root (only init container that does)
81
81
+
- Required because many tools (like buildah) need a valid passwd entry
82
82
+
83
83
+
**b. install-runner** (existing)
77
84
- Copies `/loom-runner` binary to shared volume
78
85
79
79
-
**b. configure-buildah** (new)
86
86
+
**c. configure-buildah** (new)
80
87
- Image: `quay.io/buildah/stable:latest`
81
88
- Configures buildah storage driver (`/var/lib/containers/storage.conf`)
82
89
- Copies `buildah` binary to `/runner-bin/buildah`
83
83
-
- Runs as UID 10000 (non-root)
90
90
+
- Runs as UID 1000 (non-root)
84
91
85
85
-
**c. clone-repo** (existing)
92
92
+
**d. clone-repo** (existing)
86
93
- Clones git repository
87
94
88
95
#### 3. Main Container (runner)
···
91
98
- `/tangled/workspace` - Git repository workspace
92
99
- `/runner-bin/` - Loom runner and buildah binaries
93
100
- `/var/lib/containers` - Buildah storage (emptyDir)
94
94
-
- `/home/user/.docker/config.json` - Registry credentials (optional)
101
101
+
- `/etc/passwd` - User entry for UID 1000 (from setup-user)
102
102
+
- `/etc/group` - Group entry for GID 1000 (from setup-user)
103
103
+
- `/home/runner` - Home directory for runner user
104
104
+
- `/home/runner/.docker/config.json` - Registry credentials (optional)
95
105
96
106
### Buildah Configuration
97
107
···
222
232
### Current Security Posture
223
233
224
234
✅ **Implemented:**
225
225
-
- All containers run as UID 10000 (non-root)
235
235
+
- All containers run as UID 1000 (non-root), except setup-user init container
226
236
- `allowPrivilegeEscalation: false` on all containers
227
237
- All Linux capabilities dropped
228
238
- `seccompProfile: RuntimeDefault`
+2
-1
config/manager/manager.yaml
reviewed
···
111
111
readOnly: true
112
112
volumes:
113
113
- name: spindle-logs
114
114
-
emptyDir: {}
114
114
+
persistentVolumeClaim:
115
115
+
claimName: spindle-logs
115
116
- name: spindle-db
116
117
persistentVolumeClaim:
117
118
claimName: spindle-db
+12
config/manager/pvc.yaml
reviewed
···
9
9
resources:
10
10
requests:
11
11
storage: 1Gi
12
12
+
---
13
13
+
apiVersion: v1
14
14
+
kind: PersistentVolumeClaim
15
15
+
metadata:
16
16
+
name: spindle-logs
17
17
+
namespace: system
18
18
+
spec:
19
19
+
accessModes:
20
20
+
- ReadWriteOnce
21
21
+
resources:
22
22
+
requests:
23
23
+
storage: 5Gi
+51
-4
internal/controller/spindleset_controller.go
reviewed
···
19
19
import (
20
20
"context"
21
21
"fmt"
22
22
+
"strings"
22
23
"sync"
24
24
+
"time"
23
25
26
26
+
"github.com/cenkalti/backoff/v4"
24
27
"tangled.org/core/spindle"
25
28
"tangled.org/core/spindle/models"
26
29
···
103
106
}
104
107
105
108
// Ensure Jobs are created for workflows
109
109
+
var jobsErr error
106
110
if err := r.ensurePipelineJobs(ctx, spindleSet); err != nil {
107
111
logger.Error(err, "Failed to ensure pipeline Jobs")
108
108
-
return ctrl.Result{}, err
112
112
+
jobsErr = err
113
113
+
// Continue to update status even on error for better observability
109
114
}
110
115
111
116
// Monitor Job statuses
···
126
131
return ctrl.Result{}, err
127
132
}
128
133
134
134
+
// Return error from job creation if any (after status update)
135
135
+
if jobsErr != nil {
136
136
+
return ctrl.Result{}, jobsErr
137
137
+
}
138
138
+
129
139
// Requeue after 30 seconds to update status
130
140
return ctrl.Result{RequeueAfter: 30 * ctrl.Result{}.RequeueAfter}, nil
131
141
}
···
174
184
return ctrl.Result{}, nil
175
185
}
176
186
187
187
+
// isRetryableError returns true if the error is transient and should be retried
188
188
+
func isRetryableError(err error) bool {
189
189
+
if err == nil {
190
190
+
return false
191
191
+
}
192
192
+
// Check for common transient Kubernetes API errors
193
193
+
if apierrors.IsServerTimeout(err) ||
194
194
+
apierrors.IsServiceUnavailable(err) ||
195
195
+
apierrors.IsTooManyRequests(err) ||
196
196
+
apierrors.IsTimeout(err) {
197
197
+
return true
198
198
+
}
199
199
+
// Check error message for etcd-specific errors
200
200
+
errMsg := err.Error()
201
201
+
return strings.Contains(errMsg, "etcdserver:") ||
202
202
+
strings.Contains(errMsg, "context deadline exceeded")
203
203
+
}
204
204
+
205
205
+
// retryCreate wraps a create operation with exponential backoff for transient errors
206
206
+
func (r *SpindleSetReconciler) retryCreate(ctx context.Context, obj client.Object) error {
207
207
+
bo := backoff.NewExponentialBackOff()
208
208
+
bo.InitialInterval = 100 * time.Millisecond
209
209
+
bo.MaxInterval = 5 * time.Second
210
210
+
bo.MaxElapsedTime = 30 * time.Second
211
211
+
212
212
+
operation := func() error {
213
213
+
err := r.Create(ctx, obj)
214
214
+
if err != nil && !isRetryableError(err) {
215
215
+
// Non-retryable error, stop retrying immediately
216
216
+
return backoff.Permanent(err)
217
217
+
}
218
218
+
return err
219
219
+
}
220
220
+
221
221
+
return backoff.Retry(operation, backoff.WithContext(bo, ctx))
222
222
+
}
223
223
+
177
224
// updateStatus updates the SpindleSet status based on current Jobs
178
225
func (r *SpindleSetReconciler) updateStatus(ctx context.Context, spindleSet *loomv1alpha1.SpindleSet) error {
179
226
logger := log.FromContext(ctx)
···
333
380
334
381
if err != nil {
335
382
if apierrors.IsNotFound(err) {
336
336
-
// Create the secret
383
383
+
// Create the secret with retry for transient errors
337
384
logger.Info("Creating Kubernetes Secret for repository secrets", "secret", secretName, "count", len(pipelineRun.Secrets))
338
338
-
if err := r.Create(ctx, secret); err != nil {
385
385
+
if err := r.retryCreate(ctx, secret); err != nil {
339
386
return fmt.Errorf("failed to create secret: %w", err)
340
387
}
341
388
} else {
···
414
461
}
415
462
416
463
logger.Info("Creating Job for workflow", "workflow", workflowSpec.Name, "job", job.Name)
417
417
-
if err := r.Create(ctx, job); err != nil {
464
464
+
if err := r.retryCreate(ctx, job); err != nil {
418
465
if apierrors.IsAlreadyExists(err) {
419
466
// Job already exists (possibly from previous deployment), skip
420
467
logger.Info("Job already exists, skipping creation", "workflow", workflowSpec.Name, "job", job.Name)
+42
-6
internal/engine/kubernetes_engine.go
reviewed
···
1
1
package engine
2
2
3
3
import (
4
4
-
"maps"
5
4
"bufio"
6
5
"context"
7
6
"encoding/json"
8
7
"fmt"
9
8
"io"
9
9
+
"maps"
10
10
"strings"
11
11
"sync"
12
12
"time"
···
259
259
return 1 * time.Hour
260
260
}
261
261
262
262
+
// getSpindleSet returns the SpindleSet for the given WorkflowId, checking cache first then Kubernetes.
263
263
+
// This allows the engine to recover state after a restart by querying existing resources.
264
264
+
func (e *KubernetesEngine) getSpindleSet(ctx context.Context, wid models.WorkflowId) (*loomv1alpha1.SpindleSet, error) {
265
265
+
// Check cache first
266
266
+
if ss, exists := e.spindleSets[wid.String()]; exists {
267
267
+
return ss, nil
268
268
+
}
269
269
+
270
270
+
// Cache miss - query Kubernetes (handles restart recovery)
271
271
+
spindleSetList := &loomv1alpha1.SpindleSetList{}
272
272
+
if err := e.client.List(ctx, spindleSetList,
273
273
+
client.InNamespace(e.namespace),
274
274
+
client.MatchingLabels{
275
275
+
"loom.j5t.io/pipeline-id": wid.PipelineId.Rkey,
276
276
+
"loom.j5t.io/workflow": wid.Name,
277
277
+
}); err != nil {
278
278
+
return nil, fmt.Errorf("failed to query SpindleSet: %w", err)
279
279
+
}
280
280
+
281
281
+
if len(spindleSetList.Items) == 0 {
282
282
+
return nil, nil // Not found
283
283
+
}
284
284
+
285
285
+
// Cache for future lookups
286
286
+
ss := &spindleSetList.Items[0]
287
287
+
e.spindleSets[wid.String()] = ss
288
288
+
return ss, nil
289
289
+
}
290
290
+
262
291
// DestroyWorkflow cleans up the SpindleSet after completion.
263
292
func (e *KubernetesEngine) DestroyWorkflow(ctx context.Context, wid models.WorkflowId) error {
264
293
logger := log.FromContext(ctx).WithValues("workflow", wid.Name, "pipeline", wid.PipelineId.Rkey)
265
294
266
266
-
spindleSet, exists := e.spindleSets[wid.String()]
267
267
-
if !exists {
295
295
+
spindleSet, err := e.getSpindleSet(ctx, wid)
296
296
+
if err != nil {
297
297
+
logger.Error(err, "Failed to lookup SpindleSet")
298
298
+
return nil // Don't fail cleanup on lookup error
299
299
+
}
300
300
+
if spindleSet == nil {
268
301
logger.Info("No SpindleSet found to destroy")
269
302
return nil
270
303
}
···
311
344
// Query for the Job created by SpindleSetReconciler (only on first step)
312
345
var job *batchv1.Job
313
346
if idx == 0 {
314
314
-
spindleSet, exists := e.spindleSets[wid.String()]
315
315
-
if !exists {
316
316
-
return fmt.Errorf("no SpindleSet found for workflow")
347
347
+
spindleSet, err := e.getSpindleSet(ctx, wid)
348
348
+
if err != nil {
349
349
+
return err
350
350
+
}
351
351
+
if spindleSet == nil {
352
352
+
return fmt.Errorf("no SpindleSet found for workflow %s", wid.String())
317
353
}
318
354
319
355
// Wait for Job to be created by controller
+78
-9
internal/jobbuilder/job_template.go
reviewed
···
192
192
RestartPolicy: corev1.RestartPolicyNever,
193
193
SecurityContext: &corev1.PodSecurityContext{
194
194
RunAsNonRoot: &[]bool{true}[0],
195
195
-
RunAsUser: &[]int64{10000}[0],
196
196
-
FSGroup: &[]int64{10000}[0],
195
195
+
RunAsUser: &[]int64{1000}[0],
196
196
+
FSGroup: &[]int64{1000}[0],
197
197
// Note: User namespaces (hostUsers: false) for enhanced buildah rootless
198
198
// operation requires Kubernetes 1.33+ and is not yet available in the
199
199
// current API version. Buildah will still work in rootless mode without it.
···
204
204
// Disable ServiceAccount token mounting for security
205
205
AutomountServiceAccountToken: &[]bool{false}[0],
206
206
207
207
-
// Init containers: install runner binary, configure buildah, then clone repository
207
207
+
// Init containers: setup user, install runner binary, configure buildah, then clone repository
208
208
InitContainers: []corev1.Container{
209
209
+
// Setup user creates /etc/passwd and /etc/group entries for UID 1000
210
210
+
// This is needed because many tools (like buildah) require a valid passwd entry
211
211
+
{
212
212
+
Name: "setup-user",
213
213
+
Image: "busybox:latest",
214
214
+
Command: []string{"/bin/sh", "-c"},
215
215
+
Args: []string{`
216
216
+
cat > /etc-override/passwd <<'EOF'
217
217
+
root:x:0:0:root:/root:/bin/bash
218
218
+
nobody:x:65534:65534:nobody:/nonexistent:/usr/sbin/nologin
219
219
+
runner:x:1000:1000:runner:/home/runner:/bin/sh
220
220
+
EOF
221
221
+
cat > /etc-override/group <<'EOF'
222
222
+
root:x:0:
223
223
+
nobody:x:65534:
224
224
+
runner:x:1000:
225
225
+
EOF
226
226
+
mkdir -p /home-override/runner
227
227
+
echo "User setup complete"
228
228
+
`},
229
229
+
SecurityContext: &corev1.SecurityContext{
230
230
+
AllowPrivilegeEscalation: &[]bool{false}[0],
231
231
+
// Note: This init container runs as root to create the passwd/group files
232
232
+
// All subsequent containers run as UID 1000 (non-root)
233
233
+
RunAsUser: &[]int64{0}[0],
234
234
+
Capabilities: &corev1.Capabilities{
235
235
+
Drop: []corev1.Capability{"ALL"},
236
236
+
},
237
237
+
},
238
238
+
VolumeMounts: []corev1.VolumeMount{
239
239
+
{
240
240
+
Name: "etc-override",
241
241
+
MountPath: "/etc-override",
242
242
+
},
243
243
+
{
244
244
+
Name: "home-override",
245
245
+
MountPath: "/home-override",
246
246
+
},
247
247
+
},
248
248
+
},
209
249
{
210
250
Name: "install-runner",
211
251
Image: "atcr.io/evan.jarrett.net/loom-runner:latest",
···
213
253
SecurityContext: &corev1.SecurityContext{
214
254
AllowPrivilegeEscalation: &[]bool{false}[0],
215
255
RunAsNonRoot: &[]bool{true}[0],
216
216
-
RunAsUser: &[]int64{10000}[0],
256
256
+
RunAsUser: &[]int64{1000}[0],
217
257
ReadOnlyRootFilesystem: &[]bool{true}[0],
218
258
Capabilities: &corev1.Capabilities{
219
259
Drop: []corev1.Capability{"ALL"},
···
255
295
SecurityContext: &corev1.SecurityContext{
256
296
AllowPrivilegeEscalation: &[]bool{false}[0],
257
297
RunAsNonRoot: &[]bool{true}[0],
258
258
-
RunAsUser: &[]int64{10000}[0],
298
298
+
RunAsUser: &[]int64{1000}[0],
259
299
Capabilities: &corev1.Capabilities{
260
300
Drop: []corev1.Capability{"ALL"},
261
301
},
···
290
330
SecurityContext: &corev1.SecurityContext{
291
331
AllowPrivilegeEscalation: &[]bool{false}[0],
292
332
RunAsNonRoot: &[]bool{true}[0],
293
293
-
RunAsUser: &[]int64{10000}[0],
333
333
+
RunAsUser: &[]int64{1000}[0],
294
334
// Note: ReadOnlyRootFilesystem is NOT set for the runner container
295
335
// because user-defined images may need to write to various locations
296
336
// (e.g., /go/pkg, ~/.cache, /var/tmp) that we can't predict or mount
···
375
415
376
416
// Build the shell script from clone commands
377
417
// Add set -e for error handling, safe.directory config to handle ownership mismatch
378
378
-
// (emptyDir volumes are root-owned but we run as user 10000)
418
418
+
// (emptyDir volumes are root-owned but we run as user 1000)
379
419
script := "set -e\n" +
380
420
"git config --global init.defaultBranch main\n" +
381
421
"git config --global advice.detachedHead false\n" +
···
397
437
SecurityContext: &corev1.SecurityContext{
398
438
AllowPrivilegeEscalation: &[]bool{false}[0],
399
439
RunAsNonRoot: &[]bool{true}[0],
400
400
-
RunAsUser: &[]int64{10000}[0],
440
440
+
RunAsUser: &[]int64{1000}[0],
401
441
ReadOnlyRootFilesystem: &[]bool{true}[0],
402
442
Capabilities: &corev1.Capabilities{
403
443
Drop: []corev1.Capability{"ALL"},
···
436
476
Name: "buildah-storage",
437
477
MountPath: "/var/lib/containers",
438
478
},
479
479
+
// Mount passwd/group files created by setup-user init container
480
480
+
// This ensures UID 1000 is recognized by tools like buildah
481
481
+
{
482
482
+
Name: "etc-override",
483
483
+
MountPath: "/etc/passwd",
484
484
+
SubPath: "passwd",
485
485
+
},
486
486
+
{
487
487
+
Name: "etc-override",
488
488
+
MountPath: "/etc/group",
489
489
+
SubPath: "group",
490
490
+
},
491
491
+
{
492
492
+
Name: "home-override",
493
493
+
MountPath: "/home/runner",
494
494
+
SubPath: "runner",
495
495
+
},
439
496
}
440
497
441
498
// Mount registry credentials if specified
442
499
if config.Template.RegistryCredentialsSecret != "" {
443
500
mounts = append(mounts, corev1.VolumeMount{
444
501
Name: "registry-credentials",
445
445
-
MountPath: "/home/user/.docker",
502
502
+
MountPath: "/home/runner/.docker",
446
503
ReadOnly: true,
447
504
})
448
505
}
···
473
530
},
474
531
{
475
532
Name: "buildah-storage",
533
533
+
VolumeSource: corev1.VolumeSource{
534
534
+
EmptyDir: &corev1.EmptyDirVolumeSource{},
535
535
+
},
536
536
+
},
537
537
+
{
538
538
+
Name: "etc-override",
539
539
+
VolumeSource: corev1.VolumeSource{
540
540
+
EmptyDir: &corev1.EmptyDirVolumeSource{},
541
541
+
},
542
542
+
},
543
543
+
{
544
544
+
Name: "home-override",
476
545
VolumeSource: corev1.VolumeSource{
477
546
EmptyDir: &corev1.EmptyDirVolumeSource{},
478
547
},
+1
-1
internal/jobbuilder/job_template_test.go
reviewed
···
5
5
"testing"
6
6
7
7
corev1 "k8s.io/api/core/v1"
8
8
-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
9
8
"k8s.io/apimachinery/pkg/api/resource"
9
9
+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10
10
11
11
loomv1alpha1 "tangled.org/evan.jarrett.net/loom/api/v1alpha1"
12
12
)