From 80fb911e227fd3673b43c5bb013fc46d21bc74de Mon Sep 17 00:00:00 2001
From: "Billy D." <billy.davies.10@icloud.com>
Date: Thu, 5 Feb 2026 16:13:53 -0500
Subject: [PATCH] updating to match everything in my homelab.

---
 decisions/0030-mfa-yubikey-strategy.md        |   4 +-
 decisions/0031-gitea-cicd-strategy.md         | 301 ++++++++++++++++++
 decisions/0032-velero-backup-strategy.md      | 180 +++++++++++
 decisions/0033-data-analytics-platform.md     | 267 ++++++++++++++++
 decisions/0034-volcano-batch-scheduling.md    | 206 ++++++++++++
 decisions/0035-arm64-worker-strategy.md       | 195 ++++++++++++
 decisions/0036-renovate-dependency-updates.md | 256 +++++++++++++++
 decisions/0037-node-naming-conventions.md     | 187 +++++++++++
 diagrams/README.md                            |  31 +-
 diagrams/analytics-lakehouse.mmd              |  85 +++++
 diagrams/authentik-sso.mmd                    |  84 +++++
 diagrams/cluster-topology.mmd                 |  66 ++++
 diagrams/database-strategy.mmd                |  96 ++++++
 diagrams/dual-workflow-engines.mmd            |  73 +++++
 diagrams/gitops-flux.mmd                      |  57 ++++
 diagrams/handler-deployment.mmd               |  67 ++++
 diagrams/internal-registry.mmd                |  53 +++
 diagrams/kuberay-unified-backend.mmd          |  77 +++++
 diagrams/node-naming.mmd                      |  64 ++++
 diagrams/notification-architecture.mmd        |  63 ++++
 diagrams/ntfy-discord-bridge.mmd              |  45 +++
 diagrams/observability-stack.mmd              |  72 +++++
 diagrams/ray-repository-structure.mmd         |  66 ++++
 diagrams/renovate-workflow.mmd                |  86 +++++
 diagrams/secrets-management.mmd               |  51 +++
 diagrams/security-policy-enforcement.mmd      |  81 +++++
 diagrams/storage-strategy.mmd                 |  67 ++++
 diagrams/user-registration-workflow.mmd       |  93 ++++++
 diagrams/velero-backup.mmd                    |  60 ++++
 diagrams/volcano-scheduling.mmd               |  81 +++++
 30 files changed, 3107 insertions(+), 7 deletions(-)
 create mode 100644 decisions/0031-gitea-cicd-strategy.md
 create mode 100644 decisions/0032-velero-backup-strategy.md
 create mode 100644 decisions/0033-data-analytics-platform.md
 create mode 100644 decisions/0034-volcano-batch-scheduling.md
 create mode 100644 decisions/0035-arm64-worker-strategy.md
 create mode 100644 decisions/0036-renovate-dependency-updates.md
 create mode 100644 decisions/0037-node-naming-conventions.md
 create mode 100644 diagrams/analytics-lakehouse.mmd
 create mode 100644 diagrams/authentik-sso.mmd
 create mode 100644 diagrams/cluster-topology.mmd
 create mode 100644 diagrams/database-strategy.mmd
 create mode 100644 diagrams/dual-workflow-engines.mmd
 create mode 100644 diagrams/gitops-flux.mmd
 create mode 100644 diagrams/handler-deployment.mmd
 create mode 100644 diagrams/internal-registry.mmd
 create mode 100644 diagrams/kuberay-unified-backend.mmd
 create mode 100644 diagrams/node-naming.mmd
 create mode 100644 diagrams/notification-architecture.mmd
 create mode 100644 diagrams/ntfy-discord-bridge.mmd
 create mode 100644 diagrams/observability-stack.mmd
 create mode 100644 diagrams/ray-repository-structure.mmd
 create mode 100644 diagrams/renovate-workflow.mmd
 create mode 100644 diagrams/secrets-management.mmd
 create mode 100644 diagrams/security-policy-enforcement.mmd
 create mode 100644 diagrams/storage-strategy.mmd
 create mode 100644 diagrams/user-registration-workflow.mmd
 create mode 100644 diagrams/velero-backup.mmd
 create mode 100644 diagrams/volcano-scheduling.mmd

diff --git a/decisions/0030-mfa-yubikey-strategy.md b/decisions/0030-mfa-yubikey-strategy.md
index d5dd2a1..f14fd08 100644
--- a/decisions/0030-mfa-yubikey-strategy.md
+++ b/decisions/0030-mfa-yubikey-strategy.md
@@ -52,8 +52,8 @@ WebAuthn provides the best security (phishing-resistant) and user experience (to
 
 | Application | WebAuthn Support | Current Status | Action Required |
 |-------------|------------------|----------------|-----------------|
-| Authentik | ✅ Native | ✅ Working | Configure enforcement policies |
-| Vaultwarden | ✅ Native | ⚠️ Partial | Enable in admin settings |
+| Authentik | ✅ Native | ⚠️ In Progress | Configure enforcement policies |
+| Vaultwarden | ✅ Native | ✅ Implemented | None - WebAuthn enrolled |
 
 ## Authentik Configuration
 
diff --git a/decisions/0031-gitea-cicd-strategy.md b/decisions/0031-gitea-cicd-strategy.md
new file mode 100644
index 0000000..4d5dec3
--- /dev/null
+++ b/decisions/0031-gitea-cicd-strategy.md
@@ -0,0 +1,301 @@
+# Gitea CI/CD Pipeline Strategy
+
+* Status: accepted
+* Date: 2026-02-04
+* Deciders: Billy
+* Technical Story: Establish CI/CD patterns for building and publishing container images via Gitea Actions
+
+## Context and Problem Statement
+
+The homelab uses Gitea as the Git hosting platform. Applications need automated CI/CD pipelines to build container images, run tests, and publish artifacts. Gitea Actions provides GitHub Actions-compatible workflow execution.
+
+How do we configure CI/CD pipelines that work reliably with the homelab's self-hosted infrastructure including private container registry, rootless Docker-in-Docker runners, and internal services?
+
+## Decision Drivers
+
+* Self-hosted - no external CI/CD dependencies
+* Container registry integration - push to Gitea's built-in registry
+* Rootless security - runners don't require privileged containers
+* Internal networking - leverage cluster service discovery
+* Semantic versioning - automated version bumps based on commit messages
+
+## Considered Options
+
+1. **Gitea Actions with rootless DinD runners**
+2. **External CI/CD (GitHub Actions, GitLab CI)**
+3. **Self-hosted Jenkins/Drone**
+4. **Tekton Pipelines**
+
+## Decision Outcome
+
+Chosen option: **Option 1 - Gitea Actions with rootless DinD runners**
+
+Gitea Actions provides GitHub Actions compatibility, runs inside the cluster with access to internal services, and supports rootless Docker-in-Docker for secure container builds.
+
+### Positive Consequences
+
+* GitHub Actions syntax familiarity
+* In-cluster access to internal services
+* Built-in container registry integration
+* No external dependencies
+* Rootless execution for security
+
+### Negative Consequences
+
+* Some GitHub Actions may not work (org-specific actions)
+* Rootless DinD has some limitations
+* Self-hosted maintenance burden
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                              Developer Push                                  │
+└──────────────────────────────────┬──────────────────────────────────────────┘
+                                   │
+                                   ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                              Gitea Server                                    │
+│                    (git.daviestechlabs.io)                                  │
+│  ┌─────────────────────────────────────────────────────────────────────┐    │
+│  │                        Actions Trigger                               │    │
+│  │  • Push to main branch                                              │    │
+│  │  • Pull request                                                     │    │
+│  │  • Tag creation                                                     │    │
+│  │  • workflow_dispatch                                                │    │
+│  └─────────────────────────────────────────────────────────────────────┘    │
+└──────────────────────────────────┬──────────────────────────────────────────┘
+                                   │
+                                   ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                         Gitea Actions Runner                                 │
+│                    (rootless Docker-in-Docker)                              │
+│                                                                             │
+│  ┌─────────────┐    ┌─────────────┐    ┌─────────────┐                     │
+│  │  Checkout   │───▶│   Buildx    │───▶│    Push     │                     │
+│  │             │    │   Build     │    │  Registry   │                     │
+│  └─────────────┘    └─────────────┘    └──────┬──────┘                     │
+│                                               │                             │
+└───────────────────────────────────────────────┼─────────────────────────────┘
+                                                │
+                                                ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                         Gitea Container Registry                             │
+│              (gitea-http.gitea.svc.cluster.local:3000)                      │
+│                                                                             │
+│  Images:                                                                    │
+│  • daviestechlabs/ray-worker-nvidia:v1.0.1                                 │
+│  • daviestechlabs/ray-worker-rdna2:v1.0.1                                  │
+│  • daviestechlabs/ray-worker-strixhalo:v1.0.1                              │
+│  • daviestechlabs/ray-worker-intel:v1.0.1                                  │
+│  • daviestechlabs/ntfy-discord:latest                                      │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+## Runner Configuration
+
+### Rootless Docker-in-Docker
+
+The runner uses rootless Docker for security:
+
+```yaml
+# Runner deployment uses rootless DinD
+# No privileged containers required
+# No sudo access in workflows
+```
+
+### Runner Registration
+
+Runners must be registered with **project-scoped tokens**, not instance tokens:
+
+1. Go to **Repository → Settings → Actions → Runners**
+2. Create new runner with project token
+3. Use token for runner registration
+
+**Common mistake:** Using instance-level token causes jobs not to be picked up.
+
+## Registry Authentication
+
+### Internal HTTP Endpoint
+
+Use internal cluster DNS for registry access. This avoids:
+- Cloudflare tunnel 100MB upload limit
+- TLS certificate issues
+- External network latency
+
+```yaml
+env:
+  REGISTRY: gitea-http.gitea.svc.cluster.local:3000/daviestechlabs
+  REGISTRY_HOST: gitea-http.gitea.svc.cluster.local:3000
+```
+
+### Buildx Configuration
+
+Configure buildx to use HTTP for internal registry:
+
+```yaml
+- name: Set up Docker Buildx
+  uses: docker/setup-buildx-action@v3
+  with:
+    buildkitd-config-inline: |
+      [registry."gitea-http.gitea.svc.cluster.local:3000"]
+        http = true
+        insecure = true
+```
+
+### Credential Configuration
+
+For rootless DinD, create docker config directly (no `docker login` - it defaults to HTTPS):
+
+```yaml
+- name: Configure Gitea Registry Auth
+  if: github.event_name != 'pull_request'
+  run: |
+    AUTH=$(echo -n "${{ secrets.REGISTRY_USER }}:${{ secrets.REGISTRY_TOKEN }}" | base64 -w0)
+    mkdir -p ~/.docker
+    cat > ~/.docker/config.json << EOF
+    {
+      "auths": {
+        "${{ env.REGISTRY_HOST }}": {
+          "auth": "$AUTH"
+        }
+      }
+    }
+    EOF
+```
+
+**Important:** Buildx reads `~/.docker/config.json` for authentication during push. Do NOT use `docker login` for HTTP registries as it defaults to HTTPS.
+
+### Required Secrets
+
+Configure in **Repository → Settings → Actions → Secrets**:
+
+| Secret | Purpose |
+|--------|---------|
+| `REGISTRY_USER` | Gitea username with package write access |
+| `REGISTRY_TOKEN` | Gitea access token with `write:package` scope |
+| `DOCKERHUB_TOKEN` | (Optional) Docker Hub token for rate limit bypass |
+
+## Semantic Versioning
+
+### Commit Message Conventions
+
+Version bumps are determined from commit message prefixes:
+
+| Prefix | Bump Type | Example |
+|--------|-----------|---------|
+| `major:` or `BREAKING CHANGE` | Major (x.0.0) | `major: Remove deprecated API` |
+| `minor:`, `feat:`, `feature:` | Minor (0.x.0) | `feat: Add new endpoint` |
+| (anything else) | Patch (0.0.x) | `fix: Correct typo` |
+
+### Version Calculation
+
+```yaml
+- name: Calculate semantic version
+  id: version
+  run: |
+    LATEST=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
+    VERSION=${LATEST#v}
+    IFS='.' read -r MAJOR MINOR PATCH <<< "$VERSION"
+    
+    MSG="${{ github.event.head_commit.message }}"
+    if echo "$MSG" | grep -qiE "^major:|BREAKING CHANGE"; then
+      MAJOR=$((MAJOR + 1)); MINOR=0; PATCH=0
+    elif echo "$MSG" | grep -qiE "^(minor:|feat:|feature:)"; then
+      MINOR=$((MINOR + 1)); PATCH=0
+    else
+      PATCH=$((PATCH + 1))
+    fi
+    
+    echo "version=v${MAJOR}.${MINOR}.${PATCH}" >> $GITHUB_OUTPUT
+```
+
+### Automatic Tagging
+
+After successful builds, create and push a git tag:
+
+```yaml
+- name: Create and push tag
+  run: |
+    git config user.name "gitea-actions[bot]"
+    git config user.email "actions@git.daviestechlabs.io"
+    git tag -a "$VERSION" -m "Release $VERSION ($BUMP)"
+    git push origin "$VERSION"
+```
+
+## Notifications
+
+### ntfy Integration
+
+Send build status to ntfy for notifications:
+
+```yaml
+- name: Notify on success
+  run: |
+    curl -s \
+      -H "Title: ✅ Images Built: ${{ gitea.repository }}" \
+      -H "Priority: default" \
+      -H "Tags: white_check_mark,docker" \
+      -d "Version: ${{ needs.determine-version.outputs.version }}" \
+      http://ntfy.observability.svc.cluster.local:80/gitea-ci
+```
+
+## Skip Patterns
+
+### Commit Message Skip Flags
+
+| Flag | Effect |
+|------|--------|
+| `[skip images]` | Skip all image builds |
+| `[ray-serve only]` | Skip worker images |
+| `[skip ci]` | Skip entire workflow |
+
+### Path-based Triggers
+
+Only run on relevant file changes:
+
+```yaml
+on:
+  push:
+    paths:
+      - 'dockerfiles/**'
+      - '.gitea/workflows/build-push.yaml'
+```
+
+## Troubleshooting
+
+### Common Issues
+
+| Issue | Cause | Solution |
+|-------|-------|----------|
+| Jobs not picked up | Instance token instead of project token | Re-register with project-scoped token |
+| 401 Unauthorized | Missing or wrong registry credentials | Check REGISTRY_USER and REGISTRY_TOKEN secrets |
+| "http: server gave HTTP response to HTTPS client" | Using `docker login` with HTTP registry | Create config.json directly, don't use docker login |
+| Cloudflare 100MB upload limit | Using external endpoint for large images | Use internal HTTP endpoint |
+| TLS certificate error | Using HTTPS with self-signed cert | Use internal HTTP endpoint with buildkitd http=true |
+| sudo not found | Rootless DinD has no sudo | Use user-space configuration methods |
+| "must contain at least one job without dependencies" | All jobs have `needs` | Ensure at least one job has no `needs` clause |
+
+### Debugging
+
+1. Check runner logs in Gitea Actions UI
+2. Add debug output: `echo "::debug::Variable=$VAR"`
+3. Use `actions/debug-output` step for verbose logging
+
+## Workflow Template
+
+See [kuberay-images/.gitea/workflows/build-push.yaml](https://git.daviestechlabs.io/daviestechlabs/kuberay-images/src/branch/main/.gitea/workflows/build-push.yaml) for complete example.
+
+## Future Enhancements
+
+1. **Caching improvements** - Persistent layer cache across builds
+2. **Multi-arch builds** - ARM64 support for Raspberry Pi
+3. **Security scanning** - Trivy integration in CI
+4. **Signed images** - Cosign for image signatures
+5. **SLSA provenance** - Supply chain attestations
+
+## References
+
+* [Gitea Actions Documentation](https://docs.gitea.com/usage/actions/overview)
+* [Docker Buildx Documentation](https://docs.docker.com/build/buildx/)
+* [Semantic Versioning](https://semver.org/)
diff --git a/decisions/0032-velero-backup-strategy.md b/decisions/0032-velero-backup-strategy.md
new file mode 100644
index 0000000..338dfae
--- /dev/null
+++ b/decisions/0032-velero-backup-strategy.md
@@ -0,0 +1,180 @@
+# Velero Backup and Disaster Recovery Strategy
+
+* Status: accepted
+* Date: 2026-02-05
+* Deciders: Billy
+* Technical Story: Establish cluster backup and disaster recovery capabilities
+
+## Context and Problem Statement
+
+A homelab running critical workloads (AI/ML pipelines, databases, productivity apps) needs protection against data loss from hardware failures, misconfigurations, or disasters. Kubernetes resources and persistent data must be recoverable.
+
+How do we implement backup and disaster recovery for the homelab cluster?
+
+## Decision Drivers
+
+* Full cluster state backup - resources, secrets, PVCs
+* Application-consistent backups for databases
+* S3-compatible storage for off-cluster backups
+* Scheduled automated backups
+* Selective restore capability
+* GitOps compatibility
+
+## Considered Options
+
+1. **Velero with Node Agent (Kopia)**
+2. **Kasten K10**
+3. **Longhorn snapshots only**
+4. **etcd snapshots + manual PVC backups**
+
+## Decision Outcome
+
+Chosen option: **Option 1 - Velero with Node Agent (Kopia)**
+
+Velero provides comprehensive Kubernetes backup/restore with file-level PVC backups via the Node Agent (formerly Restic, now Kopia). Backups are stored on the external NAS via S3-compatible storage.
+
+### Positive Consequences
+
+* Full cluster state captured (deployments, secrets, configmaps)
+* PVC data backed up via file-level snapshots
+* S3 backend on NAS for off-cluster storage
+* Scheduled daily backups with retention
+* Selective namespace/label restore
+* Active CNCF project with strong community
+
+### Negative Consequences
+
+* Node Agent runs as DaemonSet (14 pods on current cluster)
+* File-level backup slower than volume snapshots
+* Full cluster restore requires careful ordering
+* Some CRDs may need special handling
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                        Velero Server                             │
+│                        (velero namespace)                        │
+└────────────────────────────┬────────────────────────────────────┘
+                             │
+              ┌──────────────┼──────────────┐
+              │              │              │
+              ▼              ▼              ▼
+       ┌───────────┐  ┌───────────┐  ┌───────────┐
+       │   Node    │  │   Node    │  │   Node    │
+       │   Agent   │  │   Agent   │  │   Agent   │
+       │ (per node)│  │ (per node)│  │ (per node)│
+       └─────┬─────┘  └─────┬─────┘  └─────┬─────┘
+             │              │              │
+             └──────────────┼──────────────┘
+                            │
+                            ▼
+              ┌───────────────────────────┐
+              │  BackupStorageLocation    │
+              │  (S3 on NAS - candlekeep) │
+              │  /backups/velero          │
+              └───────────────────────────┘
+```
+
+## Configuration
+
+### Schedule
+
+```yaml
+apiVersion: velero.io/v1
+kind: Schedule
+metadata:
+  name: nightly-cluster-backup
+  namespace: velero
+spec:
+  schedule: "0 2 * * *"  # 2 AM daily
+  template:
+    includedNamespaces:
+      - "*"
+    excludedNamespaces:
+      - kube-system
+      - kube-node-lease
+      - kube-public
+    includedResources:
+      - "*"
+    excludeNodeAgent: false
+    defaultVolumesToFsBackup: true
+    ttl: 720h  # 30 days retention
+```
+
+### Backup Storage Location
+
+```yaml
+apiVersion: velero.io/v1
+kind: BackupStorageLocation
+metadata:
+  name: default
+  namespace: velero
+spec:
+  provider: aws
+  objectStorage:
+    bucket: velero
+  config:
+    region: us-east-1
+    s3ForcePathStyle: "true"
+    s3Url: http://candlekeep.lab.daviestechlabs.io:9000
+```
+
+## Backup Scope
+
+### Included
+
+| Category | Examples | Backup Method |
+|----------|----------|---------------|
+| Kubernetes resources | Deployments, Services, ConfigMaps | Velero native |
+| Secrets | Vault-synced, SOPS-decrypted | Velero native |
+| Persistent Volumes | Database data, user files | Node Agent (Kopia) |
+| CRDs | CNPG Clusters, RayServices, HelmReleases | Velero native |
+
+### Excluded
+
+| Category | Reason |
+|----------|--------|
+| kube-system | Rebuilt from Talos config |
+| flux-system | Rebuilt from Git (GitOps) |
+| Node-local data | Ephemeral, not critical |
+
+## Recovery Procedures
+
+### Full Cluster Recovery
+
+1. Bootstrap new Talos cluster
+2. Install Velero with same BSL configuration
+3. `velero restore create --from-backup nightly-cluster-backup-YYYYMMDD`
+4. Re-bootstrap Flux for GitOps reconciliation
+
+### Selective Namespace Recovery
+
+```bash
+velero restore create \
+  --from-backup nightly-cluster-backup-20260205020000 \
+  --include-namespaces ai-ml \
+  --restore-pvs
+```
+
+### Database Recovery (CNPG)
+
+For CNPG clusters, prefer CNPG's native PITR:
+```bash
+# CNPG handles its own WAL archiving to S3
+# Velero provides secondary backup layer
+```
+
+## Monitoring
+
+| Metric | Alert Threshold |
+|--------|-----------------|
+| `velero_backup_success_total` | No increase in 25h |
+| `velero_backup_failure_total` | Any increase |
+| Backup duration | > 4 hours |
+
+## Links
+
+* [Velero Documentation](https://velero.io/docs/)
+* [Node Agent (Kopia) Integration](https://velero.io/docs/main/file-system-backup/)
+* Related: [ADR-0026](0026-storage-strategy.md) - Storage Strategy
diff --git a/decisions/0033-data-analytics-platform.md b/decisions/0033-data-analytics-platform.md
new file mode 100644
index 0000000..a4418fe
--- /dev/null
+++ b/decisions/0033-data-analytics-platform.md
@@ -0,0 +1,267 @@
+# Data Analytics Platform Architecture
+
+* Status: accepted
+* Date: 2026-02-05
+* Deciders: Billy
+* Technical Story: Build a modern lakehouse architecture for HTTP analytics and ML feature engineering
+
+## Context and Problem Statement
+
+The homelab generates significant telemetry data from HTTP traffic (via Envoy Gateway), application logs, and ML inference metrics. This data is valuable for:
+- Traffic pattern analysis
+- Security anomaly detection
+- ML feature engineering
+- Cost optimization insights
+
+How do we build a scalable analytics platform that supports both batch and real-time processing?
+
+## Decision Drivers
+
+* Modern lakehouse architecture (SQL + streaming)
+* Real-time and batch processing capabilities
+* Cost-effective on homelab hardware
+* Integration with existing observability stack
+* Support for ML feature pipelines
+* Open table formats for interoperability
+
+## Considered Options
+
+1. **Lakehouse: Nessie + Spark + Flink + Trino + RisingWave**
+2. **Traditional DWH: ClickHouse only**
+3. **Cloud-native: Databricks/Snowflake (SaaS)**
+4. **Minimal: PostgreSQL with TimescaleDB**
+
+## Decision Outcome
+
+Chosen option: **Option 1 - Modern Lakehouse Architecture**
+
+A full lakehouse stack with Apache Iceberg tables (via Nessie catalog), Spark for batch ETL, Flink for streaming, Trino for interactive queries, and RisingWave for streaming SQL.
+
+### Positive Consequences
+
+* Unified batch and streaming on same data
+* Git-like versioning of tables via Nessie
+* Standard SQL across all engines
+* Decoupled compute and storage
+* Open formats prevent vendor lock-in
+* ML feature engineering support
+
+### Negative Consequences
+
+* Complex multi-component architecture
+* Higher resource requirements
+* Steeper learning curve
+* Multiple operators to maintain
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                           DATA SOURCES                                       │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐     │
+│  │ Envoy Logs   │  │ Application  │  │ Inference    │  │ Prometheus   │     │
+│  │ (HTTPRoute)  │  │ Telemetry    │  │ Metrics      │  │ Metrics      │     │
+│  └──────┬───────┘  └──────┬───────┘  └──────┬───────┘  └──────┬───────┘     │
+└─────────┼─────────────────┼─────────────────┼─────────────────┼─────────────┘
+          │                 │                 │                 │
+          └─────────────────┼─────────────────┼─────────────────┘
+                            ▼                 │
+              ┌───────────────────────┐       │
+              │   NATS JetStream      │◄──────┘
+              │   (Event Streaming)   │
+              └───────────┬───────────┘
+                          │
+          ┌───────────────┼───────────────┐
+          │               │               │
+          ▼               ▼               ▼
+┌─────────────────┐ ┌───────────┐ ┌───────────────────┐
+│  Apache Flink   │ │ RisingWave│ │   Apache Spark    │
+│  (Streaming ETL)│ │ (Stream   │ │   (Batch ETL)     │
+│                 │ │  SQL)     │ │                   │
+└────────┬────────┘ └─────┬─────┘ └─────────┬─────────┘
+         │                │                 │
+         └────────────────┼─────────────────┘
+                          │ Write Iceberg Tables
+                          ▼
+              ┌───────────────────────┐
+              │       Nessie          │
+              │   (Iceberg Catalog)   │
+              │   Git-like versioning │
+              └───────────┬───────────┘
+                          │
+                          ▼
+              ┌───────────────────────┐
+              │    NFS Storage        │
+              │  (candlekeep:/lakehouse)│
+              └───────────────────────┘
+                          │
+                          ▼
+              ┌───────────────────────┐
+              │        Trino          │
+              │  (Interactive Query)  │
+              │  + Grafana Dashboards │
+              └───────────────────────┘
+```
+
+## Component Details
+
+### Apache Nessie (Iceberg Catalog)
+
+**Purpose:** Git-like version control for data tables
+
+```yaml
+# HelmRelease: nessie
+# Version: 0.107.1
+spec:
+  versionStoreType: ROCKSDB  # Embedded storage
+  catalog:
+    iceberg:
+      configDefaults:
+        warehouse: s3://lakehouse/
+```
+
+**Features:**
+- Branch/tag data versions
+- Time travel queries
+- Multi-table transactions
+- Cross-engine compatibility
+
+### Apache Spark (Batch Processing)
+
+**Purpose:** Large-scale batch ETL and ML feature engineering
+
+```yaml
+# SparkApplication for HTTPRoute analytics
+apiVersion: sparkoperator.k8s.io/v1beta2
+kind: SparkApplication
+spec:
+  type: Python
+  mode: cluster
+  sparkConf:
+    spark.sql.catalog.nessie: org.apache.iceberg.spark.SparkCatalog
+    spark.sql.catalog.nessie.catalog-impl: org.apache.iceberg.nessie.NessieCatalog
+    spark.sql.catalog.nessie.uri: http://nessie:19120/api/v1
+```
+
+**Use Cases:**
+- Daily HTTPRoute log aggregation
+- Feature engineering for ML
+- Historical data compaction
+
+### Apache Flink (Stream Processing)
+
+**Purpose:** Real-time event processing
+
+```yaml
+# Flink Kubernetes Operator
+# Version: 1.13.0
+spec:
+  job:
+    jarURI: local:///opt/flink/jobs/httproute-analytics.jar
+    parallelism: 2
+```
+
+**Use Cases:**
+- Real-time traffic anomaly detection
+- Streaming ETL to Iceberg
+- Session windowing for user analytics
+
+### RisingWave (Streaming SQL)
+
+**Purpose:** Simplified streaming SQL for real-time dashboards
+
+```sql
+-- Materialized view for real-time traffic
+CREATE MATERIALIZED VIEW traffic_5min AS
+SELECT 
+  window_start,
+  route_name,
+  COUNT(*) as request_count,
+  AVG(response_time_ms) as avg_latency
+FROM httproute_events
+GROUP BY 
+  TUMBLE(event_time, INTERVAL '5 MINUTES'),
+  route_name;
+```
+
+**Use Cases:**
+- Real-time Grafana dashboards
+- Streaming aggregations
+- Alerting triggers
+
+### Trino (Interactive Query)
+
+**Purpose:** Fast SQL queries across Iceberg tables
+
+```yaml
+# Trino coordinator + 2 workers
+catalogs:
+  iceberg: |
+    connector.name=iceberg
+    iceberg.catalog.type=nessie
+    iceberg.nessie.uri=http://nessie:19120/api/v1
+```
+
+**Use Cases:**
+- Ad-hoc analytics queries
+- Grafana data source for dashboards
+- Cross-table JOINs
+
+## Data Flow: HTTPRoute Analytics
+
+```
+Envoy Gateway
+    │
+    ▼ (access logs via OTEL)
+NATS JetStream
+    │
+    ├─► Flink Job (streaming)
+    │       │
+    │       ▼
+    │   Iceberg Table: httproute_raw
+    │
+    └─► Spark Job (nightly batch)
+            │
+            ▼
+        Iceberg Table: httproute_daily_agg
+            │
+            ▼
+        Trino ─► Grafana Dashboard
+```
+
+## Storage Layout
+
+```
+candlekeep:/kubernetes/lakehouse/
+├── warehouse/
+│   └── analytics/
+│       ├── httproute_raw/        # Raw events (partitioned by date)
+│       ├── httproute_daily_agg/  # Daily aggregates
+│       ├── inference_metrics/    # ML inference stats
+│       └── feature_store/        # ML features
+└── checkpoints/
+    ├── flink/                    # Flink savepoints
+    └── spark/                    # Spark checkpoints
+```
+
+## Resource Allocation
+
+| Component | Replicas | CPU | Memory |
+|-----------|----------|-----|--------|
+| Nessie | 1 | 0.5 | 512Mi |
+| Spark Operator | 1 | 0.2 | 256Mi |
+| Flink Operator | 1 | 0.2 | 256Mi |
+| Flink JobManager | 1 | 1 | 2Gi |
+| Flink TaskManager | 2 | 2 | 4Gi |
+| RisingWave | 1 | 2 | 4Gi |
+| Trino Coordinator | 1 | 1 | 2Gi |
+| Trino Worker | 2 | 2 | 4Gi |
+
+## Links
+
+* [Apache Iceberg](https://iceberg.apache.org/)
+* [Project Nessie](https://projectnessie.org/)
+* [Apache Flink](https://flink.apache.org/)
+* [RisingWave](https://risingwave.com/)
+* [Trino](https://trino.io/)
+* Related: [ADR-0025](0025-observability-stack.md) - Observability Stack
diff --git a/decisions/0034-volcano-batch-scheduling.md b/decisions/0034-volcano-batch-scheduling.md
new file mode 100644
index 0000000..ab3dbd1
--- /dev/null
+++ b/decisions/0034-volcano-batch-scheduling.md
@@ -0,0 +1,206 @@
+# Volcano Batch Scheduling Strategy
+
+* Status: accepted
+* Date: 2026-02-05
+* Deciders: Billy
+* Technical Story: Optimize scheduling for batch ML and analytics workloads
+
+## Context and Problem Statement
+
+The homelab runs diverse workloads including:
+- AI/ML training jobs (batch, GPU-intensive)
+- Spark/Flink analytics jobs (batch, CPU/memory-intensive)
+- KubeRay cluster with multiple GPU workers
+- Long-running inference services
+
+The default Kubernetes scheduler (kube-scheduler) is optimized for microservices, not batch workloads. It lacks:
+- Gang scheduling (all-or-nothing pod placement)
+- Fair-share queuing across teams/projects
+- Preemption policies for priority workloads
+- Resource reservation for batch jobs
+
+How do we optimize scheduling for batch and ML workloads?
+
+## Decision Drivers
+
+* Gang scheduling for distributed ML training
+* Fair-share resource allocation
+* Priority-based preemption
+* Integration with Kubeflow and Spark
+* GPU-aware scheduling
+* Queue management for multi-tenant scenarios
+
+## Considered Options
+
+1. **Volcano Scheduler**
+2. **Apache YuniKorn**
+3. **Kubernetes default scheduler with Priority Classes**
+4. **Kueue (Kubernetes Batch Workload Queueing)**
+
+## Decision Outcome
+
+Chosen option: **Option 1 - Volcano Scheduler**
+
+Volcano is a CNCF project designed for batch, HPC, and ML workloads. It provides gang scheduling, queue management, and integrates natively with Spark, Flink, and ML frameworks.
+
+### Positive Consequences
+
+* Gang scheduling prevents partial deployments
+* Queue-based fair-share resource management
+* Native Spark and Flink integration
+* Preemption for high-priority jobs
+* CNCF project with active community
+* Coexists with default scheduler
+
+### Negative Consequences
+
+* Additional scheduler components (admission, controller, scheduler)
+* Learning curve for queue configuration
+* Workloads must opt-in via scheduler name
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                        Volcano System                            │
+│                     (volcano-system namespace)                   │
+│                                                                  │
+│  ┌─────────────────┐  ┌───────────────────┐  ┌───────────────┐  │
+│  │   Admission     │  │   Controllers     │  │   Scheduler   │  │
+│  │   Webhook       │  │   (Job lifecycle) │  │   (Placement) │  │
+│  └─────────────────┘  └───────────────────┘  └───────────────┘  │
+└─────────────────────────────────────────────────────────────────┘
+                                  │
+                                  ▼
+┌─────────────────────────────────────────────────────────────────┐
+│                         Queues                                   │
+│  ┌───────────────────────────────────────────────────────────┐  │
+│  │  ml-training    │  analytics    │  inference   │  default  │  │
+│  │  weight: 40     │  weight: 30   │  weight: 20  │  weight: 10│ │
+│  └───────────────────────────────────────────────────────────┘  │
+└─────────────────────────────────────────────────────────────────┘
+                                  │
+                                  ▼
+┌─────────────────────────────────────────────────────────────────┐
+│                        Workloads                                 │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────────────┐   │
+│  │ Spark Jobs   │  │ Flink Jobs   │  │ ML Training (KFP)    │   │
+│  │ (analytics)  │  │ (analytics)  │  │ (ml-training)        │   │
+│  └──────────────┘  └──────────────┘  └──────────────────────┘   │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## Configuration
+
+### Queue Definition
+
+```yaml
+apiVersion: scheduling.volcano.sh/v1beta1
+kind: Queue
+metadata:
+  name: ml-training
+spec:
+  weight: 40
+  reclaimable: true
+  guarantee:
+    resource:
+      cpu: "4"
+      memory: "16Gi"
+  capability:
+    resource:
+      cpu: "32"
+      memory: "128Gi"
+      nvidia.com/gpu: "2"
+```
+
+### Spark Integration
+
+```yaml
+apiVersion: sparkoperator.k8s.io/v1beta2
+kind: SparkApplication
+metadata:
+  name: analytics-job
+spec:
+  batchScheduler: volcano
+  batchSchedulerOptions:
+    queue: analytics
+    priorityClassName: normal
+  driver:
+    schedulerName: volcano
+  executor:
+    schedulerName: volcano
+    instances: 4
+```
+
+### Gang Scheduling for ML Training
+
+```yaml
+apiVersion: batch.volcano.sh/v1alpha1
+kind: Job
+metadata:
+  name: distributed-training
+spec:
+  schedulerName: volcano
+  minAvailable: 4  # Gang: all 4 pods or none
+  queue: ml-training
+  tasks:
+    - name: worker
+      replicas: 4
+      template:
+        spec:
+          containers:
+            - name: trainer
+              resources:
+                limits:
+                  nvidia.com/gpu: 1
+```
+
+## Queue Structure
+
+| Queue | Weight | Use Case | Guarantee | Preemptable |
+|-------|--------|----------|-----------|-------------|
+| `ml-training` | 40 | Kubeflow jobs, RayJobs | 4 CPU, 16Gi | No |
+| `analytics` | 30 | Spark/Flink batch jobs | 2 CPU, 8Gi | Yes |
+| `inference` | 20 | Batch inference jobs | 2 CPU, 8Gi | No |
+| `default` | 10 | Miscellaneous batch | None | Yes |
+
+## Scheduler Selection
+
+Workloads use Volcano by setting:
+
+```yaml
+spec:
+  schedulerName: volcano
+```
+
+Long-running services (inference endpoints, databases) continue using the default scheduler for stability.
+
+## Preemption Policy
+
+```yaml
+apiVersion: scheduling.volcano.sh/v1beta1
+kind: PriorityClass
+metadata:
+  name: high-priority
+spec:
+  value: 1000
+  preemptionPolicy: PreemptLowerPriority
+  description: "High priority ML training jobs"
+```
+
+## Monitoring
+
+| Metric | Description |
+|--------|-------------|
+| `volcano_queue_allocated_*` | Resources currently allocated per queue |
+| `volcano_queue_pending_*` | Pending resource requests per queue |
+| `volcano_job_status` | Job lifecycle states |
+| `volcano_scheduler_throughput` | Scheduling decisions per second |
+
+## Links
+
+* [Volcano Documentation](https://volcano.sh/docs/)
+* [Gang Scheduling](https://volcano.sh/docs/gang_scheduling/)
+* [Spark on Volcano](https://volcano.sh/docs/spark/)
+* Related: [ADR-0009](0009-dual-workflow-engines.md) - Dual Workflow Engines
+* Related: [ADR-0033](0033-data-analytics-platform.md) - Data Analytics Platform
diff --git a/decisions/0035-arm64-worker-strategy.md b/decisions/0035-arm64-worker-strategy.md
new file mode 100644
index 0000000..a2b75d3
--- /dev/null
+++ b/decisions/0035-arm64-worker-strategy.md
@@ -0,0 +1,195 @@
+# ARM64 Raspberry Pi Worker Node Strategy
+
+* Status: accepted
+* Date: 2026-02-05
+* Deciders: Billy
+* Technical Story: Integrate Raspberry Pi nodes into the Kubernetes cluster
+
+## Context and Problem Statement
+
+The homelab cluster includes 5 Raspberry Pi 4/5 nodes (ARM64 architecture) alongside x86_64 servers. These low-power nodes provide:
+- Additional compute capacity for lightweight workloads
+- Geographic distribution within the home network
+- Learning platform for multi-architecture Kubernetes
+
+However, ARM64 nodes have constraints:
+- No GPU acceleration
+- Lower CPU/memory than x86_64 servers
+- Some container images lack ARM64 support
+- Limited local storage
+
+How do we effectively integrate ARM64 nodes while avoiding scheduling failures?
+
+## Decision Drivers
+
+* Maximize utilization of ARM64 compute
+* Prevent ARM-incompatible workloads from scheduling
+* Maintain cluster stability
+* Support multi-arch container images
+* Minimize operational overhead
+
+## Considered Options
+
+1. **Node labels + affinity for workload placement**
+2. **Separate ARM64-only namespace**
+3. **Taints to exclude from general scheduling**
+4. **ARM64 nodes for specific workload types only**
+
+## Decision Outcome
+
+Chosen option: **Option 1 + Option 4 hybrid** - Use node labels with affinity rules, and designate ARM64 nodes for specific workload categories.
+
+ARM64 nodes handle:
+- Lightweight control plane components (where multi-arch images exist)
+- Velero node-agent (backup DaemonSet)
+- Node-level monitoring (Prometheus node-exporter)
+- Future: Edge/IoT workloads
+
+### Positive Consequences
+
+* Clear workload segmentation
+* No scheduling failures from arch mismatch
+* Efficient use of low-power nodes
+* Room for future ARM-specific workloads
+* Cost-effective cluster expansion
+
+### Negative Consequences
+
+* Some nodes may be underutilized
+* Must maintain multi-arch image awareness
+* Additional scheduling complexity
+
+## Cluster Composition
+
+| Node | Architecture | Role | Instance Type |
+|------|--------------|------|---------------|
+| bruenor | amd64 | control-plane | - |
+| catti | amd64 | control-plane | - |
+| storm | amd64 | control-plane | - |
+| khelben | amd64 | GPU worker (Strix Halo) | - |
+| elminster | amd64 | GPU worker (NVIDIA) | - |
+| drizzt | amd64 | GPU worker (RDNA2) | - |
+| danilo | amd64 | GPU worker (Intel Arc) | - |
+| regis | amd64 | worker | - |
+| wulfgar | amd64 | worker | - |
+| **durnan** | **arm64** | worker | raspberry-pi |
+| **elaith** | **arm64** | worker | raspberry-pi |
+| **jarlaxle** | **arm64** | worker | raspberry-pi |
+| **mirt** | **arm64** | worker | raspberry-pi |
+| **volo** | **arm64** | worker | raspberry-pi |
+
+## Node Labels
+
+```yaml
+# Applied via Talos machine config or kubectl
+labels:
+  kubernetes.io/arch: arm64
+  kubernetes.io/os: linux
+  node.kubernetes.io/instance-type: raspberry-pi
+  kubernetes.io/storage: none  # No Longhorn on Pis
+```
+
+## Workload Placement
+
+### DaemonSets (Run Everywhere)
+
+These run on all nodes including ARM64:
+
+| DaemonSet | Namespace | Multi-arch |
+|-----------|-----------|------------|
+| velero-node-agent | velero | ✅ |
+| cilium-agent | kube-system | ✅ |
+| node-exporter | observability | ✅ |
+
+### ARM64-Excluded Workloads
+
+These explicitly exclude ARM64 via node affinity:
+
+```yaml
+spec:
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+          - matchExpressions:
+              - key: kubernetes.io/arch
+                operator: In
+                values:
+                  - amd64
+```
+
+| Workload Type | Reason for Exclusion |
+|---------------|----------------------|
+| GPU workloads | No GPU on Pis |
+| Longhorn | Pis have no storage label |
+| Heavy databases | Insufficient resources |
+| Most HelmReleases | Image compatibility |
+
+### ARM64-Compatible Light Workloads
+
+Potential future workloads for ARM64 nodes:
+
+| Workload | Use Case |
+|----------|----------|
+| MQTT broker | IoT message routing |
+| Pi-hole | DNS ad blocking |
+| Home Assistant | Home automation |
+| Lightweight proxies | Traffic routing |
+
+## Storage Exclusion
+
+ARM64 nodes are excluded from Longhorn:
+
+```yaml
+# Longhorn Helm values
+defaultSettings:
+  systemManagedComponentsNodeSelector: "kubernetes.io/arch:amd64"
+```
+
+Node label:
+```yaml
+kubernetes.io/storage: none
+```
+
+## Resource Constraints
+
+| Node Type | CPU | Memory | Typical Available |
+|-----------|-----|--------|-------------------|
+| Raspberry Pi 4 | 4 cores | 4-8GB | 3 cores, 3GB |
+| Raspberry Pi 5 | 4 cores | 8GB | 3.5 cores, 6GB |
+
+## Multi-Architecture Image Strategy
+
+For workloads that should run on ARM64:
+
+1. **Use multi-arch base images** (e.g., `alpine`, `debian`)
+2. **Build with Docker buildx**:
+   ```bash
+   docker buildx build --platform linux/amd64,linux/arm64 -t myimage:latest .
+   ```
+3. **Verify arch support** before deployment
+
+## Monitoring ARM64 Nodes
+
+```promql
+# Node resource usage by architecture
+sum by (node, arch) (
+  node_memory_MemAvailable_bytes{} 
+  * on(node) group_left(arch) 
+  kube_node_labels{label_kubernetes_io_arch!=""}
+)
+```
+
+## Future Considerations
+
+- **Edge workloads**: ARM64 nodes ideal for edge compute patterns
+- **IoT integration**: MQTT, sensor data collection
+- **Scale-out**: Add more Pis for lightweight workload capacity
+- **ARM64 ML inference**: Some models support ARM (TensorFlow Lite)
+
+## Links
+
+* [Kubernetes Multi-Architecture](https://kubernetes.io/docs/concepts/containers/images/#multi-architecture-images)
+* [Talos on Raspberry Pi](https://talos.dev/v1.12/talos-guides/install/single-board-computers/rpi_generic/)
+* Related: [ADR-0002](0002-use-talos-linux.md) - Use Talos Linux
+* Related: [ADR-0026](0026-storage-strategy.md) - Storage Strategy
diff --git a/decisions/0036-renovate-dependency-updates.md b/decisions/0036-renovate-dependency-updates.md
new file mode 100644
index 0000000..730983b
--- /dev/null
+++ b/decisions/0036-renovate-dependency-updates.md
@@ -0,0 +1,256 @@
+# Automated Dependency Updates with Renovate
+
+* Status: accepted
+* Date: 2026-02-05
+* Deciders: Billy
+* Technical Story: Automate dependency updates across all homelab repositories
+
+## Context and Problem Statement
+
+The homelab consists of 20+ repositories containing:
+- Kubernetes manifests with container image references
+- Helm chart versions
+- Python/Go dependencies
+- GitHub Actions / Gitea Actions workflow versions
+
+Manually tracking and updating dependencies is:
+- Time-consuming
+- Error-prone
+- Often neglected until security issues arise
+
+How do we automate dependency updates while maintaining control over what gets updated?
+
+## Decision Drivers
+
+* Automated detection of outdated dependencies
+* PR-based update workflow for review
+* Support for Kubernetes manifests, Helm, Python, Go, Docker
+* Self-hosted on existing infrastructure
+* Configurable grouping and scheduling
+* Security update prioritization
+
+## Considered Options
+
+1. **Renovate (self-hosted)**
+2. **Dependabot (GitHub-native)**
+3. **Manual updates with version scripts**
+4. **Flux image automation**
+
+## Decision Outcome
+
+Chosen option: **Option 1 - Renovate (self-hosted)**
+
+Renovate runs as a CronJob in the cluster, scanning all repositories in the Gitea organization and creating PRs for outdated dependencies. It supports more package managers than Dependabot and works with Gitea.
+
+### Positive Consequences
+
+* Comprehensive manager support (40+ package managers)
+* Works with self-hosted Gitea
+* Configurable grouping (batch minor updates)
+* Auto-merge for patch/minor updates
+* Dashboard for update overview
+* Reusable preset configurations
+
+### Negative Consequences
+
+* Additional CronJob to maintain
+* Configuration complexity
+* API token management for Gitea access
+
+## Architecture
+
+```
+┌───────────────────────────────────────────────────────────────────┐
+│                      Renovate CronJob                              │
+│                      (ci-cd namespace)                             │
+│                                                                    │
+│  Schedule: Every 8 hours (0 */8 * * *)                            │
+│                                                                    │
+│  ┌────────────────────────────────────────────────────────────┐   │
+│  │                    Renovate Container                       │   │
+│  │                                                             │   │
+│  │  1. Fetch repositories from Gitea org                       │   │
+│  │  2. Scan each repo for dependencies                         │   │
+│  │  3. Compare versions with upstream registries               │   │
+│  │  4. Create/update PRs for outdated deps                     │   │
+│  │  5. Auto-merge approved patches                             │   │
+│  └────────────────────────────────────────────────────────────┘   │
+└───────────────────────────────────────────────────────────────────┘
+                                  │
+                                  ▼
+┌───────────────────────────────────────────────────────────────────┐
+│                         Gitea                                      │
+│                                                                    │
+│  ┌───────────────┐  ┌───────────────┐  ┌───────────────┐          │
+│  │ homelab-k8s2  │  │ chat-handler  │  │ kuberay-images│          │
+│  │               │  │               │  │               │          │
+│  │ PR: Update    │  │ PR: Update    │  │ PR: Update    │          │
+│  │ flux to 2.5.0 │  │ httpx to 0.28 │  │ ROCm to 6.4   │          │
+│  └───────────────┘  └───────────────┘  └───────────────┘          │
+└───────────────────────────────────────────────────────────────────┘
+```
+
+## Configuration
+
+### CronJob
+
+```yaml
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: renovate
+  namespace: ci-cd
+spec:
+  schedule: "0 */8 * * *"  # Every 8 hours
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          containers:
+            - name: renovate
+              image: renovate/renovate:39
+              env:
+                - name: RENOVATE_PLATFORM
+                  value: "gitea"
+                - name: RENOVATE_ENDPOINT
+                  value: "https://git.daviestechlabs.io/api/v1"
+                - name: RENOVATE_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: renovate-github-token
+                      key: token
+                - name: RENOVATE_AUTODISCOVER
+                  value: "true"
+                - name: RENOVATE_AUTODISCOVER_FILTER
+                  value: "daviestechlabs/*"
+          restartPolicy: OnFailure
+```
+
+### Repository Config (renovate.json)
+
+```json
+{
+  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
+  "extends": [
+    "config:recommended",
+    "group:allNonMajor",
+    ":automergeMinor",
+    ":automergePatch"
+  ],
+  "kubernetes": {
+    "fileMatch": ["\\.ya?ml$"]
+  },
+  "packageRules": [
+    {
+      "matchManagers": ["helm-values", "helmv3"],
+      "groupName": "helm charts"
+    },
+    {
+      "matchPackagePatterns": ["^ghcr.io/"],
+      "groupName": "GHCR images"
+    },
+    {
+      "matchUpdateTypes": ["major"],
+      "automerge": false,
+      "labels": ["major-update"]
+    }
+  ],
+  "schedule": ["before 6am on monday"]
+}
+```
+
+## Supported Package Managers
+
+| Manager | File Patterns | Examples |
+|---------|---------------|----------|
+| kubernetes | `*.yaml`, `*.yml` | Container images in Deployments |
+| helm | `Chart.yaml`, `values.yaml` | Helm chart dependencies |
+| helmv3 | HelmRelease CRDs | Flux HelmReleases |
+| flux | Flux CRDs | GitRepository, OCIRepository |
+| pip | `requirements.txt`, `pyproject.toml` | Python packages |
+| gomod | `go.mod` | Go modules |
+| dockerfile | `Dockerfile*` | Base images |
+| github-actions | `.github/workflows/*.yml` | Action versions |
+| gitea-actions | `.gitea/workflows/*.yml` | Action versions |
+
+## Update Strategy
+
+### Auto-merge Enabled
+
+| Update Type | Auto-merge | Delay |
+|-------------|------------|-------|
+| Patch (x.x.1 → x.x.2) | ✅ Yes | Immediate |
+| Minor (x.1.x → x.2.x) | ✅ Yes | 3 days stabilization |
+| Major (1.x.x → 2.x.x) | ❌ No | Manual review |
+
+### Grouping Strategy
+
+| Group | Contents | Frequency |
+|-------|----------|-----------|
+| `all-non-major` | All patch + minor updates | Weekly (Monday) |
+| `helm-charts` | All Helm chart updates | Weekly |
+| `container-images` | Docker image updates | Weekly |
+| `security` | CVE fixes | Immediate |
+
+## Security Updates
+
+Renovate prioritizes security updates:
+
+```json
+{
+  "vulnerabilityAlerts": {
+    "enabled": true,
+    "labels": ["security"]
+  },
+  "packageRules": [
+    {
+      "matchCategories": ["security"],
+      "automerge": true,
+      "schedule": ["at any time"],
+      "prPriority": 10
+    }
+  ]
+}
+```
+
+## Dashboard
+
+Renovate creates a "Dependency Dashboard" issue in each repository:
+
+```markdown
+## Dependency Dashboard
+
+### Open PRs
+- [ ] Update httpx to 0.28.1 (#42)
+- [x] Update pillow to 11.0.0 (#41) - merged
+
+### Pending Approval
+- [ ] Major: Update pydantic to v2 (#40)
+
+### Rate Limited
+- fastapi (waiting for next schedule window)
+```
+
+## Secrets
+
+| Secret | Source | Purpose |
+|--------|--------|---------|
+| `renovate-github-token` | Vault | Gitea API access |
+| `renovate-dockerhub` | Vault | Docker Hub rate limits |
+
+## Monitoring
+
+```promql
+# Renovate job success rate
+sum(kube_job_status_succeeded{job_name=~"renovate-.*"}) 
+/ 
+sum(kube_job_status_succeeded{job_name=~"renovate-.*"} + kube_job_status_failed{job_name=~"renovate-.*"})
+```
+
+## Links
+
+* [Renovate Documentation](https://docs.renovatebot.com/)
+* [Renovate Presets](https://docs.renovatebot.com/presets-default/)
+* [Gitea Platform Support](https://docs.renovatebot.com/modules/platform/gitea/)
+* Related: [ADR-0013](0013-gitea-actions-for-ci.md) - Gitea Actions for CI
+* Related: [ADR-0031](0031-gitea-cicd-strategy.md) - Gitea CI/CD Strategy
diff --git a/decisions/0037-node-naming-conventions.md b/decisions/0037-node-naming-conventions.md
new file mode 100644
index 0000000..c6db68f
--- /dev/null
+++ b/decisions/0037-node-naming-conventions.md
@@ -0,0 +1,187 @@
+# Node Naming Conventions
+
+* Status: accepted
+* Date: 2026-02-05
+* Deciders: Billy
+* Technical Story: Establish memorable, role-based naming for cluster nodes
+
+## Context and Problem Statement
+
+The homelab cluster has grown to include:
+- 14 Kubernetes nodes (control plane + workers)
+- Multiple storage servers
+- Development workstations
+
+Generic names like `node-01`, `worker-gpu-1` are:
+- Hard to remember
+- Don't convey node purpose
+- Boring
+
+How do we name nodes in a way that's memorable, fun, and indicates their role?
+
+## Decision Drivers
+
+* Names should indicate node role/capability
+* Easy to remember and reference in conversation
+* Consistent theme across the homelab
+* Scalable as more nodes are added
+
+## Decision Outcome
+
+Chosen option: **Dungeons & Dragons character naming scheme**
+
+All nodes are named after famous D&D characters from Forgotten Realms, with character class mapping to node role.
+
+## Naming Scheme
+
+### Control Plane → Companions of the Hall
+
+The control plane nodes are named after the legendary Companions of the Hall, Drizzt's closest allies.
+
+| Node | Character | Hardware | Notes |
+|------|-----------|----------|-------|
+| `bruenor` | Bruenor Battlehammer | Intel N100 | Dwarf King of Mithral Hall |
+| `catti` | Catti-brie | Intel N100 | Human ranger, Bruenor's adopted daughter |
+| `storm` | Storm Silverhand | Intel N100 | Chosen of Mystra, Harper leader |
+
+### Wizards → GPU Nodes (Spellcasters)
+
+Wizards cast powerful spells, just as GPU nodes power AI/ML workloads.
+
+| Node | Character | GPU | Notes |
+|------|-----------|-----|-------|
+| `khelben` | Khelben "Blackstaff" Arunsun | AMD Radeon 8060S 64GB | Primary AI inference, Strix Halo APU |
+| `elminster` | Elminster Aumar | NVIDIA RTX 2070 8GB | CUDA workloads, Sage of Shadowdale |
+| `drizzt` | Drizzt Do'Urden* | AMD Radeon 680M | ROCm backup node |
+| `danilo` | Danilo Thann | Intel Arc A770 | Intel inference, bard/wizard multiclass |
+| `regis` | Regis | NVIDIA GPU | Halfling with magical ruby, spellthief vibes |
+
+*Drizzt is technically a ranger, but his magical scimitars and time in Menzoberranzan qualify him for the GPU tier.
+
+### Rogues → ARM64 Edge Nodes
+
+Rogues are nimble and work in the shadows—perfect for lightweight edge compute on Raspberry Pi nodes.
+
+| Node | Character | Hardware | Notes |
+|------|-----------|----------|-------|
+| `durnan` | Durnan | Raspberry Pi 4 8GB | Yawning Portal innkeeper, retired adventurer |
+| `elaith` | Elaith Craulnober | Raspberry Pi 4 8GB | The Serpent, moon elf rogue |
+| `jarlaxle` | Jarlaxle Baenre | Raspberry Pi 4 8GB | Drow mercenary leader |
+| `mirt` | Mirt the Moneylender | Raspberry Pi 4 8GB | Harper agent, "Old Wolf" |
+| `volo` | Volothamp Geddarm | Raspberry Pi 4 8GB | Famous author and traveler |
+
+### Fighters → x86 CPU Workers
+
+Fighters are the workhorses, handling general compute without magical (GPU) abilities.
+
+| Node | Character | Hardware | Notes |
+|------|-----------|----------|-------|
+| `wulfgar` | Wulfgar | Intel x86_64 | Barbarian of Icewind Dale, Aegis-fang wielder |
+
+### Infrastructure Nodes (Locations)
+
+| Node | Character/Location | Role | Notes |
+|------|-------------------|------|-------|
+| `candlekeep` | Candlekeep | Primary NAS (Synology) | Library fortress, knowledge storage |
+| `neverwinter` | Neverwinter | Fast NAS (TrueNAS Scale) | Jewel of the North, all-SSD, nfs-fast |
+| `waterdeep` | Waterdeep | Mac Mini dev workstation | City of Splendors, primary city |
+
+### Future Expansion
+
+| Class | Role | Candidate Names |
+|-------|------|-----------------|
+| Clerics | Database/backup nodes | Cadderly, Dawnbringer |
+| Fighters | High-CPU compute | Artemis Entreri, Obould |
+| Druids | Monitoring/observability | Jaheira, Cernd |
+| Bards | API gateways | Other Thann family members |
+| Paladins | Security nodes | Ajantis, Keldorn |
+
+## Architecture
+
+```
+┌───────────────────────────────────────────────────────────────────────────────┐
+│                     Homelab Cluster (14 Kubernetes Nodes)                      │
+│                                                                                │
+│  ┌──────────────────────────────────────────────────────────────────────┐     │
+│  │              👑 Control Plane (Companions of the Hall)                │     │
+│  │                                                                       │     │
+│  │      bruenor              catti                storm                  │     │
+│  │      Intel N100           Intel N100           Intel N100             │     │
+│  │      "Dwarf King"         "Catti-brie"         "Silverhand"           │     │
+│  └──────────────────────────────────────────────────────────────────────┘     │
+│                                                                                │
+│  ┌──────────────────────────────────────────────────────────────────────┐     │
+│  │                    🧙 Wizards (GPU Spellcasters)                      │     │
+│  │                                                                       │     │
+│  │  khelben         elminster       drizzt        danilo       regis    │     │
+│  │  Radeon 8060S    RTX 2070        Radeon 680M   Arc A770     NVIDIA   │     │
+│  │  64GB unified    8GB VRAM        iGPU          16GB         GPU      │     │
+│  │  "Blackstaff"    "Sage"          "Ranger"      "Bard"       "Ruby"   │     │
+│  └──────────────────────────────────────────────────────────────────────┘     │
+│                                                                                │
+│  ┌──────────────────────────────────────────────────────────────────────┐     │
+│  │                    🗡️ Rogues (ARM64 Edge Nodes)                       │     │
+│  │                                                                       │     │
+│  │  durnan         elaith         jarlaxle        mirt         volo     │     │
+│  │  Pi 4 8GB       Pi 4 8GB       Pi 4 8GB        Pi 4 8GB     Pi 4 8GB │     │
+│  │  "Innkeeper"    "Serpent"      "Mercenary"     "Old Wolf"   "Author" │     │
+│  └──────────────────────────────────────────────────────────────────────┘     │
+│                                                                                │
+│  ┌──────────────────────────────────────────────────────────────────────┐     │
+│  │                    ⚔️ Fighters (x86 CPU Workers)                      │     │
+│  │                                                                       │     │
+│  │                           wulfgar                                     │     │
+│  │                           Intel x86_64                                │     │
+│  │                           "Barbarian of Icewind Dale"                 │     │
+│  └──────────────────────────────────────────────────────────────────────┘     │
+└───────────────────────────────────────────────────────────────────────────────┘
+
+┌───────────────────────────────────────────────────────────────────────────────┐
+│                    🏰 Locations (Off-Cluster Infrastructure)                   │
+│                                                                                │
+│  📚 candlekeep              ❄️ neverwinter              🏙️ waterdeep           │
+│  Synology NAS               TrueNAS Scale (SSD)         Mac Mini               │
+│  nfs-default                nfs-fast                    Dev workstation        │
+│  High capacity              High speed                  Primary dev box        │
+│  "Library Fortress"         "Jewel of the North"        "City of Splendors"    │
+└───────────────────────────────────────────────────────────────────────────────┘
+```
+
+## Storage Mapping
+
+| Location | Storage Class | Speed | Capacity | Use Case |
+|----------|--------------|-------|----------|----------|
+| Candlekeep | `nfs-default` | HDD | High | Backups, archives, media |
+| Neverwinter | `nfs-fast` | SSD | Medium | Database WAL, hot data |
+| Longhorn | `longhorn` | Local SSD | Distributed | Replicated app data |
+
+## Node Labels
+
+```yaml
+# GPU Wizard nodes
+node.kubernetes.io/instance-type: gpu-wizard
+homelab.daviestechlabs.io/character-class: wizard
+homelab.daviestechlabs.io/character-name: khelben
+
+# ARM64 Rogue nodes  
+node.kubernetes.io/instance-type: raspberry-pi
+homelab.daviestechlabs.io/character-class: rogue
+homelab.daviestechlabs.io/character-name: jarlaxle
+```
+
+## DNS/Hostname Resolution
+
+All nodes are resolvable via:
+- Kubernetes DNS: `<node>.node.kubernetes.io`
+- Local DNS: `<node>.lab.daviestechlabs.io`
+- mDNS: `<node>.local`
+
+## References
+
+* [Forgotten Realms Wiki](https://forgottenrealms.fandom.com/)
+* [Khelben Arunsun](https://forgottenrealms.fandom.com/wiki/Khelben_Arunsun)
+* [Elminster](https://forgottenrealms.fandom.com/wiki/Elminster_Aumar)
+* [Candlekeep](https://forgottenrealms.fandom.com/wiki/Candlekeep)
+* [Neverwinter](https://forgottenrealms.fandom.com/wiki/Neverwinter)
+* Related: [ADR-0035](0035-arm64-worker-strategy.md) - ARM64 Worker Strategy
+* Related: [ADR-0011](0011-kuberay-unified-serving.md) - KubeRay Unified Serving
diff --git a/diagrams/README.md b/diagrams/README.md
index e5e219f..1db8434 100644
--- a/diagrams/README.md
+++ b/diagrams/README.md
@@ -4,11 +4,32 @@ This directory contains additional architecture diagrams beyond the main C4 diag
 
 ## Available Diagrams
 
-| File | Description |
-|------|-------------|
-| [gpu-allocation.mmd](gpu-allocation.mmd) | GPU workload distribution |
-| [data-flow-chat.mmd](data-flow-chat.mmd) | Chat request data flow |
-| [data-flow-voice.mmd](data-flow-voice.mmd) | Voice request data flow |
+| File | Description | Related ADR |
+|------|-------------|-------------|
+| [gpu-allocation.mmd](gpu-allocation.mmd) | GPU workload distribution | ADR-0005 |
+| [data-flow-chat.mmd](data-flow-chat.mmd) | Chat request data flow | ADR-0003 |
+| [data-flow-voice.mmd](data-flow-voice.mmd) | Voice request data flow | ADR-0003 |
+| [gitops-flux.mmd](gitops-flux.mmd) | GitOps reconciliation loop | ADR-0006 |
+| [dual-workflow-engines.mmd](dual-workflow-engines.mmd) | Argo vs Kubeflow decision flow | ADR-0009 |
+| [kuberay-unified-backend.mmd](kuberay-unified-backend.mmd) | RayService endpoints and GPU allocation | ADR-0011 |
+| [secrets-management.mmd](secrets-management.mmd) | SOPS bootstrap vs Vault runtime | ADR-0017 |
+| [security-policy-enforcement.mmd](security-policy-enforcement.mmd) | Gatekeeper admission + Trivy scanning | ADR-0018 |
+| [handler-deployment.mmd](handler-deployment.mmd) | Ray cluster platform layers | ADR-0019 |
+| [internal-registry.mmd](internal-registry.mmd) | Internal vs external registry paths | ADR-0020 |
+| [notification-architecture.mmd](notification-architecture.mmd) | ntfy hub with sources and consumers | ADR-0021 |
+| [ntfy-discord-bridge.mmd](ntfy-discord-bridge.mmd) | ntfy to Discord message flow | ADR-0022 |
+| [ray-repository-structure.mmd](ray-repository-structure.mmd) | Ray package build and loading | ADR-0024 |
+| [observability-stack.mmd](observability-stack.mmd) | Prometheus + ClickStack telemetry flow | ADR-0025 |
+| [storage-strategy.mmd](storage-strategy.mmd) | Longhorn + NFS dual-tier storage | ADR-0026 |
+| [database-strategy.mmd](database-strategy.mmd) | CloudNativePG cluster management | ADR-0027 |
+| [authentik-sso.mmd](authentik-sso.mmd) | Authentik authentication flow | ADR-0028 |
+| [user-registration-workflow.mmd](user-registration-workflow.mmd) | User registration and approval | ADR-0029 |
+| [velero-backup.mmd](velero-backup.mmd) | Velero backup and restore flow | ADR-0032 |
+| [analytics-lakehouse.mmd](analytics-lakehouse.mmd) | Data analytics lakehouse architecture | ADR-0033 |
+| [volcano-scheduling.mmd](volcano-scheduling.mmd) | Volcano batch scheduler and queues | ADR-0034 |
+| [cluster-topology.mmd](cluster-topology.mmd) | Node topology (x86/ARM64/GPU) | ADR-0035 |
+| [renovate-workflow.mmd](renovate-workflow.mmd) | Renovate dependency update cycle | ADR-0036 |
+| [node-naming.mmd](node-naming.mmd) | D&D-themed node naming conventions | ADR-0037 |
 
 ## Rendering Diagrams
 
diff --git a/diagrams/analytics-lakehouse.mmd b/diagrams/analytics-lakehouse.mmd
new file mode 100644
index 0000000..69ca934
--- /dev/null
+++ b/diagrams/analytics-lakehouse.mmd
@@ -0,0 +1,85 @@
+%% Data Analytics Lakehouse Architecture
+%% Related: ADR-0033
+
+flowchart TB
+    subgraph Ingestion["Data Ingestion"]
+        Kafka["Kafka<br/>Event Streams"]
+        APIs["REST APIs<br/>Batch Loads"]
+        Files["File Drops<br/>S3/NFS"]
+    end
+
+    subgraph Processing["Processing Layer"]
+        subgraph Batch["Batch Processing"]
+            Spark["Apache Spark<br/>spark-operator"]
+        end
+        subgraph Stream["Stream Processing"]
+            Flink["Apache Flink<br/>flink-operator"]
+        end
+        subgraph Realtime["Real-time"]
+            RisingWave["RisingWave<br/>Streaming SQL"]
+        end
+    end
+
+    subgraph Catalog["Lakehouse Catalog"]
+        Nessie["Nessie<br/>Git-like Versioning"]
+        Iceberg["Apache Iceberg<br/>Table Format"]
+    end
+
+    subgraph Storage["Storage Layer"]
+        S3["S3 (MinIO)<br/>Object Storage"]
+        Parquet["Parquet Files<br/>Columnar Format"]
+    end
+
+    subgraph Query["Query Layer"]
+        Trino["Trino<br/>Distributed SQL"]
+    end
+
+    subgraph Serve["Serving Layer"]
+        Grafana["Grafana<br/>Dashboards"]
+        Jupyter["JupyterHub<br/>Notebooks"]
+        Apps["Applications<br/>REST APIs"]
+    end
+
+    subgraph Metadata["Metadata Store"]
+        PostgreSQL["CloudNativePG<br/>analytics-db"]
+    end
+
+    Kafka --> Flink
+    Kafka --> RisingWave
+    APIs --> Spark
+    Files --> Spark
+
+    Spark --> Nessie
+    Flink --> Nessie
+    RisingWave --> Nessie
+    
+    Nessie --> Iceberg
+    Iceberg --> S3
+    S3 --> Parquet
+
+    Nessie --> PostgreSQL
+    
+    Trino --> Nessie
+    Trino --> Iceberg
+    
+    Trino --> Grafana
+    Trino --> Jupyter
+    Trino --> Apps
+
+    classDef ingest fill:#4a5568,stroke:#718096,color:#fff
+    classDef batch fill:#3182ce,stroke:#2b6cb0,color:#fff
+    classDef stream fill:#38a169,stroke:#2f855a,color:#fff
+    classDef catalog fill:#d69e2e,stroke:#b7791f,color:#fff
+    classDef storage fill:#718096,stroke:#4a5568,color:#fff
+    classDef query fill:#805ad5,stroke:#6b46c1,color:#fff
+    classDef serve fill:#e53e3e,stroke:#c53030,color:#fff
+    classDef meta fill:#319795,stroke:#2c7a7b,color:#fff
+
+    class Kafka,APIs,Files ingest
+    class Spark batch
+    class Flink,RisingWave stream
+    class Nessie,Iceberg catalog
+    class S3,Parquet storage
+    class Trino query
+    class Grafana,Jupyter,Apps serve
+    class PostgreSQL meta
diff --git a/diagrams/authentik-sso.mmd b/diagrams/authentik-sso.mmd
new file mode 100644
index 0000000..33a44d2
--- /dev/null
+++ b/diagrams/authentik-sso.mmd
@@ -0,0 +1,84 @@
+```plaintext
+%% Authentik SSO Strategy (ADR-0028)
+%% Flowchart showing authentication flow stages
+
+flowchart TB
+    subgraph user["👤 User"]
+        browser["Browser"]
+    end
+
+    subgraph ingress["🌐 Ingress"]
+        traefik["Envoy Gateway"]
+    end
+
+    subgraph apps["📦 Applications"]
+        direction LR
+        oidc_app["OIDC Apps<br/>Gitea, Grafana,<br/>ArgoCD, Affine"]
+        proxy_app["Proxy Apps<br/>MLflow, Kubeflow"]
+    end
+
+    subgraph authentik["🔐 Authentik"]
+        direction TB
+        
+        subgraph components["Components"]
+            server["Server<br/>(API)"]
+            worker["Worker<br/>(Tasks)"]
+            outpost["Outpost<br/>(Proxy Auth)"]
+        end
+
+        subgraph flow["Authentication Flow"]
+            direction LR
+            f1["1️⃣ Login<br/>Stage"]
+            f2["2️⃣ Username<br/>Identification"]
+            f3["3️⃣ Password<br/>Validation"]
+            f4["4️⃣ MFA<br/>Challenge"]
+            f5["5️⃣ Session<br/>Created"]
+        end
+
+        subgraph providers["Providers"]
+            oidc_prov["OIDC Provider"]
+            proxy_prov["Proxy Provider"]
+        end
+    end
+
+    subgraph storage["💾 Storage"]
+        redis["Redis<br/>(Cache)"]
+        postgres["PostgreSQL<br/>(CNPG)"]
+    end
+
+    %% User flow
+    browser --> traefik
+    traefik --> apps
+
+    %% OIDC flow
+    oidc_app -->|"Redirect to auth"| server
+    server --> flow
+    f1 --> f2 --> f3 --> f4 --> f5
+    flow --> oidc_prov
+    oidc_prov -->|"JWT token"| oidc_app
+
+    %% Proxy flow
+    proxy_app -->|"Forward auth"| outpost
+    outpost --> server
+    server --> flow
+    proxy_prov --> outpost
+
+    %% Storage
+    server --> redis
+    server --> postgres
+
+    classDef user fill:#3498db,color:white
+    classDef ingress fill:#f39c12,color:black
+    classDef app fill:#27ae60,color:white
+    classDef authentik fill:#9b59b6,color:white
+    classDef storage fill:#e74c3c,color:white
+    classDef flow fill:#1abc9c,color:white
+
+    class browser user
+    class traefik ingress
+    class oidc_app,proxy_app app
+    class server,worker,outpost,oidc_prov,proxy_prov authentik
+    class redis,postgres storage
+    class f1,f2,f3,f4,f5 flow
+
+```
diff --git a/diagrams/cluster-topology.mmd b/diagrams/cluster-topology.mmd
new file mode 100644
index 0000000..52f8fc2
--- /dev/null
+++ b/diagrams/cluster-topology.mmd
@@ -0,0 +1,66 @@
+%% Cluster Node Topology
+%% Related: ADR-0035, ADR-0011, ADR-0037
+
+flowchart TB
+    subgraph Cluster["Homelab Kubernetes Cluster (14 nodes)"]
+        subgraph ControlPlane["👑 Control Plane (Companions of the Hall)"]
+            Bruenor["bruenor<br/>Intel N100"]
+            Catti["catti<br/>Intel N100"]
+            Storm["storm<br/>Intel N100"]
+        end
+
+        subgraph GPUNodes["🧙 Wizards (GPU Workers)"]
+            Khelben["khelben<br/>Radeon 8060S 64GB<br/>🎮 Primary AI"]
+            Elminster["elminster<br/>RTX 2070 8GB<br/>🎮 CUDA"]
+            Drizzt["drizzt<br/>Radeon 680M<br/>🎮 ROCm"]
+            Danilo["danilo<br/>Intel Arc A770<br/>🎮 Intel"]
+            Regis["regis<br/>NVIDIA GPU<br/>🎮 CUDA"]
+        end
+
+        subgraph CPUNodes["⚔️ Fighters (CPU Workers)"]
+            Wulfgar["wulfgar<br/>Intel x86_64"]
+        end
+
+        subgraph ARMWorkers["🗡️ Rogues (ARM64 Raspberry Pi)"]
+            Durnan["durnan<br/>Pi 4 8GB"]
+            Elaith["elaith<br/>Pi 4 8GB"]
+            Jarlaxle["jarlaxle<br/>Pi 4 8GB"]
+            Mirt["mirt<br/>Pi 4 8GB"]
+            Volo["volo<br/>Pi 4 8GB"]
+        end
+    end
+
+    subgraph Workloads["Workload Placement"]
+        AIInference["AI Inference<br/>→ Khelben"]
+        MLTraining["ML Training<br/>→ GPU Nodes"]
+        EdgeServices["Lightweight Services<br/>→ ARM64"]
+        General["General Workloads<br/>→ CPU + ARM64"]
+    end
+
+    subgraph Storage["Storage Affinity"]
+        Longhorn["Longhorn<br/>x86_64 only"]
+        NFS["NFS<br/>All nodes"]
+    end
+
+    AIInference -.-> Khelben
+    MLTraining -.-> GPUNodes
+    EdgeServices -.-> ARMWorkers
+    General -.-> CPUNodes
+    General -.-> ARMWorkers
+
+    Longhorn -.->|Excluded| ARMWorkers
+    NFS --> Cluster
+
+    classDef control fill:#2563eb,stroke:#1d4ed8,color:#fff
+    classDef gpu fill:#7c3aed,stroke:#5b21b6,color:#fff
+    classDef cpu fill:#dc2626,stroke:#b91c1c,color:#fff
+    classDef arm fill:#059669,stroke:#047857,color:#fff
+    classDef workload fill:#9f7aea,stroke:#805ad5,color:#fff
+    classDef storage fill:#ed8936,stroke:#dd6b20,color:#fff
+
+    class Bruenor,Catti,Storm control
+    class Khelben,Elminster,Drizzt,Danilo,Regis gpu
+    class Wulfgar cpu
+    class Durnan,Elaith,Jarlaxle,Mirt,Volo arm
+    class AIInference,MLTraining,EdgeServices,General workload
+    class Longhorn,NFS storage
diff --git a/diagrams/database-strategy.mmd b/diagrams/database-strategy.mmd
new file mode 100644
index 0000000..15a7df8
--- /dev/null
+++ b/diagrams/database-strategy.mmd
@@ -0,0 +1,96 @@
+```plaintext
+%% Database Strategy with CloudNativePG (ADR-0027)
+%% C4 Component diagram showing CNPG operator and clusters
+
+flowchart TB
+    subgraph operator["🎛️ CNPG Operator"]
+        cnpg["CloudNativePG<br/>Controller<br/>(cnpg-system)"]
+    end
+
+    subgraph clusters["📊 PostgreSQL Clusters"]
+        direction LR
+        
+        subgraph gitea_pg["gitea-pg"]
+            direction TB
+            g_primary["🔵 Primary"]
+            g_replica1["⚪ Replica"]
+            g_replica2["⚪ Replica"]
+            g_bouncer["🔗 PgBouncer"]
+        end
+
+        subgraph authentik_db["authentik-db"]
+            direction TB
+            a_primary["🔵 Primary"]
+            a_replica1["⚪ Replica"]
+            a_replica2["⚪ Replica"]
+            a_bouncer["🔗 PgBouncer"]
+        end
+
+        subgraph companions_db["companions-db"]
+            direction TB
+            c_primary["🔵 Primary"]
+            c_replica1["⚪ Replica"]
+            c_replica2["⚪ Replica"]
+            c_bouncer["🔗 PgBouncer"]
+        end
+
+        subgraph mlflow_db["mlflow-db"]
+            direction TB
+            m_primary["🔵 Primary"]
+        end
+    end
+
+    subgraph storage["💾 Storage"]
+        longhorn["Longhorn PVCs<br/>(NVMe/SSD)"]
+        s3["S3 Backups<br/>(barman)"]
+    end
+
+    subgraph services["🔌 Service Discovery"]
+        direction TB
+        rw["-rw (read-write)"]
+        ro["-ro (read-only)"]
+        pooler["-pooler-rw<br/>(PgBouncer)"]
+    end
+
+    subgraph apps["📦 Applications"]
+        gitea["Gitea"]
+        authentik["Authentik"]
+        companions["Companions"]
+        mlflow["MLflow"]
+    end
+
+    %% Operator manages clusters
+    cnpg -->|"Manages"| clusters
+
+    %% Storage connections
+    clusters --> longhorn
+    clusters -->|"WAL archiving"| s3
+
+    %% Service routing
+    g_bouncer --> rw
+    a_bouncer --> rw
+    c_bouncer --> rw
+    g_replica1 --> ro
+    g_replica2 --> ro
+
+    %% App connections
+    gitea -->|"gitea-pg-pooler-rw"| g_bouncer
+    authentik -->|"authentik-db-pooler-rw"| a_bouncer
+    companions -->|"companions-db-pooler-rw"| c_bouncer
+    mlflow -->|"mlflow-db-rw"| m_primary
+
+    classDef operator fill:#e74c3c,color:white
+    classDef primary fill:#3498db,color:white
+    classDef replica fill:#95a5a6,color:white
+    classDef bouncer fill:#9b59b6,color:white
+    classDef storage fill:#27ae60,color:white
+    classDef app fill:#f39c12,color:black
+
+    class cnpg operator
+    class g_primary,a_primary,c_primary,m_primary primary
+    class g_replica1,g_replica2,a_replica1,a_replica2,c_replica1,c_replica2 replica
+    class g_bouncer,a_bouncer,c_bouncer bouncer
+    class longhorn,s3 storage
+    class gitea,authentik,companions,mlflow app
+
+```
diff --git a/diagrams/dual-workflow-engines.mmd b/diagrams/dual-workflow-engines.mmd
new file mode 100644
index 0000000..8ec9047
--- /dev/null
+++ b/diagrams/dual-workflow-engines.mmd
@@ -0,0 +1,73 @@
+```plaintext
+%% Dual Workflow Engine Strategy (ADR-0009)
+%% Flowchart showing Argo vs Kubeflow decision and integration
+
+flowchart TB
+    subgraph trigger["🎯 Workflow Triggers"]
+        nats["NATS Event"]
+        api["API Call"]
+        schedule["Cron Schedule"]
+    end
+
+    subgraph decision["❓ Which Engine?"]
+        question{{"Workflow Type?"}}
+    end
+
+    subgraph kubeflow["🔬 Kubeflow Pipelines"]
+        direction TB
+        kfp_train["ML Training<br/>✅ Component caching"]
+        kfp_eval["Model Evaluation<br/>✅ Metric tracking"]
+        kfp_exp["Experiment Comparison<br/>✅ MLflow integration"]
+    end
+
+    subgraph argo["⚡ Argo Workflows"]
+        direction TB
+        argo_dag["Complex DAG<br/>✅ Advanced control flow"]
+        argo_batch["Batch Processing<br/>✅ Parallelization"]
+        argo_ingest["Document Ingestion<br/>✅ Simple steps"]
+    end
+
+    subgraph hybrid["🔗 Hybrid Pattern"]
+        direction TB
+        argo_orch["Argo Orchestrates"]
+        kfp_step["KFP via API"]
+        argo_orch --> kfp_step
+    end
+
+    subgraph integration["📡 Integration Layer"]
+        direction TB
+        events["Argo Events<br/>EventSource + Sensor"]
+    end
+
+    %% Flow from triggers
+    nats --> events
+    api --> decision
+    schedule --> events
+    events --> decision
+
+    %% Decision branches
+    question -->|"ML training<br/>with caching"| kubeflow
+    question -->|"Complex DAG<br/>batch jobs"| argo
+    question -->|"ML + complex<br/>orchestration"| hybrid
+
+    %% Kubeflow use cases
+    kfp_train --> kfp_eval
+    kfp_eval --> kfp_exp
+
+    %% Argo use cases
+    argo_dag --> argo_batch
+    argo_batch --> argo_ingest
+
+    classDef trigger fill:#f39c12,color:black
+    classDef kubeflow fill:#4a90d9,color:white
+    classDef argo fill:#ef6c00,color:white
+    classDef hybrid fill:#8e44ad,color:white
+    classDef integration fill:#27ae60,color:white
+
+    class nats,api,schedule trigger
+    class kfp_train,kfp_eval,kfp_exp kubeflow
+    class argo_dag,argo_batch,argo_ingest argo
+    class argo_orch,kfp_step hybrid
+    class events integration
+
+```
diff --git a/diagrams/gitops-flux.mmd b/diagrams/gitops-flux.mmd
new file mode 100644
index 0000000..404b79a
--- /dev/null
+++ b/diagrams/gitops-flux.mmd
@@ -0,0 +1,57 @@
+```plaintext
+%% GitOps Reconciliation Loop (ADR-0006)
+%% Flowchart showing Flux CD GitOps workflow
+
+flowchart TB
+    subgraph git["📂 Git Repositories"]
+        direction TB
+        homelab["homelab-k8s2<br/>(cluster config)"]
+        apps["Application Repos<br/>(argo, kubeflow, etc.)"]
+    end
+
+    subgraph flux["⚙️ Flux Controllers"]
+        direction TB
+        source["Source Controller<br/>📥 Fetches repos"]
+        kustomize["Kustomize Controller<br/>🔧 Applies manifests"]
+        helm["Helm Controller<br/>📦 Manages charts"]
+        notification["Notification Controller<br/>📢 Alerts"]
+    end
+
+    subgraph k8s["☸️ Kubernetes Cluster"]
+        direction TB
+        secrets["🔐 SOPS Secrets<br/>(Age decrypted)"]
+        resources["📋 Deployed Resources<br/>(Pods, Services, etc.)"]
+        drift["🔄 Drift Detection"]
+    end
+
+    subgraph notify["📱 Notifications"]
+        ntfy["ntfy<br/>(push alerts)"]
+    end
+
+    %% GitOps flow
+    homelab -->|"GitRepository CR"| source
+    apps -->|"GitRepository CR"| source
+    source -->|"Fetches every 5m"| kustomize
+    source -->|"Fetches charts"| helm
+
+    kustomize -->|"Decrypts with Age"| secrets
+    kustomize -->|"kubectl apply"| resources
+    helm -->|"helm upgrade"| resources
+
+    resources -->|"Actual state"| drift
+    drift -->|"Compares to Git"| kustomize
+    drift -->|"Auto-corrects"| resources
+
+    notification -->|"Success/failure"| ntfy
+
+    classDef repo fill:#f5a623,color:black
+    classDef controller fill:#4a90d9,color:white
+    classDef cluster fill:#50c878,color:white
+    classDef alert fill:#9b59b6,color:white
+
+    class homelab,apps repo
+    class source,kustomize,helm,notification controller
+    class secrets,resources,drift cluster
+    class ntfy alert
+
+```
diff --git a/diagrams/handler-deployment.mmd b/diagrams/handler-deployment.mmd
new file mode 100644
index 0000000..e4333fc
--- /dev/null
+++ b/diagrams/handler-deployment.mmd
@@ -0,0 +1,67 @@
+```plaintext
+%% Handler Deployment Strategy (ADR-0019)
+%% C4 Component diagram showing platform layers with Ray cluster
+
+flowchart TB
+    subgraph platform["🏗️ Platform Layer"]
+        direction LR
+        kubeflow["📊 Kubeflow<br/>Pipelines"]
+        kserve["🎯 KServe<br/>(visibility)"]
+        mlflow["📈 MLflow<br/>(registry)"]
+    end
+
+    subgraph ray["⚡ Ray Cluster"]
+        direction TB
+        
+        subgraph gpu_apps["🎮 GPU Inference (Workers)"]
+            direction LR
+            llm["/llm<br/>vLLM<br/>🟢 khelben 0.95 GPU"]
+            whisper["/whisper<br/>Whisper<br/>🟡 elminster 0.5 GPU"]
+            tts["/tts<br/>XTTS<br/>🟡 elminster 0.5 GPU"]
+            embeddings["/embeddings<br/>BGE<br/>🔴 drizzt 0.8 GPU"]
+            reranker["/reranker<br/>BGE<br/>🔵 danilo 0.8 GPU"]
+        end
+
+        subgraph cpu_apps["🖥️ CPU Handlers (Head Node)"]
+            direction LR
+            chat["/chat<br/>ChatHandler<br/>0 GPU"]
+            voice["/voice<br/>VoiceHandler<br/>0 GPU"]
+        end
+    end
+
+    subgraph support["🔧 Supporting Services"]
+        direction LR
+        nats["📨 NATS<br/>(events)"]
+        milvus["🔍 Milvus<br/>(vectors)"]
+        valkey["💾 Valkey<br/>(cache)"]
+    end
+
+    subgraph pypi["📦 Package Registry"]
+        gitea_pypi["Gitea PyPI<br/>• handler-base<br/>• chat-handler<br/>• voice-assistant"]
+    end
+
+    %% Connections
+    kubeflow --> ray
+    kserve --> ray
+    mlflow --> ray
+
+    cpu_apps -->|"Ray internal calls"| gpu_apps
+    cpu_apps --> nats
+    cpu_apps --> milvus
+    cpu_apps --> valkey
+
+    gitea_pypi -->|"pip install<br/>runtime_env"| cpu_apps
+
+    classDef platform fill:#9b59b6,color:white
+    classDef gpu fill:#e74c3c,color:white
+    classDef cpu fill:#3498db,color:white
+    classDef support fill:#27ae60,color:white
+    classDef registry fill:#f39c12,color:black
+
+    class kubeflow,kserve,mlflow platform
+    class llm,whisper,tts,embeddings,reranker gpu
+    class chat,voice cpu
+    class nats,milvus,valkey support
+    class gitea_pypi registry
+
+```
diff --git a/diagrams/internal-registry.mmd b/diagrams/internal-registry.mmd
new file mode 100644
index 0000000..4a768ad
--- /dev/null
+++ b/diagrams/internal-registry.mmd
@@ -0,0 +1,53 @@
+```plaintext
+%% Internal Registry for CI/CD (ADR-0020)
+%% Flowchart showing dual-path for external vs internal access
+
+flowchart TB
+    subgraph external["🌐 External Access"]
+        internet["Internet"]
+        cloudflare["☁️ Cloudflare<br/>⚠️ 100MB upload limit"]
+        external_url["git.daviestechlabs.io"]
+    end
+
+    subgraph internal["🏠 Internal Access"]
+        internal_url["registry.lab.daviestechlabs.io<br/>✅ No upload limits"]
+    end
+
+    subgraph gitea["📦 Gitea Instance"]
+        direction TB
+        git_server["Git Server"]
+        docker_registry["Docker Registry"]
+        pypi_registry["PyPI Registry"]
+    end
+
+    subgraph runners["🏃 CI/CD Runners"]
+        gitea_runner["Gitea Actions Runner<br/>(in-cluster)"]
+    end
+
+    subgraph operations["📋 Operations"]
+        small_ops["Small Operations<br/>• git clone/push<br/>• pip install<br/>• docker pull"]
+        large_ops["Large Uploads<br/>• docker push (20GB+)<br/>• pypi upload"]
+    end
+
+    %% External path (limited)
+    internet --> cloudflare
+    cloudflare -->|"100MB limit"| external_url
+    external_url --> gitea
+    small_ops --> cloudflare
+
+    %% Internal path (unlimited)
+    gitea_runner -->|"Direct"| internal_url
+    internal_url --> gitea
+    large_ops --> internal_url
+
+    classDef external fill:#e74c3c,color:white
+    classDef internal fill:#27ae60,color:white
+    classDef gitea fill:#f39c12,color:black
+    classDef runner fill:#3498db,color:white
+
+    class internet,cloudflare,external_url external
+    class internal_url internal
+    class git_server,docker_registry,pypi_registry gitea
+    class gitea_runner runner
+
+```
diff --git a/diagrams/kuberay-unified-backend.mmd b/diagrams/kuberay-unified-backend.mmd
new file mode 100644
index 0000000..107a5c0
--- /dev/null
+++ b/diagrams/kuberay-unified-backend.mmd
@@ -0,0 +1,77 @@
+```plaintext
+%% KubeRay Unified GPU Backend (ADR-0011)
+%% C4 Component diagram showing RayService endpoints and GPU allocation
+
+flowchart TB
+    subgraph clients["🔌 Clients"]
+        chat["Chat Handler"]
+        voice["Voice Handler"]
+    end
+
+    subgraph rayservice["⚡ KubeRay RayService"]
+        endpoint["ai-inference-serve-svc:8000"]
+        
+        subgraph deployments["Ray Serve Deployments"]
+            direction TB
+            
+            subgraph strixhalo["🟢 khelben (Strix Halo 64GB)"]
+                llm["/llm<br/>vLLM 70B<br/>0.95 GPU"]
+            end
+
+            subgraph rtx2070["🟡 elminster (RTX 2070 8GB)"]
+                whisper["/whisper<br/>Whisper v3<br/>0.5 GPU"]
+                tts["/tts<br/>XTTS<br/>0.5 GPU"]
+            end
+
+            subgraph radeon680m["🔴 drizzt (Radeon 680M 12GB)"]
+                embeddings["/embeddings<br/>BGE-Large<br/>0.8 GPU"]
+            end
+
+            subgraph intelarc["🔵 danilo (Intel Arc)"]
+                reranker["/reranker<br/>BGE-Reranker<br/>0.8 GPU"]
+            end
+        end
+    end
+
+    subgraph kserve["🎯 KServe Compatibility Layer"]
+        direction TB
+        svc1["whisper-predictor.ai-ml"]
+        svc2["tts-predictor.ai-ml"]
+        svc3["llm-predictor.ai-ml"]
+        svc4["embeddings-predictor.ai-ml"]
+        svc5["reranker-predictor.ai-ml"]
+    end
+
+    %% Client connections
+    chat --> endpoint
+    voice --> endpoint
+
+    %% Path routing
+    endpoint --> llm
+    endpoint --> whisper
+    endpoint --> tts
+    endpoint --> embeddings
+    endpoint --> reranker
+
+    %% KServe aliases
+    svc1 -->|"ExternalName"| endpoint
+    svc2 -->|"ExternalName"| endpoint
+    svc3 -->|"ExternalName"| endpoint
+    svc4 -->|"ExternalName"| endpoint
+    svc5 -->|"ExternalName"| endpoint
+
+    classDef client fill:#3498db,color:white
+    classDef endpoint fill:#9b59b6,color:white
+    classDef amd fill:#ED1C24,color:white
+    classDef nvidia fill:#76B900,color:white
+    classDef intel fill:#0071C5,color:white
+    classDef kserve fill:#f39c12,color:black
+
+    class chat,voice client
+    class endpoint endpoint
+    class llm,embeddings amd
+    class whisper,tts nvidia
+    class reranker intel
+    class svc1,svc2,svc3,svc4,svc5 kserve
+
+```
diff --git a/diagrams/node-naming.mmd b/diagrams/node-naming.mmd
new file mode 100644
index 0000000..9c04eb7
--- /dev/null
+++ b/diagrams/node-naming.mmd
@@ -0,0 +1,64 @@
+%% Node Naming Conventions - D&D Theme
+%% Related: ADR-0037
+
+flowchart TB
+    subgraph Cluster["Homelab Kubernetes Cluster (14 nodes)"]
+        subgraph ControlPlane["👑 Control Plane (Companions of the Hall)"]
+            Bruenor["bruenor<br/>Intel N100<br/><i>Dwarf King</i>"]
+            Catti["catti<br/>Intel N100<br/><i>Catti-brie</i>"]
+            Storm["storm<br/>Intel N100<br/><i>Storm Silverhand</i>"]
+        end
+
+        subgraph Wizards["🧙 Wizards (GPU Spellcasters)"]
+            Khelben["khelben<br/>Radeon 8060S 64GB<br/><i>The Blackstaff</i>"]
+            Elminster["elminster<br/>RTX 2070 8GB<br/><i>Sage of Shadowdale</i>"]
+            Drizzt["drizzt<br/>Radeon 680M<br/><i>Ranger-Mage</i>"]
+            Danilo["danilo<br/>Intel Arc A770<br/><i>Bard-Wizard</i>"]
+            Regis["regis<br/>NVIDIA GPU<br/><i>Halfling Spellthief</i>"]
+        end
+
+        subgraph Rogues["🗡️ Rogues (ARM64 Edge Nodes)"]
+            Durnan["durnan<br/>Pi 4 8GB<br/><i>Yawning Portal</i>"]
+            Elaith["elaith<br/>Pi 4 8GB<br/><i>The Serpent</i>"]
+            Jarlaxle["jarlaxle<br/>Pi 4 8GB<br/><i>Bregan D'aerthe</i>"]
+            Mirt["mirt<br/>Pi 4 8GB<br/><i>Old Wolf</i>"]
+            Volo["volo<br/>Pi 4 8GB<br/><i>Famous Author</i>"]
+        end
+
+        subgraph Fighters["⚔️ Fighters (x86 CPU Workers)"]
+            Wulfgar["wulfgar<br/>Intel x86_64<br/><i>Barbarian of Icewind Dale</i>"]
+        end
+    end
+
+    subgraph Infrastructure["🏰 Locations (Off-Cluster Infrastructure)"]
+        Candlekeep["📚 candlekeep<br/>Synology NAS<br/>nfs-default<br/><i>Library Fortress</i>"]
+        Neverwinter["❄️ neverwinter<br/>TrueNAS Scale (SSD)<br/>nfs-fast<br/><i>Jewel of the North</i>"]
+        Waterdeep["🏙️ waterdeep<br/>Mac Mini<br/>Dev Workstation<br/><i>City of Splendors</i>"]
+    end
+
+    subgraph Workloads["Workload Routing"]
+        AI["AI/ML Inference"] --> Wizards
+        Edge["Edge Services"] --> Rogues
+        Compute["General Compute"] --> Fighters
+        Storage["Storage I/O"] --> Infrastructure
+    end
+
+    ControlPlane -.->|"etcd"| ControlPlane
+    Wizards -.->|"Fast Storage"| Neverwinter
+    Wizards -.->|"Backups"| Candlekeep
+    Rogues -.->|"NFS Mounts"| Candlekeep
+    Fighters -.->|"NFS Mounts"| Candlekeep
+
+    classDef control fill:#2563eb,stroke:#1d4ed8,color:#fff
+    classDef wizard fill:#7c3aed,stroke:#5b21b6,color:#fff
+    classDef rogue fill:#059669,stroke:#047857,color:#fff
+    classDef fighter fill:#dc2626,stroke:#b91c1c,color:#fff
+    classDef location fill:#d97706,stroke:#b45309,color:#fff
+    classDef workload fill:#4b5563,stroke:#374151,color:#fff
+
+    class Bruenor,Catti,Storm control
+    class Khelben,Elminster,Drizzt,Danilo,Regis wizard
+    class Durnan,Elaith,Jarlaxle,Mirt,Volo rogue
+    class Wulfgar fighter
+    class Candlekeep,Neverwinter,Waterdeep location
+    class AI,Edge,Compute,Storage workload
diff --git a/diagrams/notification-architecture.mmd b/diagrams/notification-architecture.mmd
new file mode 100644
index 0000000..eb83c89
--- /dev/null
+++ b/diagrams/notification-architecture.mmd
@@ -0,0 +1,63 @@
+```plaintext
+%% Notification Architecture (ADR-0021)
+%% C4 Component diagram showing notification sources and hub
+
+flowchart LR
+    subgraph sources["📤 Notification Sources"]
+        direction TB
+        ci["🔧 Gitea Actions<br/>CI/CD builds"]
+        alertmanager["🔔 Alertmanager<br/>Prometheus alerts"]
+        gatus["❤️ Gatus<br/>Health monitoring"]
+        flux["🔄 Flux<br/>GitOps events"]
+    end
+
+    subgraph hub["📡 Central Hub"]
+        ntfy["📢 ntfy<br/>Notification Server"]
+    end
+
+    subgraph topics["🏷️ Topics"]
+        direction TB
+        t_ci["gitea-ci"]
+        t_alerts["alertmanager-alerts"]
+        t_gatus["gatus"]
+        t_flux["flux"]
+        t_deploy["deployments"]
+    end
+
+    subgraph consumers["📱 Consumers"]
+        direction TB
+        mobile["📱 ntfy App<br/>(iOS/Android)"]
+        bridge["🌉 ntfy-discord<br/>Bridge"]
+        discord["💬 Discord<br/>Webhooks"]
+    end
+
+    %% Source to hub
+    ci -->|"POST"| ntfy
+    alertmanager -->|"webhook"| ntfy
+    gatus -->|"webhook"| ntfy
+    flux -->|"notification-controller"| ntfy
+
+    %% Hub to topics
+    ntfy --> topics
+
+    %% Topics to consumers
+    t_ci --> mobile
+    t_alerts --> mobile
+    t_gatus --> mobile
+    t_flux --> mobile
+    t_deploy --> mobile
+
+    topics --> bridge
+    bridge --> discord
+
+    classDef source fill:#3498db,color:white
+    classDef hub fill:#e74c3c,color:white
+    classDef topic fill:#9b59b6,color:white
+    classDef consumer fill:#27ae60,color:white
+
+    class ci,alertmanager,gatus,flux source
+    class ntfy hub
+    class t_ci,t_alerts,t_gatus,t_flux,t_deploy topic
+    class mobile,bridge,discord consumer
+
+```
diff --git a/diagrams/ntfy-discord-bridge.mmd b/diagrams/ntfy-discord-bridge.mmd
new file mode 100644
index 0000000..f50f3a2
--- /dev/null
+++ b/diagrams/ntfy-discord-bridge.mmd
@@ -0,0 +1,45 @@
+```plaintext
+%% ntfy-Discord Bridge (ADR-0022)
+%% Sequence diagram showing message flow and transformation
+
+sequenceDiagram
+    autonumber
+    participant S as Notification Source<br/>(CI/Alertmanager)
+    participant N as ntfy<br/>Notification Hub
+    participant B as ntfy-discord<br/>Go Bridge
+    participant D as Discord<br/>Webhook
+
+    Note over S,N: Events published to ntfy topics
+
+    S->>N: POST /gitea-ci<br/>{title, message, priority}
+
+    Note over N,B: SSE subscription for real-time
+
+    N-->>B: SSE JSON stream<br/>{topic, message, priority, tags}
+
+    Note over B: Message transformation
+
+    rect rgb(240, 240, 240)
+        B->>B: Map priority to embed color<br/>urgent=red, high=orange<br/>default=blue, low=gray
+        B->>B: Format as Discord embed<br/>{embeds: [{title, description, color}]}
+    end
+
+    B->>D: POST webhook URL<br/>Discord embed format
+
+    Note over B: Hot-reload support
+
+    rect rgb(230, 245, 230)
+        B->>B: fsnotify watches secrets
+        B->>B: Reload config without restart
+    end
+
+    Note over B,D: Retry with exponential backoff
+
+    alt Webhook fails
+        B-->>B: Retry (2s, 4s, 8s...)
+        B->>D: Retry POST
+    end
+
+    D-->>D: Display in channel
+
+```
diff --git a/diagrams/observability-stack.mmd b/diagrams/observability-stack.mmd
new file mode 100644
index 0000000..83a508f
--- /dev/null
+++ b/diagrams/observability-stack.mmd
@@ -0,0 +1,72 @@
+```plaintext
+%% Observability Stack Architecture (ADR-0025)
+%% C4 Component diagram showing telemetry flow
+
+flowchart TB
+    subgraph apps["📦 Applications"]
+        direction LR
+        go["Go Apps<br/>(OTEL SDK)"]
+        python["Python Apps<br/>(OTEL SDK)"]
+        node["Node.js Apps<br/>(OTEL SDK)"]
+        java["Java Apps<br/>(OTEL SDK)"]
+    end
+
+    subgraph collection["📡 Telemetry Collection"]
+        otel["OpenTelemetry<br/>Collector<br/>━━━━━━━━<br/>OTLP gRPC :4317<br/>OTLP HTTP :4318"]
+    end
+
+    subgraph storage["💾 Storage Layer"]
+        direction LR
+        
+        subgraph metrics_store["Metrics"]
+            prometheus["📊 Prometheus<br/>14d retention<br/>50GB"]
+        end
+
+        subgraph logs_traces["Logs & Traces"]
+            clickstack["📋 ClickStack<br/>(ClickHouse)"]
+        end
+    end
+
+    subgraph visualization["📈 Visualization"]
+        grafana["🎨 Grafana<br/>Dashboards<br/>& Exploration"]
+    end
+
+    subgraph alerting["🔔 Alerting Pipeline"]
+        alertmanager["⚠️ Alertmanager"]
+        ntfy["📱 ntfy<br/>(Push)"]
+        discord["💬 Discord"]
+    end
+
+    %% App to collector
+    go -->|"OTLP"| otel
+    python -->|"OTLP"| otel
+    node -->|"OTLP"| otel
+    java -->|"OTLP"| otel
+
+    %% Collector to storage
+    otel -->|"Metrics"| prometheus
+    otel -->|"Logs"| clickstack
+    otel -->|"Traces"| clickstack
+
+    %% Storage to visualization
+    prometheus --> grafana
+    clickstack --> grafana
+
+    %% Alerting flow
+    prometheus -->|"PrometheusRules"| alertmanager
+    alertmanager --> ntfy
+    ntfy --> discord
+
+    classDef app fill:#3498db,color:white
+    classDef otel fill:#e74c3c,color:white
+    classDef storage fill:#27ae60,color:white
+    classDef viz fill:#9b59b6,color:white
+    classDef alert fill:#f39c12,color:black
+
+    class go,python,node,java app
+    class otel otel
+    class prometheus,clickstack storage
+    class grafana viz
+    class alertmanager,ntfy,discord alert
+
+```
diff --git a/diagrams/ray-repository-structure.mmd b/diagrams/ray-repository-structure.mmd
new file mode 100644
index 0000000..0e5f73e
--- /dev/null
+++ b/diagrams/ray-repository-structure.mmd
@@ -0,0 +1,66 @@
+```plaintext
+%% Ray Repository Structure (ADR-0024)
+%% Flowchart showing build and dynamic loading flow
+
+flowchart TB
+    subgraph repos["📁 Repositories"]
+        direction LR
+        kuberay["kuberay-images<br/>🐳 Docker images<br/>(infrequent updates)"]
+        rayserve["ray-serve<br/>📦 PyPI package<br/>(frequent updates)"]
+    end
+
+    subgraph ci["🔧 CI/CD Pipelines"]
+        direction LR
+        build_images["Build Docker<br/>nvidia, rdna2,<br/>strixhalo, intel"]
+        build_pypi["Build wheel<br/>uv build"]
+    end
+
+    subgraph registries["📦 Registries"]
+        direction LR
+        container_reg["🐳 Container Registry<br/>registry.lab.daviestechlabs.io"]
+        pypi_reg["📦 PyPI Registry<br/>git.daviestechlabs.io/pypi"]
+    end
+
+    subgraph ray["⚡ Ray Cluster"]
+        direction TB
+        head["🧠 Head Node"]
+        workers["🖥️ Worker Nodes<br/>(GPU-specific)"]
+        
+        subgraph runtime["🔄 Runtime Loading"]
+            pull_image["docker pull<br/>ray-worker-*"]
+            pip_install["pip install ray-serve<br/>runtime_env"]
+        end
+
+        serve_apps["Ray Serve Apps<br/>/llm, /whisper, etc."]
+    end
+
+    subgraph k8s["☸️ Kubernetes"]
+        manifests["RayService CR<br/>(homelab-k8s2)"]
+    end
+
+    %% Build flows
+    kuberay --> build_images
+    rayserve --> build_pypi
+    build_images --> container_reg
+    build_pypi --> pypi_reg
+
+    %% Deployment flow
+    manifests --> ray
+    container_reg --> pull_image
+    pull_image --> workers
+    pypi_reg --> pip_install
+    pip_install --> serve_apps
+
+    classDef repo fill:#3498db,color:white
+    classDef ci fill:#f39c12,color:black
+    classDef registry fill:#9b59b6,color:white
+    classDef ray fill:#27ae60,color:white
+    classDef k8s fill:#e74c3c,color:white
+
+    class kuberay,rayserve repo
+    class build_images,build_pypi ci
+    class container_reg,pypi_reg registry
+    class head,workers,pull_image,pip_install,serve_apps ray
+    class manifests k8s
+
+```
diff --git a/diagrams/renovate-workflow.mmd b/diagrams/renovate-workflow.mmd
new file mode 100644
index 0000000..0902c41
--- /dev/null
+++ b/diagrams/renovate-workflow.mmd
@@ -0,0 +1,86 @@
+%% Renovate Dependency Update Workflow
+%% Related: ADR-0036
+
+flowchart TB
+    subgraph Schedule["Schedule"]
+        Cron["CronJob<br/>Every 8 hours"]
+    end
+
+    subgraph Renovate["Renovate (ci-cd namespace)"]
+        Job["Renovate Job"]
+        
+        subgraph Scan["Repository Scan"]
+            Discover["Autodiscover<br/>Gitea Repos"]
+            Parse["Parse Dependencies<br/>40+ managers"]
+            Compare["Compare Versions<br/>Check registries"]
+        end
+    end
+
+    subgraph Registries["Version Sources"]
+        DockerHub["Docker Hub"]
+        GHCR["GHCR"]
+        PyPI["PyPI"]
+        GoProxy["Go Proxy"]
+        Helm["Helm Repos"]
+    end
+
+    subgraph Gitea["Gitea Repositories"]
+        subgraph Repos["Scanned Repos"]
+            K8s["homelab-k8s2"]
+            Handler["chat-handler"]
+            KubeRay["kuberay-images"]
+            More["...20+ repos"]
+        end
+
+        subgraph PRs["Generated PRs"]
+            Grouped["Grouped PR<br/>all-non-major"]
+            Security["Security PR<br/>CVE fixes"]
+            Major["Major PR<br/>breaking changes"]
+        end
+
+        Dashboard["Dependency Dashboard<br/>Issue #1"]
+    end
+
+    subgraph Merge["Merge Strategy"]
+        AutoMerge["Auto-merge<br/>patch + minor"]
+        Review["Manual Review<br/>major updates"]
+    end
+
+    Cron --> Job
+    Job --> Discover
+    Discover --> Parse
+    Parse --> Compare
+
+    Compare --> DockerHub
+    Compare --> GHCR
+    Compare --> PyPI
+    Compare --> GoProxy
+    Compare --> Helm
+
+    Discover --> K8s
+    Discover --> Handler
+    Discover --> KubeRay
+    Discover --> More
+
+    Compare --> Grouped
+    Compare --> Security
+    Compare --> Major
+    Job --> Dashboard
+
+    Grouped --> AutoMerge
+    Security --> AutoMerge
+    Major --> Review
+
+    classDef schedule fill:#4a5568,stroke:#718096,color:#fff
+    classDef renovate fill:#667eea,stroke:#5a67d8,color:#fff
+    classDef registry fill:#ed8936,stroke:#dd6b20,color:#fff
+    classDef repo fill:#38a169,stroke:#2f855a,color:#fff
+    classDef pr fill:#9f7aea,stroke:#805ad5,color:#fff
+    classDef merge fill:#e53e3e,stroke:#c53030,color:#fff
+
+    class Cron schedule
+    class Job,Discover,Parse,Compare renovate
+    class DockerHub,GHCR,PyPI,GoProxy,Helm registry
+    class K8s,Handler,KubeRay,More repo
+    class Grouped,Security,Major,Dashboard pr
+    class AutoMerge,Review merge
diff --git a/diagrams/secrets-management.mmd b/diagrams/secrets-management.mmd
new file mode 100644
index 0000000..d636bf1
--- /dev/null
+++ b/diagrams/secrets-management.mmd
@@ -0,0 +1,51 @@
+```plaintext
+%% Secrets Management Strategy (ADR-0017)
+%% Flowchart showing dual secret paths: SOPS bootstrap vs Vault runtime
+
+flowchart TB
+    subgraph bootstrap["🚀 Bootstrap Secrets (Git-encrypted)"]
+        direction TB
+        sops_files["*.sops.yaml<br/>📄 Encrypted in Git"]
+        age_key["🔑 Age Key<br/>(backed up externally)"]
+        sops_dec["SOPS Decryption"]
+        flux_dec["Flux Controller"]
+        bs_secrets["🔐 Bootstrap Secrets<br/>• Talos machine secrets<br/>• GitHub deploy key<br/>• Initial Vault unseal"]
+    end
+
+    subgraph runtime["⚙️ Runtime Secrets (Vault-managed)"]
+        direction TB
+        vault["🏦 HashiCorp Vault<br/>HA (3 replicas) + Raft"]
+        eso["External Secrets<br/>Operator"]
+        app_secrets["🔑 Application Secrets<br/>• Database credentials<br/>• API keys<br/>• OAuth secrets"]
+    end
+
+    subgraph apps["📦 Applications"]
+        direction TB
+        pods["Workload Pods"]
+    end
+
+    %% Bootstrap flow
+    sops_files -->|"Commit to Git"| flux_dec
+    age_key -->|"Decrypts"| sops_dec
+    flux_dec --> sops_dec
+    sops_dec -->|"Creates"| bs_secrets
+
+    %% Runtime flow
+    vault -->|"ExternalSecret CR"| eso
+    eso -->|"Syncs to"| app_secrets
+
+    %% Consumption
+    bs_secrets -->|"Mounted"| pods
+    app_secrets -->|"Mounted"| pods
+
+    classDef bootstrap fill:#3498db,color:white
+    classDef vault fill:#27ae60,color:white
+    classDef secrets fill:#e74c3c,color:white
+    classDef app fill:#9b59b6,color:white
+
+    class sops_files,age_key,sops_dec,flux_dec bootstrap
+    class vault,eso vault
+    class bs_secrets,app_secrets secrets
+    class pods app
+
+```
diff --git a/diagrams/security-policy-enforcement.mmd b/diagrams/security-policy-enforcement.mmd
new file mode 100644
index 0000000..3351c68
--- /dev/null
+++ b/diagrams/security-policy-enforcement.mmd
@@ -0,0 +1,81 @@
+```plaintext
+%% Security Policy Enforcement (ADR-0018)
+%% Flowchart showing admission control and vulnerability scanning
+
+flowchart TB
+    subgraph deploy["🚀 Deployment Sources"]
+        kubectl["kubectl"]
+        flux["Flux CD"]
+    end
+
+    subgraph admission["🛡️ Admission Control"]
+        api["Kubernetes<br/>API Server"]
+        gatekeeper["Gatekeeper (OPA)<br/>⚖️ Policy Validation"]
+    end
+
+    subgraph policies["📋 Policies"]
+        direction TB
+        p1["No privileged containers"]
+        p2["Required labels"]
+        p3["Resource limits"]
+        p4["Image registry whitelist"]
+    end
+
+    subgraph enforcement["🎯 Enforcement Modes"]
+        warn["⚠️ warn<br/>(log only)"]
+        dryrun["📊 dryrun<br/>(audit)"]
+        deny["🚫 deny<br/>(block)"]
+    end
+
+    subgraph workloads["☸️ Running Workloads"]
+        pods["Pods<br/>Deployments<br/>StatefulSets"]
+    end
+
+    subgraph scanning["🔍 Continuous Scanning"]
+        trivy["Trivy Operator"]
+        reports["VulnerabilityReports<br/>(CRDs)"]
+    end
+
+    subgraph observability["📈 Observability"]
+        prometheus["Prometheus<br/>📊 Metrics"]
+        grafana["Grafana<br/>📉 Dashboards"]
+        alertmanager["Alertmanager<br/>🔔 Alerts"]
+        ntfy["ntfy<br/>📱 Notifications"]
+    end
+
+    %% Admission flow
+    kubectl --> api
+    flux --> api
+    api -->|"Intercepts"| gatekeeper
+    gatekeeper -->|"Evaluates"| policies
+    policies --> enforcement
+    warn -->|"Allows"| workloads
+    dryrun -->|"Allows"| workloads
+    deny -->|"Blocks"| api
+    enforcement -->|"Violations"| prometheus
+
+    %% Scanning flow
+    workloads -->|"Scans images"| trivy
+    trivy -->|"Creates"| reports
+    reports -->|"Exports"| prometheus
+
+    %% Observability flow
+    prometheus --> grafana
+    prometheus --> alertmanager
+    alertmanager --> ntfy
+
+    classDef source fill:#f39c12,color:black
+    classDef admission fill:#3498db,color:white
+    classDef policy fill:#9b59b6,color:white
+    classDef workload fill:#27ae60,color:white
+    classDef scan fill:#e74c3c,color:white
+    classDef observe fill:#1abc9c,color:white
+
+    class kubectl,flux source
+    class api,gatekeeper admission
+    class p1,p2,p3,p4,warn,dryrun,deny policy
+    class pods workload
+    class trivy,reports scan
+    class prometheus,grafana,alertmanager,ntfy observe
+
+```
diff --git a/diagrams/storage-strategy.mmd b/diagrams/storage-strategy.mmd
new file mode 100644
index 0000000..2ab1ad0
--- /dev/null
+++ b/diagrams/storage-strategy.mmd
@@ -0,0 +1,67 @@
+```plaintext
+%% Tiered Storage Strategy (ADR-0026)
+%% C4 Component diagram showing Longhorn + NFS dual-tier
+
+flowchart TB
+    subgraph tier1["🚀 TIER 1: LONGHORN (Fast Distributed Block)"]
+        direction TB
+        
+        subgraph nodes["Cluster Nodes"]
+            direction LR
+            khelben["🖥️ khelben<br/>/var/mnt/longhorn<br/>NVMe"]
+            mystra["🖥️ mystra<br/>/var/mnt/longhorn<br/>SSD"]
+            selune["🖥️ selune<br/>/var/mnt/longhorn<br/>SSD"]
+        end
+
+        longhorn_mgr["⚙️ Longhorn Manager<br/>(Schedules 2-3 replicas)"]
+
+        subgraph longhorn_pvcs["Performance Workloads"]
+            direction LR
+            pg["🐘 PostgreSQL"]
+            vault["🔐 Vault"]
+            prom["📊 Prometheus"]
+            click["📋 ClickHouse"]
+        end
+    end
+
+    subgraph tier2["💾 TIER 2: NFS-SLOW (High-Capacity Bulk)"]
+        direction TB
+        
+        nas["🗄️ candlekeep.lab.daviestechlabs.io<br/>External NAS<br/>/kubernetes"]
+        
+        nfs_csi["📂 NFS CSI Driver"]
+
+        subgraph nfs_pvcs["Bulk Storage Workloads"]
+            direction LR
+            jellyfin["🎬 Jellyfin<br/>(1TB+ media)"]
+            nextcloud["☁️ Nextcloud"]
+            immich["📷 Immich"]
+            kavita["📚 Kavita"]
+            mlflow["📈 MLflow<br/>Artifacts"]
+            ray_models["🤖 Ray<br/>Model Weights"]
+        end
+    end
+
+    %% Tier 1 connections
+    nodes --> longhorn_mgr
+    longhorn_mgr --> longhorn_pvcs
+
+    %% Tier 2 connections
+    nas --> nfs_csi
+    nfs_csi --> nfs_pvcs
+
+    classDef tier1_node fill:#3498db,color:white
+    classDef tier1_mgr fill:#2980b9,color:white
+    classDef tier1_pvc fill:#1abc9c,color:white
+    classDef tier2_nas fill:#e74c3c,color:white
+    classDef tier2_csi fill:#c0392b,color:white
+    classDef tier2_pvc fill:#f39c12,color:black
+
+    class khelben,mystra,selune tier1_node
+    class longhorn_mgr tier1_mgr
+    class pg,vault,prom,click tier1_pvc
+    class nas tier2_nas
+    class nfs_csi tier2_csi
+    class jellyfin,nextcloud,immich,kavita,mlflow,ray_models tier2_pvc
+
+```
diff --git a/diagrams/user-registration-workflow.mmd b/diagrams/user-registration-workflow.mmd
new file mode 100644
index 0000000..c3031cb
--- /dev/null
+++ b/diagrams/user-registration-workflow.mmd
@@ -0,0 +1,93 @@
+```plaintext
+%% User Registration and Approval Workflow (ADR-0029)
+%% Flowchart showing registration, approval, and access control
+
+flowchart TB
+    subgraph registration["📝 Registration Flow"]
+        direction TB
+        request["👤 User Requests<br/>Account"]
+        form["📋 Enrollment<br/>Form"]
+        created["✅ Account<br/>Created"]
+        pending["⏳ pending-approval<br/>Group"]
+    end
+
+    subgraph approval["✋ Admin Approval"]
+        direction TB
+        notify["📧 Admin<br/>Notification"]
+        review["👁️ Admin<br/>Reviews"]
+        decision{{"Decision"}}
+    end
+
+    subgraph groups["👥 Group Assignment"]
+        direction LR
+        reject["❌ Rejected"]
+        guests["🎫 homelab-guests<br/>Limited access"]
+        users["👥 homelab-users<br/>Full access"]
+        admins["👑 homelab-admins<br/>Admin access"]
+    end
+
+    subgraph access["🔓 Application Access"]
+        direction TB
+        
+        subgraph admin_apps["Admin Apps"]
+            authentik_admin["Authentik Admin"]
+            gitea["Gitea"]
+            flux_ui["Flux UI"]
+        end
+
+        subgraph user_apps["User Apps"]
+            affine["Affine"]
+            immich["Immich"]
+            nextcloud["Nextcloud"]
+            vaultwarden["Vaultwarden"]
+        end
+
+        subgraph guest_apps["Guest Apps"]
+            kavita["Kavita"]
+        end
+
+        subgraph no_access["No Access"]
+            profile["Authentik Profile<br/>(only)"]
+        end
+    end
+
+    %% Registration flow
+    request --> form
+    form --> created
+    created --> pending
+    pending --> notify
+
+    %% Approval flow
+    notify --> review
+    review --> decision
+    decision -->|"Reject"| reject
+    decision -->|"Basic"| guests
+    decision -->|"Full"| users
+    decision -->|"Admin"| admins
+
+    %% Access mapping
+    reject --> profile
+    guests --> guest_apps
+    users --> user_apps
+    users --> guest_apps
+    admins --> admin_apps
+    admins --> user_apps
+    admins --> guest_apps
+
+    classDef registration fill:#3498db,color:white
+    classDef approval fill:#f39c12,color:black
+    classDef group fill:#9b59b6,color:white
+    classDef admin fill:#e74c3c,color:white
+    classDef user fill:#27ae60,color:white
+    classDef guest fill:#1abc9c,color:white
+    classDef none fill:#95a5a6,color:white
+
+    class request,form,created,pending registration
+    class notify,review approval
+    class reject,guests,users,admins group
+    class authentik_admin,gitea,flux_ui admin
+    class affine,immich,nextcloud,vaultwarden user
+    class kavita guest
+    class profile none
+
+```
diff --git a/diagrams/velero-backup.mmd b/diagrams/velero-backup.mmd
new file mode 100644
index 0000000..17abe8d
--- /dev/null
+++ b/diagrams/velero-backup.mmd
@@ -0,0 +1,60 @@
+%% Velero Backup Architecture
+%% Related: ADR-0032
+
+flowchart TB
+    subgraph Schedule["Backup Schedule"]
+        Nightly["Nightly Backup<br/>2:00 AM"]
+        Hourly["Hourly Snapshots<br/>Critical Namespaces"]
+    end
+
+    subgraph Velero["Velero (velero namespace)"]
+        Server["Velero Server"]
+        NodeAgent["Node Agent<br/>(DaemonSet)"]
+    end
+
+    subgraph Sources["Backup Sources"]
+        PVs["Persistent Volumes<br/>(Longhorn)"]
+        Resources["Kubernetes Resources<br/>(Secrets, ConfigMaps)"]
+        DBs["Database Dumps<br/>(Pre-backup hooks)"]
+    end
+
+    subgraph Targets["Backup Destinations"]
+        subgraph Primary["Primary: S3"]
+            MinIO["MinIO<br/>On-premises S3"]
+        end
+        subgraph Secondary["Secondary: NFS"]
+            NAS["Synology NAS<br/>Long-term retention"]
+        end
+    end
+
+    subgraph Restore["Restore Options"]
+        Full["Full Cluster Restore"]
+        Namespace["Namespace Restore"]
+        Selective["Selective Resource Restore"]
+    end
+
+    Nightly --> Server
+    Hourly --> Server
+    Server --> NodeAgent
+    NodeAgent --> PVs
+    Server --> Resources
+    Server --> DBs
+    
+    Server --> MinIO
+    MinIO -.->|Replicated| NAS
+
+    Server --> Full
+    Server --> Namespace
+    Server --> Selective
+
+    classDef schedule fill:#4a5568,stroke:#718096,color:#fff
+    classDef velero fill:#667eea,stroke:#5a67d8,color:#fff
+    classDef source fill:#48bb78,stroke:#38a169,color:#fff
+    classDef target fill:#ed8936,stroke:#dd6b20,color:#fff
+    classDef restore fill:#9f7aea,stroke:#805ad5,color:#fff
+
+    class Nightly,Hourly schedule
+    class Server,NodeAgent velero
+    class PVs,Resources,DBs source
+    class MinIO,NAS target
+    class Full,Namespace,Selective restore
diff --git a/diagrams/volcano-scheduling.mmd b/diagrams/volcano-scheduling.mmd
new file mode 100644
index 0000000..f92870e
--- /dev/null
+++ b/diagrams/volcano-scheduling.mmd
@@ -0,0 +1,81 @@
+%% Volcano Batch Scheduling Architecture
+%% Related: ADR-0034
+
+flowchart TB
+    subgraph Submissions["Workload Submissions"]
+        KFP["Kubeflow Pipelines"]
+        Argo["Argo Workflows"]
+        Spark["Spark Jobs"]
+        Ray["Ray Jobs"]
+    end
+
+    subgraph Volcano["Volcano Scheduler"]
+        Admission["Admission Controller"]
+        Scheduler["Volcano Scheduler"]
+        Controller["Job Controller"]
+        
+        subgraph Plugins["Scheduling Plugins"]
+            Gang["Gang Scheduling"]
+            Priority["Priority"]
+            DRF["Dominant Resource Fairness"]
+            Binpack["Bin Packing"]
+        end
+    end
+
+    subgraph Queues["Resource Queues"]
+        MLQueue["ml-training<br/>weight: 4"]
+        InferQueue["inference<br/>weight: 3"]
+        BatchQueue["batch-jobs<br/>weight: 2"]
+        DefaultQueue["default<br/>weight: 1"]
+    end
+
+    subgraph Resources["Cluster Resources"]
+        subgraph GPUs["GPU Nodes"]
+            Khelben["khelben<br/>Strix Halo 64GB"]
+            Elminster["elminster<br/>RTX 2070"]
+            Drizzt["drizzt<br/>RDNA2 680M"]
+            Danilo["danilo<br/>Intel Arc"]
+        end
+        subgraph CPU["CPU Nodes"]
+            Workers["9 x86_64 Workers"]
+            ARM["5 ARM64 Workers"]
+        end
+    end
+
+    KFP --> Admission
+    Argo --> Admission
+    Spark --> Admission
+    Ray --> Admission
+
+    Admission --> Scheduler
+    Scheduler --> Controller
+    
+    Scheduler --> Gang
+    Scheduler --> Priority
+    Scheduler --> DRF
+    Scheduler --> Binpack
+
+    Controller --> MLQueue
+    Controller --> InferQueue
+    Controller --> BatchQueue
+    Controller --> DefaultQueue
+
+    MLQueue --> GPUs
+    InferQueue --> GPUs
+    BatchQueue --> GPUs
+    BatchQueue --> CPU
+    DefaultQueue --> CPU
+
+    classDef submit fill:#4a5568,stroke:#718096,color:#fff
+    classDef volcano fill:#667eea,stroke:#5a67d8,color:#fff
+    classDef plugin fill:#9f7aea,stroke:#805ad5,color:#fff
+    classDef queue fill:#ed8936,stroke:#dd6b20,color:#fff
+    classDef gpu fill:#e53e3e,stroke:#c53030,color:#fff
+    classDef cpu fill:#38a169,stroke:#2f855a,color:#fff
+
+    class KFP,Argo,Spark,Ray submit
+    class Admission,Scheduler,Controller volcano
+    class Gang,Priority,DRF,Binpack plugin
+    class MLQueue,InferQueue,BatchQueue,DefaultQueue queue
+    class Khelben,Elminster,Drizzt,Danilo gpu
+    class Workers,ARM cpu