tracebloc · LukasWodka · Jun 5, 2026
diff --git a/.github/workflows/installer-tests.yaml b/.github/workflows/installer-tests.yaml
@@ -55,12 +55,11 @@ jobs:
           # below for visibility but don't fail the gate.
           shellcheck --severity=error --shell=bash \
             scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
-            scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/check-drift.sh
+            scripts/tests/check-drift.sh scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-journey.sh scripts/tests/e2e-proxy.sh scripts/tests/path-persist.sh
           echo "── shellcheck warnings (advisory, non-blocking) ──"
           shellcheck --severity=warning --shell=bash \
             scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \
-            scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/check-drift.sh || true
-
+            scripts/tests/check-drift.sh scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-journey.sh scripts/tests/e2e-proxy.sh scripts/tests/path-persist.sh
       - name: PSScriptAnalyzer (PowerShell installer)
         shell: pwsh
         run: |
@@ -172,3 +171,64 @@ jobs:
       - uses: actions/checkout@v4
       - name: Cluster up through an authenticated proxy
         run: bash scripts/tests/e2e-proxy.sh
+
+  path-persist:
+    # Fresh-shell PATH-persistence guard for the tracebloc CLI — the cheap, wide
+    # leg. Installs cli/install.sh in a plain container per distro, then opens a
+    # BRAND-NEW login AND non-login shell for each of bash/zsh/fish and asserts
+    # `tracebloc` resolves + `tracebloc version` runs. This is the cell that goes
+    # RED on the pre-fix installer and GREEN on the fixed one (a fresh non-login
+    # bash reads ~/.bashrc, not ~/.profile) — the regression guard for the whole
+    # PATH class that distro-prereqs / e2e-cluster can't see (they assert in the
+    # same shell that ran the installer). No cluster, no Docker-in-Docker, no
+    # creds. The script iterates shell × mode INSIDE the container; distro is the
+    # matrix axis (it installs zsh/fish in-container where a package exists).
+    name: PATH persist — ${{ matrix.distro }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        distro:
+          - 'ubuntu:22.04'        # most common server
+          - 'ubuntu:24.04'        # newest LTS
+          - 'debian:12'           # apt
+          - 'fedora:latest'       # dnf
+          - 'almalinux:9'         # RHEL rebuild
+          - 'opensuse/leap:15.6'  # zypper
+          - 'alpine:3'            # busybox sh + apk (optional, minimal)
+    steps:
+      - uses: actions/checkout@v4
+      - name: Fresh-shell PATH check in ${{ matrix.distro }}
+        env:
+          DISTRO: ${{ matrix.distro }}
+          # Override TRACEBLOC_CLI_REF here to point at a specific cli install.sh
+          # (URL or, for the cross-repo cli-side caller, a local path). Default
+          # lives in the script while cli#61's PATH fix is unreleased.
+          TRACEBLOC_CLI_REF: ${{ vars.TRACEBLOC_CLI_REF }}
+        run: |
+          docker run --rm \
+            -e TRACEBLOC_CLI_REF \
+            -v "$PWD:/src:ro" -w /src "$DISTRO" \
+            bash scripts/tests/path-persist.sh
+
+  e2e-journey:
+    # Leg 2 — the full last-mile journey on a real cluster: create_cluster() →
+    # install the CLI via cli/install.sh → apply a CREDENTIAL-FREE stub the CLI
+    # discovers (a *-jobs-manager Deployment with the chart's labels + an
+    # `ingestor` SA) → assert `tracebloc cluster info` succeeds AND resolves from
+    # a fresh shell → `dataset push --dry-run` smoke. No private images, no
+    # secrets (the stub stands in for the parent release). Heavier than Leg 1
+    # (boots a real k3d cluster), so — mirroring cli's e2e.yml — it runs on the
+    # nightly schedule + workflow_dispatch, and on a PR ONLY when it carries the
+    # `e2e` label. amd64, like e2e-cluster.
+    name: E2E last-mile journey (amd64)
+    runs-on: ubuntu-latest
+    if: >-
+      github.event_name != 'pull_request' ||
+      contains(github.event.pull_request.labels.*.name, 'e2e')
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install → CLI → cluster info (fresh shell) → dataset push --dry-run
+        env:
+          TRACEBLOC_CLI_REF: ${{ vars.TRACEBLOC_CLI_REF }}
+        run: bash scripts/tests/e2e-journey.sh
diff --git a/scripts/tests/e2e-journey.sh b/scripts/tests/e2e-journey.sh
@@ -0,0 +1,303 @@
+#!/usr/bin/env bash
+# =============================================================================
+#  e2e-journey.sh — last-mile customer journey on a real cluster
+# -----------------------------------------------------------------------------
+#  Continues exactly where e2e-cluster.sh stops. That job proves the installer's
+#  create_cluster() brings up a real k3d cluster and can run a workload, then
+#   stops BEFORE the CLI. This job picks up from a live cluster and walks the
+#  documented next steps a customer takes:
+#
+#    1. create_cluster()                      (the installer's real path)
+#    2. install the tracebloc CLI via cli/install.sh
+#    3. apply a CREDENTIAL-FREE stub that looks like the parent client release
+#       to the CLI's discovery (a *-jobs-manager Deployment with the chart's
+#       hallmark labels + an `ingestor` ServiceAccount), point the kubeconfig
+#       context's namespace at it, and assert `tracebloc cluster info`:
+#         (a) succeeds (exit 0), AND
+#         (b) succeeds from a FRESH shell (the cli#61 PATH class, on the journey)
+#    4. `tracebloc dataset push --dry-run` smoke on a tiny sample CSV
+#       (offline-validatable; no creds, no real ingestion)
+#    5. teardown (EXIT trap, same as e2e-cluster.sh)
+#
+#  What it deliberately does NOT do: the private-image tracebloc helm install +
+#  backend registration (needs real credentials + a reachable platform). The
+#  whole point of the stub is to exercise the CLI's discovery + token + dry-run
+#  paths end-to-end WITHOUT any of that — so this runs on stock GitHub runners
+#  with no secrets, like e2e-cluster.sh.
+#
+#  Every long-running step is wrapped in a watchdog timeout so a hang FAILS the
+#  job instead of spinning until the 6h GitHub ceiling (ties to the conntrack
+#  "looks hung" class — a hang must surface as a red failure, not a timeout).
+#
+#  Configuration (env):
+#    TRACEBLOC_CLI_REF       URL or local path to cli/install.sh (see path-persist.sh).
+#    TRACEBLOC_CLI_VERSION   Optional --version tag for install.sh.
+#    CLUSTER_NAME            Isolated cluster name (default tbe2e-journey).
+#    TB_NAMESPACE            Namespace the stub release lives in (default tracebloc).
+#
+#  Usage:  bash scripts/tests/e2e-journey.sh
+# =============================================================================
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+LIB="$HERE/../lib"
+
+# Isolated cluster name so we never touch a real 'tracebloc' cluster; opt out of
+# autostart so we don't reconfigure docker.service / restart policies on the host
+# (identical isolation posture to e2e-cluster.sh).
+export USER="${USER:-$(id -un)}"
+export CLUSTER_NAME="${CLUSTER_NAME:-tbe2e-journey}"
+export TRACEBLOC_NO_AUTOSTART=1
+
+TB_NAMESPACE="${TB_NAMESPACE:-tracebloc}"
+# Cosmetic stand-ins for the chart's real values — discovery keys off the LABELS
+# below, not these, so any plausible values work. A release name + a pinned chart
+# version make `cluster info`'s output realistic.
+STUB_RELEASE="tbe2e"
+STUB_CHART_VERSION="0.0.0-e2e"
+
+CLI_REF="${TRACEBLOC_CLI_REF:-}"
+CLI_VERSION="${TRACEBLOC_CLI_VERSION:-}"
+
+# shellcheck source=/dev/null
+source "$LIB/common.sh"
+# shellcheck source=/dev/null
+source "$LIB/setup-linux.sh"
+# shellcheck source=/dev/null
+source "$LIB/cluster.sh"
+
+WORKDIR="$(mktemp -d "${TMPDIR:-/tmp}/tb-e2e-journey-XXXXXX")"
+cleanup() {
+  k3d cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true
+  rm -rf "$WORKDIR" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# ── Watchdog: run a step under a hard time limit ─────────────────────────────
+# A hang (e.g. a stuck image pull, a wedged API server, the conntrack "looks
+# hung" class) must surface as a red FAILURE, not an infinite spinner. `timeout`
+# (GNU coreutils) is preinstalled on the Ubuntu runners this job targets; if it's
+# somehow absent we degrade to running the step unguarded rather than dying on a
+# missing binary (the step can still fail on its own non-zero exit).
+guard() { # guard <seconds> <label> -- <command...>
+  local secs="$1" label="$2"; shift 2
+  [[ "${1:-}" == "--" ]] && shift
+  if has timeout; then
+    if ! timeout --kill-after=15s "$secs" "$@"; then
+      local rc=$?
+      if [[ $rc -eq 124 ]]; then
+        error "step '${label}' exceeded ${secs}s — treating the hang as a failure."
+      fi
+      return $rc
+    fi
+  else
+    warn "'timeout' not found — running '${label}' without a watchdog."
+    "$@"
+  fi
+}
+
+echo "═══════════════════════════════════════════════════════════════════════"
+echo "  E2E last-mile journey   arch: $(uname -m)   kernel: $(uname -r)"
+echo "  install → CLI → cluster info (fresh shell) → dataset push --dry-run"
+echo "═══════════════════════════════════════════════════════════════════════"
+
+# ── Step 1: bring the cluster up via the installer's real path ───────────────
+has docker || error "Docker is not available on this host."
+umask 022
+install_kubectl
+install_k3d
+install_helm
+
+echo ""
+echo "── Step 1: create_cluster() — the installer's real cluster-bring-up ─────"
+guard 600 "create_cluster" -- create_cluster
+
+echo "── assert: all nodes reach Ready ──"
+guard 200 "wait nodes Ready" -- kubectl wait --for=condition=Ready nodes --all --timeout=180s
+kubectl get nodes -o wide
+
+# kubectl-created pods bind to default/default; on fast runners the SA controller
+# can race ("serviceaccount default not found"). Wait for it before we apply.
+echo "── wait for the default ServiceAccount ──"
+for _ in $(seq 1 30); do
+  kubectl get serviceaccount default -n default >/dev/null 2>&1 && break
+  sleep 2
+done
+
+# ── Step 2: install the CLI via cli/install.sh ───────────────────────────────
+echo ""
+echo "── Step 2: install the tracebloc CLI via cli/install.sh ────────────────"
+# The fresh-shell PATH assertion itself is covered exhaustively (distro × shell ×
+# mode) by path-persist.sh. Here we install once and re-assert the single cell
+# that matters on the journey, so a CLI that installs but isn't reachable from a
+# new terminal fails the END-TO-END path too — not just the cheap matrix.
+if [[ -z "$CLI_REF" ]]; then
+  # Default to the same branch installer path-persist.sh uses while cli#61 is
+  # unreleased. TODO(cli#61): switch to releases/latest/download/install.sh once
+  # the PATH-persist fix ships in a public release.
+  CLI_REF="https://raw.githubusercontent.com/tracebloc/cli/fix/install-path-persist/scripts/install.sh"
+fi
+echo "  cli ref: ${CLI_REF}"
+
+cli_install_args=()
+[[ -n "$CLI_VERSION" ]] && cli_install_args+=(--version "$CLI_VERSION")
+
+case "$CLI_REF" in
+  http://*|https://*)
+    installer="$WORKDIR/install.sh"
+    guard 120 "download install.sh" -- curl -fsSL "$CLI_REF" -o "$installer"
+    guard 300 "run install.sh" -- sh "$installer" "${cli_install_args[@]}"
+    ;;
+  *)
+    [[ -f "$CLI_REF" ]] || error "TRACEBLOC_CLI_REF is neither a URL nor an existing file: $CLI_REF"
+    guard 300 "run install.sh" -- sh "$CLI_REF" "${cli_install_args[@]}"
+    ;;
+esac
+success "CLI installed."
+
+# ── Step 3: apply a credential-free stub the CLI will discover ───────────────
+#
+# What the CLI's discovery actually keys off (internal/cluster/discover.go):
+#   • label selector: app.kubernetes.io/name=client,app.kubernetes.io/managed-by=Helm
+#   • Deployment name == "jobs-manager" OR ends in "-jobs-manager"
+#   • release name from   app.kubernetes.io/instance
+#   • chart version from  helm.sh/chart="client-<ver>"
+#   • app version  from   app.kubernetes.io/version
+# and `tracebloc cluster info` then mints a token for the "ingestor" SA via
+# TokenRequest (exit 5 if that SA is missing), so we create that SA too. NONE of
+# this needs a private image — pause/nginx is plenty; we only need the labels +
+# the SA to exist. (`app: manager` is included as an extra cosmetic label to
+# match the issue's shorthand, but it is NOT what discovery selects on.)
+#
+# client#208 (installer points the kube context at the workspace namespace) is
+# already MERGED, so the realistic, supported state is: context's namespace ==
+# the workspace namespace. We reproduce that by pinning the context's namespace
+# to $TB_NAMESPACE below, then assert the core path works. (The OPPOSITE case —
+# context left on `default` and the CLI auto-discovering the release across
+# namespaces — depends on a CLI namespace auto-discover change that is NOT yet
+# merged; that sub-assertion is gated as pending at the end of this script.)
+echo ""
+echo "── Step 3: stub parent release + cluster info (incl. fresh shell) ──────"
+guard 60 "create namespace" -- kubectl create namespace "$TB_NAMESPACE"
+
+MANIFEST="$WORKDIR/stub-release.yaml"
+cat > "$MANIFEST" <<YAML
+# Credential-free stand-in for the tracebloc parent client release. Carries the
+# exact labels the CLI's DiscoverParentRelease() selects on; the container image
+# is irrelevant to discovery (pause never needs to pull from a private registry).
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ingestor
+  namespace: ${TB_NAMESPACE}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ${STUB_RELEASE}-jobs-manager
+  namespace: ${TB_NAMESPACE}
+  labels:
+    app.kubernetes.io/name: client
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/instance: ${STUB_RELEASE}
+    app.kubernetes.io/version: ${STUB_CHART_VERSION}
+    helm.sh/chart: client-${STUB_CHART_VERSION}
+    app: manager
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: client
+      app.kubernetes.io/instance: ${STUB_RELEASE}
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: client
+        app.kubernetes.io/instance: ${STUB_RELEASE}
+        app: manager
+    spec:
+      containers:
+        - name: pause
+          image: registry.k8s.io/pause:3.9
+YAML
+
+guard 60 "apply stub release" -- kubectl apply -f "$MANIFEST"
+# We don't need the Deployment to roll out (discovery reads labels off the
+# Deployment object, not a running Pod), but waiting a moment makes `cluster info`
+# output realistic and catches an image-pull-stuck cluster. Non-fatal if it
+# doesn't become Available within the window — discovery still works.
+guard 120 "stub rollout (best-effort)" -- \
+  kubectl -n "$TB_NAMESPACE" rollout status "deployment/${STUB_RELEASE}-jobs-manager" --timeout=90s \
+  || warn "stub deployment didn't report Available in time — discovery is label-based, continuing."
+
+# Point the CURRENT kubeconfig context's default namespace at the workspace ns.
+# This mirrors the post-client#208 state (installer sets the context's namespace)
+# and is what `tracebloc cluster info` reads when no --namespace is passed.
+CTX="$(kubectl config current-context)"
+guard 30 "set context namespace" -- kubectl config set-context "$CTX" --namespace "$TB_NAMESPACE"
+info "kubeconfig context '$CTX' namespace → ${TB_NAMESPACE}"
+
+# (a) cluster info succeeds in THIS shell.
+echo "── assert (a): tracebloc cluster info succeeds ──"
+guard 120 "cluster info" -- tracebloc cluster info
+
+# (b) cluster info succeeds from a FRESH shell — the journey-level PATH guard.
+# A new login shell inherits NONE of this process's PATH edits; it must find the
+# binary via what install.sh persisted, then reach the same cluster via the
+# kubeconfig on disk. This is the cli#61 class asserted on the real journey.
+echo "── assert (b): tracebloc cluster info succeeds from a FRESH shell ──"
+guard 120 "cluster info (fresh shell)" -- bash -lc 'tracebloc cluster info'
+guard 120 "cluster info (fresh non-login shell)" -- bash -c 'tracebloc cluster info'
+success "cluster info works in the current shell AND a fresh login + non-login shell."
+
+# ── Step 4: dataset push --dry-run smoke ─────────────────────────────────────
+# A tiny CSV validated entirely offline: --dry-run stops before any stage Pod /
+# tar stream / network, so it needs no credentials and no reachable platform. We
+# assert exit 0 (a clean validation), which is what a customer sees as the first
+# half of `dataset push` before the real upload.
+echo ""
+echo "── Step 4: dataset push --dry-run smoke ────────────────────────────────"
+SAMPLE_DIR="$WORKDIR/sample-dataset"
+mkdir -p "$SAMPLE_DIR"
+cat > "$SAMPLE_DIR/data.csv" <<'CSV'
+id,feature_a,feature_b,label
+1,0.10,0.20,0
+2,0.30,0.40,1
+3,0.50,0.60,0
+CSV
+
+# `dataset push <dir> --dry-run` is the documented offline-validatable form. Pin
+# the namespace explicitly so the smoke is independent of context state.
+echo "── assert: dataset push --dry-run exits 0 ──"
+guard 120 "dataset push --dry-run" -- \
+  tracebloc dataset push "$SAMPLE_DIR" --dry-run --namespace "$TB_NAMESPACE"
+success "dataset push --dry-run validated the sample dataset (exit 0)."
+
+# ── Pending sub-assertion: context-on-default auto-discover (cli, not merged) ─
+# Reproduces incident #2's harder half: context left on `default`, CLI expected
+# to AUTO-DISCOVER the release in another namespace without an explicit
+# --namespace. That cross-namespace auto-discover is NOT merged in the CLI yet
+# (today's `cluster info` resolves to the context's namespace, then "default",
+# and would correctly NOT find the release). So we run it as a NON-FATAL,
+# informational probe and gate flipping it to a hard assertion behind the CLI
+# change landing. Enable with TB_EXPECT_NS_AUTODISCOVER=1 once that ships.
+echo ""
+echo "── (pending) context-on-default auto-discover probe ────────────────────"
+guard 30 "reset context to default ns" -- kubectl config set-context "$CTX" --namespace default
+if bash -lc 'tracebloc cluster info' >/dev/null 2>&1; then
+  if [[ "${TB_EXPECT_NS_AUTODISCOVER:-0}" == "1" ]]; then
+    success "auto-discover from a default-namespace context works (CLI change has landed)."
+  else
+    info "auto-discover from a default-namespace context already works — flip TB_EXPECT_NS_AUTODISCOVER=1 to enforce it."
+  fi
+else
+  if [[ "${TB_EXPECT_NS_AUTODISCOVER:-0}" == "1" ]]; then
+    error "expected CLI namespace auto-discover to find the release from a default-namespace context, but it did not."
+  fi
+  info "auto-discover from a default-namespace context not available yet (expected — CLI change unmerged). Skipping as pending."
+fi
+# Restore the working context namespace for cleanliness (teardown follows).
+kubectl config set-context "$CTX" --namespace "$TB_NAMESPACE" >/dev/null 2>&1 || true
+
+echo ""
+echo "E2E JOURNEY PASS: installer cluster → CLI install → cluster info (fresh shell) → dataset push --dry-run."