From 3fa1a8fdf12aa2933e8958a8cd0ea180b857a4a6 Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Fri, 5 Jun 2026 18:50:24 +0200 Subject: [PATCH] test: add cross-repo fresh-shell last-mile E2E harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the two-leg install-journey harness that closes the gap which let a PATH-persistence regression ship green: no existing test opens a fresh shell after install and asserts the documented next command works. Leg 1 — scripts/tests/path-persist.sh: runs in a plain distro container, installs the tracebloc CLI via cli/install.sh (configurable ref via TRACEBLOC_CLI_REF), then for each shell among bash/zsh/fish spawns a fresh login AND non-login shell and asserts `command -v tracebloc` resolves and `tracebloc version` runs. A fresh non-login bash reads ~/.bashrc (not ~/.profile), so this catches the whole PATH-persistence class — red on the pre-fix installer, green on the fixed one. Leg 2 — scripts/tests/e2e-journey.sh: extends the e2e-cluster pattern. Brings the cluster up via create_cluster(), installs the CLI, applies a credential-free stub matching the CLI's real discovery contract (a *-jobs-manager Deployment with the chart's hallmark labels + an `ingestor` ServiceAccount), points the kubeconfig context's namespace at it, and asserts `tracebloc cluster info` succeeds AND resolves from a fresh shell, then `dataset push --dry-run` on a tiny sample CSV. Long steps run under a watchdog timeout so a hang fails instead of spinning. The context-on-default namespace auto-discover sub-assertion is gated pending the CLI change. CI: new path-persist job (distro matrix, like distro-prereqs, fail-fast false) and e2e-journey job (amd64, nightly + `e2e` label only, mirroring cli's e2e.yml gating). Both new scripts added to the static shellcheck list. Part of #737. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/installer-tests.yaml | 66 +++++- scripts/tests/e2e-journey.sh | 303 +++++++++++++++++++++++++ scripts/tests/path-persist.sh | 220 ++++++++++++++++++ 3 files changed, 586 insertions(+), 3 deletions(-) create mode 100755 scripts/tests/e2e-journey.sh create mode 100755 scripts/tests/path-persist.sh diff --git a/.github/workflows/installer-tests.yaml b/.github/workflows/installer-tests.yaml index 95518fe..1857203 100644 --- a/.github/workflows/installer-tests.yaml +++ b/.github/workflows/installer-tests.yaml @@ -55,12 +55,11 @@ jobs: # below for visibility but don't fail the gate. shellcheck --severity=error --shell=bash \ scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \ - scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/check-drift.sh + scripts/tests/check-drift.sh scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-journey.sh scripts/tests/e2e-proxy.sh scripts/tests/path-persist.sh echo "── shellcheck warnings (advisory, non-blocking) ──" shellcheck --severity=warning --shell=bash \ scripts/install.sh scripts/install-k8s.sh scripts/lib/*.sh \ - scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-proxy.sh scripts/tests/check-drift.sh || true - + scripts/tests/check-drift.sh scripts/tests/distro-prereqs.sh scripts/tests/e2e-cluster.sh scripts/tests/e2e-journey.sh scripts/tests/e2e-proxy.sh scripts/tests/path-persist.sh - name: PSScriptAnalyzer (PowerShell installer) shell: pwsh run: | @@ -172,3 +171,64 @@ jobs: - uses: actions/checkout@v4 - name: Cluster up through an authenticated proxy run: bash scripts/tests/e2e-proxy.sh + + path-persist: + # Fresh-shell PATH-persistence guard for the tracebloc CLI — the cheap, wide + # leg. Installs cli/install.sh in a plain container per distro, then opens a + # BRAND-NEW login AND non-login shell for each of bash/zsh/fish and asserts + # `tracebloc` resolves + `tracebloc version` runs. This is the cell that goes + # RED on the pre-fix installer and GREEN on the fixed one (a fresh non-login + # bash reads ~/.bashrc, not ~/.profile) — the regression guard for the whole + # PATH class that distro-prereqs / e2e-cluster can't see (they assert in the + # same shell that ran the installer). No cluster, no Docker-in-Docker, no + # creds. The script iterates shell × mode INSIDE the container; distro is the + # matrix axis (it installs zsh/fish in-container where a package exists). + name: PATH persist — ${{ matrix.distro }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + distro: + - 'ubuntu:22.04' # most common server + - 'ubuntu:24.04' # newest LTS + - 'debian:12' # apt + - 'fedora:latest' # dnf + - 'almalinux:9' # RHEL rebuild + - 'opensuse/leap:15.6' # zypper + - 'alpine:3' # busybox sh + apk (optional, minimal) + steps: + - uses: actions/checkout@v4 + - name: Fresh-shell PATH check in ${{ matrix.distro }} + env: + DISTRO: ${{ matrix.distro }} + # Override TRACEBLOC_CLI_REF here to point at a specific cli install.sh + # (URL or, for the cross-repo cli-side caller, a local path). Default + # lives in the script while cli#61's PATH fix is unreleased. + TRACEBLOC_CLI_REF: ${{ vars.TRACEBLOC_CLI_REF }} + run: | + docker run --rm \ + -e TRACEBLOC_CLI_REF \ + -v "$PWD:/src:ro" -w /src "$DISTRO" \ + bash scripts/tests/path-persist.sh + + e2e-journey: + # Leg 2 — the full last-mile journey on a real cluster: create_cluster() → + # install the CLI via cli/install.sh → apply a CREDENTIAL-FREE stub the CLI + # discovers (a *-jobs-manager Deployment with the chart's labels + an + # `ingestor` SA) → assert `tracebloc cluster info` succeeds AND resolves from + # a fresh shell → `dataset push --dry-run` smoke. No private images, no + # secrets (the stub stands in for the parent release). Heavier than Leg 1 + # (boots a real k3d cluster), so — mirroring cli's e2e.yml — it runs on the + # nightly schedule + workflow_dispatch, and on a PR ONLY when it carries the + # `e2e` label. amd64, like e2e-cluster. + name: E2E last-mile journey (amd64) + runs-on: ubuntu-latest + if: >- + github.event_name != 'pull_request' || + contains(github.event.pull_request.labels.*.name, 'e2e') + steps: + - uses: actions/checkout@v4 + - name: Install → CLI → cluster info (fresh shell) → dataset push --dry-run + env: + TRACEBLOC_CLI_REF: ${{ vars.TRACEBLOC_CLI_REF }} + run: bash scripts/tests/e2e-journey.sh diff --git a/scripts/tests/e2e-journey.sh b/scripts/tests/e2e-journey.sh new file mode 100755 index 0000000..3e2371f --- /dev/null +++ b/scripts/tests/e2e-journey.sh @@ -0,0 +1,303 @@ +#!/usr/bin/env bash +# ============================================================================= +# e2e-journey.sh — last-mile customer journey on a real cluster +# ----------------------------------------------------------------------------- +# Continues exactly where e2e-cluster.sh stops. That job proves the installer's +# create_cluster() brings up a real k3d cluster and can run a workload, then +# stops BEFORE the CLI. This job picks up from a live cluster and walks the +# documented next steps a customer takes: +# +# 1. create_cluster() (the installer's real path) +# 2. install the tracebloc CLI via cli/install.sh +# 3. apply a CREDENTIAL-FREE stub that looks like the parent client release +# to the CLI's discovery (a *-jobs-manager Deployment with the chart's +# hallmark labels + an `ingestor` ServiceAccount), point the kubeconfig +# context's namespace at it, and assert `tracebloc cluster info`: +# (a) succeeds (exit 0), AND +# (b) succeeds from a FRESH shell (the cli#61 PATH class, on the journey) +# 4. `tracebloc dataset push --dry-run` smoke on a tiny sample CSV +# (offline-validatable; no creds, no real ingestion) +# 5. teardown (EXIT trap, same as e2e-cluster.sh) +# +# What it deliberately does NOT do: the private-image tracebloc helm install + +# backend registration (needs real credentials + a reachable platform). The +# whole point of the stub is to exercise the CLI's discovery + token + dry-run +# paths end-to-end WITHOUT any of that — so this runs on stock GitHub runners +# with no secrets, like e2e-cluster.sh. +# +# Every long-running step is wrapped in a watchdog timeout so a hang FAILS the +# job instead of spinning until the 6h GitHub ceiling (ties to the conntrack +# "looks hung" class — a hang must surface as a red failure, not a timeout). +# +# Configuration (env): +# TRACEBLOC_CLI_REF URL or local path to cli/install.sh (see path-persist.sh). +# TRACEBLOC_CLI_VERSION Optional --version tag for install.sh. +# CLUSTER_NAME Isolated cluster name (default tbe2e-journey). +# TB_NAMESPACE Namespace the stub release lives in (default tracebloc). +# +# Usage: bash scripts/tests/e2e-journey.sh +# ============================================================================= +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LIB="$HERE/../lib" + +# Isolated cluster name so we never touch a real 'tracebloc' cluster; opt out of +# autostart so we don't reconfigure docker.service / restart policies on the host +# (identical isolation posture to e2e-cluster.sh). +export USER="${USER:-$(id -un)}" +export CLUSTER_NAME="${CLUSTER_NAME:-tbe2e-journey}" +export TRACEBLOC_NO_AUTOSTART=1 + +TB_NAMESPACE="${TB_NAMESPACE:-tracebloc}" +# Cosmetic stand-ins for the chart's real values — discovery keys off the LABELS +# below, not these, so any plausible values work. A release name + a pinned chart +# version make `cluster info`'s output realistic. +STUB_RELEASE="tbe2e" +STUB_CHART_VERSION="0.0.0-e2e" + +CLI_REF="${TRACEBLOC_CLI_REF:-}" +CLI_VERSION="${TRACEBLOC_CLI_VERSION:-}" + +# shellcheck source=/dev/null +source "$LIB/common.sh" +# shellcheck source=/dev/null +source "$LIB/setup-linux.sh" +# shellcheck source=/dev/null +source "$LIB/cluster.sh" + +WORKDIR="$(mktemp -d "${TMPDIR:-/tmp}/tb-e2e-journey-XXXXXX")" +cleanup() { + k3d cluster delete "$CLUSTER_NAME" >/dev/null 2>&1 || true + rm -rf "$WORKDIR" 2>/dev/null || true +} +trap cleanup EXIT + +# ── Watchdog: run a step under a hard time limit ───────────────────────────── +# A hang (e.g. a stuck image pull, a wedged API server, the conntrack "looks +# hung" class) must surface as a red FAILURE, not an infinite spinner. `timeout` +# (GNU coreutils) is preinstalled on the Ubuntu runners this job targets; if it's +# somehow absent we degrade to running the step unguarded rather than dying on a +# missing binary (the step can still fail on its own non-zero exit). +guard() { # guard