openova/.github/workflows/catalyst-build.yaml

name: Build & Deploy Catalyst

# Event-driven only. Cron is forbidden — the OpenOva architecture is
# event-driven end to end (Flux dependsOn, NATS JetStream, SSE,
# Helm post-install hooks). `push` on the relevant paths is the
# canonical trigger; `workflow_dispatch` exists for ad-hoc re-runs
# without a code change.
on:
  push:
    branches: [main]
    paths:
      - 'core/console/**'
      - 'core/admin/**'
      - 'core/marketplace/**'
      - 'core/marketplace-api/**'
      - 'products/catalyst/bootstrap/**'
      - 'products/catalyst/chart/**'
      - 'infra/hetzner/**'
      - '.github/workflows/catalyst-build.yaml'
  workflow_dispatch:

env:
  REGISTRY: ghcr.io
  UI_IMAGE: ghcr.io/openova-io/openova/catalyst-ui
  API_IMAGE: ghcr.io/openova-io/openova/catalyst-api

jobs:
  build-ui:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      packages: write
    outputs:
      sha_short: ${{ steps.vars.outputs.sha_short }}
    steps:
      - name: Checkout openova-private
        uses: actions/checkout@v4

      - name: Checkout openova (public source)
        uses: actions/checkout@v4
        with:
          repository: openova-io/openova
          path: openova-src

      - name: Set short SHA
        id: vars
        run: echo "sha_short=$(echo $GITHUB_SHA | head -c 7)" >> "$GITHUB_OUTPUT"

      - name: Login to GHCR
        uses: docker/login-action@v3
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Build UI image (test)
        uses: docker/build-push-action@v6
        with:
          # Build context is the repo root so the Vite prebuild script can
          # walk platform/, products/, clusters/_template/bootstrap-kit/ to
          # populate the catalog + BOOTSTRAP_KIT. The Containerfile fails
          # the build if any of those dirs is missing.
          context: openova-src
          file: openova-src/products/catalyst/bootstrap/ui/Containerfile
          push: false
          load: true
          tags: ${{ env.UI_IMAGE }}:test
          build-args: VITE_APP_MODE=selfhosted

      - name: Smoke test UI
        run: |
          docker run -d --name smoke-ui -p 8080:8080 ${{ env.UI_IMAGE }}:test
          sleep 3
          STATUS=$(curl -s -o /dev/null -w '%{http_code}' http://localhost:8080/)
          if [ "$STATUS" != "200" ]; then
            echo "Smoke test failed: expected 200 from /, got $STATUS"
            docker stop smoke-ui
            exit 1
          fi
          echo "Smoke test (root) passed: HTTP $STATUS"

          # Logo path regression guard (#173): the wizard's StepComponents
          # references `${BASE}component-logos/<id>.<ext>` where BASE is the
          # Vite base and the extension is whatever the upstream brand mark
          # is published as (some are SVG, some are PNG — we use the canonical
          # upstream asset rather than auto-converting). Inside the catalyst-
          # ui pod nginx serves the file at /component-logos/<id>.<ext>
          # (Traefik strips /sovereign before proxying — see nginx.conf
          # comment). We list every logo path that componentGroups.ts
          # references, so a missing or mis-cased asset fails the build,
          # not the user.
          for path in \
              component-logos/cilium.svg \
              component-logos/flux.svg \
              component-logos/harbor.svg \
              component-logos/grafana.svg \
              component-logos/keycloak.svg \
              component-logos/openbao.svg \
              component-logos/langfuse.png \
              component-logos/vllm.png \
              component-logos/temporal.svg \
              component-logos/stalwart.svg \
              component-logos/cnpg.svg \
              component-logos/loki.png \
              component-logos/mimir.png \
              component-logos/tempo.svg \
              component-logos/ntfy.svg \
              component-logos/ferretdb.png \
              component-logos/openmeter.png \
              component-logos/coraza.png \
              component-logos/external-dns.png \
              component-logos/netbird.png \
              component-logos/strongswan.png \
              component-logos/trivy.png \
              component-logos/syft-grype.png ; do
            CODE=$(curl -s -o /dev/null -w '%{http_code}' \
              "http://localhost:8080/${path}")
            if [ "$CODE" != "200" ]; then
              echo "Logo smoke FAILED: /${path} returned $CODE"
              docker stop smoke-ui
              exit 1
            fi
            echo "Logo smoke OK: /${path} HTTP $CODE"
          done

          # Bootstrap-kit regression guard: the Provision page reads
          # BOOTSTRAP_KIT from the bundled catalog.generated.ts to render
          # the per-Blueprint bubbles. An earlier revision shipped with a
          # docker context that didn't include clusters/_template/bootstrap-kit/
          # so the prebuild script silently produced an empty array — the
          # page rendered only the 2 supernodes. Asserting the bundle
          # contains every bp-* id makes that regression impossible.
          #
          # Implementation note: we extract the entire bundle once via
          # `tar c -C ... --transform`, then grep locally. Earlier we ran
          # `grep` inside docker run -c "..." and the nested quote escaping
          # produced false negatives (bp-cilium was in the bundle but the
          # grep argument matched a literal `"bp-cilium"` whose surrounding
          # quotes were eaten by shell expansion). Local grep on the
          # extracted file removes that whole class of escaping bugs.
          BUNDLE_TMP=$(mktemp)
          docker run --rm --entrypoint sh ${{ env.UI_IMAGE }}:test \
            -c 'cat $(find /usr/share/nginx/html/assets -name "index-*.js" | head -1)' \
            > "$BUNDLE_TMP"
          BUNDLE_BYTES=$(wc -c < "$BUNDLE_TMP")
          echo "Bundle size: $BUNDLE_BYTES bytes"
          if [ "$BUNDLE_BYTES" -lt 100000 ]; then
            echo "Bootstrap-kit smoke FAILED: bundle suspiciously small ($BUNDLE_BYTES bytes)"
            docker stop smoke-ui
            exit 1
          fi
          for bp in bp-cilium bp-cert-manager bp-flux bp-crossplane bp-sealed-secrets \
                    bp-spire bp-nats-jetstream bp-openbao bp-keycloak bp-gitea ; do
            if ! grep -q -F "$bp" "$BUNDLE_TMP" ; then
              echo "Bootstrap-kit smoke FAILED: ${bp} missing from bundle"
              docker stop smoke-ui
              exit 1
            fi
            echo "Bootstrap-kit smoke OK: ${bp}"
          done
          rm -f "$BUNDLE_TMP"

          docker stop smoke-ui
          echo "All smoke tests passed."

      - name: Push UI image
        uses: docker/build-push-action@v6
        with:
          # Build context is the repo root so the Vite prebuild script can
          # walk platform/, products/, clusters/_template/bootstrap-kit/ to
          # populate the catalog + BOOTSTRAP_KIT. The Containerfile fails
          # the build if any of those dirs is missing.
          context: openova-src
          file: openova-src/products/catalyst/bootstrap/ui/Containerfile
          push: true
          tags: |
            ${{ env.UI_IMAGE }}:${{ steps.vars.outputs.sha_short }}
            ${{ env.UI_IMAGE }}:latest
          build-args: VITE_APP_MODE=selfhosted

  build-api:
    runs-on: ubuntu-latest
    permissions:
      contents: read
      packages: write
    outputs:
      sha_short: ${{ steps.vars.outputs.sha_short }}
    steps:
      - name: Checkout openova-private
        uses: actions/checkout@v4

      - name: Checkout openova (public source)
        uses: actions/checkout@v4
        with:
          repository: openova-io/openova
          path: openova-src

      - name: Set short SHA
        id: vars
        run: echo "sha_short=$(echo $GITHUB_SHA | head -c 7)" >> "$GITHUB_OUTPUT"

      - name: Login to GHCR
        uses: docker/login-action@v3
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      # Build context is the public openova repo root (openova-src/), not just
      # products/catalyst/bootstrap/api/, because the runtime image bundles the
      # canonical OpenTofu module from infra/hetzner/. The Containerfile's
      # COPY paths are written relative to the repo root accordingly. Without
      # this, /infra/hetzner/ is missing inside the image and every Launch
      # fails with `stage tofu module: open /infra/hetzner: no such file or
      # directory`.
      - name: Build API image (test)
        uses: docker/build-push-action@v6
        with:
          context: openova-src
          file: openova-src/products/catalyst/bootstrap/api/Containerfile
          push: false
          load: true
          tags: ${{ env.API_IMAGE }}:test

      # Smoke test — the catalyst-api Pod is the OpenTofu runner, so the .tf
      # sources MUST be present at /infra/hetzner/ inside the image. Anything
      # less ships a broken image that fails on every Launch with `stage tofu
      # module: open /infra/hetzner: no such file or directory`. Failure of
      # this step fails the build.
      - name: Smoke test API — verify infra/hetzner/ is bundled
        run: |
          set -euo pipefail
          LISTING=$(docker run --rm --entrypoint sh ${{ env.API_IMAGE }}:test \
            -c 'ls -la /infra/hetzner/')
          echo "$LISTING"
          for f in main.tf variables.tf outputs.tf versions.tf \
                   cloudinit-control-plane.tftpl cloudinit-worker.tftpl ; do
            if ! echo "$LISTING" | grep -q " ${f}\$"; then
              echo "Smoke test FAILED: /infra/hetzner/${f} missing from image"
              exit 1
            fi
            echo "Smoke test OK: /infra/hetzner/${f} present"
          done
          echo "All API smoke tests passed."

      # tofu CLI smoke test — the runtime image bundles the OpenTofu CLI
      # because internal/provisioner execs `tofu init / plan / apply` (see
      # internal/provisioner/provisioner.go runTofu()). Without the binary
      # every Launch SSE stream returns:
      #   tofu init: exec: "tofu": executable file not found in $PATH
      # We assert (a) `tofu version` succeeds inside the image and (b) the
      # output matches the EXPECTED_TOFU_VERSION pinned here, which must
      # stay in lockstep with the TOFU_VERSION ARG in the Containerfile.
      # When you bump the version in the Containerfile, bump it here too.
      - name: Smoke test API — verify OpenTofu CLI is installed
        env:
          EXPECTED_TOFU_VERSION: 1.11.6
        run: |
          set -euo pipefail
          OUT=$(docker run --rm --entrypoint sh ${{ env.API_IMAGE }}:test \
            -c 'tofu version')
          echo "$OUT"
          if ! echo "$OUT" | grep -q "^OpenTofu v${EXPECTED_TOFU_VERSION}\$"; then
            echo "Smoke test FAILED: expected 'OpenTofu v${EXPECTED_TOFU_VERSION}', got:"
            echo "$OUT"
            exit 1
          fi
          echo "Smoke test OK: OpenTofu v${EXPECTED_TOFU_VERSION} present on PATH."

          # Re-assert the binary is executable for the actual runtime UID
          # (65534, set in api-deployment.yaml securityContext.runAsUser).
          # `--user` overrides the image USER directive, simulating the K8s
          # securityContext: a missing exec bit or wrong owner here would
          # surface as a Launch failure in production, never in CI, so we
          # gate it at build time.
          docker run --rm --user 65534:65534 --entrypoint sh \
            ${{ env.API_IMAGE }}:test -c 'tofu version | head -1'
          echo "Smoke test OK: tofu executable as UID 65534."

      - name: Push API image
        uses: docker/build-push-action@v6
        with:
          context: openova-src
          file: openova-src/products/catalyst/bootstrap/api/Containerfile
          push: true
          tags: |
            ${{ env.API_IMAGE }}:${{ steps.vars.outputs.sha_short }}
            ${{ env.API_IMAGE }}:latest

  deploy:
    needs: [build-ui, build-api]
    runs-on: ubuntu-latest
    permissions:
      # contents: write — push the values.yaml SHA bump back to main
      contents: write
      # actions: write — required for `gh workflow run` to dispatch
      # blueprint-release.yaml after the deploy commit lands. Without
      # this, the dispatch step (added in PR #720 to close the
      # bot-deploy-doesn't-trigger-workflows gap from #712) returns
      # HTTP 403 "Resource not accessible by integration", the
      # blueprint-release fires NEVER, and the bp-catalyst-platform
      # OCI artifact stays stuck on the PREVIOUS deploy's image SHA.
      # Caught live 2026-05-04 — PR #722–727 all built green but
      # blueprint-release was never dispatched, leaving Sovereigns
      # provisioned afterwards on the pre-fix chart.
      actions: write
    steps:
      - name: Checkout
        uses: actions/checkout@v4

      # In-flight provisioning guard — t13/t17/t21 incident, 2026-05-17.
      #
      # The mothership catalyst-api Pod is single-replica and is rolled
      # by Flux whenever this workflow bumps the image SHA. The OpenTofu
      # workdir lives on a /tmp emptyDir that dies with the Pod, so any
      # in-flight `tofu apply` is killed mid-resource. The on-disk
      # deployment record is rewritten to status=failed on the new Pod's
      # restoreFromStore (deployments.go:413), but the Hetzner resources
      # tagged with the abandoned deployment-id remain orphans that
      # require manual `hcloud` cleanup. Three consecutive provs
      # (t13/t17/t21) died this way during 2026-05-17, each costing
      # ~15 minutes of provisioning time plus cleanup overhead.
      #
      # This step polls the public, read-only in-flight-count endpoint
      # on the mothership catalyst-api (added in this PR, served at
      # console.openova.io/api/v1/deployments/in-flight-count). The
      # endpoint counts ONLY Phase-0 in-flight statuses (pending /
      # provisioning / tofu-applying / flux-bootstrapping) — Phase-1 is
      # observational and resumes across Pod restarts, so it does not
      # block. When count==0 we proceed with the values.yaml bump.
      #
      # Timeout policy: cap at MAX_WAIT_SECONDS (default 30 minutes —
      # the upper bound on a healthy multi-region prov). If a prov is
      # still in flight after the cap, we proceed anyway and log a
      # WARNING. Blocking deploys indefinitely on a stuck prov would
      # mean an operator can never ship a fix for whatever is causing
      # the stuck prov (the worst possible failure mode for a CI gate).
      #
      # Endpoint outage policy: if the curl fails for any reason
      # (network blip, mothership down, endpoint not yet deployed on
      # the live SHA), we proceed with the bump after logging. Same
      # rationale — a broken gate must not block all future deploys.
      # First-time-rollout consideration: the endpoint does not exist
      # on the LIVE mothership until THIS PR's image lands, so the
      # first run after merge will fall through the "endpoint not
      # found" branch and proceed normally. Subsequent runs benefit
      # from the gate.
      - name: Wait for in-flight provisioning to drain
        env:
          # Override-able via repo variables/secrets if a different
          # mothership URL is in play (Sovereign chroot self-deploy,
          # staging, etc.). Default targets the production mothership.
          CATALYST_API_URL: ${{ vars.CATALYST_API_URL || 'https://console.openova.io' }}
          MAX_WAIT_SECONDS: '1800'   # 30 min hard cap
          POLL_INTERVAL_SECONDS: '20'
        run: |
          set -u
          ENDPOINT="${CATALYST_API_URL%/}/api/v1/deployments/in-flight-count"
          echo "Polling ${ENDPOINT} every ${POLL_INTERVAL_SECONDS}s (cap ${MAX_WAIT_SECONDS}s)"

          START=$(date +%s)
          ATTEMPT=0
          while : ; do
            ATTEMPT=$((ATTEMPT + 1))
            HTTP_CODE=$(curl -fsSL --max-time 10 -o /tmp/inflight.json -w '%{http_code}' \
              "${ENDPOINT}" 2>/dev/null || echo "000")

            if [ "$HTTP_CODE" = "000" ]; then
              # Network failure (DNS, connect refused, timeout). Do NOT
              # block the deploy — fail-open per "broken gate must not
              # halt all deploys" rule above. Log + proceed.
              echo "WARN: ${ENDPOINT} unreachable on attempt ${ATTEMPT} (curl failed). Proceeding without gate."
              break
            fi

            if [ "$HTTP_CODE" = "404" ]; then
              # First-rollout case — the endpoint is not yet present on
              # the LIVE catalyst-api. Once this PR merges, subsequent
              # runs will see the endpoint and start gating properly.
              echo "INFO: ${ENDPOINT} returned 404 — endpoint not yet deployed on live mothership. Proceeding (first-rollout fall-through)."
              break
            fi

            if [ "$HTTP_CODE" != "200" ]; then
              # Any other non-2xx: log + proceed (fail-open).
              echo "WARN: ${ENDPOINT} returned HTTP ${HTTP_CODE} on attempt ${ATTEMPT}. Body:"
              cat /tmp/inflight.json 2>/dev/null || true
              echo
              echo "Proceeding without gate (fail-open)."
              break
            fi

            COUNT=$(jq -r '.count // 0' /tmp/inflight.json 2>/dev/null || echo "0")
            IDS=$(jq -r '.ids // [] | join(",")' /tmp/inflight.json 2>/dev/null || echo "")

            if [ "$COUNT" -eq 0 ] 2>/dev/null; then
              echo "OK: 0 deployments in-flight. Safe to bump catalyst-api image."
              break
            fi

            ELAPSED=$(($(date +%s) - START))
            if [ "$ELAPSED" -ge "$MAX_WAIT_SECONDS" ]; then
              echo "WARN: ${COUNT} deployment(s) still in-flight after ${ELAPSED}s (cap ${MAX_WAIT_SECONDS}s)."
              echo "WARN: in-flight ids: ${IDS}"
              echo "WARN: proceeding with image bump anyway — stuck provs must not block all future deploys."
              break
            fi

            echo "WAIT: attempt ${ATTEMPT} — ${COUNT} deployment(s) in-flight (ids: ${IDS}); elapsed=${ELAPSED}s. Sleeping ${POLL_INTERVAL_SECONDS}s."
            sleep "${POLL_INTERVAL_SECONDS}"
          done

      - name: Update SHA tags in values.yaml and deployment manifests
        # The catalyst-ui and catalyst-api images are referenced in two places:
        #
        # 1. products/catalyst/chart/values.yaml — used by the Helm chart path
        #    (bp-catalyst-platform OCI chart on Sovereign clusters). Helm template
        #    expressions ({{ .Values.images.catalystUi.tag }}) are rendered at
        #    `helm install` time by Flux's helm-controller. We use awk to replace
        #    the `tag:` line that immediately follows the catalystUi/catalystApi key.
        #
        # 2. products/catalyst/chart/templates/{api,ui}-deployment.yaml — used by
        #    the Kustomize path (catalyst-platform Kustomization on contabo-mkt).
        #    These files are applied as raw manifests by Flux kustomize-controller;
        #    Helm template syntax is NOT rendered. A literal image ref is required.
        #    Bug history: feat/global-imageRegistry (#580) converted the literal
        #    image ref to a Helm template without updating this deploy step, causing
        #    InvalidImageName on the contabo-mkt Kustomize path. Fixed here by also
        #    sed-patching the literal image refs in those two deployment files.
        env:
          SHA_SHORT: ${{ needs.build-ui.outputs.sha_short }}
        run: |
          VALUES="products/catalyst/chart/values.yaml"
          awk -v sha="${SHA_SHORT}" '
            /^  catalystApi:/ { print; in_api=1; next }
            /^  catalystUi:/  { print; in_ui=1; next }
            in_api && /^ *tag:/ { sub(/"[^"]*"/, "\"" sha "\""); in_api=0 }
            in_ui  && /^ *tag:/ { sub(/"[^"]*"/, "\"" sha "\""); in_ui=0 }
            { print }
          ' "${VALUES}" > "${VALUES}.tmp" && mv "${VALUES}.tmp" "${VALUES}"
          echo "values.yaml after update:"
          grep -A2 "catalystUi\|catalystApi" "${VALUES}" | head -10

          # ALSO bump the literal image refs in the chart templates.
          # Sovereigns Helm-install this chart and contabo applies it
          # via Kustomize — both consume the literal directly because
          # kustomize-controller can't render Helm templates. Without
          # this auto-bump, every Sovereign provisioned after 2026-05-06
          # was installing :2122fb8 (frozen at PR #1040's chart-touch),
          # so PRs #1051..#1059 never reached anyone except via manual
          # `kubectl set image` patches on omantel.
          API_TPL="products/catalyst/chart/templates/api-deployment.yaml"
          UI_TPL="products/catalyst/chart/templates/ui-deployment.yaml"
          sed -i -E "s|(image: \"ghcr\.io/openova-io/openova/catalyst-api:)[^\"]*\"|\1${SHA_SHORT}\"|" "${API_TPL}"
          sed -i -E "s|(image: \"ghcr\.io/openova-io/openova/catalyst-ui:)[^\"]*\"|\1${SHA_SHORT}\"|"  "${UI_TPL}"
          # qa-loop iter-3 Fix #18 — also bump the CATALYST_BUILD_SHA env
          # literal in the api-deployment so /api/v1/version returns the
          # SHA the Pod is actually running. Without this, the env stays
          # frozen at whatever value was committed manually and the live
          # version probe lies. The env block uses literal values (not
          # Helm directives) per the dual-mode contract — this sed
          # targets the literal directly. Pattern: 6-12 hex chars in
          # double-quotes immediately after `name: CATALYST_BUILD_SHA`
          # + newline + `              value:`.
          sed -i -E "/name: CATALYST_BUILD_SHA/{n;s|(value: )\"[a-f0-9]+\"|\1\"${SHA_SHORT}\"|;}" "${API_TPL}"
          echo "templates after update:"
          grep -E "image: \".*catalyst-(api|ui):" "${API_TPL}" "${UI_TPL}"
          grep -A1 "CATALYST_BUILD_SHA" "${API_TPL}" | head -2

          # contabo's catalyst-platform Kustomization at
          # ./products/catalyst/chart/templates reconciles every 10 min
          # — it will pick up the bumped literal on the next interval.
          # If the new image breaks contabo, an operator can revert the
          # template SHA via a follow-up PR; the previous "freeze"
          # behaviour was masking real bugs (contabo silently ran an
          # old image while the Sovereign provisioning churned through
          # the same SHA being fixed downstream).

      - name: Commit and push manifest updates
        id: deploy_commit
        env:
          SHA_SHORT: ${{ needs.build-ui.outputs.sha_short }}
        run: |
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"
          # values.yaml + the two literal-image templates (api-deployment,
          # ui-deployment) are bumped together so:
          #   - Sovereigns get the new SHA via the next OCI chart publish
          #     (blueprint-release fires below).
          #   - contabo's Kustomize-path Flux reconciles the bumped literal
          #     within 10 min.
          # Both surfaces converge on the same SHA on every push.
          git add products/catalyst/chart/values.yaml \
                  products/catalyst/chart/templates/api-deployment.yaml \
                  products/catalyst/chart/templates/ui-deployment.yaml
          if git diff --staged --quiet; then
            echo "No changes to commit"
            echo "pushed=false" >> "$GITHUB_OUTPUT"
            exit 0
          fi
          git commit -m "deploy: update catalyst images to ${SHA_SHORT}"
          git push
          echo "pushed=true" >> "$GITHUB_OUTPUT"

      # Closes #712. The push above is made by GITHUB_TOKEN; per GitHub
      # Actions design, commits authored by GITHUB_TOKEN do NOT re-trigger
      # workflows. Without this dispatch step, blueprint-release.yaml
      # never fires for deploy commits and the bp-catalyst-platform OCI
      # artifact stays stuck on whatever catalyst-api SHA was current at
      # the last manual chart-touching PR (e.g. otech62-66, 2026-05-03,
      # were stuck installing catalyst-api:74d08eb six PRs after that
      # SHA was superseded). Explicit workflow_dispatch reliably re-runs
      # blueprint-release on every deploy commit, picking up the new
      # values.yaml SHA tags.
      - name: Trigger blueprint-release for the chart bump
        if: steps.deploy_commit.outputs.pushed == 'true'
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          gh workflow run blueprint-release.yaml \
            --repo "${{ github.repository }}" \
            --ref main \
            -f blueprint=catalyst \
            -f tree=products
          echo "blueprint-release dispatched for products/catalyst @ main"