From 17f7f622548c8a3c37289a80b5f42dff288f2318 Mon Sep 17 00:00:00 2001 From: gnezim Date: Tue, 28 Apr 2026 13:50:06 +0300 Subject: [PATCH] ci: turn off e2e in all CI pipelines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upstream WAF (flights.test.aeroflot.ru) is rate-limiting the corp- VPN exit IP that pve-201's tunnel uses, returning HTML block-pages or 403s for /api/* requests. Every recent ci-deploy run died in pre-warm or with cached HTML poisoning the SSR; we've sunk a chunk of time on WAF mitigations (browser UA, cache-bypass, proxy_no_cache, body validation) and the WAF still wins. Fixing the WAF is customer-side. Until that's resolved, the e2e suite is dead weight in CI — every run fails for upstream-only reasons. Pull it from ci-deploy entirely: * Removed: tunnel-reachability diagnose, /api pre-warm, Playwright install, Playwright run, the e2e branch in the rollback condition, and the playwright-report artifact path. * Kept: build, deploy, swap, wait-for-health (against the SSR root, which is local nginx → docker, no upstream involved). release-verify already had its e2e block removed (commit 36bb2d9); release.yml comment touched up to match. Specs and playwright.config.ts stay in the tree — they're still useful for local runs (`pnpm test:e2e`) once we're back on a network position the WAF tolerates. --- .gitea/workflows/ci-deploy.yml | 115 +-------------------------------- .gitea/workflows/release.yml | 3 +- 2 files changed, 4 insertions(+), 114 deletions(-) diff --git a/.gitea/workflows/ci-deploy.yml b/.gitea/workflows/ci-deploy.yml index 65bf2c6e..d8b82223 100644 --- a/.gitea/workflows/ci-deploy.yml +++ b/.gitea/workflows/ci-deploy.yml @@ -102,117 +102,8 @@ jobs: BASIC_AUTH_PASS: ${{ secrets.BASIC_AUTH_PASS }} run: scripts/ci/wait-for-url.sh https://ui-dashboard.gnerim.ru/ 30 2 - - name: Diagnose tunnel reachability - id: tunnel_check - env: - BASIC_AUTH_USER: ${{ secrets.BASIC_AUTH_USER }} - BASIC_AUTH_PASS: ${{ secrets.BASIC_AUTH_PASS }} - # The upstream WAF blocks the default curl UA — every probe needs - # a browser-like User-Agent or it gets the HTML block page. - run: | - UA='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36' - echo "--- /api/health (expect 200 + x-envoy-upstream-service-time + x-cache-status) ---" - curl -k -sSI -A "$UA" -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" https://ui-dashboard.gnerim.ru/api/health | head -15 - echo "--- /api/dictionary/1/world_regions (expect JSON, ~5KB) ---" - curl -k -sS -A "$UA" -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" \ - -w "\n[size=%{size_download} time=%{time_total}s code=%{http_code}]\n" \ - https://ui-dashboard.gnerim.ru/api/dictionary/1/world_regions | head -c 400; echo - echo "--- second hit on the same dict (expect HIT) ---" - curl -k -sSI -A "$UA" -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" \ - https://ui-dashboard.gnerim.ru/api/dictionary/1/world_regions | grep -iE "^HTTP|x-cache|x-envoy" - - - name: Pre-warm /api cache (dictionaries shared across e2e specs) - id: cache_warmup - env: - BASIC_AUTH_USER: ${{ secrets.BASIC_AUTH_USER }} - BASIC_AUTH_PASS: ${{ secrets.BASIC_AUTH_PASS }} - # Two-step warm with body validation. Run 544 was bitten by cache - # poisoning: the upstream WAF returned its HTML block-page with - # HTTP 200, the previous prewarm only checked %{http_code}, so - # nginx happily cached the HTML as a valid 200 for 6h and every - # subsequent dictionary read returned HTML instead of JSON. - # - # Step 1: validate upstream via cache-bust query (`?_=` lands - # on a unique nginx cache key, forcing an upstream fetch). - # Step 2: warm + validate the canonical URL. If the canonical - # response is HTML, attempt one cache-bypass retry - # (`Cache-Control: no-cache` — works after the matching - # nginx config update). If still HTML, fail loudly with a - # purge instruction so the operator can clear cache. - run: | - set -euo pipefail - # The upstream WAF blocks the default curl UA — every fetch must - # send a browser-like User-Agent or it returns the HTML block page. - UA='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36' - is_json() { - local body="$1" - local first_byte=${body:0:1} - [ "$first_byte" = "[" ] || [ "$first_byte" = "{" ] || return 1 - [ ${#body} -gt 1024 ] || return 1 - } - fail_with_body() { - local label="$1" body="$2" - echo "::error::pre-warm failed: $label" >&2 - echo "first 200 bytes of body:" >&2 - printf '%s\n' "${body:0:200}" >&2 - exit 1 - } - - for path in world_regions countries cities airports; do - base="https://ui-dashboard.gnerim.ru/api/dictionary/1/${path}" - - # Step 1: prove upstream is healthy (cache-bust via query). - bust_url="${base}?_=$(date +%s%N)" - bust_body=$(curl -k -sS -A "$UA" -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" \ - --max-time 15 "$bust_url") - if ! is_json "$bust_body"; then - fail_with_body "${path} upstream returned non-JSON (WAF rate-limit?)" "$bust_body" - fi - - # Step 2: warm + validate canonical URL. - cano_body=$(curl -k -sS -A "$UA" -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" \ - --max-time 15 "$base") - if ! is_json "$cano_body"; then - # Canonical hit poisoned cache. Force-refresh once via - # `Cache-Control: no-cache` (proxy_cache_bypass on the - # /api/dictionary/ location forwards to upstream, then - # stores the fresh response). - cano_body=$(curl -k -sS -A "$UA" -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" \ - -H "Cache-Control: no-cache" \ - --max-time 15 "$base") - if ! is_json "$cano_body"; then - echo "::error::cache poisoned for ${path} — Cache-Control: no-cache did not refresh" >&2 - echo "::error::manual purge: ssh pve-201 'rm -rf /var/cache/nginx/flights-api/* && systemctl reload nginx'" >&2 - fail_with_body "${path} canonical URL still non-JSON after bypass" "$cano_body" - fi - echo "warm $path -> ok via cache-bypass (cache had been poisoned, now refreshed; ${#cano_body} bytes)" - else - echo "warm $path -> ok (${#cano_body} bytes)" - fi - sleep 2 - done - echo "--- verify cache HIT on a re-fetch ---" - curl -k -sSI -A "$UA" -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" \ - https://ui-dashboard.gnerim.ru/api/dictionary/1/cities \ - | grep -iE "^HTTP|x-cache-status" - - - name: Install Playwright browsers - id: playwright_install - run: pnpm exec playwright install --with-deps chromium - - - name: Run Playwright e2e - id: e2e - env: - BASE_URL: https://ui-dashboard.gnerim.ru - BASIC_AUTH_USER: ${{ secrets.BASIC_AUTH_USER }} - BASIC_AUTH_PASS: ${{ secrets.BASIC_AUTH_PASS }} - # Skip Angular↔React parity gaps + UI-behavior mismatches that - # need separate triage. release-verify runs the full suite. - CI_DEPLOY: '1' - run: pnpm test:e2e - - name: Rollback on failure (post-deploy steps) - if: failure() && (steps.swap.outcome == 'failure' || steps.health.outcome == 'failure' || steps.e2e.outcome == 'failure') + if: failure() && (steps.swap.outcome == 'failure' || steps.health.outcome == 'failure') id: rollback run: scripts/ci/deploy-container.sh rollback @@ -225,9 +116,7 @@ jobs: uses: actions/upload-artifact@v3 with: name: ci-deploy-failure-${{ github.run_id }} - path: | - container.log - playwright-report/ + path: container.log retention-days: 7 - name: Prune old images diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 1d7a1b84..187fea31 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -8,7 +8,8 @@ on: # Workflow B: sync to GitLab + open MR + auto-merge. # Stops at "MR merged" — Jenkins is triggered manually by the operator. -# After Jenkins finishes, run the `release-verify` workflow to e2e the customer URL. +# After Jenkins finishes, run the `release-verify` workflow to smoke-check +# the customer URL. jobs: release: