diff --git a/.gitea/workflows/ci-deploy.yml b/.gitea/workflows/ci-deploy.yml index 6af31c5a..34225e8d 100644 --- a/.gitea/workflows/ci-deploy.yml +++ b/.gitea/workflows/ci-deploy.yml @@ -123,17 +123,66 @@ jobs: env: BASIC_AUTH_USER: ${{ secrets.BASIC_AUTH_USER }} BASIC_AUTH_PASS: ${{ secrets.BASIC_AUTH_PASS }} + # Two-step warm with body validation. Run 544 was bitten by cache + # poisoning: the upstream WAF returned its HTML block-page with + # HTTP 200, the previous prewarm only checked %{http_code}, so + # nginx happily cached the HTML as a valid 200 for 6h and every + # subsequent dictionary read returned HTML instead of JSON. + # + # Step 1: validate upstream via cache-bust query (`?_=` lands + # on a unique nginx cache key, forcing an upstream fetch). + # Step 2: warm + validate the canonical URL. If the canonical + # response is HTML, attempt one cache-bypass retry + # (`Cache-Control: no-cache` — works after the matching + # nginx config update). If still HTML, fail loudly with a + # purge instruction so the operator can clear cache. run: | - # The four dictionary endpoints (see src/shared/dictionaries/api.ts) - # are read by every page load — fetch them once before e2e to warm - # nginx's proxy_cache. Subsequent e2e fetches hit the cache instead - # of the upstream WAF, which has a low per-source-IP rate limit. - # Brief sleep between requests to avoid tripping the WAF on the - # cold-cache pass. + set -euo pipefail + is_json() { + local body="$1" + local first_byte=${body:0:1} + [ "$first_byte" = "[" ] || [ "$first_byte" = "{" ] || return 1 + [ ${#body} -gt 1024 ] || return 1 + } + fail_with_body() { + local label="$1" body="$2" + echo "::error::pre-warm failed: $label" >&2 + echo "first 200 bytes of body:" >&2 + printf '%s\n' "${body:0:200}" >&2 + exit 1 + } + for path in world_regions countries cities airports; do - url="https://ui-dashboard.gnerim.ru/api/dictionary/1/${path}" - rc=$(curl -k -sS -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" -o /dev/null -w "%{http_code}" "$url") - echo "warm $path -> HTTP $rc" + base="https://ui-dashboard.gnerim.ru/api/dictionary/1/${path}" + + # Step 1: prove upstream is healthy (cache-bust via query). + bust_url="${base}?_=$(date +%s%N)" + bust_body=$(curl -k -sS -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" \ + --max-time 15 "$bust_url") + if ! is_json "$bust_body"; then + fail_with_body "${path} upstream returned non-JSON (WAF rate-limit?)" "$bust_body" + fi + + # Step 2: warm + validate canonical URL. + cano_body=$(curl -k -sS -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" \ + --max-time 15 "$base") + if ! is_json "$cano_body"; then + # Canonical hit poisoned cache. Force-refresh once via + # `Cache-Control: no-cache` (proxy_cache_bypass on the + # /api/dictionary/ location forwards to upstream, then + # stores the fresh response). + cano_body=$(curl -k -sS -u "$BASIC_AUTH_USER:$BASIC_AUTH_PASS" \ + -H "Cache-Control: no-cache" \ + --max-time 15 "$base") + if ! is_json "$cano_body"; then + echo "::error::cache poisoned for ${path} — Cache-Control: no-cache did not refresh" >&2 + echo "::error::manual purge: ssh pve-201 'rm -rf /var/cache/nginx/flights-api/* && systemctl reload nginx'" >&2 + fail_with_body "${path} canonical URL still non-JSON after bypass" "$cano_body" + fi + echo "warm $path -> ok via cache-bypass (cache had been poisoned, now refreshed; ${#cano_body} bytes)" + else + echo "warm $path -> ok (${#cano_body} bytes)" + fi sleep 2 done echo "--- verify cache HIT on a re-fetch ---" diff --git a/deployment/nginx/conf.d/flights-api-cache.conf b/deployment/nginx/conf.d/flights-api-cache.conf index 6e9aa195..ca460b07 100644 --- a/deployment/nginx/conf.d/flights-api-cache.conf +++ b/deployment/nginx/conf.d/flights-api-cache.conf @@ -13,3 +13,14 @@ proxy_cache_path /var/cache/nginx/flights-api max_size=200m inactive=30m use_temp_path=off; + +# Don't cache upstream responses whose Content-Type is HTML — the upstream +# WAF returns its block page ("Доступ к сайту временно ограничен") with +# HTTP 200 + text/html, and prior to this filter nginx happily cached that +# as a valid 200 for the next 6h, poisoning every subsequent dictionary +# read for the SSR app. Pair with `proxy_no_cache $no_cache_html;` in the +# server block. +map $upstream_http_content_type $no_cache_html { + ~*text/html 1; + default ""; +} diff --git a/deployment/nginx/ui-dashboard.gnerim.ru.conf b/deployment/nginx/ui-dashboard.gnerim.ru.conf index 413be9f9..accd0d44 100644 --- a/deployment/nginx/ui-dashboard.gnerim.ru.conf +++ b/deployment/nginx/ui-dashboard.gnerim.ru.conf @@ -61,6 +61,15 @@ server { proxy_cache_valid 404 5m; proxy_cache_lock on; proxy_cache_use_stale error timeout updating http_403 http_500 http_502 http_503 http_504; + # Allow CI to force a re-fetch when the cache is suspected stale + # (Cache-Control: no-cache from client → bypass on this hit, then + # store the fresh response). Already enabled below for /api/. + proxy_cache_bypass $http_cache_control; + # Refuse to cache the WAF block page (text/html). $no_cache_html + # is set in conf.d/flights-api-cache.conf based on upstream's + # Content-Type. Without this, a single rate-limited fetch + # poisoned the cache for 6h. + proxy_no_cache $no_cache_html; add_header X-Cache-Status $upstream_cache_status always; }