diff --git a/.gitea/workflows/ci-deploy.yml b/.gitea/workflows/ci-deploy.yml index 166d926b..f39865d1 100644 --- a/.gitea/workflows/ci-deploy.yml +++ b/.gitea/workflows/ci-deploy.yml @@ -16,7 +16,7 @@ jobs: BASIC_AUTH_PASS: ${{ secrets.BASIC_AUTH_PASS }} TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} - FLIGHTS_WEB_PORT: '8081' + FLIGHTS_WEB_PORT: '3002' steps: - name: Checkout @@ -90,7 +90,7 @@ jobs: - name: Run Playwright e2e id: e2e env: - BASE_URL: http://127.0.0.1:8081 + BASE_URL: http://127.0.0.1:3002 run: pnpm test:e2e - name: Rollback on failure (post-deploy steps) diff --git a/.gitea/workflows/release-verify.yml b/.gitea/workflows/release-verify.yml new file mode 100644 index 00000000..0b59839a --- /dev/null +++ b/.gitea/workflows/release-verify.yml @@ -0,0 +1,60 @@ +name: release-verify + +# Workflow C: run after Jenkins has finished building (operator triggers manually). +# Probes the customer URL until it serves a fresh build, then runs the e2e suite +# against http://flights-ui.devwebzavod.ru with the console-error gate. + +on: + workflow_dispatch: + +jobs: + verify: + runs-on: pve-201 + timeout-minutes: 30 + env: + TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} + TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Notify start + if: ${{ env.TELEGRAM_BOT_TOKEN != '' }} + run: scripts/ci/notify-telegram.sh start release-verify + + - name: Setup Node + pnpm + uses: actions/setup-node@v4 + with: + node-version-file: '.nvmrc' + - uses: pnpm/action-setup@v4 + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Wait for customer URL + id: wait_customer + run: scripts/ci/wait-for-url.sh http://flights-ui.devwebzavod.ru/ru-ru/onlineboard 60 5 + + - name: Run Playwright e2e against customer URL + id: e2e_customer + env: + BASE_URL: http://flights-ui.devwebzavod.ru + run: pnpm test:e2e + + - name: Upload artifacts on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: release-verify-failure-${{ github.run_id }} + path: | + playwright-report/ + retention-days: 7 + + - name: Notify (success) + if: success() && env.TELEGRAM_BOT_TOKEN != '' + run: scripts/ci/notify-telegram.sh ok release-verify "customer URL e2e green" + + - name: Notify (failure) + if: failure() && env.TELEGRAM_BOT_TOKEN != '' + run: scripts/ci/notify-telegram.sh fail release-verify "see Gitea run for Playwright report" diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 5679e9f2..dd88cac3 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -6,20 +6,20 @@ on: tags: - 'release-*' +# Workflow B: sync to GitLab + open MR + auto-merge. +# Stops at "MR merged" — Jenkins is triggered manually by the operator. +# After Jenkins finishes, run the `release-verify` workflow to e2e the customer URL. + jobs: release: runs-on: pve-201 - timeout-minutes: 60 + timeout-minutes: 30 env: GITLAB_PAT: ${{ secrets.GITLAB_PAT }} GITLAB_PROJECT_ID: ${{ secrets.GITLAB_PROJECT_ID }} GITLAB_HOST: 'https://teamscore.gitlab.yandexcloud.net' GITLAB_PROJECT_PATH: 'aeroflot2/flights-front' - JENKINS_BASE_URL: 'http://jenkins.yc.devwebzavod.ru:8080' - JENKINS_JOB_PATH: '/job/Aeroflot2/job/Flights-Front-Dev' - JENKINS_USER: ${{ secrets.JENKINS_USER }} - JENKINS_API_TOKEN: ${{ secrets.JENKINS_API_TOKEN }} - JENKINS_TRIGGER_TOKEN: ${{ secrets.JENKINS_TRIGGER_TOKEN }} + JENKINS_JOB_URL: 'http://jenkins.yc.devwebzavod.ru:8080/job/Aeroflot2/job/Flights-Front-Dev/' TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} @@ -37,8 +37,6 @@ jobs: id: gate run: | API="${GITHUB_SERVER_URL}/api/v1/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=${GITHUB_SHA}" - # Gitea Actions API is similar to GitHub's; this query may differ slightly per Gitea version. - # If the endpoint isn't available, fall back to a last-3-runs check via the workflows endpoint. resp=$(curl -fsS -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" "$API" || echo '{"workflow_runs":[]}') ok=$(echo "$resp" | jq -r --arg name "ci-deploy" ' .workflow_runs[] @@ -70,8 +68,6 @@ jobs: - name: Clone GitLab target id: clone - env: - GITLAB_PAT: ${{ secrets.GITLAB_PAT }} run: | rm -rf /tmp/flights-front git clone "https://oauth2:${GITLAB_PAT}@teamscore.gitlab.yandexcloud.net/aeroflot2/flights-front.git" /tmp/flights-front @@ -145,7 +141,7 @@ jobs: "${GITLAB_HOST}/api/v4/projects/${GITLAB_PROJECT_ID}/merge_requests/${{ steps.mr_open.outputs.iid }}/merge" \ >/dev/null - - name: Cleanup MR + branch on failure (B:9-11 only) + - name: Cleanup MR + branch on failure if: failure() && (steps.mr_open.outcome == 'failure' || steps.mr_approve.outcome == 'failure' || steps.mr_merge.outcome == 'failure') run: | IID="${{ steps.mr_open.outputs.iid }}" @@ -165,35 +161,11 @@ jobs: >/dev/null || true fi - - name: Trigger + wait for Jenkins - id: jenkins - if: steps.commit.outputs.skip_remaining != '1' - run: scripts/ci/jenkins-trigger-and-wait.sh - - - name: Wait for customer URL to update - id: wait_customer - if: steps.commit.outputs.skip_remaining != '1' - run: scripts/ci/wait-for-url.sh http://flights-ui.devwebzavod.ru/ru-ru/onlineboard 60 5 - - - name: Run Playwright e2e against customer URL - id: e2e_customer - if: steps.commit.outputs.skip_remaining != '1' - env: - BASE_URL: http://flights-ui.devwebzavod.ru - run: pnpm test:e2e - - - name: Upload artifacts on failure - if: failure() - uses: actions/upload-artifact@v4 - with: - name: release-failure-${{ github.run_id }} - path: | - playwright-report/ - retention-days: 7 - - - name: Notify (success) + - name: Notify (success — manual Jenkins trigger required) if: success() && env.TELEGRAM_BOT_TOKEN != '' - run: scripts/ci/notify-telegram.sh ok release "MR ${{ steps.mr_open.outputs.url }}" + run: | + MR_URL='${{ steps.mr_open.outputs.url }}' + scripts/ci/notify-telegram.sh ok release "MR merged: ${MR_URL}. Now trigger Jenkins manually: ${JENKINS_JOB_URL}, then dispatch the release-verify workflow." - name: Notify (failure) if: failure() && env.TELEGRAM_BOT_TOKEN != '' diff --git a/deployment/README.md b/deployment/README.md index a1501947..7e3152ee 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -4,62 +4,46 @@ This is the bootstrap procedure for hosting `https://ui-dashboard.gnerim.ru/` on ## One-time setup -### 1. Routing pve-201 → TIM API (via webzavod) +### 1. SSH tunnel pve-201 → webzavod (TIM API access) -**On webzavod (192.168.88.58)** — verify IP forwarding and MASQUERADE: +The customer WAF on `flights.test.aeroflot.ru` only accepts requests from corp-VPN egress IPs. nginx proxies `/api/` and `/map/api/` to `https://127.0.0.1:8443`, which is forwarded over SSH to webzavod (which terminates the corp VPN on `ppp0`). A systemd unit keeps the tunnel up. -```bash -sysctl net.ipv4.ip_forward # expect: 1 -sudo iptables -t nat -L POSTROUTING -nv | grep ppp0 # expect: MASQUERADE rule +**On webzavod (192.168.88.58)** — append the pve-201 pubkey to `~gnezim/.ssh/authorized_keys` with `permitopen` restricting it to one host:port (one-time, read pve-201's `~gnezim/.ssh/id_rsa.pub` first): + +``` +command="exit 1",no-pty,no-X11-forwarding,no-agent-forwarding,no-user-rc,permitopen="flights.test.aeroflot.ru:443" ssh-rsa AAAA…== pve-201-flights-tim-tunnel ``` -If missing: +**On pve-201** — install + enable the systemd unit: ```bash -echo 'net.ipv4.ip_forward=1' | sudo tee -a /etc/sysctl.conf -sudo sysctl -p -sudo iptables -t nat -A POSTROUTING -o ppp0 -j MASQUERADE -sudo apt install iptables-persistent -sudo netfilter-persistent save -``` - -**On pve-201** — add a persistent static route to TIM via webzavod: - -```yaml -# /etc/netplan/01-routes.yaml — adjust NIC name as needed -network: - version: 2 - ethernets: - : # replace with actual NIC name from `ip link show` - routes: - - to: 172.18.0.0/16 - via: 192.168.88.58 -``` - -```bash -sudo netplan apply -``` - -**On pve-201** — pin TIM hostnames to reachable A records (TIM DNS returns duplicate As, one of which is dead): - -```bash -echo '172.18.0.121 flights.test.aeroflot.ru' | sudo tee -a /etc/hosts +cd /path/to/Aeroflot.Flights.Web +sudo cp deployment/systemd/flights-tim-tunnel.service /etc/systemd/system/ +sudo systemctl daemon-reload +sudo systemctl enable --now flights-tim-tunnel.service +sudo systemctl status flights-tim-tunnel.service --no-pager ``` **Smoke test:** ```bash -curl -v https://flights.test.aeroflot.ru/swagger/ # expect: 401 in <300ms +ss -ltn | grep ':8443\b' # expect: a 127.0.0.1:8443 LISTEN line +curl -k --resolve flights.test.aeroflot.ru:8443:127.0.0.1 \ + -o /dev/null -w 'swagger: %{http_code}\n' \ + https://flights.test.aeroflot.ru:8443/swagger/index.html # expect 401 +curl -k --resolve flights.test.aeroflot.ru:8443:127.0.0.1 \ + -o /dev/null -w 'api/health: %{http_code}\n' \ + https://flights.test.aeroflot.ru:8443/api/health # expect 200 ``` -If this fails, fix routing/DNS before proceeding — nothing else will work. +If swagger returns 200 with HTML body instead of 401, the tunnel is bypassed and the request egressed directly — fix the listener / SSH unit before proceeding. ### 2. nginx vhost ```bash -cd /path/to/Aeroflot.Flights.Web # repo root, e.g. ~/repos/Aeroflot.Flights.Web +cd /path/to/Aeroflot.Flights.Web sudo cp deployment/nginx/ui-dashboard.gnerim.ru.conf /etc/nginx/sites-available/ -sudo ln -s /etc/nginx/sites-available/ui-dashboard.gnerim.ru.conf /etc/nginx/sites-enabled/ +sudo ln -sf /etc/nginx/sites-available/ui-dashboard.gnerim.ru.conf /etc/nginx/sites-enabled/ sudo mkdir -p /etc/nginx/htpasswd sudo nginx -t sudo systemctl reload nginx @@ -81,19 +65,19 @@ Reachability checks the runner must pass: ```bash curl -fsS https://git.gnerim.ru/ # Gitea curl -fsSI https://teamscore.gitlab.yandexcloud.net/ # GitLab -curl -fsSI http://jenkins.yc.devwebzavod.ru:8080/ # Jenkins (via static route) -curl -fsSI http://flights-ui.devwebzavod.ru/ # Customer URL (via static route) ``` +The customer Jenkins URL and the customer site (`flights-ui.devwebzavod.ru`) are NOT reachable from the runner directly — Workflow B does not call them. Customer-side e2e (Workflow C, `release-verify`) only runs after the operator has manually triggered the Jenkins build, and it reaches the customer URL the same way the upstream API is reached: direct egress where possible, or through additional tunnels added on demand. + ### 4. GitLab Personal Access Token GitLab → User Settings → Access Tokens → create with scopes `api` and `write_repository`. Store as Gitea Actions secret `GITLAB_PAT`. ### 5. Allow self-approve on GitLab project -GitLab → flights-front project → Settings → Merge requests → Approval rules → uncheck **"Prevent approval by author"**. +GitLab → flights-front project → Settings → Merge requests → Approval rules → uncheck **"Prevent approval by author"** (skip if you can already approve your own MRs in the GitLab UI). -Verify by running (locally, after PAT is in place — script is created in Task 17 of the plan): +Verify by running (locally, after PAT is in place): ```bash GITLAB_PAT= ./scripts/ci/check-gitlab-project.sh @@ -101,29 +85,26 @@ GITLAB_PAT= ./scripts/ci/check-gitlab-project.sh It prints the numeric project ID (store as `GITLAB_PROJECT_ID` secret) and confirms self-approve is allowed. -### 6. Jenkins remote trigger token - -Jenkins → `Aeroflot2/Flights-Front-Dev` job → Configure → check **"Trigger builds remotely"** → set token (e.g. `flights-cd-trigger`). Store as `JENKINS_TRIGGER_TOKEN`. - -Also: Jenkins → User → Configure → API Token → Add new token. Store username as `JENKINS_USER`, token as `JENKINS_API_TOKEN`. - -### 7. Telegram bot +### 6. Telegram bot (optional) Use existing bot or create via @BotFather. Get the chat_id by sending a message and querying `https://api.telegram.org/bot/getUpdates`. Store as `TELEGRAM_BOT_TOKEN` and `TELEGRAM_CHAT_ID`. -### 8. Gitea Actions secrets summary +If either secret is unset, all `notify-telegram.sh` calls in the workflows skip cleanly with no error — the pipeline runs end-to-end without Telegram configured. + +### 7. Gitea Actions secrets summary Repo → Settings → Actions → Secrets — set all of: -| Secret | Purpose | -|---|---| -| `BASIC_AUTH_USER`, `BASIC_AUTH_PASS` | nginx htpasswd | -| `MAP_TILE_URL` | Default `/map/api/tile/{z}/{x}/{y}.jpeg` | -| `API_BASE_URL` | Default `/api` | -| `GITLAB_PAT`, `GITLAB_PROJECT_ID` | GitLab MR API | -| `JENKINS_USER`, `JENKINS_API_TOKEN`, `JENKINS_TRIGGER_TOKEN` | Jenkins API | -| `TELEGRAM_BOT_TOKEN`, `TELEGRAM_CHAT_ID` | Notifications | -| `GITHUB_TOKEN` | Auto-provided by Gitea Actions — no manual setup required | +| Secret | Required | Purpose | +|---|---|---| +| `BASIC_AUTH_USER`, `BASIC_AUTH_PASS` | yes | nginx htpasswd for `ui-dashboard.gnerim.ru` | +| `MAP_TILE_URL` | optional | Default `/map/api/tile/{z}/{x}/{y}.jpeg` | +| `API_BASE_URL` | optional | Default `/api` | +| `GITLAB_PAT`, `GITLAB_PROJECT_ID` | yes (release only) | GitLab MR API | +| `TELEGRAM_BOT_TOKEN`, `TELEGRAM_CHAT_ID` | optional | Notifications | +| `GITHUB_TOKEN` | auto | Provided by Gitea Actions — no manual setup required | + +Jenkins is triggered manually after the release workflow merges to GitLab; no Jenkins secret is required. ## Verifying failure paths @@ -148,7 +129,7 @@ Then push a commit that fails e2e. Rollback step finds no `:previous` and bails. - Telegram message: `🔥 ci-deploy ROLLBACK FAILED — site is DOWN` - `https://ui-dashboard.gnerim.ru/` returns 502. -- Manual recovery: `ssh pve-201 'docker stop flights-web 2>/dev/null; docker rm flights-web 2>/dev/null; docker run -d --name flights-web --restart unless-stopped -p 127.0.0.1:8081:8080 flights-web:'`. +- Manual recovery: `ssh pve-201 'docker stop flights-web 2>/dev/null; docker rm flights-web 2>/dev/null; docker run -d --name flights-web --restart unless-stopped -p 127.0.0.1:3002:8080 flights-web:'`. ### B: blocked on A not green @@ -157,27 +138,15 @@ Trigger Workflow B (manual or tag) for a SHA that has no green Workflow A run. V - Telegram message: `⚠️ release blocked — workflow ci-deploy is not green for ` - B exits early; nothing changes in GitLab. -### B: Jenkins poll timeout - -Temporarily edit `scripts/ci/jenkins-trigger-and-wait.sh` to change the default: -```bash -TIMEOUT="${JENKINS_TIMEOUT:-30}" # was 1800 -``` -Push to a throwaway branch, trigger Workflow B from that branch via the Gitea UI, and confirm: -- Telegram message: `❌ release FAILED at Jenkins build` (because polling gives up after 30s) -- The Jenkins job itself may continue running — that's fine, it's outside our control. - -**Restore the original 1800 default** and force-delete the throwaway branch when done. - ## Manual recovery scenarios -### Workflow B failed at step 12-13 (Jenkins) — MR merged but customer site stale +### Workflow B succeeded but Jenkins build failed -GitLab is already at the new commit; Jenkins didn't deploy. Recovery: +GitLab is at the new commit; customer site is stale. Recovery: -1. Open Jenkins UI → click "Build Now" on the same job, or -2. Push a new commit to GitLab to re-trigger Jenkins polling (if it's set up that way), or -3. Re-run Workflow B from a green Workflow A — but only if you also pushed new code; otherwise B will sync a no-op and skip. +1. Open Jenkins UI → check the failing build's console log +2. Fix the issue (in this repo if it's our bug, in customer's infra otherwise) +3. Push fix → Workflow A → Workflow B → trigger Jenkins again ### Container running but nginx returns 502 @@ -186,7 +155,7 @@ Check the bind: ```bash ssh pve-201 docker ps --filter name=flights-web -curl -v http://127.0.0.1:8081/ # should return 200 (or whatever the SSR root returns) +curl -v http://127.0.0.1:3002/ # should return 200 (or whatever the SSR root returns) sudo nginx -t && sudo systemctl reload nginx ``` @@ -195,5 +164,16 @@ If the container died, the Restart policy `unless-stopped` should bring it back. ```bash docker logs flights-web --tail 200 docker stop flights-web 2>/dev/null; docker rm flights-web 2>/dev/null -docker run -d --name flights-web --restart unless-stopped -p 127.0.0.1:8081:8080 flights-web:current +docker run -d --name flights-web --restart unless-stopped -p 127.0.0.1:3002:8080 flights-web:current ``` + +### TIM tunnel is down (502 on /api/* but / works) + +```bash +sudo systemctl status flights-tim-tunnel.service --no-pager +sudo journalctl -u flights-tim-tunnel.service -n 50 --no-pager +sudo systemctl restart flights-tim-tunnel.service +ss -ltn | grep ':8443\b' # confirm listener is back +``` + +If the tunnel won't come up, verify SSH key is still authorised on webzavod and that webzavod's `ppp0` is up (`ssh webzavod 'ip -br addr show ppp0'`). diff --git a/deployment/nginx/ui-dashboard.gnerim.ru.conf b/deployment/nginx/ui-dashboard.gnerim.ru.conf index 9963feed..0428e34e 100644 --- a/deployment/nginx/ui-dashboard.gnerim.ru.conf +++ b/deployment/nginx/ui-dashboard.gnerim.ru.conf @@ -18,9 +18,9 @@ server { auth_basic "ui-dashboard"; auth_basic_user_file /etc/nginx/htpasswd/ui-dashboard; - # SSR app on loopback (container bound to 127.0.0.1:8081) + # SSR app on loopback (container bound to 127.0.0.1:3002) location / { - proxy_pass http://127.0.0.1:8081; + proxy_pass http://127.0.0.1:3002; proxy_set_header Host $host; proxy_set_header X-Forwarded-Proto $scheme; proxy_set_header X-Real-IP $remote_addr; @@ -32,21 +32,26 @@ server { } # API proxy — bypass basic auth (gates HTML, not API). - # Static route on the host sends 172.18.0.0/16 via 192.168.88.58 (webzavod). - # /etc/hosts pins flights.test.aeroflot.ru → 172.18.0.121. + # Routed via the flights-tim-tunnel.service systemd unit (see + # deployment/systemd/flights-tim-tunnel.service): 127.0.0.1:8443 is an + # ssh -L tunnel to webzavod which exits via ppp0 with a corp-VPN source IP + # the upstream WAF whitelists. SNI/Host are set explicitly because the + # TCP target is loopback rather than the real hostname. location /api/ { auth_basic off; - proxy_pass https://flights.test.aeroflot.ru; + proxy_pass https://127.0.0.1:8443; proxy_set_header Host flights.test.aeroflot.ru; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_ssl_server_name on; + proxy_ssl_name flights.test.aeroflot.ru; } location /map/api/ { auth_basic off; - proxy_pass https://flights.test.aeroflot.ru; + proxy_pass https://127.0.0.1:8443; proxy_set_header Host flights.test.aeroflot.ru; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_ssl_server_name on; + proxy_ssl_name flights.test.aeroflot.ru; } } diff --git a/deployment/systemd/flights-tim-tunnel.service b/deployment/systemd/flights-tim-tunnel.service new file mode 100644 index 00000000..b8308731 --- /dev/null +++ b/deployment/systemd/flights-tim-tunnel.service @@ -0,0 +1,44 @@ +# SSH local-forward tunnel: pve-201 -> webzavod -> flights.test.aeroflot.ru:443. +# +# nginx on pve-201 proxies /api/ and /map/api/ to https://127.0.0.1:8443. This +# unit forwards 8443 over SSH to webzavod (192.168.88.58), which terminates the +# corp VPN (ppp0). The customer WAF whitelists webzavod's egress IP, so requests +# arriving via this tunnel reach the real backend instead of the WAF interstitial. +# +# Webzavod's authorized_keys entry restricts this key to: +# command="exit 1",no-pty,no-X11-forwarding,no-agent-forwarding,no-user-rc, +# permitopen="flights.test.aeroflot.ru:443" +# +# Install: +# sudo cp deployment/systemd/flights-tim-tunnel.service /etc/systemd/system/ +# sudo systemctl daemon-reload +# sudo systemctl enable --now flights-tim-tunnel.service +# +# Verify: +# ss -ltn | grep ':8443\b' +# curl -k --resolve flights.test.aeroflot.ru:8443:127.0.0.1 \ +# -o /dev/null -w '%{http_code}\n' \ +# https://flights.test.aeroflot.ru:8443/swagger/index.html # expect 401 + +[Unit] +Description=SSH tunnel pve-201->webzavod for flights.test.aeroflot.ru:443 +Documentation=https://git.gnerim.ru/gnezim/Aeroflot.Flights.Web +Wants=network-online.target +After=network-online.target + +[Service] +Type=simple +User=gnezim +ExecStart=/usr/bin/ssh -N \ + -o BatchMode=yes \ + -o ExitOnForwardFailure=yes \ + -o ServerAliveInterval=30 \ + -o ServerAliveCountMax=3 \ + -o StrictHostKeyChecking=accept-new \ + -L 127.0.0.1:8443:flights.test.aeroflot.ru:443 \ + gnezim@192.168.88.58 +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target diff --git a/docs/superpowers/specs/2026-04-25-cicd-pipeline-design.md b/docs/superpowers/specs/2026-04-25-cicd-pipeline-design.md index 3e1ea0b0..7e190573 100644 --- a/docs/superpowers/specs/2026-04-25-cicd-pipeline-design.md +++ b/docs/superpowers/specs/2026-04-25-cicd-pipeline-design.md @@ -485,3 +485,52 @@ When a private registry comes online (eventual `registry.gnerim.ru`), changes: 2. **The 9 untracked `snap-*.yml` files at repo root** look like throwaway parity-snapshot artifacts. Add to `.gitignore` or commit? Verify before flipping pipeline on (prereq #14). 3. **e2e portability to remote `BASE_URL`** — existing specs were written against localhost. Many likely hardcode paths or rely on dev-only state. Layer 2 of testing strategy budgets time for this. 4. **Initial console-allowlist content** — empty starter; will be populated on first runs ("we'll figure it out in future" per design discussion). + +--- + +## Addendum 2026-04-27 — routing change + manual Jenkins trigger + +Two design pivots discovered during Phase B prerequisites work: + +### Routing: ssh -L tunnel instead of static-route + NAT + +Original design: static route on pve-201 pushes `` via webzavod's LAN IP, webzavod NATs LAN→ppp0, `/etc/hosts` pins `flights.test.aeroflot.ru` to an internal A record. + +Discovered: +- `flights.test.aeroflot.ru` resolves to public IPs from both pve-201 and webzavod (no internal A record exists). +- pve-201 reaches the public IP directly with HTTP 200, **but the response is a WAF interstitial** — the customer WAF returns 200/HTML for non-corp egress and 401/JSON-ready for corp egress. +- The same URL from webzavod returns 401 (real backend) — webzavod's `ppp0` egress IP is whitelisted. + +New design: persistent `ssh -L 127.0.0.1:8443:flights.test.aeroflot.ru:443` from pve-201 to webzavod via systemd unit `deployment/systemd/flights-tim-tunnel.service`. nginx proxies `/api/` and `/map/api/` to `https://127.0.0.1:8443` with `Host` and `proxy_ssl_name` overrides so SNI/cert validation still target the real hostname. + +Webzavod-side authorisation pinned with `command="exit 1",no-pty,no-X11-forwarding,no-agent-forwarding,no-user-rc,permitopen="flights.test.aeroflot.ru:443"` — the key cannot open a shell, agent-forward, or forward any other host:port. + +Trade-offs vs. original: +- ✅ No webzavod kernel changes (no `ip_forward` toggle, no MASQUERADE rule, no iptables-persistent). +- ✅ No `/etc/hosts` pin needed (DNS resolution happens on webzavod, where the real IPs work). +- ✅ Recoverable in seconds (`systemctl restart flights-tim-tunnel`). +- ⚠ Per-host SSH tunnel — adding another upstream means another `-L` line. Currently only one upstream. +- ⚠ Discovered OpenSSH 9.6 quirk: `restrict + permitopen` causes TLS handshake to EOF mid-stream. Using explicit `no-*` options instead of `restrict` works. + +### Workflow B: drop Jenkins automation + +Original design: Workflow B triggers Jenkins via remote-build token, polls build status via authenticated API, then runs e2e against customer URL. + +Constraint: operator does not have Jenkins job-configure access (no remote-trigger token) nor Jenkins user API token access. Authenticated API trigger and polling are not possible without admin involvement. + +New design: +- **Workflow B (`release.yml`)** — sync to GitLab, open MR, auto-approve, auto-merge, **stop**. Telegram notify includes the Jenkins job URL with instructions to trigger by hand. +- **Workflow C (`release-verify.yml`)** — `workflow_dispatch` only. Operator runs manually after Jenkins finishes. Probes customer URL until reachable, runs Playwright e2e against `http://flights-ui.devwebzavod.ru` with the console-error gate, notifies Telegram. + +Removed from the repo: +- `scripts/ci/jenkins-trigger-and-wait.sh` +- `tests/ci/test-jenkins-trigger.sh` +- `tests/ci/fixtures/jenkins-{success,failure}-flow.json` +- `JENKINS_USER`, `JENKINS_API_TOKEN`, `JENKINS_TRIGGER_TOKEN` secrets + +Trade-off: lose automated end-to-end pipeline. Acceptable because (a) operator already triggers Jenkins manually today, (b) the manual step is a checkpoint where build failures surface clearly, (c) future Jenkins API access can swap C back into B without changing the rest of the design. + +### Other small adjustments + +- SSR container loopback port changed from `8081` → `3002` (port 8081 already in use on pve-201 by openwebui). +- `notify-telegram.sh` now skips cleanly when Telegram secrets are unset (was: hard-fail). Lets the pipeline run end-to-end without TG configured. diff --git a/scripts/ci/deploy-container.sh b/scripts/ci/deploy-container.sh index d7494283..ba804064 100755 --- a/scripts/ci/deploy-container.sh +++ b/scripts/ci/deploy-container.sh @@ -9,7 +9,7 @@ # # Env: # GITHUB_SHA (required for swap) -# FLIGHTS_WEB_PORT (default 8081 — host port that nginx proxies to) +# FLIGHTS_WEB_PORT (default 3002 — host port that nginx proxies to) # IMAGE_NAME (default flights-web — set this to point at a registry later) set -euo pipefail @@ -20,7 +20,7 @@ if [ "${1:-}" = "--dry-run" ]; then fi CMD="${1:-}" -PORT="${FLIGHTS_WEB_PORT:-8081}" +PORT="${FLIGHTS_WEB_PORT:-3002}" IMAGE="${IMAGE_NAME:-flights-web}" run() { diff --git a/scripts/ci/jenkins-trigger-and-wait.sh b/scripts/ci/jenkins-trigger-and-wait.sh deleted file mode 100755 index 2a094e92..00000000 --- a/scripts/ci/jenkins-trigger-and-wait.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env bash -# jenkins-trigger-and-wait.sh — fire a Jenkins job and wait for completion. -# -# Usage: -# jenkins-trigger-and-wait.sh # real mode (env-driven) -# jenkins-trigger-and-wait.sh --mock-mode # for tests -# -# Env (real mode): -# JENKINS_BASE_URL e.g. http://jenkins.yc.devwebzavod.ru:8080 -# JENKINS_JOB_PATH e.g. /job/Aeroflot2/job/Flights-Front-Dev -# JENKINS_USER, JENKINS_API_TOKEN -# JENKINS_TRIGGER_TOKEN -# JENKINS_TIMEOUT seconds (default 1800) -# JENKINS_POLL_INTERVAL seconds (default 10) -set -euo pipefail - -MODE=real -FIXTURE="" -if [ "${1:-}" = "--mock-mode" ]; then - MODE=mock - FIXTURE="${2:-}" - [ -n "$FIXTURE" ] || { echo "usage: $0 --mock-mode " >&2; exit 2; } - command -v jq >/dev/null 2>&1 || { echo "fatal: jq required for --mock-mode" >&2; exit 2; } -fi - -POLL_INTERVAL="${JENKINS_POLL_INTERVAL:-10}" -TIMEOUT="${JENKINS_TIMEOUT:-1800}" - -if [ "$MODE" = real ]; then - : "${JENKINS_BASE_URL:?required}" - : "${JENKINS_JOB_PATH:?required}" - : "${JENKINS_USER:?required}" - : "${JENKINS_API_TOKEN:?required}" - : "${JENKINS_TRIGGER_TOKEN:?required}" -fi - -# ── Mock mode: walk fixture deterministically ───────────────────────────────── -if [ "$MODE" = mock ]; then - QUEUE_URL=$(jq -r '.trigger_response.headers.Location' "$FIXTURE") - echo "triggered (mock): queue=$QUEUE_URL" - - # Walk queue polls until we get an executable. - count=$(jq '.queue_polls | length' "$FIXTURE") - BUILD_URL="" - for i in $(seq 0 $((count - 1))); do - body=$(jq -c ".queue_polls[$i].body" "$FIXTURE") - exe_url=$(printf '%s' "$body" | jq -r '.executable.url // empty') - if [ -n "$exe_url" ]; then - BUILD_URL="$exe_url" - break - fi - echo "queue poll $((i + 1)): not yet" - done - [ -n "${BUILD_URL:-}" ] || { echo "fatal: queue never produced executable" >&2; exit 1; } - echo "build url (mock): $BUILD_URL" - - # Walk build polls until result != null. - count=$(jq '.build_polls | length' "$FIXTURE") - for i in $(seq 0 $((count - 1))); do - body=$(jq -c ".build_polls[$i].body" "$FIXTURE") - result=$(printf '%s' "$body" | jq -r '.result // empty') - number=$(printf '%s' "$body" | jq -r '.number') - if [ -n "$result" ]; then - if [ "$result" = "SUCCESS" ]; then - echo "build #${number} SUCCESS" - exit 0 - else - echo "build #${number} ${result}" >&2 - exit 1 - fi - fi - echo "build poll $((i + 1)): building" - done - echo "fatal: build never completed within fixture" >&2 - exit 1 -fi - -# ── Real mode ───────────────────────────────────────────────────────────────── -TRIGGER_URL="${JENKINS_BASE_URL}${JENKINS_JOB_PATH}/build?token=${JENKINS_TRIGGER_TOKEN}" -echo "triggering: $TRIGGER_URL" - -# -D - dumps headers; -o /dev/null discards body. We need the Location header. -HEADERS=$(curl -fsS -X POST -u "${JENKINS_USER}:${JENKINS_API_TOKEN}" -D - -o /dev/null "$TRIGGER_URL") -QUEUE_URL=$(printf '%s' "$HEADERS" | grep -i '^Location:' | head -1 | sed 's/^[Ll]ocation:[[:space:]]*//' | tr -d '\r\n') -[ -n "$QUEUE_URL" ] || { echo "fatal: no Location header from Jenkins" >&2; exit 1; } -echo "queue: $QUEUE_URL" - -# Poll queue for executable.url. START covers both queue + build phases. -START=$(date +%s) -BUILD_URL="" -while [ -z "$BUILD_URL" ]; do - resp=$(curl -fsS -u "${JENKINS_USER}:${JENKINS_API_TOKEN}" "${QUEUE_URL}api/json") - BUILD_URL=$(printf '%s' "$resp" | jq -r '.executable.url // empty') - [ -n "$BUILD_URL" ] && break - now=$(date +%s) - if [ $((now - START)) -ge "$TIMEOUT" ]; then - echo "fatal: queue timeout after ${TIMEOUT}s" >&2 - exit 1 - fi - sleep "$POLL_INTERVAL" -done -echo "build: $BUILD_URL" - -# Poll build for result. Timeout window is shared with queue phase (START not reset). -while :; do - resp=$(curl -fsS -u "${JENKINS_USER}:${JENKINS_API_TOKEN}" "${BUILD_URL}api/json") - result=$(printf '%s' "$resp" | jq -r '.result // empty') - number=$(printf '%s' "$resp" | jq -r '.number') - if [ -n "$result" ]; then - if [ "$result" = "SUCCESS" ]; then - echo "build #${number} SUCCESS" - exit 0 - else - echo "build #${number} ${result} — see ${BUILD_URL}console" >&2 - exit 1 - fi - fi - now=$(date +%s) - if [ $((now - START)) -ge "$TIMEOUT" ]; then - echo "fatal: build timeout after ${TIMEOUT}s — see ${BUILD_URL}console" >&2 - exit 1 - fi - sleep "$POLL_INTERVAL" -done diff --git a/scripts/ci/notify-telegram.sh b/scripts/ci/notify-telegram.sh index db070636..5b50cd2a 100755 --- a/scripts/ci/notify-telegram.sh +++ b/scripts/ci/notify-telegram.sh @@ -28,8 +28,10 @@ esac [ -n "$STAGE" ] || { echo "usage: $0 [--dry-run] []" >&2; exit 2; } if [ "$DRY_RUN" -eq 0 ]; then - : "${TELEGRAM_BOT_TOKEN:?TELEGRAM_BOT_TOKEN required}" - : "${TELEGRAM_CHAT_ID:?TELEGRAM_CHAT_ID required}" + if [ -z "${TELEGRAM_BOT_TOKEN:-}" ] || [ -z "${TELEGRAM_CHAT_ID:-}" ]; then + echo "notify-telegram: TELEGRAM_BOT_TOKEN/TELEGRAM_CHAT_ID unset — skipping" >&2 + exit 0 + fi fi REPO="${GITHUB_REPOSITORY:-unknown/repo}" diff --git a/tests/ci/fixtures/jenkins-failure-flow.json b/tests/ci/fixtures/jenkins-failure-flow.json deleted file mode 100644 index 68ca4ff9..00000000 --- a/tests/ci/fixtures/jenkins-failure-flow.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "trigger_response": { - "status": 201, - "headers": { - "Location": "http://jenkins.test/queue/item/78/" - } - }, - "queue_polls": [ - {"status": 200, "body": {"executable": {"number": 43, "url": "http://jenkins.test/job/Aeroflot2/job/Flights-Front-Dev/43/"}}} - ], - "build_polls": [ - {"status": 200, "body": {"building": true, "result": null, "number": 43}}, - {"status": 200, "body": {"building": false, "result": "FAILURE", "number": 43}} - ] -} diff --git a/tests/ci/fixtures/jenkins-success-flow.json b/tests/ci/fixtures/jenkins-success-flow.json deleted file mode 100644 index ac181f58..00000000 --- a/tests/ci/fixtures/jenkins-success-flow.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "trigger_response": { - "status": 201, - "headers": { - "Location": "http://jenkins.test/queue/item/77/" - } - }, - "queue_polls": [ - {"status": 200, "body": {"why": "in queue", "executable": null}}, - {"status": 200, "body": {"why": "in queue", "executable": null}}, - {"status": 200, "body": {"executable": {"number": 42, "url": "http://jenkins.test/job/Aeroflot2/job/Flights-Front-Dev/42/"}}} - ], - "build_polls": [ - {"status": 200, "body": {"building": true, "result": null, "number": 42}}, - {"status": 200, "body": {"building": true, "result": null, "number": 42}}, - {"status": 200, "body": {"building": false, "result": "SUCCESS", "number": 42}} - ] -} diff --git a/tests/ci/test-jenkins-trigger.sh b/tests/ci/test-jenkins-trigger.sh deleted file mode 100755 index b4c1780c..00000000 --- a/tests/ci/test-jenkins-trigger.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -ROOT="$(cd "$(dirname "$0")/../.." && pwd)" -SCRIPT="$ROOT/scripts/ci/jenkins-trigger-and-wait.sh" -[ -x "$SCRIPT" ] || { echo "FAIL: $SCRIPT not executable"; exit 1; } - -# Mock-mode tests need jq — bail with a useful message if unavailable. -command -v jq >/dev/null 2>&1 || { echo "SKIP: jq not installed"; exit 0; } - -# --- success path --- -if ! "$SCRIPT" --mock-mode "$ROOT/tests/ci/fixtures/jenkins-success-flow.json" 2>&1 | tee /tmp/jenkins-test.log; then - echo "FAIL: success fixture should exit 0" - exit 1 -fi -grep -q "build #42 SUCCESS" /tmp/jenkins-test.log || { echo "FAIL: expected 'build #42 SUCCESS'"; exit 1; } - -# --- failure path --- -if "$SCRIPT" --mock-mode "$ROOT/tests/ci/fixtures/jenkins-failure-flow.json" 2>&1 | tee /tmp/jenkins-test.log; then - echo "FAIL: failure fixture should exit non-zero" - exit 1 -fi -grep -q "FAILURE" /tmp/jenkins-test.log || { echo "FAIL: expected 'FAILURE' in output"; exit 1; } - -# --- bad usage --- -if "$SCRIPT" 2>/dev/null; then - echo "FAIL: expected usage error" - exit 1 -fi - -echo "PASS: jenkins-trigger-and-wait.sh" diff --git a/tests/ci/test-notify-telegram.sh b/tests/ci/test-notify-telegram.sh index ef678387..66f4b3f8 100755 --- a/tests/ci/test-notify-telegram.sh +++ b/tests/ci/test-notify-telegram.sh @@ -37,12 +37,18 @@ out=$("$SCRIPT" --dry-run fail ci-deploy "Run Playwright e2e") assert_contains "$out" "❌ ci-deploy FAILED" assert_contains "$out" "Run Playwright e2e" -# --- missing env should error in non-dry-run --- +# --- missing env in non-dry-run: should skip cleanly (exit 0, log to stderr) --- unset TELEGRAM_BOT_TOKEN -if "$SCRIPT" ok ci-deploy 2>/dev/null; then - echo "FAIL: expected error when TELEGRAM_BOT_TOKEN missing" +set +e +err=$("$SCRIPT" ok ci-deploy 2>&1 >/dev/null) +rc=$? +set -e +if [ $rc -ne 0 ]; then + echo "FAIL: expected exit 0 when TELEGRAM_BOT_TOKEN missing (got $rc)" exit 1 fi +assert_contains "$err" "skipping" +export TELEGRAM_BOT_TOKEN="test-token" # --- fail with log tail ---