diff --git a/.github/workflows/archive-traffic.yml b/.github/workflows/archive-traffic.yml new file mode 100644 index 0000000..bf5c11b --- /dev/null +++ b/.github/workflows/archive-traffic.yml @@ -0,0 +1,74 @@ +name: Archive GitHub Traffic + +on: + schedule: + - cron: '17 3 * * *' + workflow_dispatch: + +permissions: + contents: write + +concurrency: + group: traffic-archive + cancel-in-progress: false + +env: + TRAFFIC_ARCHIVE_BRANCH: traffic-archive + TRAFFIC_ARCHIVE_DIR: ../traffic-archive/traffic + +jobs: + archive: + name: Capture traffic snapshot + runs-on: ubuntu-latest + steps: + - name: Checkout source + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setup Node + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: '20' + cache: 'npm' + + - name: Prepare archive branch + env: + ARCHIVE_PUSH_TOKEN: ${{ github.token }} + run: | + set -euo pipefail + + git config --global user.name "github-actions[bot]" + git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" + + server="${GITHUB_SERVER_URL#https://}" + archive_remote="https://x-access-token:${ARCHIVE_PUSH_TOKEN}@${server}/${GITHUB_REPOSITORY}.git" + + if git ls-remote --exit-code --heads "${archive_remote}" "${TRAFFIC_ARCHIVE_BRANCH}" >/dev/null 2>&1; then + git clone --branch "${TRAFFIC_ARCHIVE_BRANCH}" --depth 1 "${archive_remote}" ../traffic-archive + else + git init -b "${TRAFFIC_ARCHIVE_BRANCH}" ../traffic-archive + git -C ../traffic-archive remote add origin "${archive_remote}" + fi + + mkdir -p "${TRAFFIC_ARCHIVE_DIR}" + + - name: Collect traffic + env: + GH_TRAFFIC_TOKEN: ${{ secrets.TRAFFIC_ARCHIVE_TOKEN || github.token }} + GITHUB_REPOSITORY: ${{ github.repository }} + run: node scripts/archive-github-traffic.mjs --archive-dir "${TRAFFIC_ARCHIVE_DIR}" + + - name: Commit archive + run: | + set -euo pipefail + + cd ../traffic-archive + git add traffic/archive.json traffic/summary.json + git rm --ignore-unmatch traffic/README.md + + if git diff --cached --quiet; then + echo "No traffic archive changes." + exit 0 + fi + + git commit -m "chore(traffic): archive repository traffic $(date -u +%F)" + git push origin HEAD:${TRAFFIC_ARCHIVE_BRANCH} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3877e5e..5aa426e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -115,6 +115,8 @@ jobs: run: node scripts/test-skill-release-workflow.mjs - name: Deploy Pages Advisory Checksums Tests run: node scripts/test-deploy-pages-checksums.mjs + - name: GitHub Traffic Archive Tests + run: node scripts/test-github-traffic-archive.mjs clawsec-suite-tests: name: ClawSec Suite Verification Tests diff --git a/scripts/archive-github-traffic.mjs b/scripts/archive-github-traffic.mjs new file mode 100644 index 0000000..d224d0a --- /dev/null +++ b/scripts/archive-github-traffic.mjs @@ -0,0 +1,486 @@ +#!/usr/bin/env node + +import { promises as fs } from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath, pathToFileURL } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const REPO_ROOT = path.resolve(__dirname, '..'); +const API_ROOT = 'https://api.github.com'; +const GITHUB_API_VERSION = '2022-11-28'; +const ARCHIVE_VERSION = 1; +const DAY_MS = 24 * 60 * 60 * 1000; + +const SUMMARY_WINDOWS = [ + ['last_14_days', 14], + ['last_30_days', 30], + ['last_90_days', 90], + ['last_365_days', 365], +]; + +const toIsoString = (value, label) => { + const date = new Date(value); + if (Number.isNaN(date.getTime())) { + throw new Error(`Invalid ${label}: ${value}`); + } + return date.toISOString(); +}; + +const toDailyTimestamp = (value) => `${toIsoString(value, 'traffic timestamp').slice(0, 10)}T00:00:00Z`; +const toDateKey = (value) => toIsoString(value, 'capture timestamp').slice(0, 10); + +const toNonNegativeInteger = (value, label) => { + const number = Number(value); + if (!Number.isFinite(number) || number < 0) { + throw new Error(`Invalid ${label}: ${value}`); + } + return Math.trunc(number); +}; + +const toRequiredString = (value, label) => { + if (typeof value !== 'string') { + throw new Error(`${label} must be a non-empty string`); + } + + const trimmed = value.trim(); + if (!trimmed) { + throw new Error(`${label} must be a non-empty string`); + } + + return trimmed; +}; + +const normalizeRepository = (repo) => { + const normalized = String(repo || '').trim(); + if (!/^[A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+$/.test(normalized)) { + throw new Error(`Repository must be in owner/name form, received: ${repo || '(empty)'}`); + } + return normalized; +}; + +const normalizeDailyEntries = (entries, label) => { + if (!Array.isArray(entries)) { + throw new Error(`${label} must be an array`); + } + + return entries + .map((entry) => ({ + timestamp: toDailyTimestamp(entry.timestamp), + count: toNonNegativeInteger(entry.count, `${label}.count`), + uniques: toNonNegativeInteger(entry.uniques, `${label}.uniques`), + })) + .sort((a, b) => a.timestamp.localeCompare(b.timestamp)); +}; + +const normalizeReferrers = (entries) => { + if (!Array.isArray(entries)) { + throw new Error('referrers must be an array'); + } + + return entries.map((entry) => ({ + referrer: toRequiredString(entry.referrer, 'referrers.referrer'), + count: toNonNegativeInteger(entry.count, 'referrers.count'), + uniques: toNonNegativeInteger(entry.uniques, 'referrers.uniques'), + })); +}; + +const normalizePaths = (entries) => { + if (!Array.isArray(entries)) { + throw new Error('paths must be an array'); + } + + return entries.map((entry) => ({ + path: toRequiredString(entry.path, 'paths.path'), + title: toRequiredString(entry.title, 'paths.title'), + count: toNonNegativeInteger(entry.count, 'paths.count'), + uniques: toNonNegativeInteger(entry.uniques, 'paths.uniques'), + })); +}; + +const upsertByKey = (existing, incoming, key) => { + const entriesByKey = new Map(); + + for (const entry of existing || []) { + entriesByKey.set(entry[key], entry); + } + for (const entry of incoming || []) { + entriesByKey.set(entry[key], entry); + } + + return [...entriesByKey.values()].sort((a, b) => String(a[key]).localeCompare(String(b[key]))); +}; + +const latestEntry = (entries) => { + if (!entries?.length) { + return null; + } + return entries[entries.length - 1]; +}; + +const sumSeries = (entries) => entries.reduce( + (totals, entry) => ({ + count: totals.count + entry.count, + sum_daily_uniques: totals.sum_daily_uniques + entry.uniques, + }), + { count: 0, sum_daily_uniques: 0 }, +); + +const startOfUtcDay = (date) => Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate()); + +const summarizeWindow = (entries, days, now) => { + const cutoff = new Date(startOfUtcDay(now) - ((days - 1) * DAY_MS)); + const filtered = entries.filter((entry) => new Date(entry.timestamp) >= cutoff); + const totals = sumSeries(filtered); + + return { + days, + count: totals.count, + sum_daily_uniques: totals.sum_daily_uniques, + unique_semantics: 'sum_of_daily_uniques', + first_date: filtered[0]?.timestamp.slice(0, 10) ?? null, + last_date: filtered.at(-1)?.timestamp.slice(0, 10) ?? null, + }; +}; + +const summarizeAllTime = (entries) => { + const totals = sumSeries(entries); + + return { + count: totals.count, + sum_daily_uniques: totals.sum_daily_uniques, + unique_semantics: 'sum_of_daily_uniques', + first_date: entries[0]?.timestamp.slice(0, 10) ?? null, + last_date: entries.at(-1)?.timestamp.slice(0, 10) ?? null, + }; +}; + +const normalizeExistingArchive = (archive, repository, capturedAt) => { + if (!archive) { + return { + version: ARCHIVE_VERSION, + repository, + archive_started_at: capturedAt, + updated_at: capturedAt, + daily: { + views: [], + clones: [], + }, + snapshots: { + referrers: [], + paths: [], + }, + captures: [], + }; + } + + if (archive.repository && archive.repository !== repository) { + throw new Error(`Archive repository mismatch: ${archive.repository} != ${repository}`); + } + + return { + version: ARCHIVE_VERSION, + repository, + archive_started_at: archive.archive_started_at || capturedAt, + updated_at: archive.updated_at || capturedAt, + daily: { + views: normalizeDailyEntries(archive.daily?.views || [], 'daily.views'), + clones: normalizeDailyEntries(archive.daily?.clones || [], 'daily.clones'), + }, + snapshots: { + referrers: (archive.snapshots?.referrers || []).map((snapshot) => ({ + captured_at: toIsoString(snapshot.captured_at, 'referrer snapshot timestamp'), + date: snapshot.date || toDateKey(snapshot.captured_at), + entries: normalizeReferrers(snapshot.entries || []), + })), + paths: (archive.snapshots?.paths || []).map((snapshot) => ({ + captured_at: toIsoString(snapshot.captured_at, 'path snapshot timestamp'), + date: snapshot.date || toDateKey(snapshot.captured_at), + entries: normalizePaths(snapshot.entries || []), + })), + }, + captures: (archive.captures || []).map((capture) => ({ + captured_at: toIsoString(capture.captured_at, 'capture timestamp'), + date: capture.date || toDateKey(capture.captured_at), + views_window: { + count: toNonNegativeInteger(capture.views_window?.count || 0, 'captures.views_window.count'), + uniques: toNonNegativeInteger(capture.views_window?.uniques || 0, 'captures.views_window.uniques'), + }, + clones_window: { + count: toNonNegativeInteger(capture.clones_window?.count || 0, 'captures.clones_window.count'), + uniques: toNonNegativeInteger(capture.clones_window?.uniques || 0, 'captures.clones_window.uniques'), + }, + })), + }; +}; + +export const mergeTrafficArchive = (existingArchive, snapshot) => { + const repository = normalizeRepository(snapshot.repository); + const capturedAt = toIsoString(snapshot.captured_at, 'capture timestamp'); + const captureDate = toDateKey(capturedAt); + const archive = normalizeExistingArchive(existingArchive, repository, capturedAt); + + const views = normalizeDailyEntries(snapshot.views?.views || [], 'views'); + const clones = normalizeDailyEntries(snapshot.clones?.clones || [], 'clones'); + const referrerSnapshot = { + captured_at: capturedAt, + date: captureDate, + entries: normalizeReferrers(snapshot.referrers || []), + }; + const pathSnapshot = { + captured_at: capturedAt, + date: captureDate, + entries: normalizePaths(snapshot.paths || []), + }; + const capture = { + captured_at: capturedAt, + date: captureDate, + views_window: { + count: toNonNegativeInteger(snapshot.views?.count ?? sumSeries(views).count, 'views.count'), + uniques: toNonNegativeInteger(snapshot.views?.uniques ?? sumSeries(views).sum_daily_uniques, 'views.uniques'), + }, + clones_window: { + count: toNonNegativeInteger(snapshot.clones?.count ?? sumSeries(clones).count, 'clones.count'), + uniques: toNonNegativeInteger(snapshot.clones?.uniques ?? sumSeries(clones).sum_daily_uniques, 'clones.uniques'), + }, + }; + + return { + ...archive, + updated_at: capturedAt, + daily: { + views: upsertByKey(archive.daily.views, views, 'timestamp'), + clones: upsertByKey(archive.daily.clones, clones, 'timestamp'), + }, + snapshots: { + referrers: upsertByKey(archive.snapshots.referrers, [referrerSnapshot], 'date'), + paths: upsertByKey(archive.snapshots.paths, [pathSnapshot], 'date'), + }, + captures: upsertByKey(archive.captures, [capture], 'date'), + }; +}; + +export const buildTrafficSummary = (archive, options = {}) => { + const now = new Date(options.now || new Date().toISOString()); + if (Number.isNaN(now.getTime())) { + throw new Error(`Invalid summary date: ${options.now}`); + } + + const views = archive.daily?.views || []; + const clones = archive.daily?.clones || []; + const buildMetrics = (entries) => { + const metrics = Object.fromEntries(SUMMARY_WINDOWS.map(([key, days]) => [ + key, + summarizeWindow(entries, days, now), + ])); + metrics.all_time = summarizeAllTime(entries); + return metrics; + }; + + return { + version: ARCHIVE_VERSION, + repository: archive.repository, + generated_at: now.toISOString(), + archive_started_at: archive.archive_started_at || null, + updated_at: archive.updated_at || null, + source: { + api: 'GitHub REST repository traffic endpoints', + retention_limit: 'GitHub exposes roughly the last 14 days; this archive keeps daily snapshots long term.', + unique_semantics: 'GitHub daily unique values are retained as sum_daily_uniques for longer windows, not deduplicated visitors.', + }, + metrics: { + views: buildMetrics(views), + clones: buildMetrics(clones), + }, + daily: { + views, + clones, + }, + latest_snapshots: { + referrers: latestEntry(archive.snapshots?.referrers || []), + paths: latestEntry(archive.snapshots?.paths || []), + }, + snapshot_counts: { + referrers: archive.snapshots?.referrers?.length || 0, + paths: archive.snapshots?.paths?.length || 0, + captures: archive.captures?.length || 0, + }, + }; +}; + +const fetchJson = async ({ repo, token, pathname, fetchImpl }) => { + const url = new URL(pathname, API_ROOT); + const response = await fetchImpl(url, { + headers: { + Accept: 'application/vnd.github+json', + Authorization: `Bearer ${token}`, + 'User-Agent': 'clawsec-traffic-archive', + 'X-GitHub-Api-Version': GITHUB_API_VERSION, + }, + }); + + if (!response.ok) { + const body = await response.text().catch(() => ''); + const suffix = body ? ` ${body.slice(0, 500)}` : ''; + throw new Error(`GitHub traffic API request failed for ${repo}: ${url.pathname}${url.search} returned ${response.status}.${suffix}`); + } + + return response.json(); +}; + +export const fetchGitHubTraffic = async ({ + repo, + token, + capturedAt = new Date().toISOString(), + fetchImpl = globalThis.fetch, +}) => { + const repository = normalizeRepository(repo); + if (!token) { + throw new Error('A GitHub token is required to read repository traffic.'); + } + if (typeof fetchImpl !== 'function') { + throw new Error('fetch is not available in this Node runtime.'); + } + + const encodedRepo = repository.split('/').map(encodeURIComponent).join('/'); + const request = (pathname) => fetchJson({ + repo: repository, + token, + pathname: `/repos/${encodedRepo}${pathname}`, + fetchImpl, + }); + + const [views, clones, referrers, paths] = await Promise.all([ + request('/traffic/views?per=day'), + request('/traffic/clones?per=day'), + request('/traffic/popular/referrers'), + request('/traffic/popular/paths'), + ]); + + return { + repository, + captured_at: toIsoString(capturedAt, 'capture timestamp'), + views, + clones, + referrers, + paths, + }; +}; + +const readJsonIfPresent = async (file) => { + try { + return JSON.parse(await fs.readFile(file, 'utf8')); + } catch (error) { + if (error?.code === 'ENOENT') { + return undefined; + } + throw error; + } +}; + +const writeTextAtomic = async (file, content) => { + const dir = path.dirname(file); + const tempFile = path.join(dir, `.${path.basename(file)}.${process.pid}.${Date.now()}.tmp`); + let handle; + + await fs.mkdir(dir, { recursive: true }); + + try { + handle = await fs.open(tempFile, 'w'); + await handle.writeFile(content, 'utf8'); + await handle.sync(); + await handle.close(); + handle = undefined; + await fs.rename(tempFile, file); + } catch (error) { + if (handle) { + await handle.close().catch(() => {}); + } + await fs.unlink(tempFile).catch(() => {}); + throw error; + } +}; + +export const writeJson = async (file, value) => { + await writeTextAtomic(file, `${JSON.stringify(value, null, 2)}\n`); +}; + +const parseArgs = (args) => { + const options = {}; + for (let index = 0; index < args.length; index += 1) { + const arg = args[index]; + if (arg === '--archive-dir') { + options.archiveDir = args[index + 1]; + index += 1; + } else if (arg === '--repo') { + options.repo = args[index + 1]; + index += 1; + } else if (arg === '--captured-at') { + options.capturedAt = args[index + 1]; + index += 1; + } else if (arg === '--help' || arg === '-h') { + options.help = true; + } else { + throw new Error(`Unknown argument: ${arg}`); + } + } + return options; +}; + +const printHelp = () => { + console.log(`Usage: node scripts/archive-github-traffic.mjs [options] + +Options: + --archive-dir