Files
clawsec/scripts/i18n/qa_check.py
T
David Abutbul b37162a33d feat(i18n): add multilingual wiki scaffolding, language switcher, and… (#212)
* feat(i18n): add multilingual wiki scaffolding, language switcher, and translation QA pipeline

* docs(readme): adopt picoclaw-style multilingual link bar

* fix(i18n): repair localized index links and tighten partial-pair QA

* ci(i18n): fail on broken markdown links in README/wiki

* ci(i18n): add changed-files mode for markdown link checks

* i18n(de): use local Argos MT to fill untranslated German sections

* i18n(es,fr): fill untranslated sections via local Argos workflow

* i18n(ja): fill untranslated sections with scoped local Argos pass

* i18n(ko): fill untranslated sections with scoped local Argos pass

* fix(i18n): address review feedback

---------

Co-authored-by: David Abutbul <David.a@prompt.security>
2026-04-29 09:00:31 +03:00

209 lines
6.6 KiB
Python

#!/usr/bin/env python3
"""Translation QA checks for ClawSec docs.
Validates markdown translation pairs with a focus on technical integrity:
- fenced code blocks are preserved exactly
- key inline technical tokens are preserved
- absolute URLs from source are preserved
- non-translatable product/skill terms are preserved
This script checks only pairs that already exist (partial translation is allowed).
"""
from __future__ import annotations
import re
import sys
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class Pair:
source: Path
target: Path
NON_TRANSLATABLE_TERMS = (
"ClawSec",
"OpenClaw",
"NanoClaw",
"Hermes",
"Picoclaw",
"clawsec-suite",
)
def _extract_fenced_blocks(text: str) -> list[str]:
return re.findall(r"```[^\n]*\n.*?```", text, flags=re.DOTALL)
def _extract_inline_code(text: str) -> list[str]:
return re.findall(r"`([^`\n]+)`", text)
def _extract_absolute_urls(text: str) -> set[str]:
return set(re.findall(r"https?://[^\s)>'\"]+", text))
def _is_technical_inline_token(token: str) -> bool:
checks = (
"/" in token,
token.startswith("./"),
token.startswith("../"),
token.endswith(".md"),
token.endswith(".yml"),
token.endswith(".json"),
token.startswith("npx "),
token.startswith("npm "),
token.startswith("python "),
token.startswith("node "),
"--" in token,
bool(re.search(r"\$[A-Z_][A-Z0-9_]*", token)),
)
return any(checks)
def _collect_pairs(repo_root: Path) -> list[Pair]:
pairs: list[Pair] = []
readme_en = repo_root / "README.md"
for translated_readme in sorted(repo_root.glob("README.*.md")):
if translated_readme.name == "README.md":
continue
if readme_en.exists():
pairs.append(Pair(readme_en, translated_readme))
wiki_root = repo_root / "wiki"
language_dirs = {
p.name
for p in wiki_root.iterdir()
if p.is_dir() and (p / "INDEX.md").exists() and p.name not in {"modules", "i18n", "assets"}
}
for source in wiki_root.rglob("*.md"):
rel = source.relative_to(wiki_root)
rel_parts = rel.parts
if not rel_parts:
continue
# Skip language roots and i18n metadata as source files.
if rel_parts[0] in language_dirs or rel_parts[0] == "i18n":
continue
for lang in sorted(language_dirs):
target = wiki_root / lang / rel
if target.exists():
pairs.append(Pair(source, target))
return sorted(pairs, key=lambda p: str(p.source))
def _extract_command_lines_from_fence(block: str) -> list[str]:
lines = block.splitlines()[1:-1]
cleaned: list[str] = []
for line in lines:
candidate = line.strip()
if not candidate or candidate.startswith("#"):
continue
cleaned.append(candidate)
return cleaned
def _check_pair(pair: Pair) -> tuple[list[str], list[str]]:
errors: list[str] = []
warnings: list[str] = []
source_text = pair.source.read_text(encoding="utf-8")
target_text = pair.target.read_text(encoding="utf-8")
source_blocks = _extract_fenced_blocks(source_text)
target_blocks = _extract_fenced_blocks(target_text)
partial_pair = len(source_blocks) != len(target_blocks)
if partial_pair:
# Allow partial translations, but preserve command lines in translated fences.
for idx, target_block in enumerate(target_blocks, start=1):
for command_line in _extract_command_lines_from_fence(target_block):
if command_line not in source_text:
errors.append(
f"translated code fence #{idx} contains command line not found in source: {command_line}"
)
warnings.append(
f"partial translation detected (code fences source={len(source_blocks)} target={len(target_blocks)})"
)
else:
for idx, (src_block, tgt_block) in enumerate(zip(source_blocks, target_blocks), start=1):
src_commands = _extract_command_lines_from_fence(src_block)
tgt_commands = _extract_command_lines_from_fence(tgt_block)
if src_commands != tgt_commands:
errors.append(f"code fence #{idx} command lines differ from source")
source_inline = {tok for tok in _extract_inline_code(source_text) if _is_technical_inline_token(tok)}
missing_inline = sorted(tok for tok in source_inline if tok not in target_text)
if missing_inline:
preview = ", ".join(missing_inline[:8])
extra = "" if len(missing_inline) <= 8 else f" (+{len(missing_inline) - 8} more)"
msg = f"missing inline technical tokens: {preview}{extra}"
if partial_pair:
warnings.append(f"{msg} (partial pair)")
else:
warnings.append(msg)
source_urls = _extract_absolute_urls(source_text)
missing_urls = sorted(url for url in source_urls if url not in target_text)
if missing_urls:
preview = ", ".join(missing_urls[:5])
extra = "" if len(missing_urls) <= 5 else f" (+{len(missing_urls) - 5} more)"
msg = f"missing absolute URLs: {preview}{extra}"
if partial_pair:
warnings.append(f"{msg} (partial pair)")
else:
warnings.append(msg)
for term in NON_TRANSLATABLE_TERMS:
if term in source_text and term not in target_text:
errors.append(f"non-translatable term missing: {term}")
return errors, warnings
def main() -> int:
repo_root = Path(__file__).resolve().parents[2]
pairs = _collect_pairs(repo_root)
if not pairs:
print("[i18n-qa] No translation pairs found. Nothing to check.")
return 0
print(f"[i18n-qa] Checking {len(pairs)} translation pairs...")
total_errors = 0
total_warnings = 0
for pair in pairs:
rel_source = pair.source.relative_to(repo_root)
rel_target = pair.target.relative_to(repo_root)
errors, warnings = _check_pair(pair)
for warn in warnings:
total_warnings += 1
print(f"WARN {rel_source} -> {rel_target} :: {warn}")
if errors:
total_errors += len(errors)
print(f"\nFAIL {rel_source} -> {rel_target}")
for err in errors:
print(f" - {err}")
else:
print(f"PASS {rel_source} -> {rel_target}")
if total_errors:
print(f"\n[i18n-qa] FAILED with {total_errors} issue(s) and {total_warnings} warning(s).")
return 1
print(f"\n[i18n-qa] All checks passed with {total_warnings} warning(s).")
return 0
if __name__ == "__main__":
raise SystemExit(main())