feat(i18n): add multilingual wiki scaffolding, language switcher, and… (#212)

* feat(i18n): add multilingual wiki scaffolding, language switcher, and translation QA pipeline

* docs(readme): adopt picoclaw-style multilingual link bar

* fix(i18n): repair localized index links and tighten partial-pair QA

* ci(i18n): fail on broken markdown links in README/wiki

* ci(i18n): add changed-files mode for markdown link checks

* i18n(de): use local Argos MT to fill untranslated German sections

* i18n(es,fr): fill untranslated sections via local Argos workflow

* i18n(ja): fill untranslated sections with scoped local Argos pass

* i18n(ko): fill untranslated sections with scoped local Argos pass

* fix(i18n): address review feedback

---------

Co-authored-by: David Abutbul <David.a@prompt.security>
This commit is contained in:
David Abutbul
2026-04-29 09:00:31 +03:00
committed by GitHub
parent 627d20b7e1
commit b37162a33d
105 changed files with 13084 additions and 2 deletions
@@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""Create missing translated wiki pages from English source.
Usage:
python scripts/i18n/bootstrap_language_from_en.py --lang ko
python scripts/i18n/bootstrap_language_from_en.py --lang ko --dry-run
python scripts/i18n/bootstrap_language_from_en.py --lang ko --overwrite
"""
from __future__ import annotations
import argparse
from pathlib import Path
SKIP_TOP_LEVEL_DIRS = {"assets", "i18n", "modules"}
def build_header(lang: str, rel_path: str) -> str:
return (
f"<!-- AUTO-GENERATED TRANSLATION SCAFFOLD ({lang})\n"
f"Source: ../{rel_path}\n"
"Review status: draft\n"
"-->\n\n"
)
def discover_source_pages(wiki_root: Path, lang_dirs: set[str]) -> list[Path]:
pages: list[Path] = []
for page in wiki_root.rglob("*.md"):
rel = page.relative_to(wiki_root)
if not rel.parts:
continue
first = rel.parts[0]
if first in SKIP_TOP_LEVEL_DIRS or first in lang_dirs:
continue
pages.append(page)
return sorted(pages)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--lang", required=True, help="language code, e.g. ko, es, fr")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--overwrite", action="store_true")
args = parser.parse_args()
repo_root = Path(__file__).resolve().parents[2]
wiki_root = repo_root / "wiki"
target_root = wiki_root / args.lang
lang_dirs = {
p.name
for p in wiki_root.iterdir()
if p.is_dir() and (p / "INDEX.md").exists() and p.name not in SKIP_TOP_LEVEL_DIRS
}
source_pages = discover_source_pages(wiki_root, lang_dirs)
created = 0
skipped = 0
overwritten = 0
for src in source_pages:
rel = src.relative_to(wiki_root)
dst = target_root / rel
existed_before = dst.exists()
if existed_before and not args.overwrite:
skipped += 1
continue
src_text = src.read_text(encoding="utf-8")
header = build_header(args.lang, rel.as_posix())
out = header + src_text
if not args.dry_run:
dst.parent.mkdir(parents=True, exist_ok=True)
dst.write_text(out, encoding="utf-8")
if existed_before:
overwritten += 1
else:
created += 1
mode = "DRY-RUN" if args.dry_run else "WRITE"
print(f"[{mode}] lang={args.lang} source_pages={len(source_pages)} created={created} overwritten={overwritten} skipped={skipped}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
@@ -0,0 +1,207 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import re
from collections.abc import Iterable
from pathlib import Path
from argostranslate import translate
RE_INLINE_CODE = re.compile(r"`[^`]*`")
RE_MD_LINK = re.compile(r"\[[^\]]*\]\([^\)]*\)")
ENGLISH_HINTS = {
"a",
"an",
"and",
"are",
"as",
"at",
"be",
"before",
"by",
"for",
"from",
"if",
"in",
"is",
"of",
"on",
"or",
"the",
"this",
"to",
"use",
"using",
"when",
"with",
"you",
"your",
}
def _protect_tokens(line: str) -> tuple[str, dict[str, str]]:
mapping: dict[str, str] = {}
idx = 0
def repl(pattern: re.Pattern[str], text: str) -> str:
nonlocal idx
def _r(m: re.Match[str]) -> str:
nonlocal idx
key = f"ZXQTOKEN{idx}QXZ"
idx += 1
mapping[key] = m.group(0)
return key
return pattern.sub(_r, text)
out = line
out = repl(RE_MD_LINK, out)
out = repl(RE_INLINE_CODE, out)
return out, mapping
def _restore_tokens(line: str, mapping: dict[str, str]) -> str:
out = line
for key, value in mapping.items():
out = out.replace(key, value)
old_style = re.fullmatch(r"__TOK_(\d+)__", key)
if old_style:
idx = old_style.group(1)
out = re.sub(rf"_{{1,2}}TOK_{idx}_{{1,2}}", value, out)
continue
new_style = re.fullmatch(r"ZXQTOKEN(\d+)QXZ", key)
if new_style:
idx = new_style.group(1)
out = re.sub(rf"ZXQTOKEN{idx}\s+QXZ", value, out)
return out
def _should_translate(line: str) -> bool:
s = line.strip()
if not s:
return False
if s.startswith("<!--") or s.endswith("-->"):
return False
return True
def _translate_line(tr, line: str) -> str:
protected, mapping = _protect_tokens(line)
translated = tr.translate(protected)
restored = _restore_tokens(translated, mapping)
return restored
def _normalize_line(line: str) -> str:
return re.sub(r"\s+", " ", line.strip())
def _looks_like_english(line: str) -> bool:
words = re.findall(r"[A-Za-z]+", line.lower())
if not words:
return False
hint_count = sum(1 for word in words if word in ENGLISH_HINTS)
return hint_count >= 2
def _should_process_target_line(target_line: str, source_lines: set[str]) -> bool:
normalized = _normalize_line(target_line)
return normalized in source_lines or _looks_like_english(target_line)
def _process_pair(source: Path, target: Path, tr) -> int:
src_lines = source.read_text(encoding="utf-8").splitlines()
src_set = {_normalize_line(line) for line in src_lines if line.strip()}
tgt_lines = target.read_text(encoding="utf-8").splitlines()
changed = 0
in_code = False
for i, tgt in enumerate(tgt_lines):
if tgt.strip().startswith("```"):
in_code = not in_code
continue
if in_code:
continue
# Only fill lines that are still unchanged or visibly retain English fragments.
if not _should_process_target_line(tgt, src_set):
continue
if not _should_translate(tgt):
continue
new = _translate_line(tr, tgt)
if new and new != tgt:
tgt_lines[i] = new
changed += 1
if changed:
target.write_text("\n".join(tgt_lines) + "\n", encoding="utf-8")
return changed
def _normalize_only(values: Iterable[str] | None) -> set[str]:
normalized: set[str] = set()
for value in values or []:
item = value.strip().replace("\\", "/")
if not item:
continue
normalized.add(item)
normalized.add(Path(item).name)
return normalized
def _matches_only(path: Path, repo: Path, only: set[str]) -> bool:
if not only:
return True
rel = path.relative_to(repo).as_posix()
candidates = {path.name, rel}
return bool(candidates & only)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--lang", required=True, choices=["de", "es", "fr", "ja", "ko"])
parser.add_argument(
"--only",
nargs="*",
default=None,
help="Optional list of target markdown filenames to process (e.g. README.ja.md overview.md security.md)",
)
args = parser.parse_args()
repo = Path(__file__).resolve().parents[2]
tr = translate.get_translation_from_codes("en", args.lang)
if tr is None:
raise SystemExit(f"Missing Argos en->{args.lang} model. Install first.")
total = 0
only = _normalize_only(args.only)
# README
readme_target = repo / f"README.{args.lang}.md"
if _matches_only(readme_target, repo, only):
total += _process_pair(repo / "README.md", readme_target, tr)
# wiki/<lang>
lang_root = repo / "wiki" / args.lang
for lang_file in sorted(lang_root.glob("*.md")):
if not _matches_only(lang_file, repo, only):
continue
if lang_file.name in {"INDEX.md", "GENERATION.md"}:
continue
src = repo / "wiki" / lang_file.name
if src.exists():
total += _process_pair(src, lang_file, tr)
print(f"Updated translated lines for {args.lang}: {total}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
+98
View File
@@ -0,0 +1,98 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import re
import subprocess
from pathlib import Path
ROOT = Path(__file__).resolve().parents[2]
MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)]+)\)")
def _all_docs() -> set[Path]:
files = set(ROOT.glob("README*.md"))
files.update(ROOT.glob("wiki/**/*.md"))
return {p for p in files if p.is_file()}
def _changed_docs(base_ref: str) -> set[Path]:
cmd = ["git", "diff", "--name-only", f"{base_ref}...HEAD"]
output = subprocess.check_output(cmd, cwd=ROOT, text=True)
docs: set[Path] = set()
for rel in output.splitlines():
if not rel.endswith(".md"):
continue
p = (ROOT / rel).resolve()
if p in _all_docs() and p.exists():
docs.add(p)
return docs
def _should_skip(link: str) -> bool:
return (
not link
or link.startswith("#")
or "://" in link
or link.startswith("mailto:")
or link.startswith("tel:")
)
def _resolve_target(doc: Path, link: str) -> Path:
clean = link.split("#", 1)[0].strip()
return (doc.parent / clean).resolve()
def _select_docs(changed_only: bool, base_ref: str) -> list[Path]:
all_docs = _all_docs()
if not changed_only:
return sorted(all_docs)
try:
changed = _changed_docs(base_ref)
except Exception as exc: # noqa: BLE001
print(f"[link-check] WARN: changed-only mode failed ({exc}); falling back to full scan")
return sorted(all_docs)
if not changed:
print("[link-check] No changed markdown docs detected; nothing to check.")
return []
return sorted(changed)
def main() -> int:
parser = argparse.ArgumentParser(description="Check local markdown links in README/wiki docs")
parser.add_argument("--changed-only", action="store_true", help="Check only changed markdown docs against a base ref")
parser.add_argument("--base-ref", default="origin/main", help="Base ref for --changed-only (default: origin/main)")
args = parser.parse_args()
docs = _select_docs(args.changed_only, args.base_ref)
failures: list[str] = []
for doc in docs:
rel_doc = doc.relative_to(ROOT)
content = doc.read_text(encoding="utf-8")
for match in MARKDOWN_LINK_RE.finditer(content):
raw_link = match.group(1).strip()
if _should_skip(raw_link):
continue
target = _resolve_target(doc, raw_link)
if not target.exists():
failures.append(f"{rel_doc}: broken link -> {raw_link}")
if failures:
print(f"[link-check] FAILED: {len(failures)} broken link(s) found.")
for item in failures:
print(f" - {item}")
return 1
scope = "changed docs" if args.changed_only else "all docs"
print(f"[link-check] PASS: no broken local markdown links found ({scope}).")
return 0
if __name__ == "__main__":
raise SystemExit(main())
+208
View File
@@ -0,0 +1,208 @@
#!/usr/bin/env python3
"""Translation QA checks for ClawSec docs.
Validates markdown translation pairs with a focus on technical integrity:
- fenced code blocks are preserved exactly
- key inline technical tokens are preserved
- absolute URLs from source are preserved
- non-translatable product/skill terms are preserved
This script checks only pairs that already exist (partial translation is allowed).
"""
from __future__ import annotations
import re
import sys
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class Pair:
source: Path
target: Path
NON_TRANSLATABLE_TERMS = (
"ClawSec",
"OpenClaw",
"NanoClaw",
"Hermes",
"Picoclaw",
"clawsec-suite",
)
def _extract_fenced_blocks(text: str) -> list[str]:
return re.findall(r"```[^\n]*\n.*?```", text, flags=re.DOTALL)
def _extract_inline_code(text: str) -> list[str]:
return re.findall(r"`([^`\n]+)`", text)
def _extract_absolute_urls(text: str) -> set[str]:
return set(re.findall(r"https?://[^\s)>'\"]+", text))
def _is_technical_inline_token(token: str) -> bool:
checks = (
"/" in token,
token.startswith("./"),
token.startswith("../"),
token.endswith(".md"),
token.endswith(".yml"),
token.endswith(".json"),
token.startswith("npx "),
token.startswith("npm "),
token.startswith("python "),
token.startswith("node "),
"--" in token,
bool(re.search(r"\$[A-Z_][A-Z0-9_]*", token)),
)
return any(checks)
def _collect_pairs(repo_root: Path) -> list[Pair]:
pairs: list[Pair] = []
readme_en = repo_root / "README.md"
for translated_readme in sorted(repo_root.glob("README.*.md")):
if translated_readme.name == "README.md":
continue
if readme_en.exists():
pairs.append(Pair(readme_en, translated_readme))
wiki_root = repo_root / "wiki"
language_dirs = {
p.name
for p in wiki_root.iterdir()
if p.is_dir() and (p / "INDEX.md").exists() and p.name not in {"modules", "i18n", "assets"}
}
for source in wiki_root.rglob("*.md"):
rel = source.relative_to(wiki_root)
rel_parts = rel.parts
if not rel_parts:
continue
# Skip language roots and i18n metadata as source files.
if rel_parts[0] in language_dirs or rel_parts[0] == "i18n":
continue
for lang in sorted(language_dirs):
target = wiki_root / lang / rel
if target.exists():
pairs.append(Pair(source, target))
return sorted(pairs, key=lambda p: str(p.source))
def _extract_command_lines_from_fence(block: str) -> list[str]:
lines = block.splitlines()[1:-1]
cleaned: list[str] = []
for line in lines:
candidate = line.strip()
if not candidate or candidate.startswith("#"):
continue
cleaned.append(candidate)
return cleaned
def _check_pair(pair: Pair) -> tuple[list[str], list[str]]:
errors: list[str] = []
warnings: list[str] = []
source_text = pair.source.read_text(encoding="utf-8")
target_text = pair.target.read_text(encoding="utf-8")
source_blocks = _extract_fenced_blocks(source_text)
target_blocks = _extract_fenced_blocks(target_text)
partial_pair = len(source_blocks) != len(target_blocks)
if partial_pair:
# Allow partial translations, but preserve command lines in translated fences.
for idx, target_block in enumerate(target_blocks, start=1):
for command_line in _extract_command_lines_from_fence(target_block):
if command_line not in source_text:
errors.append(
f"translated code fence #{idx} contains command line not found in source: {command_line}"
)
warnings.append(
f"partial translation detected (code fences source={len(source_blocks)} target={len(target_blocks)})"
)
else:
for idx, (src_block, tgt_block) in enumerate(zip(source_blocks, target_blocks), start=1):
src_commands = _extract_command_lines_from_fence(src_block)
tgt_commands = _extract_command_lines_from_fence(tgt_block)
if src_commands != tgt_commands:
errors.append(f"code fence #{idx} command lines differ from source")
source_inline = {tok for tok in _extract_inline_code(source_text) if _is_technical_inline_token(tok)}
missing_inline = sorted(tok for tok in source_inline if tok not in target_text)
if missing_inline:
preview = ", ".join(missing_inline[:8])
extra = "" if len(missing_inline) <= 8 else f" (+{len(missing_inline) - 8} more)"
msg = f"missing inline technical tokens: {preview}{extra}"
if partial_pair:
warnings.append(f"{msg} (partial pair)")
else:
warnings.append(msg)
source_urls = _extract_absolute_urls(source_text)
missing_urls = sorted(url for url in source_urls if url not in target_text)
if missing_urls:
preview = ", ".join(missing_urls[:5])
extra = "" if len(missing_urls) <= 5 else f" (+{len(missing_urls) - 5} more)"
msg = f"missing absolute URLs: {preview}{extra}"
if partial_pair:
warnings.append(f"{msg} (partial pair)")
else:
warnings.append(msg)
for term in NON_TRANSLATABLE_TERMS:
if term in source_text and term not in target_text:
errors.append(f"non-translatable term missing: {term}")
return errors, warnings
def main() -> int:
repo_root = Path(__file__).resolve().parents[2]
pairs = _collect_pairs(repo_root)
if not pairs:
print("[i18n-qa] No translation pairs found. Nothing to check.")
return 0
print(f"[i18n-qa] Checking {len(pairs)} translation pairs...")
total_errors = 0
total_warnings = 0
for pair in pairs:
rel_source = pair.source.relative_to(repo_root)
rel_target = pair.target.relative_to(repo_root)
errors, warnings = _check_pair(pair)
for warn in warnings:
total_warnings += 1
print(f"WARN {rel_source} -> {rel_target} :: {warn}")
if errors:
total_errors += len(errors)
print(f"\nFAIL {rel_source} -> {rel_target}")
for err in errors:
print(f" - {err}")
else:
print(f"PASS {rel_source} -> {rel_target}")
if total_errors:
print(f"\n[i18n-qa] FAILED with {total_errors} issue(s) and {total_warnings} warning(s).")
return 1
print(f"\n[i18n-qa] All checks passed with {total_warnings} warning(s).")
return 0
if __name__ == "__main__":
raise SystemExit(main())
@@ -0,0 +1,88 @@
from __future__ import annotations
import importlib.util
import sys
import tempfile
import types
import unittest
from pathlib import Path
def _load_module():
translate_stub = types.SimpleNamespace(get_translation_from_codes=lambda *_args: None)
sys.modules.setdefault("argostranslate", types.SimpleNamespace(translate=translate_stub))
module_path = Path(__file__).with_name("fill_missing_translations_argos.py")
spec = importlib.util.spec_from_file_location("fill_missing_translations_argos", module_path)
if spec is None or spec.loader is None:
raise RuntimeError(f"Unable to load {module_path}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
class FakeTranslator:
def translate(self, line: str) -> str:
return (
line.replace("Brought to you von", "Bereitgestellt von")
.replace("the Platform of AI Security", "die Plattform fuer KI-Sicherheit")
.replace("requires WSL or Git Bash", "erfordern WSL oder Git Bash")
)
class FillMissingTranslationsArgosTests(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.module = _load_module()
def test_restore_tokens_handles_placeholder_variants(self) -> None:
mapping = {
"__TOK_0__": "`bash`",
"__TOK_1__": "[Prompt Security](https://prompt.security)",
"ZXQTOKEN2QXZ": "`node`",
}
restored = self.module._restore_tokens(
"Use __TOK_0_, _TOK_1__, and ZXQTOKEN2 QXZ before running.",
mapping,
)
self.assertEqual(
restored,
"Use `bash`, [Prompt Security](https://prompt.security), and `node` before running.",
)
def test_process_pair_translates_lines_that_still_contain_english_fragments(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
root = Path(tmpdir)
source = root / "source.md"
target = root / "target.md"
source.write_text(
'<h4>Brought to you by <a href="https://prompt.security">Prompt Security</a>, '
"the Platform of AI Security</h4>\n",
encoding="utf-8",
)
target.write_text(
'<h4>Brought to you von <a href="https://prompt.security">Prompt Security</a>, '
"the Platform of AI Security>/h4>\n",
encoding="utf-8",
)
changed = self.module._process_pair(source, target, FakeTranslator())
self.assertEqual(changed, 1)
self.assertIn("Bereitgestellt von", target.read_text(encoding="utf-8"))
self.assertIn("die Plattform fuer KI-Sicherheit", target.read_text(encoding="utf-8"))
def test_only_matching_accepts_repo_relative_wiki_targets(self) -> None:
repo = Path("/repo")
target = repo / "wiki" / "ja" / "overview.md"
matches_only = getattr(self.module, "_matches_only", lambda *_args: False)
self.assertTrue(matches_only(target, repo, {"wiki/ja/overview.md"}))
self.assertTrue(matches_only(target, repo, {"overview.md"}))
self.assertFalse(matches_only(target, repo, {"wiki/ja/security.md"}))
if __name__ == "__main__":
unittest.main()
+41
View File
@@ -0,0 +1,41 @@
from __future__ import annotations
import importlib.util
import sys
import tempfile
import unittest
from pathlib import Path
def _load_module():
module_path = Path(__file__).with_name("qa_check.py")
spec = importlib.util.spec_from_file_location("qa_check", module_path)
if spec is None or spec.loader is None:
raise RuntimeError(f"Unable to load {module_path}")
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module
class QaCheckTests(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.module = _load_module()
def test_partial_pairs_still_fail_when_non_translatable_terms_are_missing(self) -> None:
with tempfile.TemporaryDirectory() as tmpdir:
root = Path(tmpdir)
source = root / "source.md"
target = root / "target.md"
source.write_text("ClawSec keeps this term.\n```sh\nnpm run build\n```\n", encoding="utf-8")
target.write_text("Translated text without the product term.\n", encoding="utf-8")
errors, warnings = self.module._check_pair(self.module.Pair(source, target))
self.assertIn("non-translatable term missing: ClawSec", errors)
self.assertTrue(any("partial translation detected" in warning for warning in warnings))
if __name__ == "__main__":
unittest.main()