Files
clawsec/scripts/i18n/fill_missing_translations_argos.py
T
David Abutbul b37162a33d feat(i18n): add multilingual wiki scaffolding, language switcher, and… (#212)
* feat(i18n): add multilingual wiki scaffolding, language switcher, and translation QA pipeline

* docs(readme): adopt picoclaw-style multilingual link bar

* fix(i18n): repair localized index links and tighten partial-pair QA

* ci(i18n): fail on broken markdown links in README/wiki

* ci(i18n): add changed-files mode for markdown link checks

* i18n(de): use local Argos MT to fill untranslated German sections

* i18n(es,fr): fill untranslated sections via local Argos workflow

* i18n(ja): fill untranslated sections with scoped local Argos pass

* i18n(ko): fill untranslated sections with scoped local Argos pass

* fix(i18n): address review feedback

---------

Co-authored-by: David Abutbul <David.a@prompt.security>
2026-04-29 09:00:31 +03:00

208 lines
5.3 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import re
from collections.abc import Iterable
from pathlib import Path
from argostranslate import translate
RE_INLINE_CODE = re.compile(r"`[^`]*`")
RE_MD_LINK = re.compile(r"\[[^\]]*\]\([^\)]*\)")
ENGLISH_HINTS = {
"a",
"an",
"and",
"are",
"as",
"at",
"be",
"before",
"by",
"for",
"from",
"if",
"in",
"is",
"of",
"on",
"or",
"the",
"this",
"to",
"use",
"using",
"when",
"with",
"you",
"your",
}
def _protect_tokens(line: str) -> tuple[str, dict[str, str]]:
mapping: dict[str, str] = {}
idx = 0
def repl(pattern: re.Pattern[str], text: str) -> str:
nonlocal idx
def _r(m: re.Match[str]) -> str:
nonlocal idx
key = f"ZXQTOKEN{idx}QXZ"
idx += 1
mapping[key] = m.group(0)
return key
return pattern.sub(_r, text)
out = line
out = repl(RE_MD_LINK, out)
out = repl(RE_INLINE_CODE, out)
return out, mapping
def _restore_tokens(line: str, mapping: dict[str, str]) -> str:
out = line
for key, value in mapping.items():
out = out.replace(key, value)
old_style = re.fullmatch(r"__TOK_(\d+)__", key)
if old_style:
idx = old_style.group(1)
out = re.sub(rf"_{{1,2}}TOK_{idx}_{{1,2}}", value, out)
continue
new_style = re.fullmatch(r"ZXQTOKEN(\d+)QXZ", key)
if new_style:
idx = new_style.group(1)
out = re.sub(rf"ZXQTOKEN{idx}\s+QXZ", value, out)
return out
def _should_translate(line: str) -> bool:
s = line.strip()
if not s:
return False
if s.startswith("<!--") or s.endswith("-->"):
return False
return True
def _translate_line(tr, line: str) -> str:
protected, mapping = _protect_tokens(line)
translated = tr.translate(protected)
restored = _restore_tokens(translated, mapping)
return restored
def _normalize_line(line: str) -> str:
return re.sub(r"\s+", " ", line.strip())
def _looks_like_english(line: str) -> bool:
words = re.findall(r"[A-Za-z]+", line.lower())
if not words:
return False
hint_count = sum(1 for word in words if word in ENGLISH_HINTS)
return hint_count >= 2
def _should_process_target_line(target_line: str, source_lines: set[str]) -> bool:
normalized = _normalize_line(target_line)
return normalized in source_lines or _looks_like_english(target_line)
def _process_pair(source: Path, target: Path, tr) -> int:
src_lines = source.read_text(encoding="utf-8").splitlines()
src_set = {_normalize_line(line) for line in src_lines if line.strip()}
tgt_lines = target.read_text(encoding="utf-8").splitlines()
changed = 0
in_code = False
for i, tgt in enumerate(tgt_lines):
if tgt.strip().startswith("```"):
in_code = not in_code
continue
if in_code:
continue
# Only fill lines that are still unchanged or visibly retain English fragments.
if not _should_process_target_line(tgt, src_set):
continue
if not _should_translate(tgt):
continue
new = _translate_line(tr, tgt)
if new and new != tgt:
tgt_lines[i] = new
changed += 1
if changed:
target.write_text("\n".join(tgt_lines) + "\n", encoding="utf-8")
return changed
def _normalize_only(values: Iterable[str] | None) -> set[str]:
normalized: set[str] = set()
for value in values or []:
item = value.strip().replace("\\", "/")
if not item:
continue
normalized.add(item)
normalized.add(Path(item).name)
return normalized
def _matches_only(path: Path, repo: Path, only: set[str]) -> bool:
if not only:
return True
rel = path.relative_to(repo).as_posix()
candidates = {path.name, rel}
return bool(candidates & only)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--lang", required=True, choices=["de", "es", "fr", "ja", "ko"])
parser.add_argument(
"--only",
nargs="*",
default=None,
help="Optional list of target markdown filenames to process (e.g. README.ja.md overview.md security.md)",
)
args = parser.parse_args()
repo = Path(__file__).resolve().parents[2]
tr = translate.get_translation_from_codes("en", args.lang)
if tr is None:
raise SystemExit(f"Missing Argos en->{args.lang} model. Install first.")
total = 0
only = _normalize_only(args.only)
# README
readme_target = repo / f"README.{args.lang}.md"
if _matches_only(readme_target, repo, only):
total += _process_pair(repo / "README.md", readme_target, tr)
# wiki/<lang>
lang_root = repo / "wiki" / args.lang
for lang_file in sorted(lang_root.glob("*.md")):
if not _matches_only(lang_file, repo, only):
continue
if lang_file.name in {"INDEX.md", "GENERATION.md"}:
continue
src = repo / "wiki" / lang_file.name
if src.exists():
total += _process_pair(src, lang_file, tr)
print(f"Updated translated lines for {args.lang}: {total}")
return 0
if __name__ == "__main__":
raise SystemExit(main())