wordle-cheater/scripts/generate_wordlist.py

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Iterable

try:
    # Optional: wordfreq als zusätzliche Quelle
    from wordfreq import top_n_list  # type: ignore
except Exception:  # pragma: no cover
    top_n_list = None  # type: ignore


ALLOWED_FIVE_LETTER = re.compile(r"^[a-zäöüß]{5}$")


def is_valid_five_letter_word(word: str) -> bool:
    return bool(ALLOWED_FIVE_LETTER.match(word))


def clean_token(token: str) -> str:
    # entferne Klammerzusätze wie (ugs.), (fachspr.), etc.
    no_paren = re.sub(r"\([^)]*\)", "", token)
    # entferne Auslassungszeichen (Präfix-/Suffix-Markierungen)
    no_ellipsis = no_paren.replace("...", "").replace("…", "")
    # nur Buchstaben (inkl. äöüß) behalten
    letters_only = re.sub(r"[^A-Za-zÄÖÜäöüß]", "", no_ellipsis)
    return letters_only.strip().lower()


def extract_from_openthesaurus(path: Path) -> list[str]:
    words: set[str] = set()
    with path.open("r", encoding="utf-8") as f:
        for raw_line in f:
            line = raw_line.strip()
            if not line or line.startswith("#"):
                continue
            parts = line.split(";")
            for part in parts:
                # Einträge mit Auslassungen überspringen (keine vollständigen Wörter)
                if "..." in part or "…" in part:
                    continue
                token = clean_token(part)
                if is_valid_five_letter_word(token):
                    words.add(token)
    return sorted(words)


def extract_from_wordfreq(limit: int = 500_000) -> list[str]:
    if top_n_list is None:
        return []
    candidates = top_n_list("de", limit)
    words = {clean_token(w) for w in candidates}
    return sorted(w for w in words if is_valid_five_letter_word(w))


def main() -> None:
    root = Path(__file__).resolve().parents[1]
    source_ot = root / "data" / "openthesaurus.txt"
    out_txt = root / "data" / "words_de_5.txt"
    out_json = root / "data" / "words_de_5_sources.json"
    out_txt.parent.mkdir(parents=True, exist_ok=True)

    ot_words: list[str] = []
    if source_ot.exists():
        ot_words = extract_from_openthesaurus(source_ot)

    wf_words = extract_from_wordfreq()

    ot_set = set(ot_words)
    wf_set = set(wf_words)

    merged = sorted(ot_set | wf_set)

    # Textliste schreiben
    with out_txt.open("w", encoding="utf-8") as f:
        for w in merged:
            f.write(w + "\n")

    # Quellen-Map schreiben
    sources_map: dict[str, list[str]] = {}
    for w in merged:
        srcs: list[str] = []
        if w in ot_set:
            srcs.append("ot")
        if w in wf_set:
            srcs.append("wf")
        sources_map[w] = srcs

    with out_json.open("w", encoding="utf-8") as jf:
        json.dump(sources_map, jf, ensure_ascii=False)

    print(
        " | ".join(
            [
                f"OpenThesaurus: {len(ot_set)}",
                f"wordfreq: {len(wf_set)}",
                f"gesamt (dedupliziert): {len(merged)}",
                f"→ {out_txt} / {out_json}",
            ]
        )
    )


if __name__ == "__main__":
    main()