from __future__ import annotations import re from pathlib import Path from typing import Iterable try: # Optional, nur Fallback from wordfreq import top_n_list # type: ignore except Exception: # pragma: no cover top_n_list = None # type: ignore ALLOWED_FIVE_LETTER = re.compile(r"^[a-zäöüß]{5}$") def is_valid_five_letter_word(word: str) -> bool: return bool(ALLOWED_FIVE_LETTER.match(word)) def clean_token(token: str) -> str: # entferne Klammerzusätze wie (ugs.), (fachspr.), etc. no_paren = re.sub(r"\([^)]*\)", "", token) # entferne Auslassungszeichen (Präfix-/Suffix-Markierungen) no_ellipsis = no_paren.replace("...", "").replace("…", "") # nur Buchstaben (inkl. äöüß) behalten letters_only = re.sub(r"[^A-Za-zÄÖÜäöüß]", "", no_ellipsis) return letters_only.strip().lower() def extract_from_openthesaurus(path: Path) -> list[str]: words: set[str] = set() with path.open("r", encoding="utf-8") as f: for raw_line in f: line = raw_line.strip() if not line or line.startswith("#"): continue parts = line.split(";") for part in parts: # Einträge mit Auslassungen überspringen (keine vollständigen Wörter) if "..." in part or "…" in part: continue token = clean_token(part) if is_valid_five_letter_word(token): words.add(token) return sorted(words) def extract_from_wordfreq(limit: int = 500_000) -> list[str]: if top_n_list is None: return [] candidates = top_n_list("de", limit) words = {clean_token(w) for w in candidates} return sorted(w for w in words if is_valid_five_letter_word(w)) def main() -> None: root = Path(__file__).resolve().parents[1] source_ot = root / "data" / "openthesaurus.txt" out_path = root / "data" / "words_de_5.txt" out_path.parent.mkdir(parents=True, exist_ok=True) if source_ot.exists(): words = extract_from_openthesaurus(source_ot) source = "OpenThesaurus" else: words = extract_from_wordfreq() source = "wordfreq" with out_path.open("w", encoding="utf-8") as f: for w in words: f.write(w + "\n") print(f"Gespeichert: {len(words)} Wörter (Quelle: {source}) -> {out_path}") if __name__ == "__main__": main()