from __future__ import annotations import json import re from pathlib import Path from typing import Iterable try: # Optional: wordfreq als zusätzliche Quelle from wordfreq import top_n_list # type: ignore except Exception: # pragma: no cover top_n_list = None # type: ignore ALLOWED_FIVE_LETTER = re.compile(r"^[a-zäöüß]{5}$") def is_valid_five_letter_word(word: str) -> bool: return bool(ALLOWED_FIVE_LETTER.match(word)) def clean_token(token: str) -> str: # entferne Klammerzusätze wie (ugs.), (fachspr.), etc. no_paren = re.sub(r"\([^)]*\)", "", token) # entferne Auslassungszeichen (Präfix-/Suffix-Markierungen) no_ellipsis = no_paren.replace("...", "").replace("…", "") # nur Buchstaben (inkl. äöüß) behalten letters_only = re.sub(r"[^A-Za-zÄÖÜäöüß]", "", no_ellipsis) return letters_only.strip().lower() def extract_from_openthesaurus(path: Path) -> list[str]: words: set[str] = set() with path.open("r", encoding="utf-8") as f: for raw_line in f: line = raw_line.strip() if not line or line.startswith("#"): continue parts = line.split(";") for part in parts: # Einträge mit Auslassungen überspringen (keine vollständigen Wörter) if "..." in part or "…" in part: continue token = clean_token(part) if is_valid_five_letter_word(token): words.add(token) return sorted(words) def extract_from_wordfreq(limit: int = 500_000) -> list[str]: if top_n_list is None: return [] candidates = top_n_list("de", limit) words = {clean_token(w) for w in candidates} return sorted(w for w in words if is_valid_five_letter_word(w)) def main() -> None: root = Path(__file__).resolve().parents[1] source_ot = root / "data" / "openthesaurus.txt" out_txt = root / "data" / "words_de_5.txt" out_json = root / "data" / "words_de_5_sources.json" out_txt.parent.mkdir(parents=True, exist_ok=True) ot_words: list[str] = [] if source_ot.exists(): ot_words = extract_from_openthesaurus(source_ot) wf_words = extract_from_wordfreq() ot_set = set(ot_words) wf_set = set(wf_words) merged = sorted(ot_set | wf_set) # Textliste schreiben with out_txt.open("w", encoding="utf-8") as f: for w in merged: f.write(w + "\n") # Quellen-Map schreiben sources_map: dict[str, list[str]] = {} for w in merged: srcs: list[str] = [] if w in ot_set: srcs.append("ot") if w in wf_set: srcs.append("wf") sources_map[w] = srcs with out_json.open("w", encoding="utf-8") as jf: json.dump(sources_map, jf, ensure_ascii=False) print( " | ".join( [ f"OpenThesaurus: {len(ot_set)}", f"wordfreq: {len(wf_set)}", f"gesamt (dedupliziert): {len(merged)}", f"→ {out_txt} / {out_json}", ] ) ) if __name__ == "__main__": main()