Quelle-Badges (OT/WF), Legende verschoben, Footer mit Emojis, A11y-Verbesserungen; Generator: Merge OpenThesaurus+wordfreq; Dockerfile/Gunicorn hinzugefügt

2025-08-19 11:26:02 +02:00
parent d6d23a230e
commit 916f6510d8
5 changed files with 26823 additions and 60 deletions
--- a/scripts/generate_wordlist.py
+++ b/scripts/generate_wordlist.py
@@ -1,11 +1,12 @@
 from __future__ import annotations

+import json
 import re
 from pathlib import Path
 from typing import Iterable

 try:
-    # Optional, nur Fallback
+    # Optional: wordfreq als zusätzliche Quelle
    from wordfreq import top_n_list  # type: ignore
 except Exception:  # pragma: no cover
    top_n_list = None  # type: ignore
@@ -57,21 +58,49 @@ def extract_from_wordfreq(limit: int = 500_000) -> list[str]:
 def main() -> None:
    root = Path(__file__).resolve().parents[1]
    source_ot = root / "data" / "openthesaurus.txt"
-    out_path = root / "data" / "words_de_5.txt"
-    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_txt = root / "data" / "words_de_5.txt"
+    out_json = root / "data" / "words_de_5_sources.json"
+    out_txt.parent.mkdir(parents=True, exist_ok=True)

+    ot_words: list[str] = []
    if source_ot.exists():
-        words = extract_from_openthesaurus(source_ot)
-        source = "OpenThesaurus"
-    else:
-        words = extract_from_wordfreq()
-        source = "wordfreq"
+        ot_words = extract_from_openthesaurus(source_ot)

-    with out_path.open("w", encoding="utf-8") as f:
-        for w in words:
+    wf_words = extract_from_wordfreq()
+
+    ot_set = set(ot_words)
+    wf_set = set(wf_words)
+
+    merged = sorted(ot_set | wf_set)
+
+    # Textliste schreiben
+    with out_txt.open("w", encoding="utf-8") as f:
+        for w in merged:
            f.write(w + "\n")

-    print(f"Gespeichert: {len(words)} Wörter (Quelle: {source}) -> {out_path}")
+    # Quellen-Map schreiben
+    sources_map: dict[str, list[str]] = {}
+    for w in merged:
+        srcs: list[str] = []
+        if w in ot_set:
+            srcs.append("ot")
+        if w in wf_set:
+            srcs.append("wf")
+        sources_map[w] = srcs
+
+    with out_json.open("w", encoding="utf-8") as jf:
+        json.dump(sources_map, jf, ensure_ascii=False)
+
+    print(
+        " | ".join(
+            [
+                f"OpenThesaurus: {len(ot_set)}",
+                f"wordfreq: {len(wf_set)}",
+                f"gesamt (dedupliziert): {len(merged)}",
+                f"→ {out_txt} / {out_json}",
+            ]
+        )
+    )


 if __name__ == "__main__":