Quelle-Badges (OT/WF), Legende verschoben, Footer mit Emojis, A11y-Verbesserungen; Generator: Merge OpenThesaurus+wordfreq; Dockerfile/Gunicorn hinzugefügt

This commit is contained in:
2025-08-19 11:26:02 +02:00
parent d6d23a230e
commit 916f6510d8
5 changed files with 26823 additions and 60 deletions

View File

@@ -1,11 +1,12 @@
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Iterable
try:
# Optional, nur Fallback
# Optional: wordfreq als zusätzliche Quelle
from wordfreq import top_n_list # type: ignore
except Exception: # pragma: no cover
top_n_list = None # type: ignore
@@ -57,21 +58,49 @@ def extract_from_wordfreq(limit: int = 500_000) -> list[str]:
def main() -> None:
root = Path(__file__).resolve().parents[1]
source_ot = root / "data" / "openthesaurus.txt"
out_path = root / "data" / "words_de_5.txt"
out_path.parent.mkdir(parents=True, exist_ok=True)
out_txt = root / "data" / "words_de_5.txt"
out_json = root / "data" / "words_de_5_sources.json"
out_txt.parent.mkdir(parents=True, exist_ok=True)
ot_words: list[str] = []
if source_ot.exists():
words = extract_from_openthesaurus(source_ot)
source = "OpenThesaurus"
else:
words = extract_from_wordfreq()
source = "wordfreq"
ot_words = extract_from_openthesaurus(source_ot)
with out_path.open("w", encoding="utf-8") as f:
for w in words:
wf_words = extract_from_wordfreq()
ot_set = set(ot_words)
wf_set = set(wf_words)
merged = sorted(ot_set | wf_set)
# Textliste schreiben
with out_txt.open("w", encoding="utf-8") as f:
for w in merged:
f.write(w + "\n")
print(f"Gespeichert: {len(words)} Wörter (Quelle: {source}) -> {out_path}")
# Quellen-Map schreiben
sources_map: dict[str, list[str]] = {}
for w in merged:
srcs: list[str] = []
if w in ot_set:
srcs.append("ot")
if w in wf_set:
srcs.append("wf")
sources_map[w] = srcs
with out_json.open("w", encoding="utf-8") as jf:
json.dump(sources_map, jf, ensure_ascii=False)
print(
" | ".join(
[
f"OpenThesaurus: {len(ot_set)}",
f"wordfreq: {len(wf_set)}",
f"gesamt (dedupliziert): {len(merged)}",
f"{out_txt} / {out_json}",
]
)
)
if __name__ == "__main__":