Quelle-Badges (OT/WF), Legende verschoben, Footer mit Emojis, A11y-Verbesserungen; Generator: Merge OpenThesaurus+wordfreq; Dockerfile/Gunicorn hinzugefügt
This commit is contained in:
@@ -1,11 +1,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
try:
|
||||
# Optional, nur Fallback
|
||||
# Optional: wordfreq als zusätzliche Quelle
|
||||
from wordfreq import top_n_list # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
top_n_list = None # type: ignore
|
||||
@@ -57,21 +58,49 @@ def extract_from_wordfreq(limit: int = 500_000) -> list[str]:
|
||||
def main() -> None:
|
||||
root = Path(__file__).resolve().parents[1]
|
||||
source_ot = root / "data" / "openthesaurus.txt"
|
||||
out_path = root / "data" / "words_de_5.txt"
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_txt = root / "data" / "words_de_5.txt"
|
||||
out_json = root / "data" / "words_de_5_sources.json"
|
||||
out_txt.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
ot_words: list[str] = []
|
||||
if source_ot.exists():
|
||||
words = extract_from_openthesaurus(source_ot)
|
||||
source = "OpenThesaurus"
|
||||
else:
|
||||
words = extract_from_wordfreq()
|
||||
source = "wordfreq"
|
||||
ot_words = extract_from_openthesaurus(source_ot)
|
||||
|
||||
with out_path.open("w", encoding="utf-8") as f:
|
||||
for w in words:
|
||||
wf_words = extract_from_wordfreq()
|
||||
|
||||
ot_set = set(ot_words)
|
||||
wf_set = set(wf_words)
|
||||
|
||||
merged = sorted(ot_set | wf_set)
|
||||
|
||||
# Textliste schreiben
|
||||
with out_txt.open("w", encoding="utf-8") as f:
|
||||
for w in merged:
|
||||
f.write(w + "\n")
|
||||
|
||||
print(f"Gespeichert: {len(words)} Wörter (Quelle: {source}) -> {out_path}")
|
||||
# Quellen-Map schreiben
|
||||
sources_map: dict[str, list[str]] = {}
|
||||
for w in merged:
|
||||
srcs: list[str] = []
|
||||
if w in ot_set:
|
||||
srcs.append("ot")
|
||||
if w in wf_set:
|
||||
srcs.append("wf")
|
||||
sources_map[w] = srcs
|
||||
|
||||
with out_json.open("w", encoding="utf-8") as jf:
|
||||
json.dump(sources_map, jf, ensure_ascii=False)
|
||||
|
||||
print(
|
||||
" | ".join(
|
||||
[
|
||||
f"OpenThesaurus: {len(ot_set)}",
|
||||
f"wordfreq: {len(wf_set)}",
|
||||
f"gesamt (dedupliziert): {len(merged)}",
|
||||
f"→ {out_txt} / {out_json}",
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Reference in New Issue
Block a user