108 lines
3.1 KiB
Python
108 lines
3.1 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
try:
|
|
# Optional: wordfreq als zusätzliche Quelle
|
|
from wordfreq import top_n_list # type: ignore
|
|
except Exception: # pragma: no cover
|
|
top_n_list = None # type: ignore
|
|
|
|
|
|
ALLOWED_FIVE_LETTER = re.compile(r"^[a-zäöüß]{5}$")
|
|
|
|
|
|
def is_valid_five_letter_word(word: str) -> bool:
|
|
return bool(ALLOWED_FIVE_LETTER.match(word))
|
|
|
|
|
|
def clean_token(token: str) -> str:
|
|
# entferne Klammerzusätze wie (ugs.), (fachspr.), etc.
|
|
no_paren = re.sub(r"\([^)]*\)", "", token)
|
|
# entferne Auslassungszeichen (Präfix-/Suffix-Markierungen)
|
|
no_ellipsis = no_paren.replace("...", "").replace("…", "")
|
|
# nur Buchstaben (inkl. äöüß) behalten
|
|
letters_only = re.sub(r"[^A-Za-zÄÖÜäöüß]", "", no_ellipsis)
|
|
return letters_only.strip().lower()
|
|
|
|
|
|
def extract_from_openthesaurus(path: Path) -> list[str]:
|
|
words: set[str] = set()
|
|
with path.open("r", encoding="utf-8") as f:
|
|
for raw_line in f:
|
|
line = raw_line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
parts = line.split(";")
|
|
for part in parts:
|
|
# Einträge mit Auslassungen überspringen (keine vollständigen Wörter)
|
|
if "..." in part or "…" in part:
|
|
continue
|
|
token = clean_token(part)
|
|
if is_valid_five_letter_word(token):
|
|
words.add(token)
|
|
return sorted(words)
|
|
|
|
|
|
def extract_from_wordfreq(limit: int = 500_000) -> list[str]:
|
|
if top_n_list is None:
|
|
return []
|
|
candidates = top_n_list("de", limit)
|
|
words = {clean_token(w) for w in candidates}
|
|
return sorted(w for w in words if is_valid_five_letter_word(w))
|
|
|
|
|
|
def main() -> None:
|
|
root = Path(__file__).resolve().parents[1]
|
|
source_ot = root / "data" / "openthesaurus.txt"
|
|
out_txt = root / "data" / "words_de_5.txt"
|
|
out_json = root / "data" / "words_de_5_sources.json"
|
|
out_txt.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
ot_words: list[str] = []
|
|
if source_ot.exists():
|
|
ot_words = extract_from_openthesaurus(source_ot)
|
|
|
|
wf_words = extract_from_wordfreq()
|
|
|
|
ot_set = set(ot_words)
|
|
wf_set = set(wf_words)
|
|
|
|
merged = sorted(ot_set | wf_set)
|
|
|
|
# Textliste schreiben
|
|
with out_txt.open("w", encoding="utf-8") as f:
|
|
for w in merged:
|
|
f.write(w + "\n")
|
|
|
|
# Quellen-Map schreiben
|
|
sources_map: dict[str, list[str]] = {}
|
|
for w in merged:
|
|
srcs: list[str] = []
|
|
if w in ot_set:
|
|
srcs.append("ot")
|
|
if w in wf_set:
|
|
srcs.append("wf")
|
|
sources_map[w] = srcs
|
|
|
|
with out_json.open("w", encoding="utf-8") as jf:
|
|
json.dump(sources_map, jf, ensure_ascii=False)
|
|
|
|
print(
|
|
" | ".join(
|
|
[
|
|
f"OpenThesaurus: {len(ot_set)}",
|
|
f"wordfreq: {len(wf_set)}",
|
|
f"gesamt (dedupliziert): {len(merged)}",
|
|
f"→ {out_txt} / {out_json}",
|
|
]
|
|
)
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|