Initial commit
This commit is contained in:
78
scripts/generate_wordlist.py
Normal file
78
scripts/generate_wordlist.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
try:
|
||||
# Optional, nur Fallback
|
||||
from wordfreq import top_n_list # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
top_n_list = None # type: ignore
|
||||
|
||||
|
||||
ALLOWED_FIVE_LETTER = re.compile(r"^[a-zäöüß]{5}$")
|
||||
|
||||
|
||||
def is_valid_five_letter_word(word: str) -> bool:
|
||||
return bool(ALLOWED_FIVE_LETTER.match(word))
|
||||
|
||||
|
||||
def clean_token(token: str) -> str:
|
||||
# entferne Klammerzusätze wie (ugs.), (fachspr.), etc.
|
||||
no_paren = re.sub(r"\([^)]*\)", "", token)
|
||||
# entferne Auslassungszeichen (Präfix-/Suffix-Markierungen)
|
||||
no_ellipsis = no_paren.replace("...", "").replace("…", "")
|
||||
# nur Buchstaben (inkl. äöüß) behalten
|
||||
letters_only = re.sub(r"[^A-Za-zÄÖÜäöüß]", "", no_ellipsis)
|
||||
return letters_only.strip().lower()
|
||||
|
||||
|
||||
def extract_from_openthesaurus(path: Path) -> list[str]:
|
||||
words: set[str] = set()
|
||||
with path.open("r", encoding="utf-8") as f:
|
||||
for raw_line in f:
|
||||
line = raw_line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
parts = line.split(";")
|
||||
for part in parts:
|
||||
# Einträge mit Auslassungen überspringen (keine vollständigen Wörter)
|
||||
if "..." in part or "…" in part:
|
||||
continue
|
||||
token = clean_token(part)
|
||||
if is_valid_five_letter_word(token):
|
||||
words.add(token)
|
||||
return sorted(words)
|
||||
|
||||
|
||||
def extract_from_wordfreq(limit: int = 500_000) -> list[str]:
|
||||
if top_n_list is None:
|
||||
return []
|
||||
candidates = top_n_list("de", limit)
|
||||
words = {clean_token(w) for w in candidates}
|
||||
return sorted(w for w in words if is_valid_five_letter_word(w))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
root = Path(__file__).resolve().parents[1]
|
||||
source_ot = root / "data" / "openthesaurus.txt"
|
||||
out_path = root / "data" / "words_de_5.txt"
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if source_ot.exists():
|
||||
words = extract_from_openthesaurus(source_ot)
|
||||
source = "OpenThesaurus"
|
||||
else:
|
||||
words = extract_from_wordfreq()
|
||||
source = "wordfreq"
|
||||
|
||||
with out_path.open("w", encoding="utf-8") as f:
|
||||
for w in words:
|
||||
f.write(w + "\n")
|
||||
|
||||
print(f"Gespeichert: {len(words)} Wörter (Quelle: {source}) -> {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user