259 lines
8.5 KiB
Python
259 lines
8.5 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Speiseplan von kantine-bhz.de aus PDFs auslesen und als iCal (.ics) exportieren.
|
||
Termine täglich 12:00 mit allen Tagesgerichten (I. bis V.).
|
||
"""
|
||
|
||
import argparse
|
||
import io
|
||
import logging
|
||
import re
|
||
from datetime import date, datetime, time
|
||
from zoneinfo import ZoneInfo
|
||
from urllib.parse import urljoin
|
||
|
||
import pdfplumber
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from icalendar import Calendar, Event
|
||
|
||
# Konstanten
|
||
BASE_URL = "http://kantine-bhz.de"
|
||
DEFAULT_OUTPUT = "kantine_speiseplan.ics"
|
||
SUMMARY = "Kantine BHZ Kiel-Wik"
|
||
TIMEZONE = "Europe/Berlin"
|
||
EVENT_START = time(12, 0)
|
||
EVENT_END = time(13, 0)
|
||
|
||
# Regex: Wochentag, den DD.MM.YYYY
|
||
DATE_LINE_RE = re.compile(
|
||
r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag),\s*den\s+(\d{2})\.(\d{2})\.(\d{4})",
|
||
re.IGNORECASE,
|
||
)
|
||
# Zeilen die mit I., II., III., IV., V. beginnen (IV/V vor I/II/III prüfen)
|
||
DISH_LINE_RE = re.compile(r"^\s*(IV\.?|V\.?|I{1,3}\.?)\s*(.*)$", re.IGNORECASE)
|
||
|
||
ROMAN_ORDER = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
|
||
_log = logging.getLogger(__name__)
|
||
|
||
|
||
def fetch_speiseplan_pdf_urls(base_url: str = BASE_URL) -> list[str]:
|
||
"""Startseite laden und alle Speiseplan-PDF-Links sammeln (ohne Duplikate)."""
|
||
resp = requests.get(base_url, timeout=30)
|
||
resp.raise_for_status()
|
||
resp.encoding = resp.apparent_encoding or "utf-8"
|
||
soup = BeautifulSoup(resp.text, "html.parser")
|
||
seen = set()
|
||
urls = []
|
||
for a in soup.find_all("a", href=True):
|
||
href = a["href"].strip()
|
||
if not href.lower().endswith(".pdf"):
|
||
continue
|
||
if "Speiseplan" not in href:
|
||
continue
|
||
full = urljoin(base_url, href)
|
||
if full in seen:
|
||
continue
|
||
seen.add(full)
|
||
urls.append(full)
|
||
return urls
|
||
|
||
|
||
def download_pdf(url: str) -> bytes:
|
||
"""PDF von URL herunterladen."""
|
||
resp = requests.get(url, timeout=30)
|
||
resp.raise_for_status()
|
||
return resp.content
|
||
|
||
|
||
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
|
||
"""Text aus PDF mit pdfplumber extrahieren."""
|
||
text_parts = []
|
||
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
||
for page in pdf.pages:
|
||
t = page.extract_text()
|
||
if t:
|
||
text_parts.append(t)
|
||
return "\n".join(text_parts)
|
||
|
||
|
||
def _normalize_roman(roman: str) -> int | None:
|
||
r = roman.upper().rstrip(".")
|
||
return ROMAN_ORDER.get(r)
|
||
|
||
|
||
def parse_speiseplan_text(text: str) -> list[tuple[date, list[str]]]:
|
||
"""
|
||
Aus PDF-Text pro Tag (date) die fünf Gerichte I.–V. extrahieren.
|
||
Gibt Liste von (date, [gericht_i, ..., gericht_v]) zurück.
|
||
Fehlende Gerichte werden als "–" ergänzt.
|
||
"""
|
||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||
result: list[tuple[date, list[str]]] = []
|
||
i = 0
|
||
while i < len(lines):
|
||
m = DATE_LINE_RE.match(lines[i])
|
||
if not m:
|
||
i += 1
|
||
continue
|
||
day_name, d, mo, y = m.groups()
|
||
try:
|
||
dt = date(int(y), int(mo), int(d))
|
||
except ValueError:
|
||
i += 1
|
||
continue
|
||
dishes: dict[int, str] = {}
|
||
i += 1
|
||
while i < len(lines):
|
||
line = lines[i]
|
||
if DATE_LINE_RE.match(line):
|
||
break
|
||
if line.startswith("_" * 10) or "Öffnungszeiten" in line or "Speisen enthalten" in line:
|
||
break
|
||
dish_m = DISH_LINE_RE.match(line)
|
||
if dish_m:
|
||
roman_part, rest = dish_m.groups()
|
||
idx = _normalize_roman(roman_part.strip().rstrip("."))
|
||
if idx is not None and 1 <= idx <= 5 and idx not in dishes:
|
||
dishes[idx] = f"{roman_part.strip()} {rest}".strip() if rest else roman_part.strip()
|
||
i += 1
|
||
# Immer genau 5 Einträge; fehlende mit "–"
|
||
ordered = [dishes.get(j, "–") for j in range(1, 6)]
|
||
result.append((dt, ordered))
|
||
return result
|
||
|
||
|
||
def merge_day_events(all_parsed: list[list[tuple[date, list[str]]]]) -> dict[date, list[str]]:
|
||
"""Alle PDF-Ergebnisse zusammenführen; bei doppeltem Datum gewinnt letztes Vorkommen."""
|
||
by_date: dict[date, list[str]] = {}
|
||
for day_list in all_parsed:
|
||
for d, dishes in day_list:
|
||
by_date[d] = dishes
|
||
return by_date
|
||
|
||
|
||
def empty_ical_bytes() -> bytes:
|
||
"""Minimalen leeren iCal-Kalender (ohne Events) als Bytes liefern."""
|
||
cal = Calendar()
|
||
cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE")
|
||
cal.add("version", "2.0")
|
||
cal.add("CALSCALE", "GREGORIAN")
|
||
return cal.to_ical()
|
||
|
||
|
||
def build_ical_bytes(by_date: dict[date, list[str]]) -> bytes:
|
||
"""iCal-Kalender als Bytes erzeugen (12:00, Europe/Berlin)."""
|
||
cal = Calendar()
|
||
cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE")
|
||
cal.add("version", "2.0")
|
||
cal.add("CALSCALE", "GREGORIAN")
|
||
|
||
tz_berlin = ZoneInfo(TIMEZONE)
|
||
|
||
for d, dishes in sorted(by_date.items()):
|
||
event = Event()
|
||
event.add("uid", f"kantine-bhz-{d.isoformat()}@kantine2ical.local")
|
||
event.add("summary", SUMMARY)
|
||
desc = "\n".join(dishes)
|
||
event.add("description", desc)
|
||
start_dt = datetime.combine(d, EVENT_START, tzinfo=tz_berlin)
|
||
end_dt = datetime.combine(d, EVENT_END, tzinfo=tz_berlin)
|
||
event.add("dtstart", start_dt)
|
||
event.add("dtend", end_dt)
|
||
cal.add_component(event)
|
||
|
||
return cal.to_ical()
|
||
|
||
|
||
def build_ical(by_date: dict[date, list[str]], output_path: str) -> None:
|
||
"""iCal-Kalender erstellen und in output_path schreiben (12:00, Europe/Berlin)."""
|
||
with open(output_path, "wb") as f:
|
||
f.write(build_ical_bytes(by_date))
|
||
|
||
|
||
def refresh_speiseplan(base_url: str = BASE_URL) -> tuple[dict[date, list[str]], bytes] | None:
|
||
"""
|
||
Kompletten Ablauf ausführen: URLs holen, PDFs laden, parsen, mergen, iCal bauen.
|
||
Gibt (by_date, ical_bytes) zurück, bei Fehler oder keinen Daten None.
|
||
"""
|
||
try:
|
||
urls = fetch_speiseplan_pdf_urls(base_url)
|
||
if not urls:
|
||
_log.warning("refresh_speiseplan: Keine PDF-URLs gefunden auf %s", base_url)
|
||
return None
|
||
_log.info("refresh_speiseplan: %d PDF(s) gefunden", len(urls))
|
||
all_parsed: list[list[tuple[date, list[str]]]] = []
|
||
for url in urls:
|
||
try:
|
||
pdf_bytes = download_pdf(url)
|
||
text = extract_text_from_pdf(pdf_bytes)
|
||
days = parse_speiseplan_text(text)
|
||
all_parsed.append(days)
|
||
_log.info("refresh_speiseplan: %s -> %d Tage", url.split("/")[-1], len(days))
|
||
except Exception as e:
|
||
_log.exception("refresh_speiseplan: PDF fehlgeschlagen %s: %s", url, e)
|
||
continue
|
||
if not all_parsed:
|
||
_log.warning("refresh_speiseplan: Kein PDF erfolgreich gelesen")
|
||
return None
|
||
by_date = merge_day_events(all_parsed)
|
||
ical_bytes = build_ical_bytes(by_date)
|
||
_log.info("refresh_speiseplan: %d Termine im Kalender", len(by_date))
|
||
return (by_date, ical_bytes)
|
||
except Exception as e:
|
||
_log.exception("refresh_speiseplan: Ablauf fehlgeschlagen: %s", e)
|
||
return None
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(
|
||
description="Speiseplan von kantine-bhz.de als iCal exportieren."
|
||
)
|
||
parser.add_argument(
|
||
"-o",
|
||
"--output",
|
||
default=DEFAULT_OUTPUT,
|
||
metavar="FILE",
|
||
help=f"Ausgabedatei .ics (Standard: {DEFAULT_OUTPUT})",
|
||
)
|
||
parser.add_argument(
|
||
"--url",
|
||
default=BASE_URL,
|
||
metavar="URL",
|
||
help=f"Basis-URL der Kantine (Standard: {BASE_URL})",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
output_path: str = args.output
|
||
if not output_path.lower().endswith(".ics"):
|
||
output_path = output_path + ".ics"
|
||
|
||
print("Speiseplan-PDFs von", args.url, "laden ...")
|
||
urls = fetch_speiseplan_pdf_urls(args.url)
|
||
if not urls:
|
||
print("Keine Speiseplan-PDFs gefunden.")
|
||
return
|
||
print(f" {len(urls)} PDF(s) gefunden.")
|
||
|
||
all_parsed: list[list[tuple[date, list[str]]]] = []
|
||
for url in urls:
|
||
try:
|
||
pdf_bytes = download_pdf(url)
|
||
text = extract_text_from_pdf(pdf_bytes)
|
||
days = parse_speiseplan_text(text)
|
||
all_parsed.append(days)
|
||
except Exception as e:
|
||
print(f" Fehler bei {url}: {e}")
|
||
|
||
if not all_parsed:
|
||
print("Keine Daten aus PDFs gelesen.")
|
||
return
|
||
|
||
by_date = merge_day_events(all_parsed)
|
||
build_ical(by_date, output_path)
|
||
print(f"Kalender mit {len(by_date)} Terminen geschrieben: {output_path}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|