Initial commit: kantine2ical CLI, Flask-Server, Docker

2026-01-29 13:19:16 +01:00
commit bb43f7c63a
8 changed files with 508 additions and 0 deletions
--- a/kantine2ical.py
+++ b/kantine2ical.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+Speiseplan von kantine-bhz.de aus PDFs auslesen und als iCal (.ics) exportieren.
+Termine täglich 12:00 mit allen Tagesgerichten (I. bis V.).
+"""
+
+import argparse
+import io
+import re
+from datetime import date, datetime, time
+from zoneinfo import ZoneInfo
+from urllib.parse import urljoin
+
+import pdfplumber
+import requests
+from bs4 import BeautifulSoup
+from icalendar import Calendar, Event
+
+# Konstanten
+BASE_URL = "http://kantine-bhz.de"
+DEFAULT_OUTPUT = "kantine_speiseplan.ics"
+SUMMARY = "Kantine BHZ Kiel-Wik"
+TIMEZONE = "Europe/Berlin"
+EVENT_START = time(12, 0)
+EVENT_END = time(13, 0)
+
+# Regex: Wochentag, den DD.MM.YYYY
+DATE_LINE_RE = re.compile(
+    r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag),\s*den\s+(\d{2})\.(\d{2})\.(\d{4})",
+    re.IGNORECASE,
+)
+# Zeilen die mit I., II., III., IV., V. beginnen (IV/V vor I/II/III prüfen)
+DISH_LINE_RE = re.compile(r"^\s*(IV\.?|V\.?|I{1,3}\.?)\s*(.*)$", re.IGNORECASE)
+
+ROMAN_ORDER = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
+
+
+def fetch_speiseplan_pdf_urls(base_url: str = BASE_URL) -> list[str]:
+    """Startseite laden und alle Speiseplan-PDF-Links sammeln (ohne Duplikate)."""
+    resp = requests.get(base_url, timeout=30)
+    resp.raise_for_status()
+    resp.encoding = resp.apparent_encoding or "utf-8"
+    soup = BeautifulSoup(resp.text, "html.parser")
+    seen = set()
+    urls = []
+    for a in soup.find_all("a", href=True):
+        href = a["href"].strip()
+        if not href.lower().endswith(".pdf"):
+            continue
+        if "Speiseplan" not in href:
+            continue
+        full = urljoin(base_url, href)
+        if full in seen:
+            continue
+        seen.add(full)
+        urls.append(full)
+    return urls
+
+
+def download_pdf(url: str) -> bytes:
+    """PDF von URL herunterladen."""
+    resp = requests.get(url, timeout=30)
+    resp.raise_for_status()
+    return resp.content
+
+
+def extract_text_from_pdf(pdf_bytes: bytes) -> str:
+    """Text aus PDF mit pdfplumber extrahieren."""
+    text_parts = []
+    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
+        for page in pdf.pages:
+            t = page.extract_text()
+            if t:
+                text_parts.append(t)
+    return "\n".join(text_parts)
+
+
+def _normalize_roman(roman: str) -> int | None:
+    r = roman.upper().rstrip(".")
+    return ROMAN_ORDER.get(r)
+
+
+def parse_speiseplan_text(text: str) -> list[tuple[date, list[str]]]:
+    """
+    Aus PDF-Text pro Tag (date) die fünf Gerichte I.–V. extrahieren.
+    Gibt Liste von (date, [gericht_i, ..., gericht_v]) zurück.
+    Fehlende Gerichte werden als "–" ergänzt.
+    """
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    result: list[tuple[date, list[str]]] = []
+    i = 0
+    while i < len(lines):
+        m = DATE_LINE_RE.match(lines[i])
+        if not m:
+            i += 1
+            continue
+        day_name, d, mo, y = m.groups()
+        try:
+            dt = date(int(y), int(mo), int(d))
+        except ValueError:
+            i += 1
+            continue
+        dishes: dict[int, str] = {}
+        i += 1
+        while i < len(lines):
+            line = lines[i]
+            if DATE_LINE_RE.match(line):
+                break
+            if line.startswith("_" * 10) or "Öffnungszeiten" in line or "Speisen enthalten" in line:
+                break
+            dish_m = DISH_LINE_RE.match(line)
+            if dish_m:
+                roman_part, rest = dish_m.groups()
+                idx = _normalize_roman(roman_part.strip().rstrip("."))
+                if idx is not None and 1 <= idx <= 5 and idx not in dishes:
+                    dishes[idx] = f"{roman_part.strip()} {rest}".strip() if rest else roman_part.strip()
+            i += 1
+        # Immer genau 5 Einträge; fehlende mit "–"
+        ordered = [dishes.get(j, "–") for j in range(1, 6)]
+        result.append((dt, ordered))
+    return result
+
+
+def merge_day_events(all_parsed: list[list[tuple[date, list[str]]]]) -> dict[date, list[str]]:
+    """Alle PDF-Ergebnisse zusammenführen; bei doppeltem Datum gewinnt letztes Vorkommen."""
+    by_date: dict[date, list[str]] = {}
+    for day_list in all_parsed:
+        for d, dishes in day_list:
+            by_date[d] = dishes
+    return by_date
+
+
+def empty_ical_bytes() -> bytes:
+    """Minimalen leeren iCal-Kalender (ohne Events) als Bytes liefern."""
+    cal = Calendar()
+    cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE")
+    cal.add("version", "2.0")
+    cal.add("CALSCALE", "GREGORIAN")
+    return cal.to_ical()
+
+
+def build_ical_bytes(by_date: dict[date, list[str]]) -> bytes:
+    """iCal-Kalender als Bytes erzeugen (12:00, Europe/Berlin)."""
+    cal = Calendar()
+    cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE")
+    cal.add("version", "2.0")
+    cal.add("CALSCALE", "GREGORIAN")
+
+    tz_berlin = ZoneInfo(TIMEZONE)
+
+    for d, dishes in sorted(by_date.items()):
+        event = Event()
+        event.add("uid", f"kantine-bhz-{d.isoformat()}@kantine2ical.local")
+        event.add("summary", SUMMARY)
+        desc = "\n".join(dishes)
+        event.add("description", desc)
+        start_dt = datetime.combine(d, EVENT_START, tzinfo=tz_berlin)
+        end_dt = datetime.combine(d, EVENT_END, tzinfo=tz_berlin)
+        event.add("dtstart", start_dt)
+        event.add("dtend", end_dt)
+        cal.add_component(event)
+
+    return cal.to_ical()
+
+
+def build_ical(by_date: dict[date, list[str]], output_path: str) -> None:
+    """iCal-Kalender erstellen und in output_path schreiben (12:00, Europe/Berlin)."""
+    with open(output_path, "wb") as f:
+        f.write(build_ical_bytes(by_date))
+
+
+def refresh_speiseplan(base_url: str = BASE_URL) -> tuple[dict[date, list[str]], bytes] | None:
+    """
+    Kompletten Ablauf ausführen: URLs holen, PDFs laden, parsen, mergen, iCal bauen.
+    Gibt (by_date, ical_bytes) zurück, bei Fehler oder keinen Daten None.
+    """
+    try:
+        urls = fetch_speiseplan_pdf_urls(base_url)
+        if not urls:
+            return None
+        all_parsed: list[list[tuple[date, list[str]]]] = []
+        for url in urls:
+            try:
+                pdf_bytes = download_pdf(url)
+                text = extract_text_from_pdf(pdf_bytes)
+                days = parse_speiseplan_text(text)
+                all_parsed.append(days)
+            except Exception:
+                continue
+        if not all_parsed:
+            return None
+        by_date = merge_day_events(all_parsed)
+        ical_bytes = build_ical_bytes(by_date)
+        return (by_date, ical_bytes)
+    except Exception:
+        return None
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Speiseplan von kantine-bhz.de als iCal exportieren."
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default=DEFAULT_OUTPUT,
+        metavar="FILE",
+        help=f"Ausgabedatei .ics (Standard: {DEFAULT_OUTPUT})",
+    )
+    parser.add_argument(
+        "--url",
+        default=BASE_URL,
+        metavar="URL",
+        help=f"Basis-URL der Kantine (Standard: {BASE_URL})",
+    )
+    args = parser.parse_args()
+
+    output_path: str = args.output
+    if not output_path.lower().endswith(".ics"):
+        output_path = output_path + ".ics"
+
+    print("Speiseplan-PDFs von", args.url, "laden ...")
+    urls = fetch_speiseplan_pdf_urls(args.url)
+    if not urls:
+        print("Keine Speiseplan-PDFs gefunden.")
+        return
+    print(f"  {len(urls)} PDF(s) gefunden.")
+
+    all_parsed: list[list[tuple[date, list[str]]]] = []
+    for url in urls:
+        try:
+            pdf_bytes = download_pdf(url)
+            text = extract_text_from_pdf(pdf_bytes)
+            days = parse_speiseplan_text(text)
+            all_parsed.append(days)
+        except Exception as e:
+            print(f"  Fehler bei {url}: {e}")
+
+    if not all_parsed:
+        print("Keine Daten aus PDFs gelesen.")
+        return
+
+    by_date = merge_day_events(all_parsed)
+    build_ical(by_date, output_path)
+    print(f"Kalender mit {len(by_date)} Terminen geschrieben: {output_path}")
+
+
+if __name__ == "__main__":
+    main()