#!/usr/bin/env python3 """ Speiseplan von kantine-bhz.de aus PDFs auslesen und als iCal (.ics) exportieren. Termine täglich 12:00 mit allen Tagesgerichten (I. bis V.). """ import argparse import io import logging import re from datetime import date, datetime, time from zoneinfo import ZoneInfo from urllib.parse import urljoin import pdfplumber import requests from bs4 import BeautifulSoup from icalendar import Calendar, Event # Konstanten BASE_URL = "http://kantine-bhz.de" DEFAULT_OUTPUT = "kantine_speiseplan.ics" SUMMARY = "Kantine BHZ Kiel-Wik" TIMEZONE = "Europe/Berlin" EVENT_START = time(12, 0) EVENT_END = time(13, 0) # Regex: Wochentag, den DD.MM.YYYY DATE_LINE_RE = re.compile( r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag),\s*den\s+(\d{2})\.(\d{2})\.(\d{4})", re.IGNORECASE, ) # Zeilen die mit I., II., III., IV., V. beginnen (IV/V vor I/II/III prüfen) DISH_LINE_RE = re.compile(r"^\s*(IV\.?|V\.?|I{1,3}\.?)\s*(.*)$", re.IGNORECASE) ROMAN_ORDER = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5} _log = logging.getLogger(__name__) def fetch_speiseplan_pdf_urls(base_url: str = BASE_URL) -> list[str]: """Startseite laden und alle Speiseplan-PDF-Links sammeln (ohne Duplikate).""" resp = requests.get(base_url, timeout=30) resp.raise_for_status() resp.encoding = resp.apparent_encoding or "utf-8" soup = BeautifulSoup(resp.text, "html.parser") seen = set() urls = [] for a in soup.find_all("a", href=True): href = a["href"].strip() if not href.lower().endswith(".pdf"): continue if "Speiseplan" not in href: continue full = urljoin(base_url, href) if full in seen: continue seen.add(full) urls.append(full) return urls def download_pdf(url: str) -> bytes: """PDF von URL herunterladen.""" resp = requests.get(url, timeout=30) resp.raise_for_status() return resp.content def extract_text_from_pdf(pdf_bytes: bytes) -> str: """Text aus PDF mit pdfplumber extrahieren.""" text_parts = [] with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: for page in pdf.pages: t = page.extract_text() if t: text_parts.append(t) return "\n".join(text_parts) def _normalize_roman(roman: str) -> int | None: r = roman.upper().rstrip(".") return ROMAN_ORDER.get(r) def parse_speiseplan_text(text: str) -> list[tuple[date, list[str]]]: """ Aus PDF-Text pro Tag (date) die fünf Gerichte I.–V. extrahieren. Gibt Liste von (date, [gericht_i, ..., gericht_v]) zurück. Fehlende Gerichte werden als "–" ergänzt. """ lines = [ln.strip() for ln in text.splitlines() if ln.strip()] result: list[tuple[date, list[str]]] = [] i = 0 while i < len(lines): m = DATE_LINE_RE.match(lines[i]) if not m: i += 1 continue day_name, d, mo, y = m.groups() try: dt = date(int(y), int(mo), int(d)) except ValueError: i += 1 continue dishes: dict[int, str] = {} i += 1 while i < len(lines): line = lines[i] if DATE_LINE_RE.match(line): break if line.startswith("_" * 10) or "Öffnungszeiten" in line or "Speisen enthalten" in line: break dish_m = DISH_LINE_RE.match(line) if dish_m: roman_part, rest = dish_m.groups() idx = _normalize_roman(roman_part.strip().rstrip(".")) if idx is not None and 1 <= idx <= 5 and idx not in dishes: dishes[idx] = f"{roman_part.strip()} {rest}".strip() if rest else roman_part.strip() i += 1 # Immer genau 5 Einträge; fehlende mit "–" ordered = [dishes.get(j, "–") for j in range(1, 6)] result.append((dt, ordered)) return result def merge_day_events(all_parsed: list[list[tuple[date, list[str]]]]) -> dict[date, list[str]]: """Alle PDF-Ergebnisse zusammenführen; bei doppeltem Datum gewinnt letztes Vorkommen.""" by_date: dict[date, list[str]] = {} for day_list in all_parsed: for d, dishes in day_list: by_date[d] = dishes return by_date def empty_ical_bytes() -> bytes: """Minimalen leeren iCal-Kalender (ohne Events) als Bytes liefern.""" cal = Calendar() cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE") cal.add("version", "2.0") cal.add("CALSCALE", "GREGORIAN") return cal.to_ical() def build_ical_bytes(by_date: dict[date, list[str]]) -> bytes: """iCal-Kalender als Bytes erzeugen (12:00, Europe/Berlin).""" cal = Calendar() cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE") cal.add("version", "2.0") cal.add("CALSCALE", "GREGORIAN") tz_berlin = ZoneInfo(TIMEZONE) for d, dishes in sorted(by_date.items()): event = Event() event.add("uid", f"kantine-bhz-{d.isoformat()}@kantine2ical.local") event.add("summary", SUMMARY) desc = "\n".join(dishes) event.add("description", desc) start_dt = datetime.combine(d, EVENT_START, tzinfo=tz_berlin) end_dt = datetime.combine(d, EVENT_END, tzinfo=tz_berlin) event.add("dtstart", start_dt) event.add("dtend", end_dt) cal.add_component(event) return cal.to_ical() def build_ical(by_date: dict[date, list[str]], output_path: str) -> None: """iCal-Kalender erstellen und in output_path schreiben (12:00, Europe/Berlin).""" with open(output_path, "wb") as f: f.write(build_ical_bytes(by_date)) def refresh_speiseplan(base_url: str = BASE_URL) -> tuple[dict[date, list[str]], bytes] | None: """ Kompletten Ablauf ausführen: URLs holen, PDFs laden, parsen, mergen, iCal bauen. Gibt (by_date, ical_bytes) zurück, bei Fehler oder keinen Daten None. """ try: urls = fetch_speiseplan_pdf_urls(base_url) if not urls: _log.warning("refresh_speiseplan: Keine PDF-URLs gefunden auf %s", base_url) return None _log.info("refresh_speiseplan: %d PDF(s) gefunden", len(urls)) all_parsed: list[list[tuple[date, list[str]]]] = [] for url in urls: try: pdf_bytes = download_pdf(url) text = extract_text_from_pdf(pdf_bytes) days = parse_speiseplan_text(text) all_parsed.append(days) _log.info("refresh_speiseplan: %s -> %d Tage", url.split("/")[-1], len(days)) except Exception as e: _log.exception("refresh_speiseplan: PDF fehlgeschlagen %s: %s", url, e) continue if not all_parsed: _log.warning("refresh_speiseplan: Kein PDF erfolgreich gelesen") return None by_date = merge_day_events(all_parsed) ical_bytes = build_ical_bytes(by_date) _log.info("refresh_speiseplan: %d Termine im Kalender", len(by_date)) return (by_date, ical_bytes) except Exception as e: _log.exception("refresh_speiseplan: Ablauf fehlgeschlagen: %s", e) return None def main() -> None: parser = argparse.ArgumentParser( description="Speiseplan von kantine-bhz.de als iCal exportieren." ) parser.add_argument( "-o", "--output", default=DEFAULT_OUTPUT, metavar="FILE", help=f"Ausgabedatei .ics (Standard: {DEFAULT_OUTPUT})", ) parser.add_argument( "--url", default=BASE_URL, metavar="URL", help=f"Basis-URL der Kantine (Standard: {BASE_URL})", ) args = parser.parse_args() output_path: str = args.output if not output_path.lower().endswith(".ics"): output_path = output_path + ".ics" print("Speiseplan-PDFs von", args.url, "laden ...") urls = fetch_speiseplan_pdf_urls(args.url) if not urls: print("Keine Speiseplan-PDFs gefunden.") return print(f" {len(urls)} PDF(s) gefunden.") all_parsed: list[list[tuple[date, list[str]]]] = [] for url in urls: try: pdf_bytes = download_pdf(url) text = extract_text_from_pdf(pdf_bytes) days = parse_speiseplan_text(text) all_parsed.append(days) except Exception as e: print(f" Fehler bei {url}: {e}") if not all_parsed: print("Keine Daten aus PDFs gelesen.") return by_date = merge_day_events(all_parsed) build_ical(by_date, output_path) print(f"Kalender mit {len(by_date)} Terminen geschrieben: {output_path}") if __name__ == "__main__": main()