Files
kantine2ical/kantine2ical.py

250 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Speiseplan von kantine-bhz.de aus PDFs auslesen und als iCal (.ics) exportieren.
Termine täglich 12:00 mit allen Tagesgerichten (I. bis V.).
"""
import argparse
import io
import re
from datetime import date, datetime, time
from zoneinfo import ZoneInfo
from urllib.parse import urljoin
import pdfplumber
import requests
from bs4 import BeautifulSoup
from icalendar import Calendar, Event
# Konstanten
BASE_URL = "http://kantine-bhz.de"
DEFAULT_OUTPUT = "kantine_speiseplan.ics"
SUMMARY = "Kantine BHZ Kiel-Wik"
TIMEZONE = "Europe/Berlin"
EVENT_START = time(12, 0)
EVENT_END = time(13, 0)
# Regex: Wochentag, den DD.MM.YYYY
DATE_LINE_RE = re.compile(
r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag),\s*den\s+(\d{2})\.(\d{2})\.(\d{4})",
re.IGNORECASE,
)
# Zeilen die mit I., II., III., IV., V. beginnen (IV/V vor I/II/III prüfen)
DISH_LINE_RE = re.compile(r"^\s*(IV\.?|V\.?|I{1,3}\.?)\s*(.*)$", re.IGNORECASE)
ROMAN_ORDER = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
def fetch_speiseplan_pdf_urls(base_url: str = BASE_URL) -> list[str]:
"""Startseite laden und alle Speiseplan-PDF-Links sammeln (ohne Duplikate)."""
resp = requests.get(base_url, timeout=30)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
soup = BeautifulSoup(resp.text, "html.parser")
seen = set()
urls = []
for a in soup.find_all("a", href=True):
href = a["href"].strip()
if not href.lower().endswith(".pdf"):
continue
if "Speiseplan" not in href:
continue
full = urljoin(base_url, href)
if full in seen:
continue
seen.add(full)
urls.append(full)
return urls
def download_pdf(url: str) -> bytes:
"""PDF von URL herunterladen."""
resp = requests.get(url, timeout=30)
resp.raise_for_status()
return resp.content
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
"""Text aus PDF mit pdfplumber extrahieren."""
text_parts = []
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
for page in pdf.pages:
t = page.extract_text()
if t:
text_parts.append(t)
return "\n".join(text_parts)
def _normalize_roman(roman: str) -> int | None:
r = roman.upper().rstrip(".")
return ROMAN_ORDER.get(r)
def parse_speiseplan_text(text: str) -> list[tuple[date, list[str]]]:
"""
Aus PDF-Text pro Tag (date) die fünf Gerichte I.V. extrahieren.
Gibt Liste von (date, [gericht_i, ..., gericht_v]) zurück.
Fehlende Gerichte werden als "" ergänzt.
"""
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
result: list[tuple[date, list[str]]] = []
i = 0
while i < len(lines):
m = DATE_LINE_RE.match(lines[i])
if not m:
i += 1
continue
day_name, d, mo, y = m.groups()
try:
dt = date(int(y), int(mo), int(d))
except ValueError:
i += 1
continue
dishes: dict[int, str] = {}
i += 1
while i < len(lines):
line = lines[i]
if DATE_LINE_RE.match(line):
break
if line.startswith("_" * 10) or "Öffnungszeiten" in line or "Speisen enthalten" in line:
break
dish_m = DISH_LINE_RE.match(line)
if dish_m:
roman_part, rest = dish_m.groups()
idx = _normalize_roman(roman_part.strip().rstrip("."))
if idx is not None and 1 <= idx <= 5 and idx not in dishes:
dishes[idx] = f"{roman_part.strip()} {rest}".strip() if rest else roman_part.strip()
i += 1
# Immer genau 5 Einträge; fehlende mit ""
ordered = [dishes.get(j, "") for j in range(1, 6)]
result.append((dt, ordered))
return result
def merge_day_events(all_parsed: list[list[tuple[date, list[str]]]]) -> dict[date, list[str]]:
"""Alle PDF-Ergebnisse zusammenführen; bei doppeltem Datum gewinnt letztes Vorkommen."""
by_date: dict[date, list[str]] = {}
for day_list in all_parsed:
for d, dishes in day_list:
by_date[d] = dishes
return by_date
def empty_ical_bytes() -> bytes:
"""Minimalen leeren iCal-Kalender (ohne Events) als Bytes liefern."""
cal = Calendar()
cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE")
cal.add("version", "2.0")
cal.add("CALSCALE", "GREGORIAN")
return cal.to_ical()
def build_ical_bytes(by_date: dict[date, list[str]]) -> bytes:
"""iCal-Kalender als Bytes erzeugen (12:00, Europe/Berlin)."""
cal = Calendar()
cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE")
cal.add("version", "2.0")
cal.add("CALSCALE", "GREGORIAN")
tz_berlin = ZoneInfo(TIMEZONE)
for d, dishes in sorted(by_date.items()):
event = Event()
event.add("uid", f"kantine-bhz-{d.isoformat()}@kantine2ical.local")
event.add("summary", SUMMARY)
desc = "\n".join(dishes)
event.add("description", desc)
start_dt = datetime.combine(d, EVENT_START, tzinfo=tz_berlin)
end_dt = datetime.combine(d, EVENT_END, tzinfo=tz_berlin)
event.add("dtstart", start_dt)
event.add("dtend", end_dt)
cal.add_component(event)
return cal.to_ical()
def build_ical(by_date: dict[date, list[str]], output_path: str) -> None:
"""iCal-Kalender erstellen und in output_path schreiben (12:00, Europe/Berlin)."""
with open(output_path, "wb") as f:
f.write(build_ical_bytes(by_date))
def refresh_speiseplan(base_url: str = BASE_URL) -> tuple[dict[date, list[str]], bytes] | None:
"""
Kompletten Ablauf ausführen: URLs holen, PDFs laden, parsen, mergen, iCal bauen.
Gibt (by_date, ical_bytes) zurück, bei Fehler oder keinen Daten None.
"""
try:
urls = fetch_speiseplan_pdf_urls(base_url)
if not urls:
return None
all_parsed: list[list[tuple[date, list[str]]]] = []
for url in urls:
try:
pdf_bytes = download_pdf(url)
text = extract_text_from_pdf(pdf_bytes)
days = parse_speiseplan_text(text)
all_parsed.append(days)
except Exception:
continue
if not all_parsed:
return None
by_date = merge_day_events(all_parsed)
ical_bytes = build_ical_bytes(by_date)
return (by_date, ical_bytes)
except Exception:
return None
def main() -> None:
parser = argparse.ArgumentParser(
description="Speiseplan von kantine-bhz.de als iCal exportieren."
)
parser.add_argument(
"-o",
"--output",
default=DEFAULT_OUTPUT,
metavar="FILE",
help=f"Ausgabedatei .ics (Standard: {DEFAULT_OUTPUT})",
)
parser.add_argument(
"--url",
default=BASE_URL,
metavar="URL",
help=f"Basis-URL der Kantine (Standard: {BASE_URL})",
)
args = parser.parse_args()
output_path: str = args.output
if not output_path.lower().endswith(".ics"):
output_path = output_path + ".ics"
print("Speiseplan-PDFs von", args.url, "laden ...")
urls = fetch_speiseplan_pdf_urls(args.url)
if not urls:
print("Keine Speiseplan-PDFs gefunden.")
return
print(f" {len(urls)} PDF(s) gefunden.")
all_parsed: list[list[tuple[date, list[str]]]] = []
for url in urls:
try:
pdf_bytes = download_pdf(url)
text = extract_text_from_pdf(pdf_bytes)
days = parse_speiseplan_text(text)
all_parsed.append(days)
except Exception as e:
print(f" Fehler bei {url}: {e}")
if not all_parsed:
print("Keine Daten aus PDFs gelesen.")
return
by_date = merge_day_events(all_parsed)
build_ical(by_date, output_path)
print(f"Kalender mit {len(by_date)} Terminen geschrieben: {output_path}")
if __name__ == "__main__":
main()