Initial commit: kantine2ical CLI, Flask-Server, Docker
This commit is contained in:
249
kantine2ical.py
Normal file
249
kantine2ical.py
Normal file
@@ -0,0 +1,249 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Speiseplan von kantine-bhz.de aus PDFs auslesen und als iCal (.ics) exportieren.
|
||||
Termine täglich 12:00 mit allen Tagesgerichten (I. bis V.).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import re
|
||||
from datetime import date, datetime, time
|
||||
from zoneinfo import ZoneInfo
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import pdfplumber
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from icalendar import Calendar, Event
|
||||
|
||||
# Konstanten
|
||||
BASE_URL = "http://kantine-bhz.de"
|
||||
DEFAULT_OUTPUT = "kantine_speiseplan.ics"
|
||||
SUMMARY = "Kantine BHZ Kiel-Wik"
|
||||
TIMEZONE = "Europe/Berlin"
|
||||
EVENT_START = time(12, 0)
|
||||
EVENT_END = time(13, 0)
|
||||
|
||||
# Regex: Wochentag, den DD.MM.YYYY
|
||||
DATE_LINE_RE = re.compile(
|
||||
r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag),\s*den\s+(\d{2})\.(\d{2})\.(\d{4})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Zeilen die mit I., II., III., IV., V. beginnen (IV/V vor I/II/III prüfen)
|
||||
DISH_LINE_RE = re.compile(r"^\s*(IV\.?|V\.?|I{1,3}\.?)\s*(.*)$", re.IGNORECASE)
|
||||
|
||||
ROMAN_ORDER = {"I": 1, "II": 2, "III": 3, "IV": 4, "V": 5}
|
||||
|
||||
|
||||
def fetch_speiseplan_pdf_urls(base_url: str = BASE_URL) -> list[str]:
|
||||
"""Startseite laden und alle Speiseplan-PDF-Links sammeln (ohne Duplikate)."""
|
||||
resp = requests.get(base_url, timeout=30)
|
||||
resp.raise_for_status()
|
||||
resp.encoding = resp.apparent_encoding or "utf-8"
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
seen = set()
|
||||
urls = []
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"].strip()
|
||||
if not href.lower().endswith(".pdf"):
|
||||
continue
|
||||
if "Speiseplan" not in href:
|
||||
continue
|
||||
full = urljoin(base_url, href)
|
||||
if full in seen:
|
||||
continue
|
||||
seen.add(full)
|
||||
urls.append(full)
|
||||
return urls
|
||||
|
||||
|
||||
def download_pdf(url: str) -> bytes:
|
||||
"""PDF von URL herunterladen."""
|
||||
resp = requests.get(url, timeout=30)
|
||||
resp.raise_for_status()
|
||||
return resp.content
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
|
||||
"""Text aus PDF mit pdfplumber extrahieren."""
|
||||
text_parts = []
|
||||
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
||||
for page in pdf.pages:
|
||||
t = page.extract_text()
|
||||
if t:
|
||||
text_parts.append(t)
|
||||
return "\n".join(text_parts)
|
||||
|
||||
|
||||
def _normalize_roman(roman: str) -> int | None:
|
||||
r = roman.upper().rstrip(".")
|
||||
return ROMAN_ORDER.get(r)
|
||||
|
||||
|
||||
def parse_speiseplan_text(text: str) -> list[tuple[date, list[str]]]:
|
||||
"""
|
||||
Aus PDF-Text pro Tag (date) die fünf Gerichte I.–V. extrahieren.
|
||||
Gibt Liste von (date, [gericht_i, ..., gericht_v]) zurück.
|
||||
Fehlende Gerichte werden als "–" ergänzt.
|
||||
"""
|
||||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||
result: list[tuple[date, list[str]]] = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
m = DATE_LINE_RE.match(lines[i])
|
||||
if not m:
|
||||
i += 1
|
||||
continue
|
||||
day_name, d, mo, y = m.groups()
|
||||
try:
|
||||
dt = date(int(y), int(mo), int(d))
|
||||
except ValueError:
|
||||
i += 1
|
||||
continue
|
||||
dishes: dict[int, str] = {}
|
||||
i += 1
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
if DATE_LINE_RE.match(line):
|
||||
break
|
||||
if line.startswith("_" * 10) or "Öffnungszeiten" in line or "Speisen enthalten" in line:
|
||||
break
|
||||
dish_m = DISH_LINE_RE.match(line)
|
||||
if dish_m:
|
||||
roman_part, rest = dish_m.groups()
|
||||
idx = _normalize_roman(roman_part.strip().rstrip("."))
|
||||
if idx is not None and 1 <= idx <= 5 and idx not in dishes:
|
||||
dishes[idx] = f"{roman_part.strip()} {rest}".strip() if rest else roman_part.strip()
|
||||
i += 1
|
||||
# Immer genau 5 Einträge; fehlende mit "–"
|
||||
ordered = [dishes.get(j, "–") for j in range(1, 6)]
|
||||
result.append((dt, ordered))
|
||||
return result
|
||||
|
||||
|
||||
def merge_day_events(all_parsed: list[list[tuple[date, list[str]]]]) -> dict[date, list[str]]:
|
||||
"""Alle PDF-Ergebnisse zusammenführen; bei doppeltem Datum gewinnt letztes Vorkommen."""
|
||||
by_date: dict[date, list[str]] = {}
|
||||
for day_list in all_parsed:
|
||||
for d, dishes in day_list:
|
||||
by_date[d] = dishes
|
||||
return by_date
|
||||
|
||||
|
||||
def empty_ical_bytes() -> bytes:
|
||||
"""Minimalen leeren iCal-Kalender (ohne Events) als Bytes liefern."""
|
||||
cal = Calendar()
|
||||
cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE")
|
||||
cal.add("version", "2.0")
|
||||
cal.add("CALSCALE", "GREGORIAN")
|
||||
return cal.to_ical()
|
||||
|
||||
|
||||
def build_ical_bytes(by_date: dict[date, list[str]]) -> bytes:
|
||||
"""iCal-Kalender als Bytes erzeugen (12:00, Europe/Berlin)."""
|
||||
cal = Calendar()
|
||||
cal.add("prodid", "-//Kantine BHZ Kiel-Wik Speiseplan//kantine2ical//DE")
|
||||
cal.add("version", "2.0")
|
||||
cal.add("CALSCALE", "GREGORIAN")
|
||||
|
||||
tz_berlin = ZoneInfo(TIMEZONE)
|
||||
|
||||
for d, dishes in sorted(by_date.items()):
|
||||
event = Event()
|
||||
event.add("uid", f"kantine-bhz-{d.isoformat()}@kantine2ical.local")
|
||||
event.add("summary", SUMMARY)
|
||||
desc = "\n".join(dishes)
|
||||
event.add("description", desc)
|
||||
start_dt = datetime.combine(d, EVENT_START, tzinfo=tz_berlin)
|
||||
end_dt = datetime.combine(d, EVENT_END, tzinfo=tz_berlin)
|
||||
event.add("dtstart", start_dt)
|
||||
event.add("dtend", end_dt)
|
||||
cal.add_component(event)
|
||||
|
||||
return cal.to_ical()
|
||||
|
||||
|
||||
def build_ical(by_date: dict[date, list[str]], output_path: str) -> None:
|
||||
"""iCal-Kalender erstellen und in output_path schreiben (12:00, Europe/Berlin)."""
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(build_ical_bytes(by_date))
|
||||
|
||||
|
||||
def refresh_speiseplan(base_url: str = BASE_URL) -> tuple[dict[date, list[str]], bytes] | None:
|
||||
"""
|
||||
Kompletten Ablauf ausführen: URLs holen, PDFs laden, parsen, mergen, iCal bauen.
|
||||
Gibt (by_date, ical_bytes) zurück, bei Fehler oder keinen Daten None.
|
||||
"""
|
||||
try:
|
||||
urls = fetch_speiseplan_pdf_urls(base_url)
|
||||
if not urls:
|
||||
return None
|
||||
all_parsed: list[list[tuple[date, list[str]]]] = []
|
||||
for url in urls:
|
||||
try:
|
||||
pdf_bytes = download_pdf(url)
|
||||
text = extract_text_from_pdf(pdf_bytes)
|
||||
days = parse_speiseplan_text(text)
|
||||
all_parsed.append(days)
|
||||
except Exception:
|
||||
continue
|
||||
if not all_parsed:
|
||||
return None
|
||||
by_date = merge_day_events(all_parsed)
|
||||
ical_bytes = build_ical_bytes(by_date)
|
||||
return (by_date, ical_bytes)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Speiseplan von kantine-bhz.de als iCal exportieren."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
default=DEFAULT_OUTPUT,
|
||||
metavar="FILE",
|
||||
help=f"Ausgabedatei .ics (Standard: {DEFAULT_OUTPUT})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
default=BASE_URL,
|
||||
metavar="URL",
|
||||
help=f"Basis-URL der Kantine (Standard: {BASE_URL})",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_path: str = args.output
|
||||
if not output_path.lower().endswith(".ics"):
|
||||
output_path = output_path + ".ics"
|
||||
|
||||
print("Speiseplan-PDFs von", args.url, "laden ...")
|
||||
urls = fetch_speiseplan_pdf_urls(args.url)
|
||||
if not urls:
|
||||
print("Keine Speiseplan-PDFs gefunden.")
|
||||
return
|
||||
print(f" {len(urls)} PDF(s) gefunden.")
|
||||
|
||||
all_parsed: list[list[tuple[date, list[str]]]] = []
|
||||
for url in urls:
|
||||
try:
|
||||
pdf_bytes = download_pdf(url)
|
||||
text = extract_text_from_pdf(pdf_bytes)
|
||||
days = parse_speiseplan_text(text)
|
||||
all_parsed.append(days)
|
||||
except Exception as e:
|
||||
print(f" Fehler bei {url}: {e}")
|
||||
|
||||
if not all_parsed:
|
||||
print("Keine Daten aus PDFs gelesen.")
|
||||
return
|
||||
|
||||
by_date = merge_day_events(all_parsed)
|
||||
build_ical(by_date, output_path)
|
||||
print(f"Kalender mit {len(by_date)} Terminen geschrieben: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user