diff --git a/.env.example b/.env.example index 2850ee5..5798de9 100755 --- a/.env.example +++ b/.env.example @@ -6,6 +6,10 @@ OpenRouterAPIKey= # Valid examples: anthropic/claude-3.5-haiku, anthropic/claude-3-haiku, anthropic/claude-haiku-4.5 # OpenRouterModel=anthropic/claude-3.5-haiku +# Speech-to-Text Transcription Service (local Parakeet container endpoint) +# Defaults to: http://localhost:5092/v1/audio/transcriptions (or http://parakeet:5092/v1/audio/transcriptions in Docker) +# PARAKEET_URL=http://localhost:5092/v1/audio/transcriptions + # DeepL API (for scripts/translate-locales.mjs and scripts/translate-flyer.mjs) # Free plan keys use api-free.deepl.com automatically (suffix :fx) DeepLAPIKey= diff --git a/client/src/components/EventRemarksCell.tsx b/client/src/components/EventRemarksCell.tsx index 211c83e..dfa83d8 100644 --- a/client/src/components/EventRemarksCell.tsx +++ b/client/src/components/EventRemarksCell.tsx @@ -1,24 +1,80 @@ +import { useState, useEffect } from 'react' import { useTranslation } from 'react-i18next' +import { Mic, Loader2 } from 'lucide-react' import type { LogEventPayload } from '../utils/logEntryPayload.js' import { parseLiveVoiceRemark } from '../utils/liveEventCodes.js' import { formatEventSummary } from '../utils/formatEventSummary.js' import VoiceMemoPlayer, { type PreloadedVoiceMemo } from './VoiceMemoPlayer.tsx' +import { useDialog } from './ModalDialog.tsx' +import { updateVoiceMemoTranscript } from '../services/voiceAttachments.js' interface EventRemarksCellProps { event: LogEventPayload logbookId: string voiceMemoLookup?: Map + readOnly?: boolean } export default function EventRemarksCell({ event, logbookId, - voiceMemoLookup + voiceMemoLookup, + readOnly = false }: EventRemarksCellProps) { const { t } = useTranslation() + const { showAlert } = useDialog() const voiceId = parseLiveVoiceRemark(event.remarks.trim()) const preloaded = voiceId ? voiceMemoLookup?.get(voiceId) : undefined + const [transcribing, setTranscribing] = useState(false) + const [isOnline, setIsOnline] = useState(navigator.onLine) + + useEffect(() => { + const handleOnline = () => setIsOnline(true) + const handleOffline = () => setIsOnline(false) + window.addEventListener('online', handleOnline) + window.addEventListener('offline', handleOffline) + return () => { + window.removeEventListener('online', handleOnline) + window.removeEventListener('offline', handleOffline) + } + }, []) + + const handleTranscribe = async (e: React.MouseEvent) => { + e.preventDefault() + e.stopPropagation() + if (transcribing || !preloaded?.audio || !voiceId) return + setTranscribing(true) + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), 15000) + try { + const res = await fetch('/api/ai/transcribe', { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + }, + body: JSON.stringify({ audioDataUrl: preloaded.audio }), + signal: controller.signal + }) + clearTimeout(timeoutId) + if (!res.ok) { + throw new Error(`Server returned status ${res.status}`) + } + const data = await res.json() + const text = (data.text || '').trim() + if (!text) { + throw new Error('Transcription returned empty text') + } + await updateVoiceMemoTranscript(logbookId, voiceId, text) + } catch (err) { + clearTimeout(timeoutId) + console.error('[EventRemarksCell] Transcription failed:', err) + void showAlert(t('logs.live_voice_transcribe_failed'), t('logs.live_voice_btn')) + } finally { + setTranscribing(false) + } + } + let summary = formatEventSummary(event, t) if (voiceId && preloaded?.caption) { summary = t('logs.live_voice_entry', { caption: preloaded.caption }) @@ -28,12 +84,39 @@ export default function EventRemarksCell({
{summary} {voiceId && ( - +
+ + {!readOnly && preloaded && preloaded.transcribed === false && isOnline && ( + + )} +
)}
) diff --git a/client/src/components/LiveLogView.tsx b/client/src/components/LiveLogView.tsx index 7544dbe..3a8624a 100644 --- a/client/src/components/LiveLogView.tsx +++ b/client/src/components/LiveLogView.tsx @@ -31,7 +31,6 @@ import { removeLastEvent } from '../services/quickEventLog.js' import CreatorAvatar from './CreatorAvatar.tsx' -import { formatEventSummary } from '../utils/formatEventSummary.js' import { getLastAutoPositionMs, getLastLoggedPositionWithin, @@ -43,7 +42,6 @@ import { liveFuelRemark, livePhotoRemark, liveVoiceRemark, - parseLiveVoiceRemark, livePrecipRemark, liveSailsRemark, liveSogRemark, @@ -80,7 +78,7 @@ import CourseDialInput from './CourseDialInput.tsx' import GpsSignalHint from './GpsSignalHint.tsx' import LiveCameraCapture from './LiveCameraCapture.tsx' import LiveVoiceCapture from './LiveVoiceCapture.tsx' -import VoiceMemoPlayer from './VoiceMemoPlayer.tsx' +import EventRemarksCell from './EventRemarksCell.tsx' import { saveEntryPhoto, deleteEntryPhoto } from '../services/photoAttachments.js' import { saveEntryVoiceMemo, deleteEntryVoiceMemo } from '../services/voiceAttachments.js' import { blobToCompressedJpegDataUrl } from '../utils/imageCompress.js' @@ -836,13 +834,46 @@ export default function LiveLogView({ void (async () => { try { const audioDataUrl = await blobToAudioDataUrl(blob) + + let transcriptionText = '' + let transcribed = true + let transcriptionError = false + + try { + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), 4000) + + const res = await fetch('/api/ai/transcribe', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ audioDataUrl }), + signal: controller.signal + }) + clearTimeout(timeoutId) + if (!res.ok) throw new Error(`Status ${res.status}`) + const data = await res.json() + transcriptionText = (data.text || '').trim() + } catch (err) { + console.warn('[LiveLogView] Automatic transcription failed or timed out:', err) + transcriptionError = true + transcribed = false + } + + let finalCaption = caption + if (transcriptionText) { + finalCaption = caption + ? `${caption}\n(Transkript: ${transcriptionText})` + : transcriptionText + } + const voiceId = await saveEntryVoiceMemo({ logbookId, entryId, audioDataUrl, mimeType, durationSec, - caption, + caption: finalCaption, + transcribed, analyticsContext: 'live_log' }) await appendQuickEvent(logbookId, entryId, { @@ -854,6 +885,10 @@ export default function LiveLogView({ setVoiceCaption('') showUndo('voice') trackPlausibleEvent(PlausibleEvents.LIVE_LOG_EVENT_LOGGED, { action: 'voice' }) + + if (transcriptionError) { + void showAlert(t('logs.live_voice_transcribe_failed'), t('logs.live_voice_btn')) + } } catch (err: unknown) { console.error('Live log voice save failed:', err) const msg = err instanceof Error && err.message === 'VOICE_MEMO_TOO_LARGE' @@ -1225,12 +1260,6 @@ export default function LiveLogView({ ) : (
    {events.map((event, index) => { - const voiceId = parseLiveVoiceRemark(event.remarks.trim()) - const voicePreloaded = voiceId ? voiceMemoLookup.get(voiceId) : undefined - let summary = formatEventSummary(event, t) - if (voiceId && voicePreloaded?.caption) { - summary = t('logs.live_voice_entry', { caption: voicePreloaded.caption }) - } return (
  1. @@ -1240,15 +1269,12 @@ export default function LiveLogView({ size={24} />
    - {summary} - {voiceId && ( - - )} +
  2. ) diff --git a/client/src/components/LogEntryEditor.tsx b/client/src/components/LogEntryEditor.tsx index 4c9be29..88324a4 100644 --- a/client/src/components/LogEntryEditor.tsx +++ b/client/src/components/LogEntryEditor.tsx @@ -1909,6 +1909,7 @@ export default function LogEntryEditor({ event={ev} logbookId={logbookId} voiceMemoLookup={voiceMemoLookup} + readOnly={readOnly} /> {!readOnly && ( diff --git a/client/src/components/VoiceMemoPlayer.tsx b/client/src/components/VoiceMemoPlayer.tsx index a104cf4..ed9fdf1 100644 --- a/client/src/components/VoiceMemoPlayer.tsx +++ b/client/src/components/VoiceMemoPlayer.tsx @@ -11,6 +11,7 @@ export interface PreloadedVoiceMemo { mimeType?: string durationSec?: number caption?: string + transcribed?: boolean } interface VoiceMemoPlayerProps { diff --git a/client/src/hooks/useEntryVoiceMemos.ts b/client/src/hooks/useEntryVoiceMemos.ts index 518c0e4..0554872 100644 --- a/client/src/hooks/useEntryVoiceMemos.ts +++ b/client/src/hooks/useEntryVoiceMemos.ts @@ -48,7 +48,8 @@ export function useEntryVoiceMemos( audio: String(decrypted.audio), mimeType: decrypted.mimeType ? String(decrypted.mimeType) : undefined, durationSec: typeof decrypted.durationSec === 'number' ? decrypted.durationSec : undefined, - caption: decrypted.caption ? String(decrypted.caption) : '' + caption: decrypted.caption ? String(decrypted.caption) : '', + transcribed: decrypted.transcribed !== false }) } catch { // skip corrupt memo diff --git a/client/src/i18n/locales/da.json b/client/src/i18n/locales/da.json index 2bc837f..2959502 100644 --- a/client/src/i18n/locales/da.json +++ b/client/src/i18n/locales/da.json @@ -297,6 +297,9 @@ "live_voice_entry_plain": "Stemmenotat", "live_voice_caption_label": "Billedtekst (valgfrit)", "live_voice_caption_placeholder": "f.eks. radiokontakt med havnemester", + "live_voice_transcribe_action": "Transkribere", + "live_voice_transcribing": "Transkriberer…", + "live_voice_transcribe_failed": "Stemmebesked gemt, men transkribering mislykkedes.", "live_undo_voice_hint": "Stemmenotat gemt", "live_comment_btn": "Kommentar", "live_comment_placeholder": "Indtast tekst…", diff --git a/client/src/i18n/locales/de.json b/client/src/i18n/locales/de.json index b938cc4..6308e12 100644 --- a/client/src/i18n/locales/de.json +++ b/client/src/i18n/locales/de.json @@ -297,6 +297,9 @@ "live_voice_entry_plain": "Sprachnotiz", "live_voice_caption_label": "Beschriftung (optional)", "live_voice_caption_placeholder": "z. B. Funkverkehr mit Hafenmeister", + "live_voice_transcribe_action": "Transkribieren", + "live_voice_transcribing": "Transkribiere...", + "live_voice_transcribe_failed": "Sprachmemo gespeichert, aber Transkription fehlgeschlagen.", "live_undo_voice_hint": "Sprachnotiz gespeichert", "live_comment_btn": "Kommentar", "live_comment_placeholder": "Freitext eingeben…", diff --git a/client/src/i18n/locales/en.json b/client/src/i18n/locales/en.json index 1340d29..1728dbe 100644 --- a/client/src/i18n/locales/en.json +++ b/client/src/i18n/locales/en.json @@ -297,6 +297,9 @@ "live_voice_entry_plain": "Voice memo", "live_voice_caption_label": "Caption (optional)", "live_voice_caption_placeholder": "e.g. radio call with harbour master", + "live_voice_transcribe_action": "Transcribe", + "live_voice_transcribing": "Transcribing…", + "live_voice_transcribe_failed": "Voice memo saved, but transcription failed.", "live_undo_voice_hint": "Voice memo saved", "live_comment_btn": "Comment", "live_comment_placeholder": "Enter text…", diff --git a/client/src/i18n/locales/nb.json b/client/src/i18n/locales/nb.json index 669768f..a3de24e 100644 --- a/client/src/i18n/locales/nb.json +++ b/client/src/i18n/locales/nb.json @@ -297,6 +297,9 @@ "live_voice_entry_plain": "Talemelding", "live_voice_caption_label": "Bildetekst (valgfritt)", "live_voice_caption_placeholder": "f.eks. radiokontakt med havnesjef", + "live_voice_transcribe_action": "Transkribere", + "live_voice_transcribing": "Transkriberer…", + "live_voice_transcribe_failed": "Taleopptak lagret, men transkribering mislyktes.", "live_undo_voice_hint": "Talemelding lagret", "live_comment_btn": "Kommentar", "live_comment_placeholder": "Skriv inn tekst…", diff --git a/client/src/i18n/locales/sv.json b/client/src/i18n/locales/sv.json index 179fbbf..30b95ae 100644 --- a/client/src/i18n/locales/sv.json +++ b/client/src/i18n/locales/sv.json @@ -297,6 +297,9 @@ "live_voice_entry_plain": "Röstanteckning", "live_voice_caption_label": "Bildtext (valfritt)", "live_voice_caption_placeholder": "t.ex. radiokontakt med hamnmästare", + "live_voice_transcribe_action": "Transkribera", + "live_voice_transcribing": "Transkriberar…", + "live_voice_transcribe_failed": "Röstanteckning sparad, men transkribering misslyckades.", "live_undo_voice_hint": "Röstanteckning sparad", "live_comment_btn": "Kommentar", "live_comment_placeholder": "Ange text…", diff --git a/client/src/services/voiceAttachments.ts b/client/src/services/voiceAttachments.ts index 72d2344..fb28fd1 100644 --- a/client/src/services/voiceAttachments.ts +++ b/client/src/services/voiceAttachments.ts @@ -1,7 +1,7 @@ import { db } from './db.js' import { getActiveMasterKey } from './auth.js' import { getLogbookKey } from './logbookKeys.js' -import { encryptJson } from './crypto.js' +import { encryptJson, decryptJson } from './crypto.js' import { syncLogbook } from './sync.js' import { PlausibleEvents, trackPlausibleEvent } from './analytics.js' @@ -18,6 +18,7 @@ export async function saveEntryVoiceMemo(options: { mimeType: string durationSec: number caption?: string + transcribed?: boolean analyticsContext?: string }): Promise { const { @@ -27,6 +28,7 @@ export async function saveEntryVoiceMemo(options: { mimeType, durationSec, caption = '', + transcribed = true, analyticsContext = 'logbook' } = options const masterKey = await getEncryptionKey(logbookId) @@ -35,7 +37,8 @@ export async function saveEntryVoiceMemo(options: { audio: audioDataUrl, mimeType, durationSec, - caption: caption.trim() + caption: caption.trim(), + transcribed: !!transcribed } const encrypted = await encryptJson(voicePayload, masterKey) @@ -98,3 +101,55 @@ export async function removeLastVoiceMemoForEntry( await deleteEntryVoiceMemo(logbookId, lastId) return lastId } + +/** Updates an existing voice memo payload with a new transcript and sets transcribed: true. */ +export async function updateVoiceMemoTranscript( + logbookId: string, + voiceId: string, + transcript: string +): Promise { + const masterKey = await getEncryptionKey(logbookId) + const record = await db.voiceMemos.get(voiceId) + if (!record) throw new Error('Voice memo not found') + + const decrypted = await decryptJson(record.encryptedData, record.iv, record.tag, masterKey) + if (!decrypted) throw new Error('Failed to decrypt voice memo') + + const manualCaption = decrypted.caption ? String(decrypted.caption).trim() : '' + const finalCaption = manualCaption + ? `${manualCaption}\n(Transkript: ${transcript.trim()})` + : transcript.trim() + + const updatedPayload = { + ...decrypted, + caption: finalCaption, + transcribed: true + } + + const encrypted = await encryptJson(updatedPayload, masterKey) + const now = new Date().toISOString() + + await db.voiceMemos.put({ + ...record, + encryptedData: encrypted.ciphertext, + iv: encrypted.iv, + tag: encrypted.tag, + updatedAt: now + }) + + await db.syncQueue.put({ + action: 'update', + type: 'voiceMemo', + payloadId: voiceId, + logbookId, + data: JSON.stringify({ + encryptedData: encrypted.ciphertext, + iv: encrypted.iv, + tag: encrypted.tag, + entryId: record.entryId + }), + updatedAt: now + }) + + syncLogbook(logbookId).catch((err) => console.warn('Background sync failed:', err)) +} diff --git a/docker-compose.staging.yml b/docker-compose.staging.yml index 69019e2..4054f15 100644 --- a/docker-compose.staging.yml +++ b/docker-compose.staging.yml @@ -33,6 +33,7 @@ services: OpenWeatherMapAPIKey: ${OpenWeatherMapAPIKey:-} OpenRouterAPIKey: ${OpenRouterAPIKey:-} OpenRouterModel: ${OpenRouterModel:-anthropic/claude-3.5-haiku} + PARAKEET_URL: ${PARAKEET_URL:-http://parakeet:5092/v1/audio/transcriptions} SESSION_SECRET: ${SESSION_SECRET:-} ADMIN_USER_IDS: ${ADMIN_USER_IDS:-} NTFY_SERVER: ${NTFY_SERVER:-https://ntfy.sh} @@ -66,6 +67,13 @@ services: backend: condition: service_healthy + parakeet: + image: ghcr.io/achetronic/parakeet:latest + container_name: daagbox-staging-parakeet + restart: always + ports: + - "5092:5092" + volumes: pgdata: name: daagbox-staging-pgdata diff --git a/docker-compose.yml b/docker-compose.yml index 45c224d..292a56f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,6 +34,7 @@ services: OpenWeatherMapAPIKey: ${OpenWeatherMapAPIKey:-} OpenRouterAPIKey: ${OpenRouterAPIKey:-} OpenRouterModel: ${OpenRouterModel:-anthropic/claude-3.5-haiku} + PARAKEET_URL: ${PARAKEET_URL:-http://parakeet:5092/v1/audio/transcriptions} SESSION_SECRET: ${SESSION_SECRET:-} ADMIN_USER_IDS: ${ADMIN_USER_IDS:-} NTFY_SERVER: ${NTFY_SERVER:-https://ntfy.sh} @@ -67,6 +68,13 @@ services: backend: condition: service_healthy + parakeet: + image: ghcr.io/achetronic/parakeet:latest + container_name: daagbox-prod-parakeet + restart: always + ports: + - "5092:5092" + volumes: pgdata: name: daagbox-prod-pgdata diff --git a/server/src/api.smoke.test.ts b/server/src/api.smoke.test.ts index 2a6f780..591d882 100644 --- a/server/src/api.smoke.test.ts +++ b/server/src/api.smoke.test.ts @@ -59,4 +59,12 @@ describe('API smoke', () => { expect(res.status).toBe(401) expect(res.body.error).toMatch(/Unauthorized/i) }) + + it('POST /api/ai/transcribe requires session', async () => { + const res = await request(app) + .post('/api/ai/transcribe') + .send({ audioDataUrl: 'data:audio/webm;base64,abcdef' }) + expect(res.status).toBe(401) + expect(res.body.error).toMatch(/Unauthorized/i) + }) }) diff --git a/server/src/routes/ai.ts b/server/src/routes/ai.ts index 8ee1f46..e840731 100644 --- a/server/src/routes/ai.ts +++ b/server/src/routes/ai.ts @@ -4,6 +4,7 @@ import { requireUser } from '../middleware/auth.js' const router = Router() +const PARAKEET_URL = process.env.PARAKEET_URL || 'http://localhost:5092/v1/audio/transcriptions' const MAX_ATTEMPTS_PER_ENTRY = 3 const DEFAULT_MODEL = 'anthropic/claude-3.5-haiku' const OPENROUTER_URL = 'https://openrouter.ai/api/v1/chat/completions' @@ -230,4 +231,68 @@ router.post('/summary', async (req: any, res) => { } }) +router.post('/transcribe', async (req: any, res) => { + try { + const { audioDataUrl } = req.body ?? {} + if (!audioDataUrl || typeof audioDataUrl !== 'string') { + return res.status(400).json({ error: 'audioDataUrl is required' }) + } + + const match = audioDataUrl.match(/^data:([^;]+);base64,(.+)$/) + if (!match) { + return res.status(400).json({ error: 'Invalid audio data URL format' }) + } + + const [, mimeType, base64Data] = match + const buffer = Buffer.from(base64Data, 'base64') + + let ext = 'webm' + if (mimeType.includes('mp4')) ext = 'mp4' + else if (mimeType.includes('ogg')) ext = 'ogg' + else if (mimeType.includes('wav')) ext = 'wav' + + const filename = `audio.${ext}` + const file = new File([buffer], filename, { type: mimeType }) + + const formData = new FormData() + formData.append('file', file) + + console.log(`[server] Forwarding ASR request to ${PARAKEET_URL} (${filename}, ${buffer.length} bytes)`) + + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), 15000) + + try { + const parakeetRes = await fetch(PARAKEET_URL, { + method: 'POST', + body: formData, + signal: controller.signal + }) + + if (!parakeetRes.ok) { + const errorText = await parakeetRes.text().catch(() => '') + console.error(`[server] Parakeet ASR error response (status=${parakeetRes.status}):`, errorText) + throw new Error(`Parakeet returned status ${parakeetRes.status}`) + } + + const data: any = await parakeetRes.json() + const text = (data?.text || '').trim() + + console.log(`[server] ASR completed successfully: "${text}"`) + return res.json({ text }) + } catch (error: unknown) { + if (error instanceof Error && error.name === 'AbortError') { + console.error('[server] Parakeet ASR request timed out') + return res.status(504).json({ error: 'Transcription request timed out' }) + } + throw error + } finally { + clearTimeout(timeoutId) + } + } catch (error: unknown) { + console.error('ASR transcription failed:', error) + return res.status(503).json({ error: 'Transcription service unavailable' }) + } +}) + export default router