From 1d948242b379547aafc52b3350e7397546c401a6 Mon Sep 17 00:00:00 2001 From: elpatron Date: Fri, 6 Mar 2026 20:50:30 +0100 Subject: [PATCH] Add drag-and-drop audio transcription, fix model download, widen window - Add audio module: WAV/MP3 conversion to 16kHz mono for Parakeet - Add drop zone for audio files (WAV, MP3) at bottom of settings UI - Enable drag-and-drop in viewport, process dropped files - Fix model download: use altunenes/parakeet-rs/tdt with correct filenames - Move drop zone above status line - Increase window width by ~1/3 (640px default, 500px min) Made-with: Cursor --- HotKeet/Cargo.lock | 162 ++++++++++++++++++++++++++++++++++++ HotKeet/Cargo.toml | 1 + HotKeet/src/audio.rs | 191 +++++++++++++++++++++++++++++++++++++++++++ HotKeet/src/main.rs | 87 +++++++++++++++++++- HotKeet/src/model.rs | 9 +- HotKeet/src/ui.rs | 31 +++++++ 6 files changed, 475 insertions(+), 6 deletions(-) create mode 100644 HotKeet/src/audio.rs diff --git a/HotKeet/Cargo.lock b/HotKeet/Cargo.lock index 04e59d3..8568fd1 100644 --- a/HotKeet/Cargo.lock +++ b/HotKeet/Cargo.lock @@ -1013,6 +1013,15 @@ dependencies = [ "bytemuck", ] +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "endi" version = "1.1.1" @@ -1153,6 +1162,12 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "extended" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af9673d8203fcb076b19dfd17e38b3d4ae9f44959416ea532ce72415a6020365" + [[package]] name = "fastrand" version = "2.3.0" @@ -1617,6 +1632,7 @@ dependencies = [ "rfd", "serde", "serde_json", + "symphonia", "tokio", "tray-item", "winapi", @@ -3508,6 +3524,152 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "symphonia" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5773a4c030a19d9bfaa090f49746ff35c75dfddfa700df7a5939d5e076a57039" +dependencies = [ + "lazy_static", + "symphonia-bundle-flac", + "symphonia-bundle-mp3", + "symphonia-codec-adpcm", + "symphonia-codec-pcm", + "symphonia-codec-vorbis", + "symphonia-core", + "symphonia-format-mkv", + "symphonia-format-ogg", + "symphonia-format-riff", + "symphonia-metadata", +] + +[[package]] +name = "symphonia-bundle-flac" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91565e180aea25d9b80a910c546802526ffd0072d0b8974e3ebe59b686c9976" +dependencies = [ + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-bundle-mp3" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4872dd6bb56bf5eac799e3e957aa1981086c3e613b27e0ac23b176054f7c57ed" +dependencies = [ + "lazy_static", + "log", + "symphonia-core", + "symphonia-metadata", +] + +[[package]] +name = "symphonia-codec-adpcm" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dddc50e2bbea4cfe027441eece77c46b9f319748605ab8f3443350129ddd07f" +dependencies = [ + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-codec-pcm" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e89d716c01541ad3ebe7c91ce4c8d38a7cf266a3f7b2f090b108fb0cb031d95" +dependencies = [ + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-codec-vorbis" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f025837c309cd69ffef572750b4a2257b59552c5399a5e49707cc5b1b85d1c73" +dependencies = [ + "log", + "symphonia-core", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-core" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea00cc4f79b7f6bb7ff87eddc065a1066f3a43fe1875979056672c9ef948c2af" +dependencies = [ + "arrayvec", + "bitflags 1.3.2", + "bytemuck", + "lazy_static", + "log", +] + +[[package]] +name = "symphonia-format-mkv" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "122d786d2c43a49beb6f397551b4a050d8229eaa54c7ddf9ee4b98899b8742d0" +dependencies = [ + "lazy_static", + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-format-ogg" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b4955c67c1ed3aa8ae8428d04ca8397fbef6a19b2b051e73b5da8b1435639cb" +dependencies = [ + "log", + "symphonia-core", + "symphonia-metadata", + "symphonia-utils-xiph", +] + +[[package]] +name = "symphonia-format-riff" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2d7c3df0e7d94efb68401d81906eae73c02b40d5ec1a141962c592d0f11a96f" +dependencies = [ + "extended", + "log", + "symphonia-core", + "symphonia-metadata", +] + +[[package]] +name = "symphonia-metadata" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36306ff42b9ffe6e5afc99d49e121e0bd62fe79b9db7b9681d48e29fa19e6b16" +dependencies = [ + "encoding_rs", + "lazy_static", + "log", + "symphonia-core", +] + +[[package]] +name = "symphonia-utils-xiph" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27c85ab799a338446b68eec77abf42e1a6f1bb490656e121c6e27bfbab9f16" +dependencies = [ + "symphonia-core", + "symphonia-metadata", +] + [[package]] name = "syn" version = "1.0.109" diff --git a/HotKeet/Cargo.toml b/HotKeet/Cargo.toml index 9f5497d..d911fdf 100644 --- a/HotKeet/Cargo.toml +++ b/HotKeet/Cargo.toml @@ -28,6 +28,7 @@ chrono = "0.4" raw-window-handle = "0.6" rfd = "0.14" reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] } +symphonia = { version = "0.5", features = ["mp3"] } [build-dependencies] winresource = "0.1" diff --git a/HotKeet/src/audio.rs b/HotKeet/src/audio.rs new file mode 100644 index 0000000..2228555 --- /dev/null +++ b/HotKeet/src/audio.rs @@ -0,0 +1,191 @@ +//! Audio-Dateien (WAV, MP3) für Transkription vorbereiten. +//! Parakeet erwartet: 16 kHz, Mono, 16-bit PCM. + +use std::path::Path; +use symphonia::core::audio::Signal; + +const TARGET_SAMPLE_RATE: u32 = 16000; +const TARGET_CHANNELS: u16 = 1; + +/// Bereitet eine Audio-Datei für die Transkription vor. +/// Gibt den Pfad zu einer temporären WAV-Datei zurück (16 kHz, Mono, 16-bit). +/// Der Aufrufer sollte die Temp-Datei nach der Transkription löschen. +pub fn prepare_for_transcription(path: &Path) -> Result { + let ext = path + .extension() + .and_then(|e| e.to_str()) + .map(|s| s.to_lowercase()) + .unwrap_or_default(); + + let samples = match ext.as_str() { + "wav" => decode_wav(path)?, + "mp3" => decode_mp3(path)?, + _ => return Err(format!("Unsupported format: .{} (use .wav or .mp3)", ext)), + }; + + let temp_path = std::env::temp_dir().join(format!( + "hotkeet-transcribe-{}.wav", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0) + )); + + write_wav(&temp_path, &samples)?; + Ok(temp_path) +} + +fn decode_wav(path: &Path) -> Result, String> { + let reader = hound::WavReader::open(path).map_err(|e| format!("WAV read: {}", e))?; + let spec = reader.spec(); + let sample_rate = spec.sample_rate; + let channels = spec.channels; + + let samples: Vec = match spec.sample_format { + hound::SampleFormat::Int => reader + .into_samples::() + .filter_map(|s| s.ok()) + .collect(), + hound::SampleFormat::Float => reader + .into_samples::() + .filter_map(|s| s.ok()) + .map(|f| (f.clamp(-1.0, 1.0) * 32767.0) as i16) + .collect(), + }; + + let mono = if channels > 1 { + samples + .chunks(channels as usize) + .map(|c| { + let sum: i32 = c.iter().map(|&s| s as i32).sum(); + (sum / channels as i32).clamp(-32768, 32767) as i16 + }) + .collect() + } else { + samples + }; + + let resampled = resample_i16(&mono, sample_rate, TARGET_SAMPLE_RATE); + Ok(resampled) +} + +fn decode_mp3(path: &Path) -> Result, String> { + let file = std::fs::File::open(path).map_err(|e| format!("Open MP3: {}", e))?; + let source = symphonia::core::io::MediaSourceStream::new( + Box::new(file), + symphonia::core::io::MediaSourceStreamOptions::default(), + ); + + let probe = symphonia::default::get_probe(); + let result = probe + .format( + &Default::default(), + source, + &Default::default(), + &Default::default(), + ) + .map_err(|e| format!("MP3 probe: {}", e))?; + + let mut format = result.format; + + let track = format + .tracks() + .iter() + .find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL) + .ok_or("No audio track in MP3")? + .clone(); + + let mut decoder = symphonia::default::get_codecs() + .make(&track.codec_params, &Default::default()) + .map_err(|e| format!("MP3 decoder: {}", e))?; + + let sample_rate = track + .codec_params + .sample_rate + .ok_or("MP3: no sample rate")? as u32; + let channels = track + .codec_params + .channels + .ok_or("MP3: no channels")? + .count() as usize; + + let mut all_samples: Vec = Vec::new(); + + while let Ok(packet) = format.next_packet() { + if let Ok(decoded) = decoder.decode(&packet) { + match decoded { + symphonia::core::audio::AudioBufferRef::F32(buf) => { + let n_frames = buf.frames(); + for i in 0..n_frames { + let mut sum = 0.0f32; + for c in 0..channels { + sum += buf.chan(c)[i]; + } + sum /= channels as f32; + all_samples.push((sum.clamp(-1.0, 1.0) * 32767.0) as i16); + } + } + symphonia::core::audio::AudioBufferRef::S16(buf) => { + let n_frames = buf.frames(); + for i in 0..n_frames { + let mut sum = 0i32; + for c in 0..channels { + sum += buf.chan(c)[i] as i32; + } + all_samples.push((sum / channels as i32).clamp(-32768, 32767) as i16); + } + } + symphonia::core::audio::AudioBufferRef::U16(buf) => { + let n_frames = buf.frames(); + for i in 0..n_frames { + let mut sum = 0i32; + for c in 0..channels { + sum += (buf.chan(c)[i] as i32) - 32768; + } + all_samples.push((sum / channels as i32).clamp(-32768, 32767) as i16); + } + } + _ => return Err("MP3: unsupported sample format".to_string()), + } + } + } + + let resampled = resample_i16(&all_samples, sample_rate, TARGET_SAMPLE_RATE); + Ok(resampled) +} + +fn resample_i16(samples: &[i16], from_rate: u32, to_rate: u32) -> Vec { + if from_rate == to_rate { + return samples.to_vec(); + } + let ratio = from_rate as f64 / to_rate as f64; + let out_len = (samples.len() as f64 / ratio) as usize; + let mut out = Vec::with_capacity(out_len); + for i in 0..out_len { + let src_idx = i as f64 * ratio; + let idx0 = src_idx.floor() as usize; + let idx1 = (idx0 + 1).min(samples.len().saturating_sub(1)); + let frac = src_idx - idx0 as f64; + let s0 = samples.get(idx0).copied().unwrap_or(0) as f64; + let s1 = samples.get(idx1).copied().unwrap_or(0) as f64; + let s = s0 * (1.0 - frac) + s1 * frac; + out.push(s.clamp(-32768.0, 32767.0) as i16); + } + out +} + +fn write_wav(path: &Path, samples: &[i16]) -> Result<(), String> { + let spec = hound::WavSpec { + channels: TARGET_CHANNELS, + sample_rate: TARGET_SAMPLE_RATE, + bits_per_sample: 16, + sample_format: hound::SampleFormat::Int, + }; + let mut writer = hound::WavWriter::create(path, spec) + .map_err(|e| format!("Create WAV: {}", e))?; + for &s in samples { + writer.write_sample(s).map_err(|e| format!("Write WAV: {}", e))?; + } + writer.finalize().map_err(|e| format!("Finalize WAV: {}", e))?; + Ok(()) +} diff --git a/HotKeet/src/main.rs b/HotKeet/src/main.rs index fd57142..db3da17 100644 --- a/HotKeet/src/main.rs +++ b/HotKeet/src/main.rs @@ -57,6 +57,7 @@ fn show_main_window() { } } } +mod audio; mod config; mod hotkey; mod paste; @@ -149,6 +150,29 @@ impl eframe::App for AppState { ctx.send_viewport_cmd(egui::ViewportCommand::Visible(false)); } + // Dropped audio files: transcribe and paste + let dropped: Vec<_> = ctx.input(|i| i.raw.dropped_files.clone()); + for file in dropped { + if let Some(ref path) = file.path { + let ext = path + .extension() + .and_then(|e| e.to_str()) + .map(|s| s.to_lowercase()) + .unwrap_or_default(); + if ext == "wav" || ext == "mp3" { + let path = path.clone(); + let cfg = self.config.read().unwrap().clone(); + let status = self.status.clone(); + let status_detail = self.status_detail.clone(); + let paste_tx = self.paste_tx.clone(); + std::thread::spawn(move || { + run_file_transcription(path, cfg, status, status_detail, paste_tx); + }); + break; // Nur eine Datei pro Drop verarbeiten + } + } + } + // Download request: spawn model download if let Ok(path) = self.download_request_rx.try_recv() { model::download_model_async(path, self.download_progress_tx.clone()); @@ -419,8 +443,9 @@ fn main() -> eframe::Result<()> { }; let mut viewport = egui::ViewportBuilder::default() - .with_inner_size([400.0, 400.0]) - .with_min_inner_size([300.0, 300.0]); + .with_inner_size([660.0, 520.0]) + .with_min_inner_size([500.0, 420.0]) + .with_drag_and_drop(true); if start_minimized { viewport = viewport.with_visible(false); } @@ -454,6 +479,64 @@ fn main() -> eframe::Result<()> { Ok(()) } +fn run_file_transcription( + path: std::path::PathBuf, + config: DictateConfig, + status: Arc>, + status_detail: Arc>, + paste_tx: Sender, +) { + set_status(&status, &status_detail, AppStatus::Transkribieren, "Converting…"); + + let wav_path = match audio::prepare_for_transcription(&path) { + Ok(p) => p, + Err(e) => { + eprintln!("Audio prepare: {}", e); + set_status(&status, &status_detail, AppStatus::Fehler, &format!("Audio: {}", e)); + return; + } + }; + + let _defer = Defer(Some(|| { + let _ = std::fs::remove_file(&wav_path); + })); + + set_status(&status, &status_detail, AppStatus::Transkribieren, "Transcribing…"); + + let text = match transcribe( + &config.parakeet_cli_path, + &config.model_path, + &wav_path, + ) { + Ok(t) => t, + Err(e) => { + eprintln!("Transcription: {}", e); + set_status(&status, &status_detail, AppStatus::Fehler, &format!("Transcription: {}", e)); + return; + } + }; + + let req = PasteRequest { + text, + method: config.paste_method_enum(), + target_hwnd: None, + debug_logging: config.debug_logging, + }; + if paste_tx.send(req).is_err() { + eprintln!("Paste channel closed"); + set_status(&status, &status_detail, AppStatus::Fehler, "Paste channel failed"); + } +} + +struct Defer(Option); +impl Drop for Defer { + fn drop(&mut self) { + if let Some(f) = self.0.take() { + f(); + } + } +} + fn set_status( status: &Arc>, detail: &Arc>, diff --git a/HotKeet/src/model.rs b/HotKeet/src/model.rs index 16fb80e..8ba13bb 100644 --- a/HotKeet/src/model.rs +++ b/HotKeet/src/model.rs @@ -4,14 +4,15 @@ use std::io::Read; use std::path::Path; use std::sync::mpsc::Sender; -const HF_BASE: &str = "https://huggingface.co/nasedkinpv/parakeet-tdt-0.6b-v3-onnx-int8/resolve/main"; +/// transcribe-rs compatible model (encoder-model.int8.onnx, decoder_joint-model.int8.onnx, nemo128.onnx) +const HF_BASE: &str = "https://huggingface.co/altunenes/parakeet-rs/resolve/main/tdt"; /// Required files for Parakeet INT8 model (transcribe-rs). const REQUIRED_FILES: &[&str] = &[ "vocab.txt", - "encoder-int8.onnx", - "encoder-int8.onnx.data", - "decoder_joint-int8.onnx", + "nemo128.onnx", + "encoder-model.int8.onnx", + "decoder_joint-model.int8.onnx", ]; /// Progress message during download. diff --git a/HotKeet/src/ui.rs b/HotKeet/src/ui.rs index 8088545..ce2eeac 100644 --- a/HotKeet/src/ui.rs +++ b/HotKeet/src/ui.rs @@ -318,6 +318,37 @@ impl SettingsApp { } } + ui.add_space(8.0); + + // Drag-and-Drop-Bereich für Audio-Dateien (über der Statuszeile) + let drop_rect = ui.available_rect_before_wrap(); + let drop_height = 56.0; + let drop_rect = egui::Rect::from_min_size( + drop_rect.min, + egui::vec2(drop_rect.width(), drop_height), + ); + let (rect, _) = ui.allocate_exact_size(drop_rect.size(), egui::Sense::hover()); + let is_hovered = ctx.input(|i| i.pointer.hover_pos()) + .map(|p| rect.contains(p)) + .unwrap_or(false); + let stroke = if is_hovered { + egui::Stroke::new(2.0, egui::Color32::from_rgb(100, 150, 255)) + } else { + egui::Stroke::new(1.0, egui::Color32::from_rgb(120, 120, 120)) + }; + ui.painter().rect_stroke(rect, 4.0, stroke); + ui.allocate_new_ui(egui::UiBuilder::new().max_rect(rect), |ui| { + ui.vertical_centered(|ui| { + ui.add_space(8.0); + ui.label( + egui::RichText::new("Drop WAV or MP3 here for transcription") + .color(egui::Color32::from_rgb(140, 140, 140)), + ); + }); + }); + + ui.add_space(4.0); + if !self.status.is_empty() { ui.add_space(8.0); ui.label(&self.status);