Add drag-and-drop audio transcription, fix model download, widen window

- Add audio module: WAV/MP3 conversion to 16kHz mono for Parakeet
- Add drop zone for audio files (WAV, MP3) at bottom of settings UI
- Enable drag-and-drop in viewport, process dropped files
- Fix model download: use altunenes/parakeet-rs/tdt with correct filenames
- Move drop zone above status line
- Increase window width by ~1/3 (640px default, 500px min)

Made-with: Cursor
This commit is contained in:
2026-03-06 20:50:30 +01:00
parent 882224f26c
commit 1d948242b3
6 changed files with 475 additions and 6 deletions

162
HotKeet/Cargo.lock generated
View File

@@ -1013,6 +1013,15 @@ dependencies = [
"bytemuck",
]
[[package]]
name = "encoding_rs"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if",
]
[[package]]
name = "endi"
version = "1.1.1"
@@ -1153,6 +1162,12 @@ dependencies = [
"pin-project-lite",
]
[[package]]
name = "extended"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af9673d8203fcb076b19dfd17e38b3d4ae9f44959416ea532ce72415a6020365"
[[package]]
name = "fastrand"
version = "2.3.0"
@@ -1617,6 +1632,7 @@ dependencies = [
"rfd",
"serde",
"serde_json",
"symphonia",
"tokio",
"tray-item",
"winapi",
@@ -3508,6 +3524,152 @@ version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "symphonia"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5773a4c030a19d9bfaa090f49746ff35c75dfddfa700df7a5939d5e076a57039"
dependencies = [
"lazy_static",
"symphonia-bundle-flac",
"symphonia-bundle-mp3",
"symphonia-codec-adpcm",
"symphonia-codec-pcm",
"symphonia-codec-vorbis",
"symphonia-core",
"symphonia-format-mkv",
"symphonia-format-ogg",
"symphonia-format-riff",
"symphonia-metadata",
]
[[package]]
name = "symphonia-bundle-flac"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c91565e180aea25d9b80a910c546802526ffd0072d0b8974e3ebe59b686c9976"
dependencies = [
"log",
"symphonia-core",
"symphonia-metadata",
"symphonia-utils-xiph",
]
[[package]]
name = "symphonia-bundle-mp3"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4872dd6bb56bf5eac799e3e957aa1981086c3e613b27e0ac23b176054f7c57ed"
dependencies = [
"lazy_static",
"log",
"symphonia-core",
"symphonia-metadata",
]
[[package]]
name = "symphonia-codec-adpcm"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dddc50e2bbea4cfe027441eece77c46b9f319748605ab8f3443350129ddd07f"
dependencies = [
"log",
"symphonia-core",
]
[[package]]
name = "symphonia-codec-pcm"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e89d716c01541ad3ebe7c91ce4c8d38a7cf266a3f7b2f090b108fb0cb031d95"
dependencies = [
"log",
"symphonia-core",
]
[[package]]
name = "symphonia-codec-vorbis"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f025837c309cd69ffef572750b4a2257b59552c5399a5e49707cc5b1b85d1c73"
dependencies = [
"log",
"symphonia-core",
"symphonia-utils-xiph",
]
[[package]]
name = "symphonia-core"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea00cc4f79b7f6bb7ff87eddc065a1066f3a43fe1875979056672c9ef948c2af"
dependencies = [
"arrayvec",
"bitflags 1.3.2",
"bytemuck",
"lazy_static",
"log",
]
[[package]]
name = "symphonia-format-mkv"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "122d786d2c43a49beb6f397551b4a050d8229eaa54c7ddf9ee4b98899b8742d0"
dependencies = [
"lazy_static",
"log",
"symphonia-core",
"symphonia-metadata",
"symphonia-utils-xiph",
]
[[package]]
name = "symphonia-format-ogg"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b4955c67c1ed3aa8ae8428d04ca8397fbef6a19b2b051e73b5da8b1435639cb"
dependencies = [
"log",
"symphonia-core",
"symphonia-metadata",
"symphonia-utils-xiph",
]
[[package]]
name = "symphonia-format-riff"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2d7c3df0e7d94efb68401d81906eae73c02b40d5ec1a141962c592d0f11a96f"
dependencies = [
"extended",
"log",
"symphonia-core",
"symphonia-metadata",
]
[[package]]
name = "symphonia-metadata"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "36306ff42b9ffe6e5afc99d49e121e0bd62fe79b9db7b9681d48e29fa19e6b16"
dependencies = [
"encoding_rs",
"lazy_static",
"log",
"symphonia-core",
]
[[package]]
name = "symphonia-utils-xiph"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee27c85ab799a338446b68eec77abf42e1a6f1bb490656e121c6e27bfbab9f16"
dependencies = [
"symphonia-core",
"symphonia-metadata",
]
[[package]]
name = "syn"
version = "1.0.109"

View File

@@ -28,6 +28,7 @@ chrono = "0.4"
raw-window-handle = "0.6"
rfd = "0.14"
reqwest = { version = "0.12", default-features = false, features = ["blocking", "rustls-tls"] }
symphonia = { version = "0.5", features = ["mp3"] }
[build-dependencies]
winresource = "0.1"

191
HotKeet/src/audio.rs Normal file
View File

@@ -0,0 +1,191 @@
//! Audio-Dateien (WAV, MP3) für Transkription vorbereiten.
//! Parakeet erwartet: 16 kHz, Mono, 16-bit PCM.
use std::path::Path;
use symphonia::core::audio::Signal;
const TARGET_SAMPLE_RATE: u32 = 16000;
const TARGET_CHANNELS: u16 = 1;
/// Bereitet eine Audio-Datei für die Transkription vor.
/// Gibt den Pfad zu einer temporären WAV-Datei zurück (16 kHz, Mono, 16-bit).
/// Der Aufrufer sollte die Temp-Datei nach der Transkription löschen.
pub fn prepare_for_transcription(path: &Path) -> Result<std::path::PathBuf, String> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|s| s.to_lowercase())
.unwrap_or_default();
let samples = match ext.as_str() {
"wav" => decode_wav(path)?,
"mp3" => decode_mp3(path)?,
_ => return Err(format!("Unsupported format: .{} (use .wav or .mp3)", ext)),
};
let temp_path = std::env::temp_dir().join(format!(
"hotkeet-transcribe-{}.wav",
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0)
));
write_wav(&temp_path, &samples)?;
Ok(temp_path)
}
fn decode_wav(path: &Path) -> Result<Vec<i16>, String> {
let reader = hound::WavReader::open(path).map_err(|e| format!("WAV read: {}", e))?;
let spec = reader.spec();
let sample_rate = spec.sample_rate;
let channels = spec.channels;
let samples: Vec<i16> = match spec.sample_format {
hound::SampleFormat::Int => reader
.into_samples::<i16>()
.filter_map(|s| s.ok())
.collect(),
hound::SampleFormat::Float => reader
.into_samples::<f32>()
.filter_map(|s| s.ok())
.map(|f| (f.clamp(-1.0, 1.0) * 32767.0) as i16)
.collect(),
};
let mono = if channels > 1 {
samples
.chunks(channels as usize)
.map(|c| {
let sum: i32 = c.iter().map(|&s| s as i32).sum();
(sum / channels as i32).clamp(-32768, 32767) as i16
})
.collect()
} else {
samples
};
let resampled = resample_i16(&mono, sample_rate, TARGET_SAMPLE_RATE);
Ok(resampled)
}
fn decode_mp3(path: &Path) -> Result<Vec<i16>, String> {
let file = std::fs::File::open(path).map_err(|e| format!("Open MP3: {}", e))?;
let source = symphonia::core::io::MediaSourceStream::new(
Box::new(file),
symphonia::core::io::MediaSourceStreamOptions::default(),
);
let probe = symphonia::default::get_probe();
let result = probe
.format(
&Default::default(),
source,
&Default::default(),
&Default::default(),
)
.map_err(|e| format!("MP3 probe: {}", e))?;
let mut format = result.format;
let track = format
.tracks()
.iter()
.find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
.ok_or("No audio track in MP3")?
.clone();
let mut decoder = symphonia::default::get_codecs()
.make(&track.codec_params, &Default::default())
.map_err(|e| format!("MP3 decoder: {}", e))?;
let sample_rate = track
.codec_params
.sample_rate
.ok_or("MP3: no sample rate")? as u32;
let channels = track
.codec_params
.channels
.ok_or("MP3: no channels")?
.count() as usize;
let mut all_samples: Vec<i16> = Vec::new();
while let Ok(packet) = format.next_packet() {
if let Ok(decoded) = decoder.decode(&packet) {
match decoded {
symphonia::core::audio::AudioBufferRef::F32(buf) => {
let n_frames = buf.frames();
for i in 0..n_frames {
let mut sum = 0.0f32;
for c in 0..channels {
sum += buf.chan(c)[i];
}
sum /= channels as f32;
all_samples.push((sum.clamp(-1.0, 1.0) * 32767.0) as i16);
}
}
symphonia::core::audio::AudioBufferRef::S16(buf) => {
let n_frames = buf.frames();
for i in 0..n_frames {
let mut sum = 0i32;
for c in 0..channels {
sum += buf.chan(c)[i] as i32;
}
all_samples.push((sum / channels as i32).clamp(-32768, 32767) as i16);
}
}
symphonia::core::audio::AudioBufferRef::U16(buf) => {
let n_frames = buf.frames();
for i in 0..n_frames {
let mut sum = 0i32;
for c in 0..channels {
sum += (buf.chan(c)[i] as i32) - 32768;
}
all_samples.push((sum / channels as i32).clamp(-32768, 32767) as i16);
}
}
_ => return Err("MP3: unsupported sample format".to_string()),
}
}
}
let resampled = resample_i16(&all_samples, sample_rate, TARGET_SAMPLE_RATE);
Ok(resampled)
}
fn resample_i16(samples: &[i16], from_rate: u32, to_rate: u32) -> Vec<i16> {
if from_rate == to_rate {
return samples.to_vec();
}
let ratio = from_rate as f64 / to_rate as f64;
let out_len = (samples.len() as f64 / ratio) as usize;
let mut out = Vec::with_capacity(out_len);
for i in 0..out_len {
let src_idx = i as f64 * ratio;
let idx0 = src_idx.floor() as usize;
let idx1 = (idx0 + 1).min(samples.len().saturating_sub(1));
let frac = src_idx - idx0 as f64;
let s0 = samples.get(idx0).copied().unwrap_or(0) as f64;
let s1 = samples.get(idx1).copied().unwrap_or(0) as f64;
let s = s0 * (1.0 - frac) + s1 * frac;
out.push(s.clamp(-32768.0, 32767.0) as i16);
}
out
}
fn write_wav(path: &Path, samples: &[i16]) -> Result<(), String> {
let spec = hound::WavSpec {
channels: TARGET_CHANNELS,
sample_rate: TARGET_SAMPLE_RATE,
bits_per_sample: 16,
sample_format: hound::SampleFormat::Int,
};
let mut writer = hound::WavWriter::create(path, spec)
.map_err(|e| format!("Create WAV: {}", e))?;
for &s in samples {
writer.write_sample(s).map_err(|e| format!("Write WAV: {}", e))?;
}
writer.finalize().map_err(|e| format!("Finalize WAV: {}", e))?;
Ok(())
}

View File

@@ -57,6 +57,7 @@ fn show_main_window() {
}
}
}
mod audio;
mod config;
mod hotkey;
mod paste;
@@ -149,6 +150,29 @@ impl eframe::App for AppState {
ctx.send_viewport_cmd(egui::ViewportCommand::Visible(false));
}
// Dropped audio files: transcribe and paste
let dropped: Vec<_> = ctx.input(|i| i.raw.dropped_files.clone());
for file in dropped {
if let Some(ref path) = file.path {
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|s| s.to_lowercase())
.unwrap_or_default();
if ext == "wav" || ext == "mp3" {
let path = path.clone();
let cfg = self.config.read().unwrap().clone();
let status = self.status.clone();
let status_detail = self.status_detail.clone();
let paste_tx = self.paste_tx.clone();
std::thread::spawn(move || {
run_file_transcription(path, cfg, status, status_detail, paste_tx);
});
break; // Nur eine Datei pro Drop verarbeiten
}
}
}
// Download request: spawn model download
if let Ok(path) = self.download_request_rx.try_recv() {
model::download_model_async(path, self.download_progress_tx.clone());
@@ -419,8 +443,9 @@ fn main() -> eframe::Result<()> {
};
let mut viewport = egui::ViewportBuilder::default()
.with_inner_size([400.0, 400.0])
.with_min_inner_size([300.0, 300.0]);
.with_inner_size([660.0, 520.0])
.with_min_inner_size([500.0, 420.0])
.with_drag_and_drop(true);
if start_minimized {
viewport = viewport.with_visible(false);
}
@@ -454,6 +479,64 @@ fn main() -> eframe::Result<()> {
Ok(())
}
fn run_file_transcription(
path: std::path::PathBuf,
config: DictateConfig,
status: Arc<std::sync::RwLock<AppStatus>>,
status_detail: Arc<std::sync::RwLock<String>>,
paste_tx: Sender<PasteRequest>,
) {
set_status(&status, &status_detail, AppStatus::Transkribieren, "Converting…");
let wav_path = match audio::prepare_for_transcription(&path) {
Ok(p) => p,
Err(e) => {
eprintln!("Audio prepare: {}", e);
set_status(&status, &status_detail, AppStatus::Fehler, &format!("Audio: {}", e));
return;
}
};
let _defer = Defer(Some(|| {
let _ = std::fs::remove_file(&wav_path);
}));
set_status(&status, &status_detail, AppStatus::Transkribieren, "Transcribing…");
let text = match transcribe(
&config.parakeet_cli_path,
&config.model_path,
&wav_path,
) {
Ok(t) => t,
Err(e) => {
eprintln!("Transcription: {}", e);
set_status(&status, &status_detail, AppStatus::Fehler, &format!("Transcription: {}", e));
return;
}
};
let req = PasteRequest {
text,
method: config.paste_method_enum(),
target_hwnd: None,
debug_logging: config.debug_logging,
};
if paste_tx.send(req).is_err() {
eprintln!("Paste channel closed");
set_status(&status, &status_detail, AppStatus::Fehler, "Paste channel failed");
}
}
struct Defer<F: FnOnce()>(Option<F>);
impl<F: FnOnce()> Drop for Defer<F> {
fn drop(&mut self) {
if let Some(f) = self.0.take() {
f();
}
}
}
fn set_status(
status: &Arc<std::sync::RwLock<AppStatus>>,
detail: &Arc<std::sync::RwLock<String>>,

View File

@@ -4,14 +4,15 @@ use std::io::Read;
use std::path::Path;
use std::sync::mpsc::Sender;
const HF_BASE: &str = "https://huggingface.co/nasedkinpv/parakeet-tdt-0.6b-v3-onnx-int8/resolve/main";
/// transcribe-rs compatible model (encoder-model.int8.onnx, decoder_joint-model.int8.onnx, nemo128.onnx)
const HF_BASE: &str = "https://huggingface.co/altunenes/parakeet-rs/resolve/main/tdt";
/// Required files for Parakeet INT8 model (transcribe-rs).
const REQUIRED_FILES: &[&str] = &[
"vocab.txt",
"encoder-int8.onnx",
"encoder-int8.onnx.data",
"decoder_joint-int8.onnx",
"nemo128.onnx",
"encoder-model.int8.onnx",
"decoder_joint-model.int8.onnx",
];
/// Progress message during download.

View File

@@ -318,6 +318,37 @@ impl SettingsApp {
}
}
ui.add_space(8.0);
// Drag-and-Drop-Bereich für Audio-Dateien (über der Statuszeile)
let drop_rect = ui.available_rect_before_wrap();
let drop_height = 56.0;
let drop_rect = egui::Rect::from_min_size(
drop_rect.min,
egui::vec2(drop_rect.width(), drop_height),
);
let (rect, _) = ui.allocate_exact_size(drop_rect.size(), egui::Sense::hover());
let is_hovered = ctx.input(|i| i.pointer.hover_pos())
.map(|p| rect.contains(p))
.unwrap_or(false);
let stroke = if is_hovered {
egui::Stroke::new(2.0, egui::Color32::from_rgb(100, 150, 255))
} else {
egui::Stroke::new(1.0, egui::Color32::from_rgb(120, 120, 120))
};
ui.painter().rect_stroke(rect, 4.0, stroke);
ui.allocate_new_ui(egui::UiBuilder::new().max_rect(rect), |ui| {
ui.vertical_centered(|ui| {
ui.add_space(8.0);
ui.label(
egui::RichText::new("Drop WAV or MP3 here for transcription")
.color(egui::Color32::from_rgb(140, 140, 140)),
);
});
});
ui.add_space(4.0);
if !self.status.is_empty() {
ui.add_space(8.0);
ui.label(&self.status);