File transcription: save to text file, chunking, ffmpeg, logging

- Transcribe dropped files to text file via Save dialog (no paste)
- Default save folder: source folder of audio file
- Chunk long audio (45s) to avoid ORT errors with Parakeet
- Prefer ffmpeg for WAV/MP3 conversion (better compatibility)
- Add transcription-debug.log for debugging
- Support 24/32-bit WAV, ffmpeg fallback for exotic formats

Made-with: Cursor
This commit is contained in:
2026-03-06 21:37:29 +01:00
parent 1d948242b3
commit c5625d9b39
5 changed files with 315 additions and 42 deletions

View File

@@ -10,19 +10,16 @@ const TARGET_CHANNELS: u16 = 1;
/// Bereitet eine Audio-Datei für die Transkription vor.
/// Gibt den Pfad zu einer temporären WAV-Datei zurück (16 kHz, Mono, 16-bit).
/// Der Aufrufer sollte die Temp-Datei nach der Transkription löschen.
pub fn prepare_for_transcription(path: &Path) -> Result<std::path::PathBuf, String> {
pub fn prepare_for_transcription(
path: &Path,
debug_logging: bool,
) -> Result<std::path::PathBuf, String> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|s| s.to_lowercase())
.unwrap_or_default();
let samples = match ext.as_str() {
"wav" => decode_wav(path)?,
"mp3" => decode_mp3(path)?,
_ => return Err(format!("Unsupported format: .{} (use .wav or .mp3)", ext)),
};
let temp_path = std::env::temp_dir().join(format!(
"hotkeet-transcribe-{}.wav",
std::time::SystemTime::now()
@@ -31,21 +28,176 @@ pub fn prepare_for_transcription(path: &Path) -> Result<std::path::PathBuf, Stri
.unwrap_or(0)
));
write_wav(&temp_path, &samples)?;
match ext.as_str() {
"wav" | "mp3" => {
// ffmpeg erzeugt WAV, das transcribe-rs zuverlässig akzeptiert (vermeidet ORT-Fehler)
if let Ok(()) = convert_with_ffmpeg(path, &temp_path, debug_logging) {
if debug_logging {
let size = std::fs::metadata(&temp_path).map(|m| m.len()).unwrap_or(0);
crate::transcription_log::log(
&format!("ffmpeg OK: {} -> {} ({} bytes)", path.display(), temp_path.display(), size),
true,
);
}
} else {
if debug_logging {
crate::transcription_log::log("ffmpeg failed, fallback to native decode", true);
}
// Fallback: native Dekodierung
let samples = match ext.as_str() {
"wav" => decode_wav(path)?,
"mp3" => decode_mp3(path)?,
_ => unreachable!(),
};
if samples.is_empty() {
return Err("Audio file is empty or could not be decoded. Install ffmpeg for better compatibility.".to_string());
}
if debug_logging {
crate::transcription_log::log(
&format!("native decode OK: {} samples -> {}", samples.len(), temp_path.display()),
true,
);
}
write_wav(&temp_path, &samples)?;
}
}
_ => return Err(format!("Unsupported format: .{} (use .wav or .mp3)", ext)),
}
Ok(temp_path)
}
/// Maximale Chunk-Dauer in Sekunden (Parakeet/ORT hat Probleme mit langen Dateien).
pub const MAX_CHUNK_SEC: u32 = 45;
/// Teilt eine WAV-Datei in Chunks für die Transkription.
/// Gibt Pfade zu temporären Chunk-WAVs zurück. Aufrufer muss diese löschen.
pub fn split_wav_into_chunks(
wav_path: &Path,
max_sec: u32,
debug_logging: bool,
) -> Result<Vec<std::path::PathBuf>, String> {
let reader = hound::WavReader::open(wav_path).map_err(|e| format!("WAV read: {}", e))?;
let spec = reader.spec();
if spec.sample_rate != TARGET_SAMPLE_RATE || spec.channels != TARGET_CHANNELS {
return Err(format!(
"WAV must be {} Hz mono, got {} Hz {} ch",
TARGET_SAMPLE_RATE, spec.sample_rate, spec.channels
));
}
let samples: Vec<i16> = match spec.sample_format {
hound::SampleFormat::Int => reader
.into_samples::<i32>()
.filter_map(|s| s.ok())
.map(|s| s.clamp(-32768, 32767) as i16)
.collect(),
hound::SampleFormat::Float => reader
.into_samples::<f32>()
.filter_map(|s| s.ok())
.map(|f| (f.clamp(-1.0, 1.0) * 32767.0) as i16)
.collect(),
};
let samples_per_chunk = (max_sec as usize) * (TARGET_SAMPLE_RATE as usize);
if samples.len() <= samples_per_chunk {
return Ok(vec![wav_path.to_path_buf()]);
}
let mut paths = Vec::new();
let base = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0);
for (i, chunk) in samples.chunks(samples_per_chunk).enumerate() {
let path = std::env::temp_dir().join(format!("hotkeet-chunk-{}-{}.wav", base, i));
write_wav(&path, chunk)?;
paths.push(path);
}
if debug_logging {
crate::transcription_log::log(
&format!("split into {} chunks ({} sec each)", paths.len(), max_sec),
true,
);
}
Ok(paths)
}
/// Konvertiert Audio mit ffmpeg zu 16 kHz Mono WAV.
fn convert_with_ffmpeg(input: &Path, output: &Path, debug_logging: bool) -> Result<(), String> {
let output = output.to_path_buf();
let input = input.to_path_buf();
let mut cmd = std::process::Command::new("ffmpeg");
cmd.arg("-y")
.arg("-i")
.arg(&input)
.arg("-acodec")
.arg("pcm_s16le")
.arg("-ar")
.arg(TARGET_SAMPLE_RATE.to_string())
.arg("-ac")
.arg(TARGET_CHANNELS.to_string())
.arg("-f")
.arg("wav")
.arg(&output);
#[cfg(windows)]
{
use std::os::windows::process::CommandExt;
const CREATE_NO_WINDOW: u32 = 0x0800_0000;
cmd.creation_flags(CREATE_NO_WINDOW);
}
if debug_logging {
crate::transcription_log::log(
&format!("ffmpeg -i {} -> {}", input.display(), output.display()),
true,
);
}
let output_result = cmd.output().map_err(|e| format!("ffmpeg: {}", e))?;
if !output_result.status.success() {
let stderr = String::from_utf8_lossy(&output_result.stderr);
if debug_logging {
crate::transcription_log::log(&format!("ffmpeg stderr: {}", stderr), true);
}
return Err(format!("ffmpeg failed: {}", stderr.lines().last().unwrap_or("")));
}
if !output.exists() {
return Err("ffmpeg did not create output file".to_string());
}
Ok(())
}
fn decode_wav(path: &Path) -> Result<Vec<i16>, String> {
let reader = hound::WavReader::open(path).map_err(|e| format!("WAV read: {}", e))?;
let spec = reader.spec();
let sample_rate = spec.sample_rate;
let channels = spec.channels;
// i32 für 16/24/32-bit Int, f32 für Float
let samples: Vec<i16> = match spec.sample_format {
hound::SampleFormat::Int => reader
.into_samples::<i16>()
.filter_map(|s| s.ok())
.collect(),
hound::SampleFormat::Int => {
let raw: Vec<i32> = reader
.into_samples::<i32>()
.filter_map(|s| s.ok())
.collect();
// Skalieren auf i16 (24/32-bit haben größeren Wertebereich)
let max_val = 1 << (spec.bits_per_sample.saturating_sub(1));
raw.into_iter()
.map(|s| {
let scaled = (s as f64 / max_val as f64).clamp(-1.0, 1.0) * 32767.0;
scaled as i16
})
.collect()
}
hound::SampleFormat::Float => reader
.into_samples::<f32>()
.filter_map(|s| s.ok())

View File

@@ -60,6 +60,7 @@ fn show_main_window() {
mod audio;
mod config;
mod hotkey;
mod transcription_log;
mod paste;
mod recording;
mod transcription;
@@ -164,9 +165,8 @@ impl eframe::App for AppState {
let cfg = self.config.read().unwrap().clone();
let status = self.status.clone();
let status_detail = self.status_detail.clone();
let paste_tx = self.paste_tx.clone();
std::thread::spawn(move || {
run_file_transcription(path, cfg, status, status_detail, paste_tx);
run_file_transcription(path, cfg, status, status_detail);
});
break; // Nur eine Datei pro Drop verarbeiten
}
@@ -443,7 +443,7 @@ fn main() -> eframe::Result<()> {
};
let mut viewport = egui::ViewportBuilder::default()
.with_inner_size([660.0, 520.0])
.with_inner_size([680.0, 520.0])
.with_min_inner_size([500.0, 420.0])
.with_drag_and_drop(true);
if start_minimized {
@@ -484,47 +484,125 @@ fn run_file_transcription(
config: DictateConfig,
status: Arc<std::sync::RwLock<AppStatus>>,
status_detail: Arc<std::sync::RwLock<String>>,
paste_tx: Sender<PasteRequest>,
) {
let debug = config.debug_logging;
if debug {
transcription_log::log(&format!("file transcription start: {}", path.display()), true);
}
set_status(&status, &status_detail, AppStatus::Transkribieren, "Converting…");
let wav_path = match audio::prepare_for_transcription(&path) {
let wav_path = match audio::prepare_for_transcription(&path, debug) {
Ok(p) => p,
Err(e) => {
eprintln!("Audio prepare: {}", e);
transcription_log::log(&format!("audio prepare error: {}", e), debug);
set_status(&status, &status_detail, AppStatus::Fehler, &format!("Audio: {}", e));
return;
}
};
let _defer = Defer(Some(|| {
let _ = std::fs::remove_file(&wav_path);
}));
if debug {
let size = std::fs::metadata(&wav_path).map(|m| m.len()).unwrap_or(0);
transcription_log::log(
&format!("wav ready: {} ({} bytes)", wav_path.display(), size),
true,
);
}
set_status(&status, &status_detail, AppStatus::Transkribieren, "Transcribing…");
let text = match transcribe(
&config.parakeet_cli_path,
&config.model_path,
&wav_path,
) {
Ok(t) => t,
let chunks = match audio::split_wav_into_chunks(&wav_path, audio::MAX_CHUNK_SEC, debug) {
Ok(c) => c,
Err(e) => {
eprintln!("Transcription: {}", e);
set_status(&status, &status_detail, AppStatus::Fehler, &format!("Transcription: {}", e));
transcription_log::log(&format!("split error: {}", e), debug);
set_status(&status, &status_detail, AppStatus::Fehler, &format!("Audio: {}", e));
return;
}
};
let req = PasteRequest {
text,
method: config.paste_method_enum(),
target_hwnd: None,
debug_logging: config.debug_logging,
};
if paste_tx.send(req).is_err() {
eprintln!("Paste channel closed");
set_status(&status, &status_detail, AppStatus::Fehler, "Paste channel failed");
let mut paths_to_cleanup = vec![wav_path.clone()];
for p in &chunks {
if p != &wav_path {
paths_to_cleanup.push(p.clone());
}
}
let _defer = Defer(Some(move || {
for p in paths_to_cleanup {
let _ = std::fs::remove_file(&p);
}
}));
set_status(&status, &status_detail, AppStatus::Transkribieren, "Transcribing…");
let mut text_parts = Vec::new();
for (i, chunk_path) in chunks.iter().enumerate() {
if debug && chunks.len() > 1 {
transcription_log::log(&format!("chunk {}/{}", i + 1, chunks.len()), true);
}
match transcribe(
&config.parakeet_cli_path,
&config.model_path,
chunk_path,
debug,
) {
Ok(t) if !t.is_empty() => text_parts.push(t),
Ok(_) => {}
Err(e) => {
transcription_log::log(&format!("transcription error: {}", e), debug);
set_status(&status, &status_detail, AppStatus::Fehler, &format!("Transcription: {}", e));
return;
}
}
}
let text = text_parts.join(" ");
if debug {
transcription_log::log(&format!("transcription OK: {} chars", text.len()), true);
}
// Save-Dialog: Benutzer wählt Zieldatei
let default_name = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("transcription")
.to_string();
let mut dialog = rfd::FileDialog::new()
.set_title("Save transcription as")
.add_filter("Text", &["txt"])
.set_file_name(&format!("{}.txt", default_name));
if let Some(dir) = path.parent() {
dialog = dialog.set_directory(dir);
}
let save_path = dialog.save_file();
match save_path {
Some(p) => {
match std::fs::write(&p, &text) {
Ok(()) => {
let msg = format!("Saved to {}", p.display());
set_status(&status, &status_detail, AppStatus::Fertig, &msg);
if debug {
transcription_log::log(&format!("saved to {}", p.display()), true);
}
let status_reset = status.clone();
let detail_reset = status_detail.clone();
std::thread::spawn(move || {
std::thread::sleep(std::time::Duration::from_secs(5));
let _ = status_reset.write().map(|mut w| *w = AppStatus::Bereit);
let _ = detail_reset.write().map(|mut w| *w = String::new());
});
}
Err(e) => {
set_status(
&status,
&status_detail,
AppStatus::Fehler,
&format!("Write error: {}", e),
);
}
}
}
None => {
set_status(&status, &status_detail, AppStatus::Bereit, "Save cancelled");
}
}
}
@@ -624,10 +702,11 @@ fn run_recording(
&config.parakeet_cli_path,
&config.model_path,
&wav_path,
config.debug_logging,
) {
Ok(t) => t,
Err(e) => {
eprintln!("Transkription: {}", e);
transcription_log::log(&format!("transcription error: {}", e), config.debug_logging);
let _ = std::fs::remove_file(&wav_path);
set_status(&status, &status_detail, AppStatus::Fehler, &format!("Transcription: {}", e));
return;

View File

@@ -23,6 +23,7 @@ pub fn transcribe(
parakeet_cli_path: &str,
model_dir: &str,
wav_path: &Path,
debug_logging: bool,
) -> Result<String, String> {
let cli = if parakeet_cli_path.is_empty() {
"parakeet-cli"
@@ -36,6 +37,13 @@ pub fn transcribe(
model_dir
};
if debug_logging {
crate::transcription_log::log(
&format!("parakeet-cli \"{}\" \"{}\"", model, wav_path.display()),
true,
);
}
let mut cmd = Command::new(cli);
cmd.arg(model).arg(wav_path);
#[cfg(windows)]
@@ -47,6 +55,9 @@ pub fn transcribe(
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
if debug_logging {
crate::transcription_log::log(&format!("parakeet-cli stderr: {}", stderr), true);
}
return Err(format!("parakeet-cli error: {}", stderr));
}

View File

@@ -0,0 +1,31 @@
//! Debug-Logging für Transkription (Konsole + Datei).
use std::io::Write;
/// Schreibt bei aktiviertem Logging in Konsole und transcription-debug.log.
pub fn log(msg: &str, enabled: bool) {
if !enabled {
return;
}
eprintln!("[transcription] {}", msg);
let log_dir = crate::config::DictateConfig::config_path()
.parent()
.map(|p| p.to_path_buf())
.unwrap_or_else(|| std::path::PathBuf::from("."));
let path = log_dir.join("transcription-debug.log");
if let Some(parent) = path.parent() {
let _ = std::fs::create_dir_all(parent);
}
if let Ok(mut f) = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(&path)
{
let _ = writeln!(
f,
"{} {}",
chrono::Local::now().format("%H:%M:%S%.3f"),
msg
);
}
}

View File

@@ -302,7 +302,7 @@ impl SettingsApp {
ui.checkbox(&mut self.config.start_minimized, "Minimize on start");
ui.checkbox(&mut self.config.minimize_to_tray, "Minimize to tray");
ui.checkbox(&mut self.config.sound_on_start_end, "Audio feedback on record start/end");
ui.checkbox(&mut self.config.debug_logging, "Debug logging (paste-debug.log, console)");
ui.checkbox(&mut self.config.debug_logging, "Debug logging (paste-debug.log, transcription-debug.log, console)");
ui.add_space(16.0);