Add vocabulary extension support in configuration and UI

- Introduced a new configuration field for vocabulary extension path in DictateConfig. - Updated transcription functions to apply vocabulary extensions from the specified file. - Enhanced UI to allow users to browse and set the vocabulary extension file, with clear instructions on the format for replacement rules.
2026-03-08 10:28:02 +01:00
parent 3bef69fb5a
commit c296e2afe9
4 changed files with 95 additions and 2 deletions
@@ -51,6 +51,10 @@ pub struct DictateConfig {
    #[serde(default)]
    pub model_path: String,

+    /// Pfad zu Ersetzungsregeln (eine Zeile: \"Quelltext→Ersetzung\" oder \"Quelltext=Ersetzung\")
+    #[serde(default)]
+    pub vocabulary_extension_path: String,
+
    #[serde(default = "default_true")]
    pub start_minimized: bool,

@@ -92,6 +96,7 @@ impl Default for DictateConfig {
            input_device_name: String::new(),
            parakeet_cli_path: String::new(),
            model_path: String::new(),
+            vocabulary_extension_path: String::new(),
            start_minimized: true,
            minimize_to_tray: true,
            paste_method: "Auto".to_string(),
@@ -61,6 +61,7 @@ mod audio;
 mod config;
 mod hotkey;
 mod transcription_log;
+mod vocabulary_extension;
 mod paste;
 mod recording;
 mod transcription;
@@ -552,7 +553,8 @@ fn run_file_transcription(
            }
        }
    }
-    let text = text_parts.join(" ");
+    let mut text = text_parts.join(" ");
+    text = vocabulary_extension::apply_from_file(&text, &config.vocabulary_extension_path);

    if debug {
        transcription_log::log(&format!("transcription OK: {} chars", text.len()), true);
@@ -698,7 +700,7 @@ fn run_recording(
        return;
    }

-    let text = match transcribe(
+    let mut text = match transcribe(
        &config.parakeet_cli_path,
        &config.model_path,
        &wav_path,
@@ -715,6 +717,8 @@ fn run_recording(

    let _ = std::fs::remove_file(&wav_path);

+    text = vocabulary_extension::apply_from_file(&text, &config.vocabulary_extension_path);
+
    if config.debug_logging {
        eprintln!(
            "[recording] Transkription: {} Zeichen, Text: {:?}",
@@ -240,6 +240,31 @@ impl SettingsApp {
                }
            });

+            ui.horizontal(|ui| {
+                ui.label("Vocabulary extension:");
+                let display = if self.config.vocabulary_extension_path.is_empty() {
+                    "(empty = no replacements)".to_string()
+                } else {
+                    self.config.vocabulary_extension_path.clone()
+                };
+                ui.label(egui::RichText::new(&display).color(egui::Color32::GRAY));
+                if ui.button("Browse…").clicked() {
+                    if let Some(p) = rfd::FileDialog::new()
+                        .set_title("Select vocabulary extension file")
+                        .add_filter("Text", &["txt"])
+                        .pick_file()
+                    {
+                        self.config.vocabulary_extension_path = p.display().to_string();
+                        self.status = "Remember to save changes.".to_string();
+                    }
+                }
+                if !self.config.vocabulary_extension_path.is_empty() && ui.small_button("✕").clicked() {
+                    self.config.vocabulary_extension_path.clear();
+                    self.status = "Remember to save changes.".to_string();
+                }
+            });
+            ui.label(egui::RichText::new("Format: one rule per line, \"source→replacement\" or \"source=replacement\"").small().color(egui::Color32::GRAY));
+
            let model_path = Path::new(&self.config.model_path);
            let model_valid = !self.config.model_path.is_empty() && model::is_model_valid(model_path);
            let downloading = self.download_progress.as_ref().and_then(|p| p.read().ok()).and_then(|g| g.clone());
@@ -0,0 +1,59 @@
+//! Vokabular-Erweiterung: Ersetzungsregeln für Transkription.
+//! Siehe https://deepwiki.com/altunenes/parakeet-rs/7.3-vocabulary-and-tokenization
+//!
+//! Format: Eine Zeile pro Regel, "Quelltext→Ersetzung" oder "Quelltext=Ersetzung".
+//! Leere Zeilen und Zeilen mit # werden ignoriert.
+
+use std::path::Path;
+
+/// Lädt Ersetzungsregeln aus einer Datei.
+/// Gibt Vec<(from, to)> zurück, sortiert nach from-Länge (längste zuerst).
+pub fn load_rules(path: &Path) -> Result<Vec<(String, String)>, String> {
+    let content = std::fs::read_to_string(path).map_err(|e| format!("Read vocabulary: {}", e))?;
+    let mut rules = Vec::new();
+    for line in content.lines() {
+        let line = line.trim();
+        if line.is_empty() || line.starts_with('#') {
+            continue;
+        }
+        let (from, to) = if let Some(pos) = line.find("→") {
+            let (a, b) = line.split_at(pos);
+            (a.trim(), b[3..].trim()) // "→" is 3 bytes in UTF-8
+        } else if let Some(pos) = line.find('=') {
+            let (a, b) = line.split_at(pos);
+            (a.trim(), b[1..].trim())
+        } else {
+            continue;
+        };
+        if !from.is_empty() {
+            rules.push((from.to_string(), to.to_string()));
+        }
+    }
+    // Längste Muster zuerst, damit "MediTech" vor "Medi" ersetzt wird
+    rules.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
+    Ok(rules)
+}
+
+/// Wendet Ersetzungsregeln auf den Text an.
+pub fn apply(text: &str, rules: &[(String, String)]) -> String {
+    let mut result = text.to_string();
+    for (from, to) in rules {
+        result = result.replace(from, to);
+    }
+    result
+}
+
+/// Lädt Regeln und wendet sie an. Bei leerem Pfad oder Fehler: Original zurück.
+pub fn apply_from_file(text: &str, path: &str) -> String {
+    if path.is_empty() {
+        return text.to_string();
+    }
+    let p = Path::new(path);
+    if !p.is_file() {
+        return text.to_string();
+    }
+    match load_rules(p) {
+        Ok(rules) if !rules.is_empty() => apply(text, &rules),
+        _ => text.to_string(),
+    }
+}