From 0f424a84db74c7f57553c4827dd4071c90c37f8b Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Thu, 7 May 2026 19:07:49 -0700 Subject: runtime GUI, configurable audio RMS value --- gui.py | 332 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- server.py | 201 ++++++++++++++++++++++++++++++------- 2 files changed, 494 insertions(+), 39 deletions(-) diff --git a/gui.py b/gui.py index 89817e5..d112469 100644 --- a/gui.py +++ b/gui.py @@ -1,7 +1,9 @@ -from typing import Iterable, List, Tuple, Dict, Any, cast +from typing import Iterable, List, Tuple, Dict, Any, Callable, cast, Optional +import time +import numpy as np import sounddevice as sd from PySide6.QtGui import QFont -from PySide6.QtCore import Qt +from PySide6.QtCore import Qt, QTimer from PySide6.QtWidgets import ( QApplication, QCheckBox, @@ -16,6 +18,7 @@ from PySide6.QtWidgets import ( QLineEdit, QMessageBox, QTabWidget, + QTextEdit, QVBoxLayout, QWidget, ) @@ -41,8 +44,15 @@ class _SettingsDialog(QDialog): def get_value(key: str, fallback: Any) -> Any: return settings.get(key, default_settings.get(key, fallback)) + self.device_indices = [idx for idx, _dev in input_devices] self.device_names = [dev["name"] for _idx, dev in input_devices] + self._monitor_stream: Optional[sd.InputStream] = None + self._monitor_rms: float = 0.0 + self._monitor_active_until: float = 0.0 + self._monitor_error: str = "" + self._monitor_threshold: float = float(get_value("audio_activity_threshold", 0.003)) + root_layout = QVBoxLayout(self) tabs = QTabWidget(self) @@ -127,6 +137,13 @@ class _SettingsDialog(QDialog): self.update_interval_edit = QLineEdit(str(get_value("update_interval_seconds", 2)), whisper_tab) whisper_advanced_layout.addRow(QLabel("Update interval (s):"), self.update_interval_edit) + self.audio_activity_threshold_edit = QLineEdit(str(get_value("audio_activity_threshold", 0.003)), whisper_tab) + whisper_advanced_layout.addRow(QLabel("Audio activity threshold (RMS):"), self.audio_activity_threshold_edit) + + self.audio_indicator_label = QLabel("⚪ Idle", whisper_tab) + self.audio_indicator_label.setAlignment(Qt.AlignmentFlag.AlignLeft | Qt.AlignmentFlag.AlignVCenter) + whisper_advanced_layout.addRow(QLabel("Live input indicator:"), self.audio_indicator_label) + whisper_tab_layout.addWidget(whisper_advanced_group) tabs.addTab(whisper_tab, "Whisper") @@ -177,9 +194,125 @@ class _SettingsDialog(QDialog): button_box.rejected.connect(self.reject) button_layout.addWidget(button_box) + self.device_combo.currentIndexChanged.connect(self._restart_monitor_stream) + self.audio_activity_threshold_edit.textChanged.connect(self._on_threshold_changed) + + self._monitor_timer = QTimer(self) + self._monitor_timer.setInterval(120) + self._monitor_timer.timeout.connect(self._refresh_audio_indicator) + self._monitor_timer.start() + + self._restart_monitor_stream() + self._refresh_audio_indicator() + def _warn(self, title: str, text: str) -> None: QMessageBox.warning(self, title, text) + def _on_threshold_changed(self, text: str) -> None: + try: + parsed = float(text.strip()) + if parsed > 0: + self._monitor_threshold = parsed + except ValueError: + pass + + def _pick_monitor_sample_rate(self, device_index: int, preferred_rate: int) -> Optional[int]: + common_rates: List[int] = [48000, 44100, 32000, 24000, 22050, 16000, 12000, 8000] + tried = set() + for rate in [preferred_rate] + common_rates: + if rate in tried or rate <= 0: + continue + tried.add(rate) + try: + sd.check_input_settings(device=device_index, channels=1, samplerate=rate, dtype="float32") + return rate + except sd.PortAudioError: + continue + return None + + def _monitor_callback(self, indata: np.ndarray, frames: int, time_info: Any, status: Any) -> None: + if status: + self._monitor_error = f"Audio status: {status}" + if indata is None or len(indata) == 0: + return + + chunk = indata[:, 0] + rms = float(np.sqrt(np.mean(np.square(chunk)))) + self._monitor_rms = rms + if rms >= self._monitor_threshold: + self._monitor_active_until = time.monotonic() + 0.6 + + def _refresh_audio_indicator(self) -> None: + if self._monitor_error: + self.audio_indicator_label.setText(f"⚠ {self._monitor_error}") + self.audio_indicator_label.setStyleSheet("color: #f28b82;") + return + + active = time.monotonic() <= self._monitor_active_until + rms_text = f"{self._monitor_rms:.5f}" + if active: + self.audio_indicator_label.setText(f"🟢 Audio detected (RMS {rms_text})") + self.audio_indicator_label.setStyleSheet("color: #8fd18f;") + else: + self.audio_indicator_label.setText(f"⚪ Idle (RMS {rms_text})") + self.audio_indicator_label.setStyleSheet("color: #b0b0b0;") + + def _stop_monitor_stream(self) -> None: + stream = self._monitor_stream + self._monitor_stream = None + if stream is None: + return + try: + stream.stop() + except Exception: + pass + try: + stream.close() + except Exception: + pass + + def _restart_monitor_stream(self, *_args: Any) -> None: + self._stop_monitor_stream() + self._monitor_error = "" + self._monitor_rms = 0.0 + self._monitor_active_until = 0.0 + + selection = self.device_combo.currentIndex() + if selection < 0 or selection >= len(self.device_indices): + self._monitor_error = "No input device selected." + return + + device_index = self.device_indices[selection] + try: + device_info = sd.query_devices(device_index) + except Exception as exc: + self._monitor_error = f"Could not read device info: {exc}" + return + + preferred_rate = int(float(device_info.get("default_samplerate", 48000))) + if preferred_rate <= 0: + preferred_rate = 48000 + + sample_rate = self._pick_monitor_sample_rate(device_index, preferred_rate) + if sample_rate is None: + self._monitor_error = "No supported sample rate for monitor stream." + return + + blocksize = max(256, int(sample_rate * 0.1)) + try: + stream = sd.InputStream( + device=device_index, + channels=1, + samplerate=sample_rate, + dtype="float32", + callback=self._monitor_callback, + blocksize=blocksize, + ) + stream.start() + self._monitor_stream = stream + except Exception as exc: + self._monitor_error = f"Unable to start monitor: {exc}" + def accept(self) -> None: selection = self.device_combo.currentIndex() if selection < 0: @@ -215,6 +348,14 @@ class _SettingsDialog(QDialog): self._warn("Invalid update interval", "Update interval must be a positive number.") return + try: + audio_activity_threshold = float(self.audio_activity_threshold_edit.text().strip()) + if audio_activity_threshold <= 0: + raise ValueError + except ValueError: + self._warn("Invalid audio threshold", "Audio activity threshold must be a positive number.") + return + try: ollama_context_window = int(self.ollama_context_edit.text().strip()) if ollama_context_window <= 0: @@ -241,14 +382,22 @@ class _SettingsDialog(QDialog): "language": self.language_edit.text().strip(), "context_seconds": context_seconds, "update_interval_seconds": update_interval_seconds, + "audio_activity_threshold": audio_activity_threshold, "use_ollama_cleanup": self.use_ollama_cleanup_checkbox.isChecked(), "ollama_device": self.ollama_device_combo.currentText(), "ollama_model": self.ollama_model_edit.text().strip(), "ollama_context_window": ollama_context_window, "ollama_raw_batch_size": ollama_raw_batch_size, } + self._monitor_timer.stop() + self._stop_monitor_stream() super().accept() + def reject(self) -> None: + self._monitor_timer.stop() + self._stop_monitor_stream() + super().reject() + def select_settings( settings: Dict[str, Any], @@ -284,6 +433,185 @@ def select_settings( return dialog.selected_settings +AudioActivityProvider = Callable[[], Dict[str, Any]] +RuntimeLogLinesProvider = Callable[[], List[str]] +SubtitleLinesProvider = Callable[[], List[str]] + + +class _RuntimeDashboard(QWidget): + def __init__( + self, + get_audio_activity: AudioActivityProvider, + get_runtime_logs: RuntimeLogLinesProvider, + get_subtitle_lines: SubtitleLinesProvider, + on_close: Callable[[], None], + ) -> None: + super().__init__() + self._get_audio_activity = get_audio_activity + self._get_runtime_logs = get_runtime_logs + self._get_subtitle_lines = get_subtitle_lines + self._on_close = on_close + self._closed = False + self._last_rendered_runtime_logs: str = "" + self._last_rendered_final_logs: str = "" + + self.setWindowTitle("auto-live-tl") + self.setMinimumSize(1100, 700) + + layout = QVBoxLayout(self) + + title = QLabel("auto-live-tl", self) + title.setStyleSheet("font-size: 22px; font-weight: 700; color: #000000;") + layout.addWidget(title) + + self.audio_indicator = QLabel("⚪ Idle", self) + self.audio_indicator.setStyleSheet("font-size: 16px; color: #b0b0b0; font-weight: 600;") + layout.addWidget(self.audio_indicator) + + self.audio_details = QLabel("RMS 0.00000 | threshold 0.00300", self) + self.audio_details.setStyleSheet("font-size: 13px; color: #9aa0a6;") + layout.addWidget(self.audio_details) + + raw_group = QGroupBox("Debug Log (It's recommended to fetch the final data via the SSE API, see the README)", self) + raw_group_layout = QVBoxLayout(raw_group) + + raw_title = QLabel("System / Raw Output", raw_group) + raw_group_layout.addWidget(raw_title) + + self.runtime_log_view = QTextEdit(raw_group) + self.runtime_log_view.setReadOnly(True) + self.runtime_log_view.setPlaceholderText("Waiting for raw Whisper output...") + self.runtime_log_view.setStyleSheet( + """ + QTextEdit { + background: #111417; + color: #d8dee9; + border: 1px solid #2f3742; + border-radius: 8px; + padding: 8px; + font-family: 'Consolas', 'Monaco', monospace; + font-size: 13px; + line-height: 1.4; + } + """ + ) + raw_group_layout.addWidget(self.runtime_log_view, 3) + + final_title = QLabel("Final (Sent via SSE)", raw_group) + raw_group_layout.addWidget(final_title) + + self.final_log_view = QTextEdit(raw_group) + self.final_log_view.setReadOnly(True) + self.final_log_view.setPlaceholderText("Waiting for FINAL output...") + self.final_log_view.setStyleSheet( + """ + QTextEdit { + background: #0f1410; + color: #dcf9dd; + border: 1px solid #2f4a35; + border-radius: 8px; + padding: 8px; + font-family: 'Consolas', 'Monaco', monospace; + font-size: 14px; + font-weight: 700; + line-height: 1.6; + } + """ + ) + raw_group_layout.addWidget(self.final_log_view, 2) + + layout.addWidget(raw_group, 1) + + self._timer = QTimer(self) + self._timer.setInterval(150) + self._timer.timeout.connect(self._refresh) + self._timer.start() + self._refresh() + + def _shutdown(self) -> None: + if self._closed: + return + self._closed = True + self._timer.stop() + try: + self._on_close() + except Exception: + pass + + def closeEvent(self, event: Any) -> None: # type: ignore[override] + self._shutdown() + super().closeEvent(event) + + def _refresh(self) -> None: + try: + activity = self._get_audio_activity() + except Exception: + activity = {} + + active = bool(activity.get("active", False)) + try: + rms = float(activity.get("rms", 0.0)) + except (TypeError, ValueError): + rms = 0.0 + try: + threshold = float(activity.get("threshold", 0.0)) + except (TypeError, ValueError): + threshold = 0.0 + + if active: + self.audio_indicator.setText("🟢 Audio detected") + self.audio_indicator.setStyleSheet("font-size: 16px; color: #8fd18f; font-weight: 600;") + else: + self.audio_indicator.setText("⚪ Idle") + self.audio_indicator.setStyleSheet("font-size: 16px; color: #b0b0b0; font-weight: 600;") + self.audio_details.setText(f"RMS {rms:.5f} | threshold {threshold:.5f}") + + try: + logs = self._get_runtime_logs() + except Exception: + logs = [] + runtime_lines = [line for line in logs if "[FINAL]" not in line] + final_lines = [line for line in logs if "[FINAL]" in line] + + joined_runtime_logs = "\n".join(runtime_lines) + if joined_runtime_logs != self._last_rendered_runtime_logs: + self._last_rendered_runtime_logs = joined_runtime_logs + self.runtime_log_view.setPlainText(joined_runtime_logs) + log_scroll = self.runtime_log_view.verticalScrollBar() + log_scroll.setValue(log_scroll.maximum()) + + joined_final_logs = "\n\n".join(final_lines) + if joined_final_logs != self._last_rendered_final_logs: + self._last_rendered_final_logs = joined_final_logs + self.final_log_view.setPlainText(joined_final_logs) + final_scroll = self.final_log_view.verticalScrollBar() + final_scroll.setValue(final_scroll.maximum()) + + + + +def run_runtime_dashboard( + get_audio_activity: AudioActivityProvider, + get_runtime_logs: RuntimeLogLinesProvider, + get_subtitle_lines: SubtitleLinesProvider, + on_close: Callable[[], None], +) -> None: + app = QApplication.instance() + if app is None: + app = QApplication([]) + app = cast(QApplication, app) + app.setFont(QFont("Calibri", 12)) + + dashboard = _RuntimeDashboard( + get_audio_activity=get_audio_activity, + get_runtime_logs=get_runtime_logs, + get_subtitle_lines=get_subtitle_lines, + on_close=on_close, + ) + dashboard.show() + app.exec() + + def prompt_input_sample_rate(device_index: int, common_rates: Iterable[int]) -> int: rates = list(common_rates) while True: diff --git a/server.py b/server.py index 21d292f..7c6b67e 100644 --- a/server.py +++ b/server.py @@ -14,7 +14,7 @@ from ollama import ChatResponse import numpy as np import sounddevice as sd from faster_whisper import WhisperModel -from gui import select_settings, prompt_input_sample_rate +from gui import select_settings, prompt_input_sample_rate, run_runtime_dashboard from routes import register_routes from config import _SYSTEM_PROMPT, _LLM_EMPTY_SENTINELS, _HALLUCINATION_PHRASES @@ -24,7 +24,10 @@ BUFFER_SECONDS: float = 10 MAX_SAMPLES: int = 0 PROCESS_INTERVAL_SECONDS: float = 2 SSE_EVENT_SUBTITLE: str = "subtitle" +SSE_EVENT_AUDIO_ACTIVITY: str = "audio_activity" SSE_KEEPALIVE_SECONDS: int = 15 +RUNTIME_SUBTITLE_LINES_MAX: int = 120 +RUNTIME_LOG_LINES_MAX: int = 300 USE_OLLAMA_CLEANUP: bool = True OLLAMA_MODEL: str = "qwen2.5:7b-instruct" @@ -44,6 +47,7 @@ DEFAULT_SETTINGS: Dict[str, Any] = { "language": "", "context_seconds": 10, "update_interval_seconds": 2, + "audio_activity_threshold": 0.003, "use_ollama_cleanup": True, "ollama_device": "GPU", "ollama_model": "qwen2.5:7b-instruct", @@ -62,8 +66,23 @@ model: Optional[WhisperModel] = None WHISPER_TASK: str = DEFAULT_SETTINGS["task"] WHISPER_BEAM_SIZE: int = DEFAULT_SETTINGS["beam_size"] WHISPER_LANGUAGE: str = DEFAULT_SETTINGS["language"] - -last_payload: Optional[Dict[str, Any]] = None +AUDIO_ACTIVITY_THRESHOLD: float = float(DEFAULT_SETTINGS["audio_activity_threshold"]) +AUDIO_ACTIVITY_HOLD_SECONDS: float = 0.75 +AUDIO_ACTIVITY_REPORT_INTERVAL_SECONDS: float = 0.5 + +last_subtitle_payload: Optional[Dict[str, Any]] = None +last_audio_activity_payload: Dict[str, Any] = { + "active": False, + "rms": 0.0, + "threshold": AUDIO_ACTIVITY_THRESHOLD, +} +_audio_active_until: float = 0.0 +_audio_last_emit: float = 0.0 +_audio_state_lock: threading.Lock = threading.Lock() +recent_subtitle_lines: Deque[str] = deque(maxlen=RUNTIME_SUBTITLE_LINES_MAX) +recent_subtitle_lines_lock: threading.Lock = threading.Lock() +runtime_logs: Deque[str] = deque(maxlen=RUNTIME_LOG_LINES_MAX) +runtime_logs_lock: threading.Lock = threading.Lock() clients: Set[queue.Queue] = set() clients_lock: threading.Lock = threading.Lock() SERVER_HOST: str = "127.0.0.1" @@ -181,7 +200,14 @@ def normalize_llm_output(text: str) -> str: return text -def is_hallucination(text: str) -> bool: +def add_runtime_log(kind: str, message: str) -> None: + timestamp = time.strftime("%H:%M:%S") + line = f"[{timestamp}] [{kind.upper()}] {message}" + with runtime_logs_lock: + runtime_logs.append(line) + + +def is_hallucination(text: str) -> Optional[str]: """ Algorithmic hallucination detection by checking if the output from whisper is unusually long given sliding window length, or if there are too many repeating words/phrases @@ -191,11 +217,10 @@ def is_hallucination(text: str) -> bool: """ words = text.split() if not words: - return False + return None max_expected = int(BUFFER_SECONDS * 4.5) if len(words) > max_expected: - print(f"🔴 Hallucination (too long: {len(words)} words > {max_expected}): {text[:60]!r}") - return True + return f"too long: {len(words)} words > {max_expected}" clean = [re.sub(r"[^\w']+", "", w).lower() for w in words] clean = [w for w in clean if w] for n in [2, 3]: @@ -206,19 +231,16 @@ def is_hallucination(text: str) -> bool: if count >= 3: tokens_covered = count * n if tokens_covered / max(1, len(clean)) > 0.35: - print(f"🔴 Hallucination (\'{top}\' x{count}, covers {tokens_covered}/{len(clean)} tokens): {text[:60]!r}") - return True + return f"repeating phrase '{top}' x{count} (covers {tokens_covered}/{len(clean)} tokens)" top, count = Counter(clean).most_common(1)[0] if count >= 4 and count / len(clean) > 0.40: - print(f"🔴 Hallucination (\'{top}\' x{count}, {count/len(clean):.0%}): {text[:60]!r}") - return True + return f"repeating token '{top}' x{count} ({count/len(clean):.0%})" normalized = re.sub(r"[^\w\s]", "", text.lower()).strip() if normalized in _HALLUCINATION_PHRASES: - print(f"🔴 Hallucination (blocked phrase): {text!r}") - return True + return "blocked phrase pattern" - return False + return None def llm_processing_loop() -> None: @@ -235,6 +257,7 @@ def llm_processing_loop() -> None: cleaned: Optional[str] = cleanup_subtitle_with_ollama(raw_text, context) if cleaned is None: + add_runtime_log("LLM", "cleanup failed, falling back to raw text") cleaned = raw_text else: cleaned = normalize_llm_output(cleaned) @@ -242,10 +265,10 @@ def llm_processing_loop() -> None: if cleaned: with subtitle_context_lock: subtitle_context.append(cleaned) - print(f"🔵 (cleaned) {cleaned}") + add_runtime_log("FINAL", cleaned) broadcast_subtitle(cleaned) else: - print("🟡 (LLM: no new content)") + add_runtime_log("LLM", "no new content from cleanup") def run_whisper(audio_np: np.ndarray) -> str: @@ -258,9 +281,11 @@ def run_whisper(audio_np: np.ndarray) -> str: if not text: return text - print(f"🟢 (raw) {text}") + add_runtime_log("RAW", text) - if is_hallucination(text): + hallucination_reason = is_hallucination(text) + if hallucination_reason: + add_runtime_log("HALLUCINATION", f"{hallucination_reason} | text={text}") return text if USE_OLLAMA_CLEANUP: @@ -272,9 +297,11 @@ def run_whisper(audio_np: np.ndarray) -> str: else: batch_text = None if batch_text is not None: + add_runtime_log("RAW->LLM", batch_text.replace("\n", " || ")) try: llm_input_queue.put_nowait(batch_text) except queue.Full: + add_runtime_log("LLM", "queue full, dropping previous batch") try: llm_input_queue.get_nowait() except queue.Empty: @@ -282,26 +309,50 @@ def run_whisper(audio_np: np.ndarray) -> str: try: llm_input_queue.put_nowait(batch_text) except queue.Full: - pass + add_runtime_log("LLM", "queue still full, skipped batch") else: + add_runtime_log("FINAL", text) broadcast_subtitle(text) return text -def broadcast_subtitle(text: str) -> None: - global last_payload - payload: Dict[str, Any] = {"text": text} - last_payload = payload +def broadcast_event(event: str, payload: Dict[str, Any]) -> None: + message: Dict[str, Any] = {"event": event, "payload": payload} with clients_lock: targets = list(clients) for client_queue in targets: try: - client_queue.put_nowait(payload) + client_queue.put_nowait(message) except queue.Full: pass +def broadcast_subtitle(text: str) -> None: + global last_subtitle_payload + payload: Dict[str, Any] = {"text": text} + last_subtitle_payload = payload + with recent_subtitle_lines_lock: + if not recent_subtitle_lines or recent_subtitle_lines[-1] != text: + recent_subtitle_lines.append(text) + broadcast_event(SSE_EVENT_SUBTITLE, payload) + + +def get_audio_activity_snapshot() -> Dict[str, Any]: + with _audio_state_lock: + return dict(last_audio_activity_payload) + + +def get_recent_subtitle_lines_snapshot() -> List[str]: + with recent_subtitle_lines_lock: + return list(recent_subtitle_lines) + + +def get_runtime_logs_snapshot() -> List[str]: + with runtime_logs_lock: + return list(runtime_logs) + + def format_sse_event(event: str, payload: Dict[str, Any]) -> str: """ Creates an SSE event raw payload @@ -311,21 +362,28 @@ def format_sse_event(event: str, payload: Dict[str, Any]) -> str: def event_stream() -> Iterator[str]: - client_queue: queue.Queue = queue.Queue(maxsize=10) + client_queue: queue.Queue = queue.Queue(maxsize=20) with clients_lock: clients.add(client_queue) - if last_payload: - yield format_sse_event(SSE_EVENT_SUBTITLE, last_payload) + if last_subtitle_payload: + yield format_sse_event(SSE_EVENT_SUBTITLE, last_subtitle_payload) + yield format_sse_event(SSE_EVENT_AUDIO_ACTIVITY, last_audio_activity_payload) try: while True: try: - payload_data = client_queue.get(timeout=SSE_KEEPALIVE_SECONDS) + event_data = client_queue.get(timeout=SSE_KEEPALIVE_SECONDS) except queue.Empty: yield ": keep-alive\n\n" continue - yield format_sse_event(SSE_EVENT_SUBTITLE, payload_data) + if not isinstance(event_data, dict): + continue + event_name = str(event_data.get("event", SSE_EVENT_SUBTITLE)) + payload = event_data.get("payload", {}) + if not isinstance(payload, dict): + payload = {} + yield format_sse_event(event_name, payload) finally: with clients_lock: clients.discard(client_queue) @@ -367,6 +425,33 @@ def list_audio_devices() -> None: print(f"[{idx}] {dev['name']} ({io_str})") +def publish_audio_activity(chunk_rms: float) -> None: + global _audio_active_until, _audio_last_emit, last_audio_activity_payload + + now_mono = time.monotonic() + if chunk_rms >= AUDIO_ACTIVITY_THRESHOLD: + _audio_active_until = now_mono + AUDIO_ACTIVITY_HOLD_SECONDS + + with _audio_state_lock: + active = now_mono <= _audio_active_until + previous_active = bool(last_audio_activity_payload.get("active", False)) + state_changed = active != previous_active + report_due = (now_mono - _audio_last_emit) >= AUDIO_ACTIVITY_REPORT_INTERVAL_SECONDS + + if not state_changed and not report_due: + return + + payload: Dict[str, Any] = { + "active": active, + "rms": round(chunk_rms, 6), + "threshold": AUDIO_ACTIVITY_THRESHOLD, + } + last_audio_activity_payload = payload + _audio_last_emit = now_mono + + broadcast_event(SSE_EVENT_AUDIO_ACTIVITY, payload) + + def audio_callback(indata: np.ndarray, frames: int, time_info: Any, status: Any) -> None: """ Callback definition for audio sink. Unload all data into global audio_buffer @@ -375,6 +460,8 @@ def audio_callback(indata: np.ndarray, frames: int, time_info: Any, status: Any) print(f"Audio status: {status}") # Take first channel chunk: np.ndarray = indata[:, 0].copy() + chunk_rms: float = float(np.sqrt(np.mean(np.square(chunk)))) if len(chunk) > 0 else 0.0 + publish_audio_activity(chunk_rms) global audio_buffer with lock: @@ -390,7 +477,7 @@ def is_silent(audio_16k: Optional[np.ndarray]) -> bool: if audio_16k is None or len(audio_16k) == 0: return False rms: float = float(np.sqrt(np.mean(np.square(audio_16k)))) # root mean square - return rms < 0.003 + return rms < AUDIO_ACTIVITY_THRESHOLD def processing_loop() -> None: @@ -433,6 +520,7 @@ def main() -> None: global CAPTURE_SAMPLE_RATE, MAX_SAMPLES, model, WHISPER_TASK, WHISPER_BEAM_SIZE, WHISPER_LANGUAGE global BUFFER_SECONDS, PROCESS_INTERVAL_SECONDS, USE_OLLAMA_CLEANUP global OLLAMA_MODEL, OLLAMA_CONTEXT_WINDOW, RAW_BATCH_SIZE, subtitle_context + global AUDIO_ACTIVITY_THRESHOLD, last_audio_activity_payload, _audio_active_until, _audio_last_emit start_subtitle_server() settings: Dict[str, Any] = load_settings() @@ -478,10 +566,22 @@ def main() -> None: WHISPER_LANGUAGE = settings["language"].strip() if settings["language"] else "" BUFFER_SECONDS = float(settings.get("context_seconds", BUFFER_SECONDS)) PROCESS_INTERVAL_SECONDS = float(settings.get("update_interval_seconds", PROCESS_INTERVAL_SECONDS)) + AUDIO_ACTIVITY_THRESHOLD = float(settings.get("audio_activity_threshold", AUDIO_ACTIVITY_THRESHOLD)) if BUFFER_SECONDS <= 0: BUFFER_SECONDS = DEFAULT_SETTINGS["context_seconds"] if PROCESS_INTERVAL_SECONDS <= 0: PROCESS_INTERVAL_SECONDS = DEFAULT_SETTINGS["update_interval_seconds"] + if AUDIO_ACTIVITY_THRESHOLD <= 0: + AUDIO_ACTIVITY_THRESHOLD = float(DEFAULT_SETTINGS["audio_activity_threshold"]) + + last_audio_activity_payload = { + "active": False, + "rms": 0.0, + "threshold": AUDIO_ACTIVITY_THRESHOLD, + } + _audio_active_until = 0.0 + _audio_last_emit = 0.0 + broadcast_event(SSE_EVENT_AUDIO_ACTIVITY, last_audio_activity_payload) model = WhisperModel(model_name, device=whisper_device, compute_type=compute_type) @@ -495,24 +595,51 @@ def main() -> None: print(f"Model: {model_name} | task={WHISPER_TASK} | beam_size={WHISPER_BEAM_SIZE}") print(f"Compute: device={whisper_device} | compute_type={compute_type}") print(f"Capture sample rate: {CAPTURE_SAMPLE_RATE} Hz (resampling to {TARGET_SAMPLE_RATE} Hz)") + print(f"Audio activity threshold (RMS): {AUDIO_ACTIVITY_THRESHOLD}") print(f"Ollama cleanup: {'enabled' if USE_OLLAMA_CLEANUP else 'disabled'} (model={OLLAMA_MODEL})") processing_thread = threading.Thread(target=processing_loop, daemon=True) processing_thread.start() - with sd.InputStream( + + with recent_subtitle_lines_lock: + recent_subtitle_lines.clear() + with runtime_logs_lock: + runtime_logs.clear() + + add_runtime_log("SYSTEM", "Runtime dashboard started") + add_runtime_log("SYSTEM", f"Device: {device_info['name']} @ {CAPTURE_SAMPLE_RATE} Hz") + add_runtime_log("SYSTEM", f"Task={WHISPER_TASK} | Beam={WHISPER_BEAM_SIZE} | Cleanup={'on' if USE_OLLAMA_CLEANUP else 'off'}") + + stream = sd.InputStream( device=device_index, channels=1, samplerate=CAPTURE_SAMPLE_RATE, dtype="float32", callback=audio_callback, blocksize=int(CAPTURE_SAMPLE_RATE * 0.5), - ): - print("Listening... Press Ctrl+C to stop.") + ) + + def _on_dashboard_close() -> None: + print("Stopping.") + + try: + stream.start() + print("Listening... Close the runtime window to stop.") + run_runtime_dashboard( + get_audio_activity=get_audio_activity_snapshot, + get_runtime_logs=get_runtime_logs_snapshot, + get_subtitle_lines=get_recent_subtitle_lines_snapshot, + on_close=_on_dashboard_close, + ) + finally: + try: + stream.stop() + except Exception: + pass try: - while True: - time.sleep(1) - except KeyboardInterrupt: - print("Stopping.") + stream.close() + except Exception: + pass if __name__ == "__main__": -- cgit v1.2.3