aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2026-05-09 00:27:38 -0700
committerPinapelz <yukais@pinapelz.com>2026-05-09 00:27:57 -0700
commitebc390f9e74378fdaba3e79e9da4a76c436464e2 (patch)
tree0b2829e10d0c846902eb2bf53ad05dcdb8cff3ce
parent41c4a2b287030f93b96db27dc6783e2b12aab99e (diff)
implement gpt-realtime-translate
-rw-r--r--README.md12
-rw-r--r--gui/gui_settings.py2
-rw-r--r--openai_realtime.py294
-rw-r--r--pyproject.toml1
-rw-r--r--server.py104
-rw-r--r--uv.lock11
-rw-r--r--youtube-subtitle.user.js2
7 files changed, 402 insertions, 24 deletions
diff --git a/README.md b/README.md
index c92f653..f880bf9 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,14 @@ A GUI is available for configuration
`server.py` serves a backend for translating incoming audio data. It expects some other client to hit the `/events` endpoint to fetch the translated data.
+## Translation backends
+
+You can now choose between two subtitle backends in the settings dialog:
+
+- **Whisper + optional Ollama cleanup**
+- **OpenAI Realtime translation** using `gpt-realtime-translate` (requires OpenAI API Key, billed usage)
+ - This bypasses `faster-whisper` and `ollama`, in this use case auto-live-tl only serves to encode PCM data into the appropriate format for `gpt-realtime-translate`
+
# Clients:
@@ -103,7 +111,3 @@ https://github.com/user-attachments/assets/db602a11-2d13-4e58-a5e8-1d4a71c1be0e
Example 2:
https://github.com/user-attachments/assets/a480809e-77f7-4b66-9686-aa2ffea8333d
-
-
-
-
diff --git a/gui/gui_settings.py b/gui/gui_settings.py
index c6d98b5..66f9a6a 100644
--- a/gui/gui_settings.py
+++ b/gui/gui_settings.py
@@ -216,7 +216,7 @@ class _SettingsDialog(QDialog):
openai_tab_layout.addLayout(openai_layout)
self.openai_hint_label = QLabel(
- "When enabled, source audio is streamed to OpenAI /v1/realtime/translations and subtitle SSE events are produced from realtime transcript output. Ollama cleanup is bypassed.",
+ "When enabled, source audio is streamed to OpenAI /v1/realtime/translations (gpt-realtime-translate) and subtitle SSE events are produced from realtime transcript output. Ollama cleanup is bypassed.",
openai_tab,
)
self.openai_hint_label.setWordWrap(True)
diff --git a/openai_realtime.py b/openai_realtime.py
new file mode 100644
index 0000000..56ba849
--- /dev/null
+++ b/openai_realtime.py
@@ -0,0 +1,294 @@
+import base64
+import json
+import queue
+import re
+import threading
+import time
+from typing import Any, Callable, List, Optional, Tuple
+
+import numpy as np
+import websocket
+
+
+AddRuntimeLogFn = Callable[[str, str], None]
+BroadcastSubtitleFn = Callable[[str], None]
+ResampleAudioFn = Callable[[np.ndarray, int, int], np.ndarray]
+
+
+class OpenAIRealtimeTranslator:
+ def __init__(
+ self,
+ *,
+ api_key: str,
+ model: str,
+ output_language: str,
+ safety_identifier: str,
+ add_runtime_log: AddRuntimeLogFn,
+ broadcast_subtitle: BroadcastSubtitleFn,
+ resample_audio: ResampleAudioFn,
+ target_sample_rate: int = 24000,
+ reconnect_seconds: float = 2.0,
+ buffer_stale_seconds: float = 1.1,
+ ws_url_template: str = "wss://api.openai.com/v1/realtime/translations?model={model}",
+ queue_maxsize: int = 200,
+ ) -> None:
+ self.api_key = api_key
+ self.model = model
+ self.output_language = output_language
+ self.safety_identifier = safety_identifier
+
+ self.target_sample_rate = target_sample_rate
+ self.reconnect_seconds = reconnect_seconds
+ self.buffer_stale_seconds = buffer_stale_seconds
+ self.ws_url_template = ws_url_template
+
+ self._add_runtime_log = add_runtime_log
+ self._broadcast_subtitle = broadcast_subtitle
+ self._resample_audio = resample_audio
+
+ self._audio_queue: queue.Queue = queue.Queue(maxsize=queue_maxsize)
+ self._stop_sentinel: object = object()
+ self._stop_event: threading.Event = threading.Event()
+ self._transcript_buffer: str = ""
+ self._last_delta_monotonic: float = 0.0
+ self._transcript_lock: threading.Lock = threading.Lock()
+ self._thread: Optional[threading.Thread] = None
+
+ @staticmethod
+ def _float_audio_to_pcm16_base64(audio_np: np.ndarray) -> str:
+ if len(audio_np) == 0:
+ return ""
+ clipped = np.clip(audio_np, -1.0, 1.0)
+ pcm16 = (clipped * 32767.0).astype(np.int16)
+ return base64.b64encode(pcm16.tobytes()).decode("ascii")
+
+ @staticmethod
+ def _normalize_subtitle_chunk(text: str) -> str:
+ return re.sub(r"\s+", " ", text).strip()
+
+ def _extract_completed_sentences(self, buffer: str) -> Tuple[List[str], str]:
+ completed: List[str] = []
+ remaining = buffer
+
+ while True:
+ match = re.search(r"(.+?[.!?])(?=\s|$)", remaining, flags=re.DOTALL)
+ if not match:
+ break
+ sentence = self._normalize_subtitle_chunk(match.group(1))
+ if sentence:
+ completed.append(sentence)
+ remaining = remaining[match.end() :].lstrip()
+
+ if "\n" in remaining:
+ parts = [part.strip() for part in remaining.split("\n")]
+ for part in parts[:-1]:
+ normalized_part = self._normalize_subtitle_chunk(part)
+ if normalized_part:
+ completed.append(normalized_part)
+ remaining = parts[-1] if parts else ""
+
+ return completed, remaining
+
+ @staticmethod
+ def _clear_queue(target_queue: queue.Queue) -> None:
+ while True:
+ try:
+ target_queue.get_nowait()
+ except queue.Empty:
+ break
+
+ def _flush_transcript_buffer(self, force: bool = False) -> None:
+ with self._transcript_lock:
+ text = self._normalize_subtitle_chunk(self._transcript_buffer)
+ if not text:
+ self._transcript_buffer = ""
+ return
+ if not force and len(text) < 2:
+ return
+ self._transcript_buffer = ""
+
+ self._add_runtime_log("FINAL", text)
+ self._broadcast_subtitle(text)
+
+ def _flush_transcript_buffer_if_stale(self) -> None:
+ with self._transcript_lock:
+ if not self._transcript_buffer:
+ return
+ elapsed = time.monotonic() - self._last_delta_monotonic
+ if elapsed < self.buffer_stale_seconds:
+ return
+ self._flush_transcript_buffer(force=True)
+
+ def _handle_transcript_delta(self, delta: str) -> None:
+ if not delta:
+ return
+
+ delta = delta.replace("\r", "")
+
+ with self._transcript_lock:
+ self._last_delta_monotonic = time.monotonic()
+ self._transcript_buffer += delta
+ completed, remaining = self._extract_completed_sentences(self._transcript_buffer)
+
+ if len(remaining) > 180:
+ split_idx = remaining.rfind(" ")
+ if split_idx > 80:
+ overflow = self._normalize_subtitle_chunk(remaining[:split_idx])
+ if overflow:
+ completed.append(overflow)
+ remaining = remaining[split_idx:].lstrip()
+
+ self._transcript_buffer = remaining
+
+ for sentence in completed:
+ self._add_runtime_log("FINAL", sentence)
+ self._broadcast_subtitle(sentence)
+
+ def _audio_sender_loop(self, ws: websocket.WebSocket) -> None:
+ while not self._stop_event.is_set():
+ try:
+ item = self._audio_queue.get(timeout=0.2)
+ except queue.Empty:
+ continue
+
+ if item is self._stop_sentinel:
+ break
+ if not isinstance(item, str) or not item:
+ continue
+
+ payload = {
+ "type": "session.input_audio_buffer.append",
+ "audio": item,
+ }
+ try:
+ ws.send(json.dumps(payload))
+ except Exception:
+ break
+
+ def _run_loop(self) -> None:
+ ws_url = self.ws_url_template.format(model=self.model)
+
+ while not self._stop_event.is_set():
+ ws: Any = None
+ sender_thread: Optional[threading.Thread] = None
+
+ try:
+ headers: List[str] = [f"Authorization: Bearer {self.api_key}"]
+ if self.safety_identifier:
+ headers.append(f"OpenAI-Safety-Identifier: {self.safety_identifier}")
+
+ ws = websocket.WebSocket()
+ ws.connect(ws_url, header=headers)
+ ws.settimeout(0.6)
+
+ session_update = {
+ "type": "session.update",
+ "session": {
+ "audio": {
+ "output": {
+ "language": self.output_language,
+ },
+ },
+ },
+ }
+ ws.send(json.dumps(session_update))
+ self._add_runtime_log(
+ "OPENAI",
+ f"Connected to realtime translation (lang={self.output_language}, model={self.model})",
+ )
+
+ sender_thread = threading.Thread(target=self._audio_sender_loop, args=(ws,), daemon=True)
+ sender_thread.start()
+
+ while not self._stop_event.is_set():
+ try:
+ incoming = ws.recv()
+ except websocket.WebSocketTimeoutException:
+ self._flush_transcript_buffer_if_stale()
+ continue
+
+ if incoming is None:
+ break
+ incoming = incoming.strip()
+ if not incoming:
+ continue
+
+ try:
+ event = json.loads(incoming)
+ except json.JSONDecodeError:
+ self._add_runtime_log("OPENAI", "Received non-JSON event from realtime translation socket")
+ continue
+
+ event_type = str(event.get("type", ""))
+ if event_type == "session.output_transcript.delta":
+ delta = str(event.get("delta", ""))
+ self._handle_transcript_delta(delta)
+ elif event_type in {"session.output_transcript.done", "session.output_transcript.completed"}:
+ self._flush_transcript_buffer(force=True)
+ elif event_type in {"error", "session.error"}:
+ self._add_runtime_log("OPENAI", f"Realtime API error: {json.dumps(event, ensure_ascii=False)}")
+ elif event_type == "session.updated":
+ self._add_runtime_log("OPENAI", "Realtime session configured")
+
+ except Exception as exc:
+ if self._stop_event.is_set():
+ break
+ self._add_runtime_log("OPENAI", f"Realtime connection failed: {exc}")
+ time.sleep(self.reconnect_seconds)
+ finally:
+ if ws is not None:
+ try:
+ ws.close()
+ except Exception:
+ pass
+ self._flush_transcript_buffer(force=True)
+ if sender_thread is not None and sender_thread.is_alive():
+ sender_thread.join(timeout=1.0)
+
+ def start(self) -> None:
+ if self._thread is not None and self._thread.is_alive():
+ return
+
+ self._clear_queue(self._audio_queue)
+ self._stop_event.clear()
+ with self._transcript_lock:
+ self._transcript_buffer = ""
+ self._last_delta_monotonic = 0.0
+
+ self._thread = threading.Thread(target=self._run_loop, daemon=True)
+ self._thread.start()
+
+ def stop(self) -> None:
+ self._stop_event.set()
+ try:
+ self._audio_queue.put_nowait(self._stop_sentinel)
+ except queue.Full:
+ self._clear_queue(self._audio_queue)
+ try:
+ self._audio_queue.put_nowait(self._stop_sentinel)
+ except queue.Full:
+ pass
+
+ if self._thread is not None and self._thread.is_alive():
+ self._thread.join(timeout=2.0)
+
+ def enqueue_audio_chunk(self, chunk: np.ndarray, capture_sample_rate: int) -> None:
+ if capture_sample_rate <= 0 or len(chunk) == 0:
+ return
+
+ resampled = self._resample_audio(chunk, capture_sample_rate, self.target_sample_rate)
+ encoded = self._float_audio_to_pcm16_base64(resampled)
+ if not encoded:
+ return
+
+ try:
+ self._audio_queue.put_nowait(encoded)
+ except queue.Full:
+ try:
+ self._audio_queue.get_nowait()
+ except queue.Empty:
+ pass
+ try:
+ self._audio_queue.put_nowait(encoded)
+ except queue.Full:
+ pass
diff --git a/pyproject.toml b/pyproject.toml
index 0b7da0b..686ded2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,4 +11,5 @@ dependencies = [
"ollama>=0.6.1",
"pyside6>=6.11.0",
"sounddevice>=0.5.5",
+ "websocket-client>=1.8.0",
]
diff --git a/server.py b/server.py
index d1da010..fefe61e 100644
--- a/server.py
+++ b/server.py
@@ -6,6 +6,7 @@ import os
from collections import Counter, deque
import re
from typing import Any, Deque, Dict, Optional, Set, List, Iterator, Callable
+
from flask import Flask
from flask_cors import CORS
import ollama as _ollama
@@ -14,7 +15,9 @@ from ollama import ChatResponse
import numpy as np
import sounddevice as sd
from faster_whisper import WhisperModel
-from gui import select_settings, prompt_input_sample_rate, run_runtime_dashboard, run_with_loading_popup
+
+from gui.gui import select_settings, prompt_input_sample_rate, run_runtime_dashboard, run_with_loading_popup
+from openai_realtime import OpenAIRealtimeTranslator
from routes import register_routes
from config import _SYSTEM_PROMPT, _LLM_EMPTY_SENTINELS, _HALLUCINATION_PHRASES
@@ -30,11 +33,17 @@ RUNTIME_SUBTITLE_LINES_MAX: int = 120
RUNTIME_LOG_LINES_MAX: int = 300
USE_OLLAMA_CLEANUP: bool = True
+USE_OPENAI_REALTIME_TRANSLATE: bool = False
OLLAMA_MODEL: str = "qwen2.5:7b-instruct"
OLLAMA_CONTEXT_WINDOW: int = 6 # number of recent cleaned segments kept as context
OLLAMA_OPTIONS: Dict[str, Any] = {"num_gpu": 1}
RAW_BATCH_SIZE: int = 2 # accumulate this many raw Whisper lines before calling the LLM
+OPENAI_REALTIME_MODEL: str = "gpt-realtime-translate"
+OPENAI_API_KEY: str = ""
+OPENAI_OUTPUT_LANGUAGE: str = "es"
+OPENAI_SAFETY_IDENTIFIER: str = ""
+
SETTINGS_PATH: str = os.path.join(os.path.dirname(__file__), "settings.json")
DEFAULT_SETTINGS: Dict[str, Any] = {
@@ -53,6 +62,11 @@ DEFAULT_SETTINGS: Dict[str, Any] = {
"ollama_model": "qwen2.5:7b-instruct",
"ollama_context_window": 5,
"ollama_raw_batch_size": 1,
+ "use_openai_realtime_translate": False,
+ "openai_api_key": "",
+ "openai_output_language": "en",
+ "openai_model": "gpt-realtime-translate",
+ "openai_safety_identifier": "",
}
MODEL_CHOICES: List[str] = ["tiny", "base", "small", "medium", "large-v2", "large-v3", "distil-large-v3"]
@@ -92,11 +106,13 @@ CORS(app)
# OLLAMA stuff
llm_input_queue: queue.Queue = queue.Queue(maxsize=1)
-subtitle_context: Deque[str] = deque(maxlen=OLLAMA_CONTEXT_WINDOW) # sliding window context
+subtitle_context: Deque[str] = deque(maxlen=OLLAMA_CONTEXT_WINDOW) # sliding window context
subtitle_context_lock: threading.Lock = threading.Lock()
_raw_batch: List[str] = []
_raw_batch_lock: threading.Lock = threading.Lock()
+openai_realtime_client: Optional[OpenAIRealtimeTranslator] = None
+
def resample_audio(audio_np: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
"""
Resamples audio to TARGET_SAMPLE_RATE (default is 16000hz), speeds up inference time, fetched as a nd array
@@ -113,6 +129,8 @@ def resample_audio(audio_np: np.ndarray, src_rate: int, dst_rate: int) -> np.nda
return np.interp(x_new, x_old, audio_np).astype(np.float32)
+
+
def load_settings() -> Dict[str, Any]:
if not os.path.exists(SETTINGS_PATH):
return DEFAULT_SETTINGS.copy()
@@ -462,7 +480,7 @@ def publish_audio_activity(chunk_rms: float) -> None:
def audio_callback(indata: np.ndarray, frames: int, time_info: Any, status: Any) -> None:
"""
- Callback definition for audio sink. Unload all data into global audio_buffer
+ Callback definition for audio sink. Sends audio to local Whisper buffer or OpenAI realtime queue.
"""
if status:
print(f"Audio status: {status}")
@@ -471,6 +489,11 @@ def audio_callback(indata: np.ndarray, frames: int, time_info: Any, status: Any)
chunk_rms: float = float(np.sqrt(np.mean(np.square(chunk)))) if len(chunk) > 0 else 0.0
publish_audio_activity(chunk_rms)
+ if USE_OPENAI_REALTIME_TRANSLATE:
+ if openai_realtime_client is not None:
+ openai_realtime_client.enqueue_audio_chunk(chunk, CAPTURE_SAMPLE_RATE)
+ return
+
global audio_buffer
with lock:
audio_buffer = np.concatenate([audio_buffer, chunk])
@@ -526,9 +549,12 @@ def select_input_sample_rate(device_index: int, preferred_rate: int) -> int:
def main() -> None:
global CAPTURE_SAMPLE_RATE, MAX_SAMPLES, model, WHISPER_TASK, WHISPER_BEAM_SIZE, WHISPER_LANGUAGE
- global BUFFER_SECONDS, PROCESS_INTERVAL_SECONDS, USE_OLLAMA_CLEANUP
+ global BUFFER_SECONDS, PROCESS_INTERVAL_SECONDS, USE_OLLAMA_CLEANUP, USE_OPENAI_REALTIME_TRANSLATE
global OLLAMA_MODEL, OLLAMA_CONTEXT_WINDOW, RAW_BATCH_SIZE, subtitle_context
global AUDIO_ACTIVITY_THRESHOLD, last_audio_activity_payload, _audio_active_until, _audio_last_emit
+ global OPENAI_API_KEY, OPENAI_OUTPUT_LANGUAGE, OPENAI_REALTIME_MODEL, OPENAI_SAFETY_IDENTIFIER
+ global openai_realtime_client
+
start_subtitle_server()
settings: Dict[str, Any] = load_settings()
@@ -545,12 +571,22 @@ def main() -> None:
)
save_settings(settings)
- USE_OLLAMA_CLEANUP = bool(settings.get("use_ollama_cleanup", True))
+ USE_OPENAI_REALTIME_TRANSLATE = bool(settings.get("use_openai_realtime_translate", False))
+ OPENAI_REALTIME_MODEL = str(settings.get("openai_model", OPENAI_REALTIME_MODEL)).strip() or OPENAI_REALTIME_MODEL
+ OPENAI_OUTPUT_LANGUAGE = str(settings.get("openai_output_language", "es")).strip() or "es"
+ OPENAI_SAFETY_IDENTIFIER = str(settings.get("openai_safety_identifier", "")).strip()
+ OPENAI_API_KEY = str(settings.get("openai_api_key", "")).strip() or str(os.environ.get("OPENAI_API_KEY", "")).strip()
+
+ USE_OLLAMA_CLEANUP = bool(settings.get("use_ollama_cleanup", True)) and not USE_OPENAI_REALTIME_TRANSLATE
OLLAMA_OPTIONS["num_gpu"] = 0 if settings.get("ollama_device", "CPU").upper() == "CPU" else 1
- OLLAMA_MODEL = "qwen2.5:7b-instruct" if str(settings.get("ollama_model", OLLAMA_MODEL)) is None else str(settings.get("ollama_model", OLLAMA_MODEL))
+ OLLAMA_MODEL = str(settings.get("ollama_model", OLLAMA_MODEL)).strip() or OLLAMA_MODEL
OLLAMA_CONTEXT_WINDOW = int(settings.get("ollama_context_window", 6))
subtitle_context = deque(maxlen=OLLAMA_CONTEXT_WINDOW)
RAW_BATCH_SIZE = int(settings.get("ollama_raw_batch_size", 3))
+
+ if USE_OPENAI_REALTIME_TRANSLATE and not OPENAI_API_KEY:
+ raise RuntimeError("OpenAI realtime translation is enabled, but no API key was provided.")
+
if USE_OLLAMA_CLEANUP:
run_with_loading_popup(
title="Preparing Ollama model",
@@ -595,23 +631,12 @@ def main() -> None:
_audio_last_emit = 0.0
broadcast_event(SSE_EVENT_AUDIO_ACTIVITY, last_audio_activity_payload)
- model = WhisperModel(model_name, device=whisper_device, compute_type=compute_type)
-
device_info = sd.query_devices(device_index)
preferred_rate: int = int(device_info["default_samplerate"])
if preferred_rate <= 0:
preferred_rate = 48000
CAPTURE_SAMPLE_RATE = select_input_sample_rate(device_index, preferred_rate)
MAX_SAMPLES = int(CAPTURE_SAMPLE_RATE * BUFFER_SECONDS)
- print(f"Using device {device_index}: {device_info['name']}")
- print(f"Model: {model_name} | task={WHISPER_TASK} | beam_size={WHISPER_BEAM_SIZE}")
- print(f"Compute: device={whisper_device} | compute_type={compute_type}")
- print(f"Capture sample rate: {CAPTURE_SAMPLE_RATE} Hz (resampling to {TARGET_SAMPLE_RATE} Hz)")
- print(f"Audio activity threshold (RMS): {AUDIO_ACTIVITY_THRESHOLD}")
- print(f"Ollama cleanup: {'enabled' if USE_OLLAMA_CLEANUP else 'disabled'} (model={OLLAMA_MODEL})")
-
- processing_thread = threading.Thread(target=processing_loop, daemon=True)
- processing_thread.start()
with recent_subtitle_lines_lock:
recent_subtitle_lines.clear()
@@ -620,7 +645,46 @@ def main() -> None:
add_runtime_log("SYSTEM", "Runtime dashboard started")
add_runtime_log("SYSTEM", f"Device: {device_info['name']} @ {CAPTURE_SAMPLE_RATE} Hz")
- add_runtime_log("SYSTEM", f"Task={WHISPER_TASK} | Beam={WHISPER_BEAM_SIZE} | Cleanup={'on' if USE_OLLAMA_CLEANUP else 'off'}")
+
+ processing_thread: Optional[threading.Thread] = None
+
+ if USE_OPENAI_REALTIME_TRANSLATE:
+ openai_realtime_client = OpenAIRealtimeTranslator(
+ api_key=OPENAI_API_KEY,
+ model=OPENAI_REALTIME_MODEL,
+ output_language=OPENAI_OUTPUT_LANGUAGE,
+ safety_identifier=OPENAI_SAFETY_IDENTIFIER,
+ add_runtime_log=add_runtime_log,
+ broadcast_subtitle=broadcast_subtitle,
+ resample_audio=resample_audio,
+ )
+ openai_realtime_client.start()
+
+ print(f"Using device {device_index}: {device_info['name']}")
+ print(f"Realtime translation backend: OpenAI ({OPENAI_REALTIME_MODEL})")
+ print(
+ f"Capture sample rate: {CAPTURE_SAMPLE_RATE} Hz "
+ f"(resampling to {openai_realtime_client.target_sample_rate} Hz for OpenAI)"
+ )
+ print(f"Audio activity threshold (RMS): {AUDIO_ACTIVITY_THRESHOLD}")
+ print("Ollama cleanup: disabled (OpenAI realtime translation selected)")
+
+ add_runtime_log("SYSTEM", f"Backend=OpenAI realtime | model={OPENAI_REALTIME_MODEL} | lang={OPENAI_OUTPUT_LANGUAGE}")
+ else:
+ openai_realtime_client = None
+ model = WhisperModel(model_name, device=whisper_device, compute_type=compute_type)
+
+ print(f"Using device {device_index}: {device_info['name']}")
+ print(f"Model: {model_name} | task={WHISPER_TASK} | beam_size={WHISPER_BEAM_SIZE}")
+ print(f"Compute: device={whisper_device} | compute_type={compute_type}")
+ print(f"Capture sample rate: {CAPTURE_SAMPLE_RATE} Hz (resampling to {TARGET_SAMPLE_RATE} Hz)")
+ print(f"Audio activity threshold (RMS): {AUDIO_ACTIVITY_THRESHOLD}")
+ print(f"Ollama cleanup: {'enabled' if USE_OLLAMA_CLEANUP else 'disabled'} (model={OLLAMA_MODEL})")
+
+ add_runtime_log("SYSTEM", f"Task={WHISPER_TASK} | Beam={WHISPER_BEAM_SIZE} | Cleanup={'on' if USE_OLLAMA_CLEANUP else 'off'}")
+
+ processing_thread = threading.Thread(target=processing_loop, daemon=True)
+ processing_thread.start()
stream = sd.InputStream(
device=device_index,
@@ -644,6 +708,10 @@ def main() -> None:
on_close=_on_dashboard_close,
)
finally:
+ if USE_OPENAI_REALTIME_TRANSLATE and openai_realtime_client is not None:
+ openai_realtime_client.stop()
+ openai_realtime_client = None
+
try:
stream.stop()
except Exception:
diff --git a/uv.lock b/uv.lock
index 05b823b..cce3d4a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -43,6 +43,7 @@ dependencies = [
{ name = "ollama" },
{ name = "pyside6" },
{ name = "sounddevice" },
+ { name = "websocket-client" },
]
[package.metadata]
@@ -53,6 +54,7 @@ requires-dist = [
{ name = "ollama", specifier = ">=0.6.1" },
{ name = "pyside6", specifier = ">=6.11.0" },
{ name = "sounddevice", specifier = ">=0.5.5" },
+ { name = "websocket-client", specifier = ">=1.8.0" },
]
[[package]]
@@ -897,6 +899,15 @@ wheels = [
]
[[package]]
+name = "websocket-client"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/41/aa4bf9664e4cda14c3b39865b12251e8e7d239f4cd0e3cc1b6c2ccde25c1/websocket_client-1.9.0.tar.gz", hash = "sha256:9e813624b6eb619999a97dc7958469217c3176312b3a16a4bd1bc7e08a46ec98", size = 70576, upload-time = "2025-10-07T21:16:36.495Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/34/db/b10e48aa8fff7407e67470363eac595018441cf32d5e1001567a7aeba5d2/websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef", size = 82616, upload-time = "2025-10-07T21:16:34.951Z" },
+]
+
+[[package]]
name = "werkzeug"
version = "3.1.8"
source = { registry = "https://pypi.org/simple" }
diff --git a/youtube-subtitle.user.js b/youtube-subtitle.user.js
index 1d3b687..e915c94 100644
--- a/youtube-subtitle.user.js
+++ b/youtube-subtitle.user.js
@@ -147,7 +147,7 @@
const footerText = document.createElement("div");
footerText.id = FOOTER_TEXT_ID;
- footerText.textContent = "auto live tl - message - drag me";
+ footerText.textContent = "Auto-Live-TL - Machine Translated - no translation should be taken as authoritative or quoted verbatim - (drag me)";
footerText.style.marginTop = "8px";
footerText.style.fontSize = "12px";
footerText.style.opacity = "0.7";
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage