From a191bcd1ebbd1e4a1b8870efdf69a09710bcd68a Mon Sep 17 00:00:00 2001 From: Pheby Date: Mon, 8 Jun 2026 06:29:29 +0000 Subject: [PATCH] fix: decode opus manually and use json audio transcription --- bot.py | 26 ++------------- openrouter_client.py | 41 +++++++++++++++++------ tests/test_openrouter_client.py | 25 ++++++++++++++ voice.py | 59 +++++++++++++++++++++++++++++++-- 4 files changed, 114 insertions(+), 37 deletions(-) create mode 100644 tests/test_openrouter_client.py diff --git a/bot.py b/bot.py index 289b367..83a06f4 100644 --- a/bot.py +++ b/bot.py @@ -10,8 +10,6 @@ from pathlib import Path import discord from discord import app_commands from discord.ext import commands, voice_recv -from discord.ext.voice_recv import opus as voice_recv_opus -from discord.opus import OpusError from dotenv import load_dotenv import config @@ -19,28 +17,8 @@ from helpers import chunk_message, command_channel_error from openrouter_client import summarize, transcribe from voice import MeetingRecorder -log = logging.getLogger(__name__) - -_original_decode_packet = voice_recv_opus.PacketDecoder._decode_packet - - -def _safe_decode_packet(self, packet): - try: - return _original_decode_packet(self, packet) - except OpusError as exc: - log.warning("Dropping corrupted opus packet for ssrc %s: %s", self.ssrc, exc) - if packet: - return packet, b"" - try: - next_packet = self._buffer.peek_next() - if next_packet is not None: - return packet, self._decoder.decode(next_packet.decrypted_data, fec=True) - return packet, self._decoder.decode(None, fec=False) - except Exception: - return packet, b"" - - -voice_recv_opus.PacketDecoder._decode_packet = _safe_decode_packet +logging.getLogger("discord.ext.voice_recv.reader").setLevel(logging.WARNING) +logging.getLogger("discord.ext.voice_recv.gateway").setLevel(logging.WARNING) load_dotenv() diff --git a/openrouter_client.py b/openrouter_client.py index 1b0fbd0..14e671b 100644 --- a/openrouter_client.py +++ b/openrouter_client.py @@ -1,6 +1,9 @@ from __future__ import annotations +import base64 +import json import os +from pathlib import Path from typing import Any import httpx @@ -26,19 +29,35 @@ def _auth_headers() -> dict[str, str]: } +def _audio_format(audio_path: str) -> str: + suffix = Path(audio_path).suffix.lower().lstrip(".") + return suffix or "wav" + + +def _build_transcription_payload(audio_path: str) -> dict[str, Any]: + with open(audio_path, "rb") as audio_file: + encoded = base64.b64encode(audio_file.read()).decode("ascii") + + return { + "model": "openai/whisper-large-v3", + "input_audio": { + "data": encoded, + "format": _audio_format(audio_path), + }, + } + + async def transcribe(audio_path: str) -> str: - """Send a WAV file to OpenRouter's whisper model and return transcript text.""" + """Send audio to OpenRouter's whisper model and return transcript text.""" + headers = _auth_headers() + headers["Content-Type"] = "application/json" + async with httpx.AsyncClient(timeout=300) as client: - with open(audio_path, "rb") as audio_file: - files = { - "file": (os.path.basename(audio_path), audio_file, "audio/wav"), - "model": (None, "openai/whisper-large-v3"), - } - resp = await client.post( - f"{OPENROUTER_BASE}/audio/transcriptions", - headers=_auth_headers(), - files=files, - ) + resp = await client.post( + f"{OPENROUTER_BASE}/audio/transcriptions", + headers=headers, + content=json.dumps(_build_transcription_payload(audio_path)), + ) try: resp.raise_for_status() except httpx.HTTPStatusError as exc: diff --git a/tests/test_openrouter_client.py b/tests/test_openrouter_client.py new file mode 100644 index 0000000..fc79e56 --- /dev/null +++ b/tests/test_openrouter_client.py @@ -0,0 +1,25 @@ +import base64 +import os +import sys +from pathlib import Path + +sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) + +from openrouter_client import _audio_format, _build_transcription_payload + + +def test_audio_format_defaults_to_wav_when_missing_suffix(tmp_path: Path): + path = tmp_path / "recording" + path.write_bytes(b"abc") + assert _audio_format(str(path)) == "wav" + + +def test_build_transcription_payload_uses_base64_json_shape(tmp_path: Path): + path = tmp_path / "meeting.wav" + path.write_bytes(b"RIFFdemo") + + payload = _build_transcription_payload(str(path)) + + assert payload["model"] == "openai/whisper-large-v3" + assert payload["input_audio"]["format"] == "wav" + assert payload["input_audio"]["data"] == base64.b64encode(b"RIFFdemo").decode("ascii") diff --git a/voice.py b/voice.py index 2ddf2d1..df8541d 100644 --- a/voice.py +++ b/voice.py @@ -1,10 +1,65 @@ from __future__ import annotations +import logging +import wave from pathlib import Path from typing import Callable +from discord.opus import Decoder, OpusError from discord.ext import voice_recv +log = logging.getLogger(__name__) + + +class OpusWaveSink(voice_recv.AudioSink): + """Decode raw opus packets per-SSRC and write PCM to a wav file.""" + + def __init__(self, output_path: str): + super().__init__() + self.output_path = output_path + self._file: wave.Wave_write = wave.open(output_path, "wb") + self._file.setnchannels(Decoder.CHANNELS) + self._file.setsampwidth(Decoder.SAMPLE_SIZE // Decoder.CHANNELS) + self._file.setframerate(Decoder.SAMPLING_RATE) + self._decoders: dict[int, Decoder] = {} + self._error_counts: dict[int, int] = {} + + def wants_opus(self) -> bool: + return True + + def write(self, user, data) -> None: + packet = data.packet + ssrc = getattr(packet, "ssrc", 0) + decoder = self._decoders.setdefault(ssrc, Decoder()) + + try: + if packet: + opus_bytes = data.opus + if not opus_bytes: + return + pcm = decoder.decode(opus_bytes, fec=False) + else: + pcm = decoder.decode(None, fec=False) + except OpusError as exc: + count = self._error_counts.get(ssrc, 0) + 1 + self._error_counts[ssrc] = count + if count in {1, 10, 100}: + log.warning( + "Dropping corrupted opus packet for ssrc %s: %s (count=%s)", + ssrc, + exc, + count, + ) + return + + self._file.writeframes(pcm) + + def cleanup(self) -> None: + try: + self._file.close() + except Exception: + log.warning("OpusWaveSink got error closing file on cleanup", exc_info=True) + class MeetingRecorder: """Wrapper around discord-ext-voice-recv's listen/stop_listening API.""" @@ -13,14 +68,14 @@ class MeetingRecorder: self.vc = voice_client self.output_path = output_path self.recording = False - self.sink: voice_recv.WaveSink | None = None + self.sink: OpusWaveSink | None = None async def start(self, after_callback: Callable[[Exception | None], None]) -> None: if self.vc.is_listening(): raise RuntimeError("Voice client is already listening") Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) - self.sink = voice_recv.WaveSink(self.output_path) + self.sink = OpusWaveSink(self.output_path) self.vc.listen(self.sink, after=after_callback) self.recording = True