fix: decode opus manually and use json audio transcription

2026-06-08 06:29:29 +00:00
parent e6aac63aca
commit a191bcd1eb
4 changed files with 114 additions and 37 deletions
@@ -10,8 +10,6 @@ from pathlib import Path
 import discord
 from discord import app_commands
 from discord.ext import commands, voice_recv
 from discord.ext.voice_recv import opus as voice_recv_opus
 from discord.opus import OpusError
 from dotenv import load_dotenv
 import config
@@ -19,28 +17,8 @@ from helpers import chunk_message, command_channel_error
 from openrouter_client import summarize, transcribe
 from voice import MeetingRecorder
-log = logging.getLogger(__name__)
+logging.getLogger("discord.ext.voice_recv.reader").setLevel(logging.WARNING)
-
+logging.getLogger("discord.ext.voice_recv.gateway").setLevel(logging.WARNING)
 _original_decode_packet = voice_recv_opus.PacketDecoder._decode_packet
 def _safe_decode_packet(self, packet):
    try:
        return _original_decode_packet(self, packet)
    except OpusError as exc:
        log.warning("Dropping corrupted opus packet for ssrc %s: %s", self.ssrc, exc)
        if packet:
            return packet, b""
        try:
            next_packet = self._buffer.peek_next()
            if next_packet is not None:
                return packet, self._decoder.decode(next_packet.decrypted_data, fec=True)
            return packet, self._decoder.decode(None, fec=False)
        except Exception:
            return packet, b""
 voice_recv_opus.PacketDecoder._decode_packet = _safe_decode_packet
 load_dotenv()
@@ -1,6 +1,9 @@
 from __future__ import annotations
 import base64
 import json
 import os
 from pathlib import Path
 from typing import Any
 import httpx
@@ -26,19 +29,35 @@ def _auth_headers() -> dict[str, str]:
    }
 def _audio_format(audio_path: str) -> str:
    suffix = Path(audio_path).suffix.lower().lstrip(".")
    return suffix or "wav"
 def _build_transcription_payload(audio_path: str) -> dict[str, Any]:
    with open(audio_path, "rb") as audio_file:
        encoded = base64.b64encode(audio_file.read()).decode("ascii")
    return {
        "model": "openai/whisper-large-v3",
        "input_audio": {
            "data": encoded,
            "format": _audio_format(audio_path),
        },
    }
 async def transcribe(audio_path: str) -> str:
-    """Send a WAV file to OpenRouter's whisper model and return transcript text."""
+    """Send audio to OpenRouter's whisper model and return transcript text."""
    headers = _auth_headers()
    headers["Content-Type"] = "application/json"
    async with httpx.AsyncClient(timeout=300) as client:
-        with open(audio_path, "rb") as audio_file:
+        resp = await client.post(
-            files = {
+            f"{OPENROUTER_BASE}/audio/transcriptions",
-                "file": (os.path.basename(audio_path), audio_file, "audio/wav"),
+            headers=headers,
-                "model": (None, "openai/whisper-large-v3"),
+            content=json.dumps(_build_transcription_payload(audio_path)),
-            }
+        )
            resp = await client.post(
                f"{OPENROUTER_BASE}/audio/transcriptions",
                headers=_auth_headers(),
                files=files,
            )
        try:
            resp.raise_for_status()
        except httpx.HTTPStatusError as exc:
@@ -0,0 +1,25 @@
 import base64
 import os
 import sys
 from pathlib import Path
 sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
 from openrouter_client import _audio_format, _build_transcription_payload
 def test_audio_format_defaults_to_wav_when_missing_suffix(tmp_path: Path):
    path = tmp_path / "recording"
    path.write_bytes(b"abc")
    assert _audio_format(str(path)) == "wav"
 def test_build_transcription_payload_uses_base64_json_shape(tmp_path: Path):
    path = tmp_path / "meeting.wav"
    path.write_bytes(b"RIFFdemo")
    payload = _build_transcription_payload(str(path))
    assert payload["model"] == "openai/whisper-large-v3"
    assert payload["input_audio"]["format"] == "wav"
    assert payload["input_audio"]["data"] == base64.b64encode(b"RIFFdemo").decode("ascii")
@@ -1,10 +1,65 @@
 from __future__ import annotations
 import logging
 import wave
 from pathlib import Path
 from typing import Callable
 from discord.opus import Decoder, OpusError
 from discord.ext import voice_recv
 log = logging.getLogger(__name__)
 class OpusWaveSink(voice_recv.AudioSink):
    """Decode raw opus packets per-SSRC and write PCM to a wav file."""
    def __init__(self, output_path: str):
        super().__init__()
        self.output_path = output_path
        self._file: wave.Wave_write = wave.open(output_path, "wb")
        self._file.setnchannels(Decoder.CHANNELS)
        self._file.setsampwidth(Decoder.SAMPLE_SIZE // Decoder.CHANNELS)
        self._file.setframerate(Decoder.SAMPLING_RATE)
        self._decoders: dict[int, Decoder] = {}
        self._error_counts: dict[int, int] = {}
    def wants_opus(self) -> bool:
        return True
    def write(self, user, data) -> None:
        packet = data.packet
        ssrc = getattr(packet, "ssrc", 0)
        decoder = self._decoders.setdefault(ssrc, Decoder())
        try:
            if packet:
                opus_bytes = data.opus
                if not opus_bytes:
                    return
                pcm = decoder.decode(opus_bytes, fec=False)
            else:
                pcm = decoder.decode(None, fec=False)
        except OpusError as exc:
            count = self._error_counts.get(ssrc, 0) + 1
            self._error_counts[ssrc] = count
            if count in {1, 10, 100}:
                log.warning(
                    "Dropping corrupted opus packet for ssrc %s: %s (count=%s)",
                    ssrc,
                    exc,
                    count,
                )
            return
        self._file.writeframes(pcm)
    def cleanup(self) -> None:
        try:
            self._file.close()
        except Exception:
            log.warning("OpusWaveSink got error closing file on cleanup", exc_info=True)
 class MeetingRecorder:
    """Wrapper around discord-ext-voice-recv's listen/stop_listening API."""
@@ -13,14 +68,14 @@ class MeetingRecorder:
        self.vc = voice_client
        self.output_path = output_path
        self.recording = False
-        self.sink: voice_recv.WaveSink | None = None
+        self.sink: OpusWaveSink | None = None
    async def start(self, after_callback: Callable[[Exception | None], None]) -> None:
        if self.vc.is_listening():
            raise RuntimeError("Voice client is already listening")
        Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
-        self.sink = voice_recv.WaveSink(self.output_path)
+        self.sink = OpusWaveSink(self.output_path)
        self.vc.listen(self.sink, after=after_callback)
        self.recording = True