fix: decode opus manually and use json audio transcription

This commit is contained in:
2026-06-08 06:29:29 +00:00
parent e6aac63aca
commit a191bcd1eb
4 changed files with 114 additions and 37 deletions
+2 -24
View File
@@ -10,8 +10,6 @@ from pathlib import Path
import discord import discord
from discord import app_commands from discord import app_commands
from discord.ext import commands, voice_recv from discord.ext import commands, voice_recv
from discord.ext.voice_recv import opus as voice_recv_opus
from discord.opus import OpusError
from dotenv import load_dotenv from dotenv import load_dotenv
import config import config
@@ -19,28 +17,8 @@ from helpers import chunk_message, command_channel_error
from openrouter_client import summarize, transcribe from openrouter_client import summarize, transcribe
from voice import MeetingRecorder from voice import MeetingRecorder
log = logging.getLogger(__name__) logging.getLogger("discord.ext.voice_recv.reader").setLevel(logging.WARNING)
logging.getLogger("discord.ext.voice_recv.gateway").setLevel(logging.WARNING)
_original_decode_packet = voice_recv_opus.PacketDecoder._decode_packet
def _safe_decode_packet(self, packet):
try:
return _original_decode_packet(self, packet)
except OpusError as exc:
log.warning("Dropping corrupted opus packet for ssrc %s: %s", self.ssrc, exc)
if packet:
return packet, b""
try:
next_packet = self._buffer.peek_next()
if next_packet is not None:
return packet, self._decoder.decode(next_packet.decrypted_data, fec=True)
return packet, self._decoder.decode(None, fec=False)
except Exception:
return packet, b""
voice_recv_opus.PacketDecoder._decode_packet = _safe_decode_packet
load_dotenv() load_dotenv()
+30 -11
View File
@@ -1,6 +1,9 @@
from __future__ import annotations from __future__ import annotations
import base64
import json
import os import os
from pathlib import Path
from typing import Any from typing import Any
import httpx import httpx
@@ -26,19 +29,35 @@ def _auth_headers() -> dict[str, str]:
} }
def _audio_format(audio_path: str) -> str:
suffix = Path(audio_path).suffix.lower().lstrip(".")
return suffix or "wav"
def _build_transcription_payload(audio_path: str) -> dict[str, Any]:
with open(audio_path, "rb") as audio_file:
encoded = base64.b64encode(audio_file.read()).decode("ascii")
return {
"model": "openai/whisper-large-v3",
"input_audio": {
"data": encoded,
"format": _audio_format(audio_path),
},
}
async def transcribe(audio_path: str) -> str: async def transcribe(audio_path: str) -> str:
"""Send a WAV file to OpenRouter's whisper model and return transcript text.""" """Send audio to OpenRouter's whisper model and return transcript text."""
headers = _auth_headers()
headers["Content-Type"] = "application/json"
async with httpx.AsyncClient(timeout=300) as client: async with httpx.AsyncClient(timeout=300) as client:
with open(audio_path, "rb") as audio_file: resp = await client.post(
files = { f"{OPENROUTER_BASE}/audio/transcriptions",
"file": (os.path.basename(audio_path), audio_file, "audio/wav"), headers=headers,
"model": (None, "openai/whisper-large-v3"), content=json.dumps(_build_transcription_payload(audio_path)),
} )
resp = await client.post(
f"{OPENROUTER_BASE}/audio/transcriptions",
headers=_auth_headers(),
files=files,
)
try: try:
resp.raise_for_status() resp.raise_for_status()
except httpx.HTTPStatusError as exc: except httpx.HTTPStatusError as exc:
+25
View File
@@ -0,0 +1,25 @@
import base64
import os
import sys
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from openrouter_client import _audio_format, _build_transcription_payload
def test_audio_format_defaults_to_wav_when_missing_suffix(tmp_path: Path):
path = tmp_path / "recording"
path.write_bytes(b"abc")
assert _audio_format(str(path)) == "wav"
def test_build_transcription_payload_uses_base64_json_shape(tmp_path: Path):
path = tmp_path / "meeting.wav"
path.write_bytes(b"RIFFdemo")
payload = _build_transcription_payload(str(path))
assert payload["model"] == "openai/whisper-large-v3"
assert payload["input_audio"]["format"] == "wav"
assert payload["input_audio"]["data"] == base64.b64encode(b"RIFFdemo").decode("ascii")
+57 -2
View File
@@ -1,10 +1,65 @@
from __future__ import annotations from __future__ import annotations
import logging
import wave
from pathlib import Path from pathlib import Path
from typing import Callable from typing import Callable
from discord.opus import Decoder, OpusError
from discord.ext import voice_recv from discord.ext import voice_recv
log = logging.getLogger(__name__)
class OpusWaveSink(voice_recv.AudioSink):
"""Decode raw opus packets per-SSRC and write PCM to a wav file."""
def __init__(self, output_path: str):
super().__init__()
self.output_path = output_path
self._file: wave.Wave_write = wave.open(output_path, "wb")
self._file.setnchannels(Decoder.CHANNELS)
self._file.setsampwidth(Decoder.SAMPLE_SIZE // Decoder.CHANNELS)
self._file.setframerate(Decoder.SAMPLING_RATE)
self._decoders: dict[int, Decoder] = {}
self._error_counts: dict[int, int] = {}
def wants_opus(self) -> bool:
return True
def write(self, user, data) -> None:
packet = data.packet
ssrc = getattr(packet, "ssrc", 0)
decoder = self._decoders.setdefault(ssrc, Decoder())
try:
if packet:
opus_bytes = data.opus
if not opus_bytes:
return
pcm = decoder.decode(opus_bytes, fec=False)
else:
pcm = decoder.decode(None, fec=False)
except OpusError as exc:
count = self._error_counts.get(ssrc, 0) + 1
self._error_counts[ssrc] = count
if count in {1, 10, 100}:
log.warning(
"Dropping corrupted opus packet for ssrc %s: %s (count=%s)",
ssrc,
exc,
count,
)
return
self._file.writeframes(pcm)
def cleanup(self) -> None:
try:
self._file.close()
except Exception:
log.warning("OpusWaveSink got error closing file on cleanup", exc_info=True)
class MeetingRecorder: class MeetingRecorder:
"""Wrapper around discord-ext-voice-recv's listen/stop_listening API.""" """Wrapper around discord-ext-voice-recv's listen/stop_listening API."""
@@ -13,14 +68,14 @@ class MeetingRecorder:
self.vc = voice_client self.vc = voice_client
self.output_path = output_path self.output_path = output_path
self.recording = False self.recording = False
self.sink: voice_recv.WaveSink | None = None self.sink: OpusWaveSink | None = None
async def start(self, after_callback: Callable[[Exception | None], None]) -> None: async def start(self, after_callback: Callable[[Exception | None], None]) -> None:
if self.vc.is_listening(): if self.vc.is_listening():
raise RuntimeError("Voice client is already listening") raise RuntimeError("Voice client is already listening")
Path(self.output_path).parent.mkdir(parents=True, exist_ok=True) Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
self.sink = voice_recv.WaveSink(self.output_path) self.sink = OpusWaveSink(self.output_path)
self.vc.listen(self.sink, after=after_callback) self.vc.listen(self.sink, after=after_callback)
self.recording = True self.recording = True