fix: decode opus manually and use json audio transcription
This commit is contained in:
@@ -10,8 +10,6 @@ from pathlib import Path
|
|||||||
import discord
|
import discord
|
||||||
from discord import app_commands
|
from discord import app_commands
|
||||||
from discord.ext import commands, voice_recv
|
from discord.ext import commands, voice_recv
|
||||||
from discord.ext.voice_recv import opus as voice_recv_opus
|
|
||||||
from discord.opus import OpusError
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
import config
|
import config
|
||||||
@@ -19,28 +17,8 @@ from helpers import chunk_message, command_channel_error
|
|||||||
from openrouter_client import summarize, transcribe
|
from openrouter_client import summarize, transcribe
|
||||||
from voice import MeetingRecorder
|
from voice import MeetingRecorder
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
logging.getLogger("discord.ext.voice_recv.reader").setLevel(logging.WARNING)
|
||||||
|
logging.getLogger("discord.ext.voice_recv.gateway").setLevel(logging.WARNING)
|
||||||
_original_decode_packet = voice_recv_opus.PacketDecoder._decode_packet
|
|
||||||
|
|
||||||
|
|
||||||
def _safe_decode_packet(self, packet):
|
|
||||||
try:
|
|
||||||
return _original_decode_packet(self, packet)
|
|
||||||
except OpusError as exc:
|
|
||||||
log.warning("Dropping corrupted opus packet for ssrc %s: %s", self.ssrc, exc)
|
|
||||||
if packet:
|
|
||||||
return packet, b""
|
|
||||||
try:
|
|
||||||
next_packet = self._buffer.peek_next()
|
|
||||||
if next_packet is not None:
|
|
||||||
return packet, self._decoder.decode(next_packet.decrypted_data, fec=True)
|
|
||||||
return packet, self._decoder.decode(None, fec=False)
|
|
||||||
except Exception:
|
|
||||||
return packet, b""
|
|
||||||
|
|
||||||
|
|
||||||
voice_recv_opus.PacketDecoder._decode_packet = _safe_decode_packet
|
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|||||||
+27
-8
@@ -1,6 +1,9 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
@@ -26,18 +29,34 @@ def _auth_headers() -> dict[str, str]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async def transcribe(audio_path: str) -> str:
|
def _audio_format(audio_path: str) -> str:
|
||||||
"""Send a WAV file to OpenRouter's whisper model and return transcript text."""
|
suffix = Path(audio_path).suffix.lower().lstrip(".")
|
||||||
async with httpx.AsyncClient(timeout=300) as client:
|
return suffix or "wav"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_transcription_payload(audio_path: str) -> dict[str, Any]:
|
||||||
with open(audio_path, "rb") as audio_file:
|
with open(audio_path, "rb") as audio_file:
|
||||||
files = {
|
encoded = base64.b64encode(audio_file.read()).decode("ascii")
|
||||||
"file": (os.path.basename(audio_path), audio_file, "audio/wav"),
|
|
||||||
"model": (None, "openai/whisper-large-v3"),
|
return {
|
||||||
|
"model": "openai/whisper-large-v3",
|
||||||
|
"input_audio": {
|
||||||
|
"data": encoded,
|
||||||
|
"format": _audio_format(audio_path),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def transcribe(audio_path: str) -> str:
|
||||||
|
"""Send audio to OpenRouter's whisper model and return transcript text."""
|
||||||
|
headers = _auth_headers()
|
||||||
|
headers["Content-Type"] = "application/json"
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=300) as client:
|
||||||
resp = await client.post(
|
resp = await client.post(
|
||||||
f"{OPENROUTER_BASE}/audio/transcriptions",
|
f"{OPENROUTER_BASE}/audio/transcriptions",
|
||||||
headers=_auth_headers(),
|
headers=headers,
|
||||||
files=files,
|
content=json.dumps(_build_transcription_payload(audio_path)),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|||||||
@@ -0,0 +1,25 @@
|
|||||||
|
import base64
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||||
|
|
||||||
|
from openrouter_client import _audio_format, _build_transcription_payload
|
||||||
|
|
||||||
|
|
||||||
|
def test_audio_format_defaults_to_wav_when_missing_suffix(tmp_path: Path):
|
||||||
|
path = tmp_path / "recording"
|
||||||
|
path.write_bytes(b"abc")
|
||||||
|
assert _audio_format(str(path)) == "wav"
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_transcription_payload_uses_base64_json_shape(tmp_path: Path):
|
||||||
|
path = tmp_path / "meeting.wav"
|
||||||
|
path.write_bytes(b"RIFFdemo")
|
||||||
|
|
||||||
|
payload = _build_transcription_payload(str(path))
|
||||||
|
|
||||||
|
assert payload["model"] == "openai/whisper-large-v3"
|
||||||
|
assert payload["input_audio"]["format"] == "wav"
|
||||||
|
assert payload["input_audio"]["data"] == base64.b64encode(b"RIFFdemo").decode("ascii")
|
||||||
@@ -1,10 +1,65 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import wave
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
|
||||||
|
from discord.opus import Decoder, OpusError
|
||||||
from discord.ext import voice_recv
|
from discord.ext import voice_recv
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class OpusWaveSink(voice_recv.AudioSink):
|
||||||
|
"""Decode raw opus packets per-SSRC and write PCM to a wav file."""
|
||||||
|
|
||||||
|
def __init__(self, output_path: str):
|
||||||
|
super().__init__()
|
||||||
|
self.output_path = output_path
|
||||||
|
self._file: wave.Wave_write = wave.open(output_path, "wb")
|
||||||
|
self._file.setnchannels(Decoder.CHANNELS)
|
||||||
|
self._file.setsampwidth(Decoder.SAMPLE_SIZE // Decoder.CHANNELS)
|
||||||
|
self._file.setframerate(Decoder.SAMPLING_RATE)
|
||||||
|
self._decoders: dict[int, Decoder] = {}
|
||||||
|
self._error_counts: dict[int, int] = {}
|
||||||
|
|
||||||
|
def wants_opus(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def write(self, user, data) -> None:
|
||||||
|
packet = data.packet
|
||||||
|
ssrc = getattr(packet, "ssrc", 0)
|
||||||
|
decoder = self._decoders.setdefault(ssrc, Decoder())
|
||||||
|
|
||||||
|
try:
|
||||||
|
if packet:
|
||||||
|
opus_bytes = data.opus
|
||||||
|
if not opus_bytes:
|
||||||
|
return
|
||||||
|
pcm = decoder.decode(opus_bytes, fec=False)
|
||||||
|
else:
|
||||||
|
pcm = decoder.decode(None, fec=False)
|
||||||
|
except OpusError as exc:
|
||||||
|
count = self._error_counts.get(ssrc, 0) + 1
|
||||||
|
self._error_counts[ssrc] = count
|
||||||
|
if count in {1, 10, 100}:
|
||||||
|
log.warning(
|
||||||
|
"Dropping corrupted opus packet for ssrc %s: %s (count=%s)",
|
||||||
|
ssrc,
|
||||||
|
exc,
|
||||||
|
count,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
self._file.writeframes(pcm)
|
||||||
|
|
||||||
|
def cleanup(self) -> None:
|
||||||
|
try:
|
||||||
|
self._file.close()
|
||||||
|
except Exception:
|
||||||
|
log.warning("OpusWaveSink got error closing file on cleanup", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
class MeetingRecorder:
|
class MeetingRecorder:
|
||||||
"""Wrapper around discord-ext-voice-recv's listen/stop_listening API."""
|
"""Wrapper around discord-ext-voice-recv's listen/stop_listening API."""
|
||||||
@@ -13,14 +68,14 @@ class MeetingRecorder:
|
|||||||
self.vc = voice_client
|
self.vc = voice_client
|
||||||
self.output_path = output_path
|
self.output_path = output_path
|
||||||
self.recording = False
|
self.recording = False
|
||||||
self.sink: voice_recv.WaveSink | None = None
|
self.sink: OpusWaveSink | None = None
|
||||||
|
|
||||||
async def start(self, after_callback: Callable[[Exception | None], None]) -> None:
|
async def start(self, after_callback: Callable[[Exception | None], None]) -> None:
|
||||||
if self.vc.is_listening():
|
if self.vc.is_listening():
|
||||||
raise RuntimeError("Voice client is already listening")
|
raise RuntimeError("Voice client is already listening")
|
||||||
|
|
||||||
Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
|
Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
self.sink = voice_recv.WaveSink(self.output_path)
|
self.sink = OpusWaveSink(self.output_path)
|
||||||
self.vc.listen(self.sink, after=after_callback)
|
self.vc.listen(self.sink, after=after_callback)
|
||||||
self.recording = True
|
self.recording = True
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user