fix: normalize meeting audio before transcription

This commit is contained in:
2026-06-08 06:37:59 +00:00
parent a191bcd1eb
commit f629794a50
2 changed files with 62 additions and 16 deletions
+61 -16
View File
@@ -1,8 +1,10 @@
from __future__ import annotations from __future__ import annotations
import asyncio
import base64 import base64
import json import json
import os import os
import tempfile
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@@ -44,31 +46,74 @@ def _build_transcription_payload(audio_path: str) -> dict[str, Any]:
"data": encoded, "data": encoded,
"format": _audio_format(audio_path), "format": _audio_format(audio_path),
}, },
"language": "en",
} }
async def _normalize_audio_for_transcription(audio_path: str) -> str:
source = Path(audio_path)
fd, normalized_path = tempfile.mkstemp(prefix="meeting-normalized-", suffix=".wav")
os.close(fd)
proc = await asyncio.create_subprocess_exec(
"ffmpeg",
"-y",
"-i",
str(source),
"-ac",
"1",
"-ar",
"16000",
"-c:a",
"pcm_s16le",
normalized_path,
stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.PIPE,
)
_, stderr = await proc.communicate()
if proc.returncode != 0:
try:
os.remove(normalized_path)
except OSError:
pass
raise RuntimeError(
"Audio normalization failed: " + (stderr.decode("utf-8", errors="replace").strip() or f"ffmpeg exited {proc.returncode}")
)
return normalized_path
async def transcribe(audio_path: str) -> str: async def transcribe(audio_path: str) -> str:
"""Send audio to OpenRouter's whisper model and return transcript text.""" """Send audio to OpenRouter's whisper model and return transcript text."""
headers = _auth_headers() headers = _auth_headers()
headers["Content-Type"] = "application/json" headers["Content-Type"] = "application/json"
async with httpx.AsyncClient(timeout=300) as client: normalized_path = await _normalize_audio_for_transcription(audio_path)
resp = await client.post( try:
f"{OPENROUTER_BASE}/audio/transcriptions", async with httpx.AsyncClient(timeout=300) as client:
headers=headers, resp = await client.post(
content=json.dumps(_build_transcription_payload(audio_path)), f"{OPENROUTER_BASE}/audio/transcriptions",
) headers=headers,
try: content=json.dumps(_build_transcription_payload(normalized_path)),
resp.raise_for_status() )
except httpx.HTTPStatusError as exc: try:
detail = summarize_error(_safe_json(resp), fallback=resp.text) resp.raise_for_status()
raise RuntimeError(f"OpenRouter transcription failed: {detail}") from exc except httpx.HTTPStatusError as exc:
detail = summarize_error(_safe_json(resp), fallback=resp.text)
raise RuntimeError(
f"OpenRouter transcription failed ({resp.status_code}): {detail}"
) from exc
data = resp.json() data = resp.json()
text = data.get("text", "") text = data.get("text", "")
if not text.strip(): if not text.strip():
raise RuntimeError("OpenRouter transcription returned empty text") raise RuntimeError("OpenRouter transcription returned empty text")
return text.strip() return text.strip()
finally:
try:
os.remove(normalized_path)
except OSError:
pass
async def summarize(transcript: str) -> str: async def summarize(transcript: str) -> str:
+1
View File
@@ -21,5 +21,6 @@ def test_build_transcription_payload_uses_base64_json_shape(tmp_path: Path):
payload = _build_transcription_payload(str(path)) payload = _build_transcription_payload(str(path))
assert payload["model"] == "openai/whisper-large-v3" assert payload["model"] == "openai/whisper-large-v3"
assert payload["language"] == "en"
assert payload["input_audio"]["format"] == "wav" assert payload["input_audio"]["format"] == "wav"
assert payload["input_audio"]["data"] == base64.b64encode(b"RIFFdemo").decode("ascii") assert payload["input_audio"]["data"] == base64.b64encode(b"RIFFdemo").decode("ascii")