fix: normalize meeting audio before transcription

2026-06-08 06:37:59 +00:00
parent a191bcd1eb
commit f629794a50
2 changed files with 62 additions and 16 deletions
@@ -1,8 +1,10 @@
 from __future__ import annotations
 import asyncio
 import base64
 import json
 import os
 import tempfile
 from pathlib import Path
 from typing import Any
@@ -44,31 +46,74 @@ def _build_transcription_payload(audio_path: str) -> dict[str, Any]:
            "data": encoded,
            "format": _audio_format(audio_path),
        },
        "language": "en",
    }
 async def _normalize_audio_for_transcription(audio_path: str) -> str:
    source = Path(audio_path)
    fd, normalized_path = tempfile.mkstemp(prefix="meeting-normalized-", suffix=".wav")
    os.close(fd)
    proc = await asyncio.create_subprocess_exec(
        "ffmpeg",
        "-y",
        "-i",
        str(source),
        "-ac",
        "1",
        "-ar",
        "16000",
        "-c:a",
        "pcm_s16le",
        normalized_path,
        stdout=asyncio.subprocess.DEVNULL,
        stderr=asyncio.subprocess.PIPE,
    )
    _, stderr = await proc.communicate()
    if proc.returncode != 0:
        try:
            os.remove(normalized_path)
        except OSError:
            pass
        raise RuntimeError(
            "Audio normalization failed: " + (stderr.decode("utf-8", errors="replace").strip() or f"ffmpeg exited {proc.returncode}")
        )
    return normalized_path
 async def transcribe(audio_path: str) -> str:
    """Send audio to OpenRouter's whisper model and return transcript text."""
    headers = _auth_headers()
    headers["Content-Type"] = "application/json"
-    async with httpx.AsyncClient(timeout=300) as client:
+    normalized_path = await _normalize_audio_for_transcription(audio_path)
-        resp = await client.post(
+    try:
-            f"{OPENROUTER_BASE}/audio/transcriptions",
+        async with httpx.AsyncClient(timeout=300) as client:
-            headers=headers,
+            resp = await client.post(
-            content=json.dumps(_build_transcription_payload(audio_path)),
+                f"{OPENROUTER_BASE}/audio/transcriptions",
-        )
+                headers=headers,
-        try:
+                content=json.dumps(_build_transcription_payload(normalized_path)),
-            resp.raise_for_status()
+            )
-        except httpx.HTTPStatusError as exc:
+            try:
-            detail = summarize_error(_safe_json(resp), fallback=resp.text)
+                resp.raise_for_status()
-            raise RuntimeError(f"OpenRouter transcription failed: {detail}") from exc
+            except httpx.HTTPStatusError as exc:
                detail = summarize_error(_safe_json(resp), fallback=resp.text)
                raise RuntimeError(
                    f"OpenRouter transcription failed ({resp.status_code}): {detail}"
                ) from exc
-        data = resp.json()
+            data = resp.json()
-        text = data.get("text", "")
+            text = data.get("text", "")
-        if not text.strip():
+            if not text.strip():
-            raise RuntimeError("OpenRouter transcription returned empty text")
+                raise RuntimeError("OpenRouter transcription returned empty text")
-        return text.strip()
+            return text.strip()
    finally:
        try:
            os.remove(normalized_path)
        except OSError:
            pass
 async def summarize(transcript: str) -> str:
@@ -21,5 +21,6 @@ def test_build_transcription_payload_uses_base64_json_shape(tmp_path: Path):
    payload = _build_transcription_payload(str(path))
    assert payload["model"] == "openai/whisper-large-v3"
    assert payload["language"] == "en"
    assert payload["input_audio"]["format"] == "wav"
    assert payload["input_audio"]["data"] == base64.b64encode(b"RIFFdemo").decode("ascii")