fix: normalize meeting audio before transcription
This commit is contained in:
+61
-16
@@ -1,8 +1,10 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@@ -44,31 +46,74 @@ def _build_transcription_payload(audio_path: str) -> dict[str, Any]:
|
|||||||
"data": encoded,
|
"data": encoded,
|
||||||
"format": _audio_format(audio_path),
|
"format": _audio_format(audio_path),
|
||||||
},
|
},
|
||||||
|
"language": "en",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _normalize_audio_for_transcription(audio_path: str) -> str:
|
||||||
|
source = Path(audio_path)
|
||||||
|
fd, normalized_path = tempfile.mkstemp(prefix="meeting-normalized-", suffix=".wav")
|
||||||
|
os.close(fd)
|
||||||
|
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
"ffmpeg",
|
||||||
|
"-y",
|
||||||
|
"-i",
|
||||||
|
str(source),
|
||||||
|
"-ac",
|
||||||
|
"1",
|
||||||
|
"-ar",
|
||||||
|
"16000",
|
||||||
|
"-c:a",
|
||||||
|
"pcm_s16le",
|
||||||
|
normalized_path,
|
||||||
|
stdout=asyncio.subprocess.DEVNULL,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
_, stderr = await proc.communicate()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
try:
|
||||||
|
os.remove(normalized_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
raise RuntimeError(
|
||||||
|
"Audio normalization failed: " + (stderr.decode("utf-8", errors="replace").strip() or f"ffmpeg exited {proc.returncode}")
|
||||||
|
)
|
||||||
|
|
||||||
|
return normalized_path
|
||||||
|
|
||||||
|
|
||||||
async def transcribe(audio_path: str) -> str:
|
async def transcribe(audio_path: str) -> str:
|
||||||
"""Send audio to OpenRouter's whisper model and return transcript text."""
|
"""Send audio to OpenRouter's whisper model and return transcript text."""
|
||||||
headers = _auth_headers()
|
headers = _auth_headers()
|
||||||
headers["Content-Type"] = "application/json"
|
headers["Content-Type"] = "application/json"
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=300) as client:
|
normalized_path = await _normalize_audio_for_transcription(audio_path)
|
||||||
resp = await client.post(
|
try:
|
||||||
f"{OPENROUTER_BASE}/audio/transcriptions",
|
async with httpx.AsyncClient(timeout=300) as client:
|
||||||
headers=headers,
|
resp = await client.post(
|
||||||
content=json.dumps(_build_transcription_payload(audio_path)),
|
f"{OPENROUTER_BASE}/audio/transcriptions",
|
||||||
)
|
headers=headers,
|
||||||
try:
|
content=json.dumps(_build_transcription_payload(normalized_path)),
|
||||||
resp.raise_for_status()
|
)
|
||||||
except httpx.HTTPStatusError as exc:
|
try:
|
||||||
detail = summarize_error(_safe_json(resp), fallback=resp.text)
|
resp.raise_for_status()
|
||||||
raise RuntimeError(f"OpenRouter transcription failed: {detail}") from exc
|
except httpx.HTTPStatusError as exc:
|
||||||
|
detail = summarize_error(_safe_json(resp), fallback=resp.text)
|
||||||
|
raise RuntimeError(
|
||||||
|
f"OpenRouter transcription failed ({resp.status_code}): {detail}"
|
||||||
|
) from exc
|
||||||
|
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
text = data.get("text", "")
|
text = data.get("text", "")
|
||||||
if not text.strip():
|
if not text.strip():
|
||||||
raise RuntimeError("OpenRouter transcription returned empty text")
|
raise RuntimeError("OpenRouter transcription returned empty text")
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.remove(normalized_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def summarize(transcript: str) -> str:
|
async def summarize(transcript: str) -> str:
|
||||||
|
|||||||
@@ -21,5 +21,6 @@ def test_build_transcription_payload_uses_base64_json_shape(tmp_path: Path):
|
|||||||
payload = _build_transcription_payload(str(path))
|
payload = _build_transcription_payload(str(path))
|
||||||
|
|
||||||
assert payload["model"] == "openai/whisper-large-v3"
|
assert payload["model"] == "openai/whisper-large-v3"
|
||||||
|
assert payload["language"] == "en"
|
||||||
assert payload["input_audio"]["format"] == "wav"
|
assert payload["input_audio"]["format"] == "wav"
|
||||||
assert payload["input_audio"]["data"] == base64.b64encode(b"RIFFdemo").decode("ascii")
|
assert payload["input_audio"]["data"] == base64.b64encode(b"RIFFdemo").decode("ascii")
|
||||||
|
|||||||
Reference in New Issue
Block a user