"""
Speech-to-Text input – record from microphone and transcribe.
"""
from __future__ import annotations
from pathlib import Path
[docs]
def record_and_transcribe(
seconds: int = 30,
samplerate: int = 16000,
model_size: str = "base",
device: str = "cpu",
) -> str:
"""
Record audio from the default microphone and return the transcription.
Parameters
----------
seconds: Maximum recording duration.
samplerate: Audio sample rate (16 kHz is recommended for Whisper).
model_size: faster-whisper model: tiny | base | small | medium | large-v3
device: 'cpu' or 'cuda'
"""
_ensure_stt_deps()
import numpy as np
import sounddevice as sd # type: ignore
print(f"[audia] Recording for up to {seconds} seconds… (Ctrl-C to stop early)")
audio = sd.rec(
int(seconds * samplerate),
samplerate=samplerate,
channels=1,
dtype="float32",
)
try:
sd.wait()
except KeyboardInterrupt:
sd.stop()
print("[audia] Recording finished.")
audio_1d: np.ndarray = audio.flatten()
return _transcribe_array(audio_1d, samplerate, model_size, device)
[docs]
def transcribe_file(
audio_path: str | Path,
model_size: str = "base",
device: str = "cpu",
) -> str:
"""Transcribe an existing audio file (wav, mp3, …)."""
_ensure_stt_deps()
from faster_whisper import WhisperModel # type: ignore
model = WhisperModel(model_size, device=device, compute_type="int8")
segments, _ = model.transcribe(str(audio_path), beam_size=5)
return " ".join(seg.text.strip() for seg in segments)
def _transcribe_array(
audio: object,
samplerate: int,
model_size: str,
device: str,
) -> str:
"""Transcribe a NumPy float32 array using faster-whisper."""
import tempfile
import soundfile as sf # type: ignore
from faster_whisper import WhisperModel # type: ignore
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
tmp_path = f.name
sf.write(tmp_path, audio, samplerate)
try:
model = WhisperModel(model_size, device=device, compute_type="int8")
segments, _ = model.transcribe(tmp_path, beam_size=5)
return " ".join(seg.text.strip() for seg in segments)
finally:
Path(tmp_path).unlink(missing_ok=True)
[docs]
def distill_search_query(speech: str) -> str:
"""
Use the configured LLM to extract a concise ArXiv search query from raw speech.
Example
-------
>>> distill_search_query("I would like to research about agentic AI.")
'agentic AI research'
"""
from langchain_core.messages import HumanMessage, SystemMessage # type: ignore
from audia.agents.text_cleaner import _build_llm
from audia.config import get_settings
cfg = get_settings()
llm = _build_llm(cfg)
messages = [
SystemMessage(
content=(
"You extract a short, precise academic search query from spoken input. "
"Return ONLY the query – no explanation, no punctuation at the end. "
"4–7 words maximum, suitable for searching ArXiv."
)
),
HumanMessage(content=speech),
]
result = llm.invoke(messages)
return result.content.strip().strip(".")
def _ensure_stt_deps() -> None:
missing = []
try:
import sounddevice # noqa: F401
except ImportError:
missing.append("sounddevice")
try:
import faster_whisper # noqa: F401
except ImportError:
missing.append("faster-whisper")
if missing:
deps = " ".join(missing)
raise ImportError(
f"STT requires extra dependencies: pip install audia[stt]\nMissing: {deps}"
)