Source code for audia.agents.text_cleaner

"""
Text curation pipeline – the core intelligence of audia.

Two-stage process:
  1. heuristic_clean()  – fast regex pre-pass: removes citations, LaTeX artefacts,
                          collapses whitespace. Reduces LLM token cost.
  2. llm_curate()       – LLM pass (ALWAYS required): rewrites math in plain English,
                          summarises tables, condenses acknowledgements,
                          ensures smooth spoken-word flow.
"""

from __future__ import annotations

import re

from rich.console import Console

from audia.config import Settings, get_settings

console = Console(stderr=True)


# ──────────────────────────────────────────────────────────── regex patterns

# (Author et al., 2023)  /  (Smith & Jones, 2022)  /  (see Wang 2021)
_AUTHOR_CITATION = re.compile(
    r"\(\s*(?:see\s+)?[A-Z][A-Za-zÀ-ÿ\-]+(?:\s+et\s+al\.?)?(?:\s*[,&]\s*[A-Z][A-Za-zÀ-ÿ\-]+)*"
    r"(?:,\s*\d{4}[a-z]?)?\s*\)",
)
# [3], [3,4], [3-5], [3, 4, 5]
_NUMERIC_CITATION = re.compile(r"\[\s*\d+(?:\s*[,\-–]\s*\d+)*\s*\]")

# Detect an acknowledgements section heading
_ACK_HEADING = re.compile(
    r"(^|\n)\s*(?:acknowledgements?|acknowledgments?)\s*\n",
    re.IGNORECASE,
)
# Content of acknowledgements section (everything to next all-caps heading or end)
_ACK_SECTION = re.compile(
    r"(?:acknowledgements?|acknowledgments?)\s*\n(.*?)(?=\n[A-Z][A-Z\s]{3,}\n|\Z)",
    re.IGNORECASE | re.DOTALL,
)

# Isolated figure / table captions to remove
_FIGURE_TABLE_LABEL = re.compile(
    r"(Figure|Fig\.|Table)\s+\d+[.:]\s*[^\n]*\n?",
    re.IGNORECASE,
)

# LaTeX commands: \textbf{foo}, \cite{bar}, \emph{x}, standalone \cmd, etc.
_LATEX_CMD = re.compile(r"\\[a-zA-Z]+(?:\{[^}]*\})*")

# Excessive blank lines
_MULTI_BLANK = re.compile(r"\n{3,}")


[docs] def heuristic_clean(text: str) -> str: """ Fast regex pre-pass – always runs before the LLM call to reduce token cost. """ text = _NUMERIC_CITATION.sub("", text) text = _AUTHOR_CITATION.sub("", text) text = _LATEX_CMD.sub("", text) text = _FIGURE_TABLE_LABEL.sub("", text) text = _MULTI_BLANK.sub("\n\n", text) paragraphs = [p.strip() for p in text.split("\n\n")] return "\n\n".join(p for p in paragraphs if p)
# ──────────────────────────────────────────────────────────── LLM curation _SYSTEM_PROMPT = """ You are an expert academic editor preparing a research paper for text-to-speech conversion. Transform the text so it reads naturally and clearly when spoken aloud. Apply ALL rules without exception: 1. **Mathematical notation**: Never read symbol sequences. Replace with clear spoken English. Example: "∇L = Σᵢ αᵢ yᵢ xᵢ" → "the gradient of L equals the weighted sum of training examples" Example: "f(x) = x²" → "the function f of x equals x squared" Example: "p < 0.05" → "p less than 0.05" 2. **Tables**: Replace every raw table with ONE sentence summarising what it shows. If the surrounding text already summarises the table, remove the table entirely. 3. **Remove entirely**: equation labels like (1) or (2), author affiliations, email addresses, DOI/URL lines, running headers, page numbers. 4. **Acknowledgements**: Condense to one sentence: "The authors acknowledge support from [institutions]." 5. **Residual artefacts**: Remove leftover bullet symbols (• ‣), hyphenated line-breaks (e.g. algo- rithm), and any remaining LaTeX commands. 6. **Section transitions**: Preserve section headings as natural spoken transitions, e.g. "Section 3, Methodology:" or "Moving on to the results:" 7. **Flow**: Merge orphaned fragments into complete sentences. Ensure natural spoken rhythm. Return ONLY the curated spoken text. No markdown, no commentary. """ # How many chars of the previous curated chunk to pass as transition context. # Large enough to cover 1-2 paragraphs; small enough not to waste tokens. _CONTEXT_TAIL_CHARS = 600 _USER_TEMPLATE = "Curate the following text for text-to-speech:\n\n{chunk}" # Used for chunks 2, 3, … – the tail of the previous curated chunk is injected # so the LLM can write a smooth transition without re-reading the whole chunk. _USER_TEMPLATE_WITH_CONTEXT = ( "[CONTEXT — already curated text that immediately precedes this chunk. " "Do NOT repeat or continue it.]\n{tail}\n\n" "[CURRENT CHUNK — curate this for text-to-speech, " "ensuring a smooth spoken transition from the context above]\n{chunk}" )
[docs] def llm_curate( text: str, settings: Settings | None = None, progress_cb=None, ) -> str: """ LLM curation pass – ALWAYS required, always runs. Raises RuntimeError on misconfiguration / missing API key. Parameters ---------- progress_cb : callable(str) | None Optional callback invoked with a plain-text progress line for each chunk so callers (e.g. the web job runner) can surface per-chunk progress without parsing Rich markup. """ cfg = settings or get_settings() llm = _build_llm(cfg) chunks = _split_text(text, max_chars=cfg.llm_max_chunk_chars) total = len(chunks) curated: list[str] = [] prev_tail: str = "" for i, chunk in enumerate(chunks, 1): msg = f"LLM curation chunk {i}/{total} ({len(chunk):,} chars)\u2026" console.print(f" [dim] {msg}[/dim]") if progress_cb: progress_cb(msg) if prev_tail: user_msg = _USER_TEMPLATE_WITH_CONTEXT.format(tail=prev_tail, chunk=chunk) else: user_msg = _USER_TEMPLATE.format(chunk=chunk) result = llm.invoke( [ {"role": "system", "content": _SYSTEM_PROMPT}, {"role": "user", "content": user_msg}, ] ) curated_chunk = getattr(result, "content", str(result)).strip() curated.append(curated_chunk) prev_tail = _extract_tail(curated_chunk, _CONTEXT_TAIL_CHARS) return "\n\n".join(curated)
# back-compat alias llm_clean = llm_curate # ──────────────────────────────────────────────────────────── LLM factory def _build_llm(cfg: Settings): """Instantiate a LangChain chat model; raises clearly on bad config.""" if cfg.llm_provider == "openai": try: from langchain_openai import ChatOpenAI # type: ignore except ImportError as e: raise ImportError("OpenAI support requires: pip install audia[openai]") from e if not cfg.openai_api_key: raise RuntimeError( "AUDIA_OPENAI_API_KEY is not set.\n" "Add it to your .env file: AUDIA_OPENAI_API_KEY=sk-..." ) kwargs: dict = dict( model=cfg.llm_model, temperature=cfg.llm_temperature, api_key=cfg.openai_api_key, ) if cfg.openai_api_base: kwargs["base_url"] = cfg.openai_api_base return ChatOpenAI(**kwargs) elif cfg.llm_provider == "anthropic": try: from langchain_anthropic import ChatAnthropic # type: ignore except ImportError as e: raise ImportError("Anthropic support requires: pip install audia[anthropic]") from e if not cfg.anthropic_api_key: raise RuntimeError( "AUDIA_ANTHROPIC_API_KEY is not set.\n" "Add it to your .env file: AUDIA_ANTHROPIC_API_KEY=sk-ant-..." ) kwargs: dict = dict( model=cfg.llm_model, temperature=cfg.llm_temperature, api_key=cfg.anthropic_api_key, ) if cfg.anthropic_api_base: kwargs["base_url"] = cfg.anthropic_api_base return ChatAnthropic(**kwargs) elif cfg.llm_provider == "google": try: from langchain_google_genai import ChatGoogleGenerativeAI # type: ignore except ImportError as e: raise ImportError("Google Gemini support requires: pip install audia[gemini]") from e if not cfg.google_api_key: raise RuntimeError( "AUDIA_GOOGLE_API_KEY is not set.\n" "Add it to your .env file: AUDIA_GOOGLE_API_KEY=AIza..." ) kwargs: dict = dict( model=cfg.llm_model, temperature=cfg.llm_temperature, google_api_key=cfg.google_api_key, ) if cfg.google_api_base: kwargs["client_options"] = {"api_endpoint": cfg.google_api_base} return ChatGoogleGenerativeAI(**kwargs) else: raise ValueError( f"Unknown LLM provider: '{cfg.llm_provider}'. Valid: openai | anthropic | google" ) def _extract_tail(text: str, max_chars: int) -> str: """ Return the last complete paragraph(s) of *text* up to *max_chars* chars. Used to give the next chunk's LLM call just enough context for a smooth spoken transition without re-processing the full previous chunk. """ if len(text) <= max_chars: return text tail = text[-max_chars:] # Trim to the first paragraph boundary so we don't start mid-sentence. first_break = tail.find("\n\n") return tail[first_break + 2 :] if first_break != -1 else tail def _split_text(text: str, max_chars: int = 8000) -> list[str]: """ Split text into chunks of at most max_chars, breaking at paragraph boundaries. """ if len(text) <= max_chars: return [text] chunks: list[str] = [] paragraphs = text.split("\n\n") current = "" for para in paragraphs: if len(current) + len(para) + 2 > max_chars and current: chunks.append(current.strip()) current = para + "\n\n" else: current += para + "\n\n" if current.strip(): chunks.append(current.strip()) return chunks or [text] # ──────────────────────────────────────────────────────────── main entry
[docs] def curate_text(text: str, settings: Settings | None = None) -> str: """ Full curation pipeline: heuristic pre-pass → LLM curation. Transition guarantee -------------------- Each chunk (from chunk 2 onward) receives the tail of the previous curated chunk as read-only context so the LLM can write a smooth spoken transition without re-processing or re-outputting already-curated text. The full paper content is preserved as-is after the LLM pass — no content is dropped or deduplicated. """ cfg = settings or get_settings() console.print(" [dim]Heuristic pre-pass (citations, LaTeX artefacts)…[/dim]") preprocessed = heuristic_clean(text) console.print(f" [dim]Pre-pass: {len(text):,}{len(preprocessed):,} chars[/dim]") return llm_curate(preprocessed, cfg)
# Back-compat alias
[docs] def clean_text(text: str, settings: Settings | None = None) -> str: return curate_text(text, settings)