Source code for audia.agents.text_cleaner

"""
Text curation pipeline – the core intelligence of audia.

Two-stage process:
  1. heuristic_clean()  – fast regex pre-pass: removes citations, LaTeX artefacts,
                          collapses whitespace. Reduces LLM token cost.
  2. llm_curate()       – LLM pass (ALWAYS required): rewrites math in plain English,
                          summarises tables, condenses acknowledgements,
                          ensures smooth spoken-word flow.
"""

from __future__ import annotations

import re

from rich.console import Console

from audia.config import Settings, get_settings

console = Console(stderr=True)


# ──────────────────────────────────────────────────────────── regex patterns

# (Author et al., 2023)  /  (Smith & Jones, 2022)  /  (see Wang 2021)
_AUTHOR_CITATION = re.compile(
    r"\(\s*(?:see\s+)?[A-Z][A-Za-zÀ-ÿ\-]+(?:\s+et\s+al\.?)?(?:\s*[,&]\s*[A-Z][A-Za-zÀ-ÿ\-]+)*"
    r"(?:,\s*\d{4}[a-z]?)?\s*\)",
)
# [3], [3,4], [3-5], [3, 4, 5]
_NUMERIC_CITATION = re.compile(r"\[\s*\d+(?:\s*[,\-–]\s*\d+)*\s*\]")

# Detect an acknowledgements section heading
_ACK_HEADING = re.compile(
    r"(^|\n)\s*(?:acknowledgements?|acknowledgments?)\s*\n",
    re.IGNORECASE,
)
# Content of acknowledgements section (everything to next all-caps heading or end)
_ACK_SECTION = re.compile(
    r"(?:acknowledgements?|acknowledgments?)\s*\n(.*?)(?=\n[A-Z][A-Z\s]{3,}\n|\Z)",
    re.IGNORECASE | re.DOTALL,
)

# Isolated figure / table captions to remove
_FIGURE_TABLE_LABEL = re.compile(
    r"(Figure|Fig\.|Table)\s+\d+[.:]\s*[^\n]*\n?",
    re.IGNORECASE,
)

# LaTeX commands: \textbf{foo}, \cite{bar}, \emph{x}, standalone \cmd, etc.
_LATEX_CMD = re.compile(r"\\[a-zA-Z]+(?:\{[^}]*\})*")

# Excessive blank lines
_MULTI_BLANK = re.compile(r"\n{3,}")



[docs]
def heuristic_clean(text: str) -> str:
    """
    Fast regex pre-pass – always runs before the LLM call to reduce token cost.
    """
    text = _NUMERIC_CITATION.sub("", text)
    text = _AUTHOR_CITATION.sub("", text)
    text = _LATEX_CMD.sub("", text)
    text = _FIGURE_TABLE_LABEL.sub("", text)
    text = _MULTI_BLANK.sub("\n\n", text)
    paragraphs = [p.strip() for p in text.split("\n\n")]
    return "\n\n".join(p for p in paragraphs if p)



# ──────────────────────────────────────────────────────────── LLM curation

_SYSTEM_PROMPT = """
You are an expert academic editor preparing a research paper for text-to-speech conversion.
Transform the text so it reads naturally and clearly when spoken aloud.

Apply ALL rules without exception:
1. **Mathematical notation**: Never read symbol sequences. Replace with clear spoken English.
   Example: "∇L = Σᵢ αᵢ yᵢ xᵢ" → "the gradient of L equals the weighted sum of training examples"
   Example: "f(x) = x²" → "the function f of x equals x squared"
   Example: "p < 0.05" → "p less than 0.05"
2. **Tables**: Replace every raw table with ONE sentence summarising what it shows.
   If the surrounding text already summarises the table, remove the table entirely.
3. **Remove entirely**: equation labels like (1) or (2), author affiliations,
   email addresses, DOI/URL lines, running headers, page numbers.
4. **Acknowledgements**: Condense to one sentence:
   "The authors acknowledge support from [institutions]."
5. **Residual artefacts**: Remove leftover bullet symbols (•  ‣), hyphenated line-breaks (e.g. algo-
   rithm), and any remaining LaTeX commands.
6. **Section transitions**: Preserve section headings as natural spoken transitions,
   e.g. "Section 3, Methodology:" or "Moving on to the results:"
7. **Flow**: Merge orphaned fragments into complete sentences. Ensure natural spoken rhythm.

Return ONLY the curated spoken text. No markdown, no commentary.
"""

# How many chars of the previous curated chunk to pass as transition context.
# Large enough to cover 1-2 paragraphs; small enough not to waste tokens.
_CONTEXT_TAIL_CHARS = 600

_USER_TEMPLATE = "Curate the following text for text-to-speech:\n\n{chunk}"

# Used for chunks 2, 3, … – the tail of the previous curated chunk is injected
# so the LLM can write a smooth transition without re-reading the whole chunk.
_USER_TEMPLATE_WITH_CONTEXT = (
    "[CONTEXT — already curated text that immediately precedes this chunk. "
    "Do NOT repeat or continue it.]\n{tail}\n\n"
    "[CURRENT CHUNK — curate this for text-to-speech, "
    "ensuring a smooth spoken transition from the context above]\n{chunk}"
)



[docs]
def llm_curate(
    text: str,
    settings: Settings | None = None,
    progress_cb=None,
) -> str:
    """
    LLM curation pass – ALWAYS required, always runs.
    Raises RuntimeError on misconfiguration / missing API key.

    Parameters
    ----------
    progress_cb : callable(str) | None
        Optional callback invoked with a plain-text progress line for each
        chunk so callers (e.g. the web job runner) can surface per-chunk
        progress without parsing Rich markup.
    """
    cfg = settings or get_settings()
    llm = _build_llm(cfg)

    chunks = _split_text(text, max_chars=cfg.llm_max_chunk_chars)
    total = len(chunks)
    curated: list[str] = []

    prev_tail: str = ""

    for i, chunk in enumerate(chunks, 1):
        msg = f"LLM curation chunk {i}/{total} ({len(chunk):,} chars)\u2026"
        console.print(f"  [dim]  {msg}[/dim]")
        if progress_cb:
            progress_cb(msg)
        if prev_tail:
            user_msg = _USER_TEMPLATE_WITH_CONTEXT.format(tail=prev_tail, chunk=chunk)
        else:
            user_msg = _USER_TEMPLATE.format(chunk=chunk)

        result = llm.invoke(
            [
                {"role": "system", "content": _SYSTEM_PROMPT},
                {"role": "user", "content": user_msg},
            ]
        )
        curated_chunk = getattr(result, "content", str(result)).strip()
        curated.append(curated_chunk)
        prev_tail = _extract_tail(curated_chunk, _CONTEXT_TAIL_CHARS)

    return "\n\n".join(curated)



# back-compat alias
llm_clean = llm_curate


# ──────────────────────────────────────────────────────────── LLM factory


def _build_llm(cfg: Settings):
    """Instantiate a LangChain chat model; raises clearly on bad config."""
    if cfg.llm_provider == "openai":
        try:
            from langchain_openai import ChatOpenAI  # type: ignore
        except ImportError as e:
            raise ImportError("OpenAI support requires: pip install audia[openai]") from e
        if not cfg.openai_api_key:
            raise RuntimeError(
                "AUDIA_OPENAI_API_KEY is not set.\n"
                "Add it to your .env file:  AUDIA_OPENAI_API_KEY=sk-..."
            )
        kwargs: dict = dict(
            model=cfg.llm_model,
            temperature=cfg.llm_temperature,
            api_key=cfg.openai_api_key,
        )
        if cfg.openai_api_base:
            kwargs["base_url"] = cfg.openai_api_base
        return ChatOpenAI(**kwargs)
    elif cfg.llm_provider == "anthropic":
        try:
            from langchain_anthropic import ChatAnthropic  # type: ignore
        except ImportError as e:
            raise ImportError("Anthropic support requires: pip install audia[anthropic]") from e
        if not cfg.anthropic_api_key:
            raise RuntimeError(
                "AUDIA_ANTHROPIC_API_KEY is not set.\n"
                "Add it to your .env file:  AUDIA_ANTHROPIC_API_KEY=sk-ant-..."
            )
        kwargs: dict = dict(
            model=cfg.llm_model,
            temperature=cfg.llm_temperature,
            api_key=cfg.anthropic_api_key,
        )
        if cfg.anthropic_api_base:
            kwargs["base_url"] = cfg.anthropic_api_base
        return ChatAnthropic(**kwargs)
    elif cfg.llm_provider == "google":
        try:
            from langchain_google_genai import ChatGoogleGenerativeAI  # type: ignore
        except ImportError as e:
            raise ImportError("Google Gemini support requires: pip install audia[gemini]") from e
        if not cfg.google_api_key:
            raise RuntimeError(
                "AUDIA_GOOGLE_API_KEY is not set.\n"
                "Add it to your .env file:  AUDIA_GOOGLE_API_KEY=AIza..."
            )
        kwargs: dict = dict(
            model=cfg.llm_model,
            temperature=cfg.llm_temperature,
            google_api_key=cfg.google_api_key,
        )
        if cfg.google_api_base:
            kwargs["client_options"] = {"api_endpoint": cfg.google_api_base}
        return ChatGoogleGenerativeAI(**kwargs)
    elif cfg.llm_provider == "qwen":
        try:
            from langchain_openai import ChatOpenAI  # type: ignore
        except ImportError as e:
            raise ImportError("Qwen support requires: pip install audia[openai]") from e
        if not cfg.qwen_api_key:
            raise RuntimeError(
                "AUDIA_QWEN_API_KEY is not set.\n"
                "Add it to your .env file:  AUDIA_QWEN_API_KEY=sk-..."
            )
        kwargs: dict = dict(
            model=cfg.llm_model,
            temperature=cfg.llm_temperature,
            api_key=cfg.qwen_api_key,
        )
        if cfg.qwen_api_base:
            kwargs["base_url"] = cfg.qwen_api_base
        return ChatOpenAI(**kwargs)
    else:
        raise ValueError(
            f"Unknown LLM provider: '{cfg.llm_provider}'. Valid: openai | anthropic | google | qwen"
        )


def _extract_tail(text: str, max_chars: int) -> str:
    """
    Return the last complete paragraph(s) of *text* up to *max_chars* chars.
    Used to give the next chunk's LLM call just enough context for a smooth
    spoken transition without re-processing the full previous chunk.
    """
    if len(text) <= max_chars:
        return text
    tail = text[-max_chars:]
    # Trim to the first paragraph boundary so we don't start mid-sentence.
    first_break = tail.find("\n\n")
    return tail[first_break + 2 :] if first_break != -1 else tail


def _split_text(text: str, max_chars: int = 8000) -> list[str]:
    """
    Split text into chunks of at most max_chars, breaking at paragraph boundaries.
    """
    if len(text) <= max_chars:
        return [text]

    chunks: list[str] = []
    paragraphs = text.split("\n\n")
    current = ""
    for para in paragraphs:
        if len(current) + len(para) + 2 > max_chars and current:
            chunks.append(current.strip())
            current = para + "\n\n"
        else:
            current += para + "\n\n"
    if current.strip():
        chunks.append(current.strip())
    return chunks or [text]


# ──────────────────────────────────────────────────────────── main entry



[docs]
def curate_text(text: str, settings: Settings | None = None) -> str:
    """
    Full curation pipeline: heuristic pre-pass → LLM curation.

    Transition guarantee
    --------------------
    Each chunk (from chunk 2 onward) receives the tail of the previous curated
    chunk as read-only context so the LLM can write a smooth spoken transition
    without re-processing or re-outputting already-curated text.
    The full paper content is preserved as-is after the LLM pass — no content
    is dropped or deduplicated.
    """
    cfg = settings or get_settings()
    console.print("  [dim]Heuristic pre-pass (citations, LaTeX artefacts)…[/dim]")
    preprocessed = heuristic_clean(text)
    console.print(f"  [dim]Pre-pass: {len(text):,} → {len(preprocessed):,} chars[/dim]")
    return llm_curate(preprocessed, cfg)



# Back-compat alias

[docs]
def clean_text(text: str, settings: Settings | None = None) -> str:
    return curate_text(text, settings)