Source code for audia.agents.pdf_processor

"""
PDF text extraction using PyMuPDF (fitz).

Handles:
- Multi-page PDFs
- Basic heuristic removal of headers, footers, page numbers,
  references section, and acknowledgements section.
"""

from __future__ import annotations

import re
from pathlib import Path
from typing import NamedTuple


[docs] class ExtractionResult(NamedTuple): text: str num_pages: int title: str
# ──────────────────────────────────────────────────────── constants _REFERENCES_PATTERNS = re.compile( r"^\s*references?\s*$", re.IGNORECASE | re.MULTILINE, ) _ACKNOWLEDGEMENTS_PATTERNS = re.compile( r"^\s*acknowledgements?\s*$", re.IGNORECASE | re.MULTILINE, ) # Lines that look like isolated page numbers (e.g. "— 3 —", "3", "Page 3") _PAGE_NUMBER_LINE = re.compile(r"^\s*(?:page\s+)?\d+\s*$", re.IGNORECASE) # Repeated short lines across pages are likely headers/footers; # collect them and strip later. _MAX_HEADER_FOOTER_LENGTH = 80
[docs] def extract_text(pdf_path: str | Path) -> ExtractionResult: """ Extract and pre-clean text from a PDF. Returns an ExtractionResult with cleaned text, page count, and guessed title. Raises FileNotFoundError if the PDF does not exist. """ try: import fitz # PyMuPDF except ImportError as e: raise ImportError("PyMuPDF is required: pip install PyMuPDF") from e pdf_path = Path(pdf_path) if not pdf_path.exists(): raise FileNotFoundError(f"PDF not found: {pdf_path}") doc = fitz.open(str(pdf_path)) num_pages = len(doc) # 1. Extract text per page page_texts: list[str] = [] for page in doc: page_texts.append(page.get_text("text")) # type: ignore[attr-defined] doc.close() # 2. Detect repeated short lines (header / footer candidates) candidate_hf = _detect_header_footer_lines(page_texts) # 3. Clean each page cleaned_pages: list[str] = [] for text in page_texts: cleaned = _clean_page(text, candidate_hf) if cleaned.strip(): cleaned_pages.append(cleaned) full_text = "\n\n".join(cleaned_pages) # 4. Try to guess title from first non-empty line title = _guess_title(full_text, pdf_path.stem) # 5. Trim everything after References section full_text = _trim_references_and_beyond(full_text) return ExtractionResult( text=full_text, num_pages=num_pages, title=title, )
# ──────────────────────────────────────────────────────── helpers def _detect_header_footer_lines(page_texts: list[str]) -> set[str]: """ Identify lines that appear verbatim in ≥50% of pages AND are short. These are most likely running headers/footers. """ if len(page_texts) < 2: return set() from collections import Counter line_counts: Counter[str] = Counter() for text in page_texts: seen_on_this_page: set[str] = set() for line in text.splitlines(): stripped = line.strip() if stripped and len(stripped) <= _MAX_HEADER_FOOTER_LENGTH: if stripped not in seen_on_this_page: line_counts[stripped] += 1 seen_on_this_page.add(stripped) threshold = max(2, len(page_texts) * 0.4) return {line for line, cnt in line_counts.items() if cnt >= threshold} def _clean_page(text: str, header_footer_lines: set[str]) -> str: """Remove header/footer lines and lone page numbers from a page.""" lines = text.splitlines() cleaned: list[str] = [] for line in lines: stripped = line.strip() if stripped in header_footer_lines: continue if _PAGE_NUMBER_LINE.match(stripped): continue cleaned.append(line) return "\n".join(cleaned) def _trim_references_and_beyond(text: str) -> str: """ Remove the References section and everything after it. Keep the Acknowledgements section intact so the LLM can summarise it. """ match = _REFERENCES_PATTERNS.search(text) if match: text = text[: match.start()].rstrip() return text def _guess_title(text: str, fallback: str) -> str: """Return the first meaningful line of the document as the title.""" for line in text.splitlines(): stripped = line.strip() if stripped and len(stripped) > 8: return stripped[:200] return fallback