Source code for audia.agents.research

"""
ArXiv paper search and download.

Primary:  arxiv Python SDK.
Fallback: HTML scrape of arxiv.org/search (used when the API returns 429).
"""

from __future__ import annotations

import calendar
import html as _html
import re
import urllib.parse
import urllib.request
from dataclasses import dataclass
from pathlib import Path

from rich import print as rprint

from audia.config import get_settings


[docs] @dataclass class ArxivPaper: """Lightweight representation of an ArXiv result.""" arxiv_id: str title: str authors: list[str] abstract: str pdf_url: str published: str # ISO date string local_pdf_path: str | None = None
[docs] class ArxivSearcher: """Search ArXiv and download PDFs.""" def __init__(self, max_results: int | None = None): self._max_results = max_results or get_settings().arxiv_max_results
[docs] def search(self, query: str) -> list[ArxivPaper]: """ Search ArXiv for *query* and return up to max_results papers. Falls back to HTML scraping if the API returns an error (e.g. HTTP 429). """ try: import arxiv # type: ignore except ImportError as e: raise ImportError("arxiv package required: pip install arxiv") from e try: client = arxiv.Client() search = arxiv.Search( query=query, max_results=self._max_results, sort_by=arxiv.SortCriterion.Relevance, ) results: list[ArxivPaper] = [] for r in client.results(search): results.append( ArxivPaper( arxiv_id=r.get_short_id(), title=r.title, authors=[a.name for a in r.authors], abstract=r.summary, pdf_url=r.pdf_url, published=r.published.date().isoformat() if r.published else "", ) ) return results except Exception as exc: msg = str(exc) if "429" in msg: rprint( "[yellow]Page request resulted in HTTP 429" " — starting alternative search…[/yellow]" ) else: rprint("[yellow]ArXiv API unavailable — starting alternative search…[/yellow]") return self._html_search(query)
def _html_search(self, query: str) -> list[ArxivPaper]: """Fallback: scrape arxiv.org/search HTML when the API is unavailable.""" q = urllib.parse.quote_plus(query) url = f"https://arxiv.org/search/?query={q}&searchtype=all&source=header&start=0" req = urllib.request.Request(url, headers={"User-Agent": "audia/0.1 (research fallback)"}) with urllib.request.urlopen(req, timeout=40) as resp: body = resp.read().decode("utf-8", errors="replace") papers: list[ArxivPaper] = [] for block in re.findall(r'<li class="arxiv-result">(.*?)</li>', body, re.DOTALL): id_m = re.search(r"arxiv\.org/abs/([\.\w]+)", block) if not id_m: continue arxiv_id = id_m.group(1) title_m = re.search(r'<p class="title[^"]*">(.*?)</p>', block, re.DOTALL) title = re.sub(r"<[^>]+>", "", title_m.group(1)).strip() if title_m else arxiv_id authors_m = re.search(r'<p class="authors">(.*?)</p>', block, re.DOTALL) authors: list[str] = [] if authors_m: raw_authors = re.sub(r"<[^>]+>", "", authors_m.group(1)) authors = [a.strip() for a in raw_authors.split(",") if a.strip()] date_m = re.match(r"(\d{2})(\d{2})\.", arxiv_id) if date_m: yy, mm = int(date_m.group(1)), int(date_m.group(2)) published = f"{calendar.month_abbr[mm]} {2000 + yy}" else: published = "" abstract_m = re.search(r'<span class="abstract-[^"]*">(.*?)</span>', block, re.DOTALL) abstract = re.sub(r"<[^>]+>", "", abstract_m.group(1)).strip() if abstract_m else "" papers.append( ArxivPaper( arxiv_id=arxiv_id, title=_html.unescape(title), authors=[_html.unescape(a) for a in authors], abstract=_html.unescape(abstract), pdf_url=f"https://arxiv.org/pdf/{arxiv_id}", published=published, ) ) if len(papers) >= self._max_results: break return papers
[docs] def download_pdf(self, paper: ArxivPaper, dest_dir: str | Path | None = None) -> Path: """ Download the PDF for *paper* directly from arxiv.org/pdf/<id>. Bypasses the arxiv SDK export API entirely to avoid HTTP 429 rate-limits. Skips the download if the file already exists. """ cfg = get_settings() dest = Path(dest_dir) if dest_dir else cfg.upload_dir dest.mkdir(parents=True, exist_ok=True) filename = f"{paper.arxiv_id.replace('/', '_')}.pdf" target = dest / filename if target.exists(): paper.local_pdf_path = str(target) return target pdf_url = f"https://arxiv.org/pdf/{paper.arxiv_id}" req = urllib.request.Request( pdf_url, headers={ "User-Agent": "audia/0.1 (PDF download)", "Accept": "application/pdf,*/*", }, ) with urllib.request.urlopen(req, timeout=30) as resp: target.write_bytes(resp.read()) paper.local_pdf_path = str(target) return target