bennybot/src/preview/arxiv.py at main

  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3import json
  4import re
  5import shutil
  6from email.utils import parsedate_to_datetime
  7from pathlib import Path
  8
  9import feedparser
 10from bs4 import BeautifulSoup
 11from glom import Coalesce, glom
 12from loguru import logger
 13from pyrogram.client import Client
 14from pyrogram.types import InputMediaDocument, Message
 15
 16from config import AI, CAPTION_LENGTH, DOWNLOAD_DIR, PROXY
 17from messages.progress import modify_progress
 18from messages.sender import send2tg, send_blockquote_texts
 19from messages.utils import smart_split
 20from networking import download_file, hx_req
 21from preview.utils import add_summary_url
 22from summarize.summarize import summarize
 23from utils import nowdt
 24
 25HEADERS = {
 26    "User-Agent": "feedparser/6.0.11 +https://github.com/kurtmckee/feedparser/",
 27    "Accept": "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1",
 28}
 29
 30
 31async def preview_arxiv(
 32    client: Client,
 33    message: Message,
 34    url: str,
 35    arxiv_id: str,
 36    *,
 37    summary_arxiv: bool = True,
 38    summary_arxiv_model: str = AI.AI_SUMMARY_MODEL_ALIAS,
 39    **kwargs,
 40) -> None:
 41    """Preview arxiv in the message."""
 42    status_msg = (await send2tg(client, message, texts=f"🔗正在解析arXiv链接\n{url}", **kwargs))[0]
 43    kwargs["send_from_user"] = ""  # disable @send_user
 44    if not isinstance(status_msg, Message):
 45        return
 46
 47    # First, get the PDF and send it.
 48    pdf_url = f"https://arxiv.org/pdf/{arxiv_id}"
 49    pdf = await download_file(pdf_url, suffix=".pdf", proxy=PROXY.ARXIV, stream=True)
 50    if not pdf:
 51        await modify_progress(status_msg, text="❌下载PDF失败", force_update=True)
 52        return
 53
 54    status_msg = await status_msg.edit_media(media=InputMediaDocument(pdf, caption=f"arXiv: [{arxiv_id}]({url})"))
 55    file_id = glom(status_msg, "document.file_id", default=pdf)
 56    arxiv_info = await get_arxiv_meta(arxiv_id)
 57    if not arxiv_info:
 58        logger.error("❌arXiv API调用失败")
 59        if summary_arxiv:
 60            sources = await extract_arxiv_tex(arxiv_id) + [{"type": "file", "path": pdf, "mime_type": "application/pdf"}]
 61            summary = await summarize(sources=sources, title=f"arXiv-{arxiv_id}", model=summary_arxiv_model, url=url, force_r2_page=True)
 62            await send_blockquote_texts(client, status_msg, texts=summary["texts"], **kwargs)
 63        Path(pdf).unlink(missing_ok=True)
 64        return
 65
 66    title = arxiv_info["title"]
 67    authors = arxiv_info["authors"]
 68    updated = arxiv_info["updated"]
 69    comment = arxiv_info["comment"]
 70    abstract = arxiv_info["abstract"]
 71
 72    texts = f"📄**[{title}]({url})**\n🕒{updated}\n👥{authors}\n"
 73    if comment:
 74        texts += f"📝{comment}"
 75
 76    caption = (await smart_split(texts, CAPTION_LENGTH))[0]
 77    status_msg = await status_msg.edit_media(media=InputMediaDocument(file_id, caption=caption))
 78    if not summary_arxiv:
 79        await send_blockquote_texts(client, status_msg, texts=f"**Abstract**\n{abstract}", **kwargs)
 80    else:
 81        iframe = f'<iframe src="{pdf_url}" width="100%" height="800px" style="border: none; border-radius: 8px;"></iframe>'
 82        ptag = f'<p style="text-align: center;"><a href="{pdf_url}" target="_blank">在新标签页中打开论文</a></p>'
 83        sources = [{"type": "file", "path": pdf, "mime_type": "application/pdf"}, {"type": "text", "text": json.dumps(arxiv_info)}] + await extract_arxiv_tex(arxiv_id)
 84        summary = await summarize(
 85            sources=sources,
 86            title=title,
 87            model=summary_arxiv_model,
 88            author=authors,
 89            date=updated,
 90            url=url,
 91            description={"emoji": "📄", "name": "原始论文", "html": iframe + ptag},
 92            force_r2_page=True,
 93        )
 94        telegraph_url = summary.get("telegraph_url")
 95        if not telegraph_url:
 96            await send_blockquote_texts(client, status_msg, texts=summary.get("texts", ""), **kwargs)
 97        else:
 98            await add_summary_url(telegraph_url, status_msg)
 99
100    Path(pdf).unlink(missing_ok=True)
101
102
103async def get_arxiv_meta(arxiv_id: str) -> dict:
104    """Get arxiv metadata."""
105    # first, get the metadata from the standard arXxiv API
106    api = f"https://export.arxiv.org/api/query?id_list={arxiv_id}"
107    resp = await hx_req(api, headers=HEADERS, proxy=PROXY.ARXIV, rformat="text", timeout=3, max_retry=1)
108    if resp.get("status_code") == 200:
109        arxiv = feedparser.parse(resp["text"])
110        entry = glom(arxiv, "entries.0", default={})
111        title = glom(entry, "title", default="")
112        published = glom(entry, "published", default="")
113        updated = glom(entry, Coalesce("updated", "published"), default="")
114        published = published.replace("T", " ").rstrip("Z")
115        updated = updated.replace("T", " ").rstrip("Z")
116        abstract = glom(entry, "summary", default="")
117        comment = glom(entry, "arxiv_comment", default="")
118        authors = ""
119        for author in glom(entry, "authors", default=[]):
120            if name := author.get("name"):
121                authors += f"{name}, "
122        authors = authors.rstrip(", ")
123        return {"title": title, "authors": authors, "published": published, "updated": updated, "comment": comment, "abstract": abstract}
124
125    logger.warning("❌arXiv standard API调用失败，回退到Open Archives Initiative")
126    clean_id = re.sub(r"v\d+$", "", arxiv_id)
127    api = f"https://oaipmh.arxiv.org/oai?verb=GetRecord&identifier=oai:arXiv.org:{clean_id}&metadataPrefix=arXivRaw"
128    resp = await hx_req(api, headers=HEADERS, proxy=PROXY.ARXIV, rformat="text", timeout=3, max_retry=1)
129    if resp.get("status_code") == 200:
130        soup = BeautifulSoup(resp["text"], "xml")
131        title = glom(soup, "title.text", default="").strip().replace("\n", " ")
132        authors = glom(soup, "authors.text", default="").strip()
133        abstract = glom(soup, "abstract.text", default="").strip()
134        comment = glom(soup, "comments.text", default="").strip().replace("\n", " ")
135        versions = soup.find_all("version")
136        pub_dt = nowdt("UTC")
137        upd_dt = nowdt("UTC")
138        if published := glom(versions, "0.date.text", default=""):
139            pub_dt = parsedate_to_datetime(published)
140        if updated := glom(versions, "-1.date.text", default=""):
141            upd_dt = parsedate_to_datetime(updated)
142        return {
143            "title": title,
144            "authors": authors,
145            "published": pub_dt.strftime("%Y-%m-%d %H:%M:%S"),
146            "updated": upd_dt.strftime("%Y-%m-%d %H:%M:%S"),
147            "comment": comment,
148            "abstract": abstract,
149        }
150    return {}
151
152
153async def extract_arxiv_tex(arxiv_id: str) -> list[dict]:
154
155    def remove_comments(content: str) -> str:
156        content = re.sub(r"(?<!\\)%.*$", "", content, flags=re.MULTILINE)
157        return re.sub(r"\n\s*\n", "\n\n", content)
158
159    arxiv_dir = Path(DOWNLOAD_DIR) / arxiv_id
160    save_path = Path(DOWNLOAD_DIR) / f"{arxiv_id}.tar.gz"
161    try:
162        await download_file(f"https://arxiv.org/src/{arxiv_id}", save_path, proxy=PROXY.ARXIV, stream=True)
163        shutil.rmtree(arxiv_dir, ignore_errors=True)
164        arxiv_dir.mkdir(parents=True, exist_ok=True)
165        shutil.unpack_archive(save_path, arxiv_dir)
166
167        main_tex = [f for f in arxiv_dir.rglob("*") if f.is_file and f.name == "main.tex"]
168        tex_files = [f for f in arxiv_dir.rglob("*") if f.is_file and f.suffix == ".tex"]
169        tex_files = sorted(tex_files, key=lambda x: x.name)
170        bib_files = [f for f in arxiv_dir.rglob("*") if f.is_file and f.suffix == ".bib"]
171
172        sources = []
173        files = main_tex + tex_files + bib_files
174        for f in files:
175            name = f.relative_to(arxiv_dir).name
176            content = f.read_text().strip()
177            if f.suffix == ".tex":
178                content = remove_comments(content)
179            sources.append({"type": "text", "text": json.dumps({"filename": name, "content": content})})
180    except Exception as e:
181        logger.error(f"❌arXiv {arxiv_id} 提取 tex 失败: {e}")
182        sources = []
183    finally:
184        Path(save_path).unlink(missing_ok=True)
185        shutil.rmtree(arxiv_dir, ignore_errors=True)
186    return sources