Commit 247a3d8

benny-dou <60535774+benny-dou@users.noreply.github.com>
2026-01-25 14:18:24
feat(arxiv): add arXiv link preview support
1 parent 0a7a1f5
Changed files (5)
src/messages/main.py
@@ -32,6 +32,7 @@ from others.search_ytb import search_youtube
 from others.tmdb import search_tmdb
 from others.version import get_bot_version
 from others.watermark import add_watermark
+from preview.arxiv import preview_arxiv
 from preview.bilibili import preview_bilibili
 from preview.douyin import preview_douyin
 from preview.github import preview_github
@@ -168,7 +169,8 @@ async def preview_social_media(
     v2ex: bool = True,  # Parse V2EX
     music163: bool = True,  # Parse Music163
     spotify: bool = True,  # Parse Spotify
-    ytdlp: bool = True,  # Parse YT-DLP
+    ytdlp: bool = True,  # Parse YT-DLP link
+    arxiv: bool = True,  # Parse arXiv
     **kwargs,
 ):
     """Preview social media link in the message.
@@ -270,6 +272,8 @@ async def preview_social_media(
             return await preview_spotify(client, message, **kwargs)
         if v2ex and matched["platform"] == "v2ex":
             return await preview_v2ex(client, message, **kwargs)
+        if arxiv and matched["platform"] == "arxiv":
+            return await preview_arxiv(client, message, **kwargs)
         if matched["platform"].startswith("bilibili-"):  # this is not bilibili video, for videos, use yt-dlp
             return await preview_bilibili(client, message, **kwargs)
 
src/preview/arxiv.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import feedparser
+from glom import Coalesce, glom
+from pyrogram.client import Client
+from pyrogram.types import Message
+
+from config import PROXY
+from messages.progress import modify_progress
+from messages.sender import send2tg
+from networking import download_file, hx_req
+
+HEADERS = {
+    "User-Agent": "feedparser/6.0.11 +https://github.com/kurtmckee/feedparser/",
+    "Accept": "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1",
+}
+
+
+async def preview_arxiv(client: Client, message: Message, url: str, arxiv_id: str, **kwargs):
+    """Preview arxiv in the message."""
+    if kwargs.get("show_progress") and "progress" not in kwargs:
+        res = await send2tg(client, message, texts=f"🔗正在解析arXiv链接\n{url}", **kwargs)
+        kwargs["progress"] = res[0]
+    kwargs["send_from_user"] = ""  # disable @send_user
+
+    api = f"https://export.arxiv.org/api/query?id_list={arxiv_id}"
+    resp = await hx_req(api, headers=HEADERS, proxy=PROXY.ARXIV, rformat="text")
+    if "hx_error" in resp:
+        return
+    if not resp.get("text"):
+        await modify_progress(text=f"❌arXiv解析失败: {resp}", force_update=True, **kwargs)
+    arxiv = feedparser.parse(resp["text"])
+
+    entry = glom(arxiv, "entries.0", default={})
+
+    title = glom(entry, "title", default="")
+    updated = glom(entry, Coalesce("updated", "published"), default="")
+    abstract = glom(entry, "summary", default="")
+    comment = glom(entry, "arxiv_comment", default="")
+    authors = ""
+    for author in glom(arxiv, "entries.0.authors", default=[]):
+        if name := author.get("name"):
+            authors += f"{name}, "
+    authors = authors.rstrip(", ")
+    await modify_progress(text="⏬正在下载PDF", force_update=True, **kwargs)
+    pdf = await download_file(f"https://arxiv.org/pdf/{arxiv_id}", suffix=".pdf", proxy=PROXY.ARXIV, stream=True)
+    texts = f"📄**[{title}]({url})**\n👥{authors}\n🕒{updated}\n"
+    if comment:
+        texts += f"📝{comment}\n"
+    texts += f"\n**Abstract**\n{abstract}"
+    await send2tg(client, message, texts=texts, media=[{"document": pdf}], **kwargs)
+    await modify_progress(del_status=True, **kwargs)
src/config.py
@@ -55,6 +55,7 @@ class ENABLE:  # see fine-grained permission in `src/permission.py`
     WECHAT = os.getenv("ENABLE_WECHAT", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     REDDIT = os.getenv("ENABLE_REDDIT", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     V2EX = os.getenv("ENABLE_V2EX", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
+    ARXIV = os.getenv("ENABLE_ARXIV", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     WGET = os.getenv("ENABLE_WGET", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     GITHUB = os.getenv("ENABLE_GITHUB", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     MUSIC163 = os.getenv("ENABLE_MUSIC163", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
@@ -207,6 +208,7 @@ class PROXY:  # format: socks5://127.0.0.1:7890
     WARP = os.getenv("WARP_PROXY", None)
     WECHAT = os.getenv("WECHAT_PROXY", None)
     WEIBO = os.getenv("WEIBO_PROXY", None)
+    ARXIV = os.getenv("ARXIV_PROXY", None)
     XHS = os.getenv("XHS_PROXY", None)  # Banned VPS IP, need residential proxy
     YTDLP = os.getenv("YTDLP_PROXY", None)  # general proxy for ytdlp
     YTDLP_FALLBACK = os.getenv("YTDLP_PROXY_FALLBACK", None)  # fallback proxy for ytdlp
src/networking.py
@@ -12,7 +12,7 @@ from urllib.parse import parse_qs, urlparse
 import anyio
 from curl_cffi.requests.impersonate import BrowserTypeLiteral
 from httpx import AsyncClient, AsyncHTTPTransport, HTTPStatusError, Request, RequestError, Response
-from httpx._types import RequestContent, RequestData, RequestFiles  # type: ignore
+from httpx._types import RequestContent, RequestData, RequestFiles
 from httpx_curl_cffi import AsyncCurlTransport, CurlOpt
 from loguru import logger
 
@@ -130,9 +130,9 @@ async def hx_req(
     except Exception as e:
         error = f"{type(e).__name__}[{retry + 1}/{max_retry + 1}]: Failed to request {url}, {e}"
         with contextlib.suppress(Exception):
-            hx_raw = response.json()  # type: ignore
+            hx_raw = response.json()
         if "res" in locals():
-            error += f"\n{res}"  # type: ignore
+            error += f"\n{res}"
         elif "data" in locals():
             error += f"\n{data}"
         logger.error(error)
@@ -443,6 +443,16 @@ async def match_social_media_link(text: str, *, flatten_first: bool = True) -> d
     if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/comments/([_A-Za-z0-9]+)", text):
         return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
 
+    # https://arxiv.org/abs/2301.12345
+    # https://arxiv.org/pdf/2301.12345v3
+    if matched := re.search(r"(https?://)?arxiv\.org/(abs|pdf)/(\d{4}\.\d{4,5}(?:v\d+)?)", text):
+        url = matched.group(0)
+        arxiv_id = matched.group(3)
+        if "v" not in arxiv_id:
+            arxiv_id += "v1"
+            url += "v1"
+        return {"url": url, "arxiv_id": arxiv_id, "db_key": f"arxiv.org/abs/{arxiv_id}", "platform": "arxiv"}
+
     # if all above pre-defined patterns failed, try to match ytdlp link
     if urls := match_urls(text):
         for url in urls:
src/permission.py
@@ -127,6 +127,7 @@ def check_service(cid: int | str, ctype: str) -> dict:
         "convert_img": True,
         "tts": True,
         "ytb": True,
+        "arxiv": True,
         "google_search": True,
         "show_progress": True,
         "detail_progress": True,
@@ -182,6 +183,8 @@ def check_service(cid: int | str, ctype: str) -> dict:
         permission["reddit"] = False
     if not ENABLE.YTDLP:
         permission["ytdlp"] = False
+    if not ENABLE.ARXIV:
+        permission["arxiv"] = False
     if not ENABLE.AI:
         permission["ai"] = False
     if not ENABLE.ASR: