Commit 247a3d8
Changed files (5)
src
src/messages/main.py
@@ -32,6 +32,7 @@ from others.search_ytb import search_youtube
from others.tmdb import search_tmdb
from others.version import get_bot_version
from others.watermark import add_watermark
+from preview.arxiv import preview_arxiv
from preview.bilibili import preview_bilibili
from preview.douyin import preview_douyin
from preview.github import preview_github
@@ -168,7 +169,8 @@ async def preview_social_media(
v2ex: bool = True, # Parse V2EX
music163: bool = True, # Parse Music163
spotify: bool = True, # Parse Spotify
- ytdlp: bool = True, # Parse YT-DLP
+ ytdlp: bool = True, # Parse YT-DLP link
+ arxiv: bool = True, # Parse arXiv
**kwargs,
):
"""Preview social media link in the message.
@@ -270,6 +272,8 @@ async def preview_social_media(
return await preview_spotify(client, message, **kwargs)
if v2ex and matched["platform"] == "v2ex":
return await preview_v2ex(client, message, **kwargs)
+ if arxiv and matched["platform"] == "arxiv":
+ return await preview_arxiv(client, message, **kwargs)
if matched["platform"].startswith("bilibili-"): # this is not bilibili video, for videos, use yt-dlp
return await preview_bilibili(client, message, **kwargs)
src/preview/arxiv.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import feedparser
+from glom import Coalesce, glom
+from pyrogram.client import Client
+from pyrogram.types import Message
+
+from config import PROXY
+from messages.progress import modify_progress
+from messages.sender import send2tg
+from networking import download_file, hx_req
+
+HEADERS = {
+ "User-Agent": "feedparser/6.0.11 +https://github.com/kurtmckee/feedparser/",
+ "Accept": "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1",
+}
+
+
+async def preview_arxiv(client: Client, message: Message, url: str, arxiv_id: str, **kwargs):
+ """Preview arxiv in the message."""
+ if kwargs.get("show_progress") and "progress" not in kwargs:
+ res = await send2tg(client, message, texts=f"🔗正在解析arXiv链接\n{url}", **kwargs)
+ kwargs["progress"] = res[0]
+ kwargs["send_from_user"] = "" # disable @send_user
+
+ api = f"https://export.arxiv.org/api/query?id_list={arxiv_id}"
+ resp = await hx_req(api, headers=HEADERS, proxy=PROXY.ARXIV, rformat="text")
+ if "hx_error" in resp:
+ return
+ if not resp.get("text"):
+ await modify_progress(text=f"❌arXiv解析失败: {resp}", force_update=True, **kwargs)
+ arxiv = feedparser.parse(resp["text"])
+
+ entry = glom(arxiv, "entries.0", default={})
+
+ title = glom(entry, "title", default="")
+ updated = glom(entry, Coalesce("updated", "published"), default="")
+ abstract = glom(entry, "summary", default="")
+ comment = glom(entry, "arxiv_comment", default="")
+ authors = ""
+ for author in glom(arxiv, "entries.0.authors", default=[]):
+ if name := author.get("name"):
+ authors += f"{name}, "
+ authors = authors.rstrip(", ")
+ await modify_progress(text="⏬正在下载PDF", force_update=True, **kwargs)
+ pdf = await download_file(f"https://arxiv.org/pdf/{arxiv_id}", suffix=".pdf", proxy=PROXY.ARXIV, stream=True)
+ texts = f"📄**[{title}]({url})**\n👥{authors}\n🕒{updated}\n"
+ if comment:
+ texts += f"📝{comment}\n"
+ texts += f"\n**Abstract**\n{abstract}"
+ await send2tg(client, message, texts=texts, media=[{"document": pdf}], **kwargs)
+ await modify_progress(del_status=True, **kwargs)
src/config.py
@@ -55,6 +55,7 @@ class ENABLE: # see fine-grained permission in `src/permission.py`
WECHAT = os.getenv("ENABLE_WECHAT", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
REDDIT = os.getenv("ENABLE_REDDIT", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
V2EX = os.getenv("ENABLE_V2EX", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
+ ARXIV = os.getenv("ENABLE_ARXIV", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
WGET = os.getenv("ENABLE_WGET", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
GITHUB = os.getenv("ENABLE_GITHUB", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
MUSIC163 = os.getenv("ENABLE_MUSIC163", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
@@ -207,6 +208,7 @@ class PROXY: # format: socks5://127.0.0.1:7890
WARP = os.getenv("WARP_PROXY", None)
WECHAT = os.getenv("WECHAT_PROXY", None)
WEIBO = os.getenv("WEIBO_PROXY", None)
+ ARXIV = os.getenv("ARXIV_PROXY", None)
XHS = os.getenv("XHS_PROXY", None) # Banned VPS IP, need residential proxy
YTDLP = os.getenv("YTDLP_PROXY", None) # general proxy for ytdlp
YTDLP_FALLBACK = os.getenv("YTDLP_PROXY_FALLBACK", None) # fallback proxy for ytdlp
src/networking.py
@@ -12,7 +12,7 @@ from urllib.parse import parse_qs, urlparse
import anyio
from curl_cffi.requests.impersonate import BrowserTypeLiteral
from httpx import AsyncClient, AsyncHTTPTransport, HTTPStatusError, Request, RequestError, Response
-from httpx._types import RequestContent, RequestData, RequestFiles # type: ignore
+from httpx._types import RequestContent, RequestData, RequestFiles
from httpx_curl_cffi import AsyncCurlTransport, CurlOpt
from loguru import logger
@@ -130,9 +130,9 @@ async def hx_req(
except Exception as e:
error = f"{type(e).__name__}[{retry + 1}/{max_retry + 1}]: Failed to request {url}, {e}"
with contextlib.suppress(Exception):
- hx_raw = response.json() # type: ignore
+ hx_raw = response.json()
if "res" in locals():
- error += f"\n{res}" # type: ignore
+ error += f"\n{res}"
elif "data" in locals():
error += f"\n{data}"
logger.error(error)
@@ -443,6 +443,16 @@ async def match_social_media_link(text: str, *, flatten_first: bool = True) -> d
if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/comments/([_A-Za-z0-9]+)", text):
return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
+ # https://arxiv.org/abs/2301.12345
+ # https://arxiv.org/pdf/2301.12345v3
+ if matched := re.search(r"(https?://)?arxiv\.org/(abs|pdf)/(\d{4}\.\d{4,5}(?:v\d+)?)", text):
+ url = matched.group(0)
+ arxiv_id = matched.group(3)
+ if "v" not in arxiv_id:
+ arxiv_id += "v1"
+ url += "v1"
+ return {"url": url, "arxiv_id": arxiv_id, "db_key": f"arxiv.org/abs/{arxiv_id}", "platform": "arxiv"}
+
# if all above pre-defined patterns failed, try to match ytdlp link
if urls := match_urls(text):
for url in urls:
src/permission.py
@@ -127,6 +127,7 @@ def check_service(cid: int | str, ctype: str) -> dict:
"convert_img": True,
"tts": True,
"ytb": True,
+ "arxiv": True,
"google_search": True,
"show_progress": True,
"detail_progress": True,
@@ -182,6 +183,8 @@ def check_service(cid: int | str, ctype: str) -> dict:
permission["reddit"] = False
if not ENABLE.YTDLP:
permission["ytdlp"] = False
+ if not ENABLE.ARXIV:
+ permission["arxiv"] = False
if not ENABLE.AI:
permission["ai"] = False
if not ENABLE.ASR: