main
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3import json
4import re
5import shutil
6from email.utils import parsedate_to_datetime
7from pathlib import Path
8
9import feedparser
10from bs4 import BeautifulSoup
11from glom import Coalesce, glom
12from loguru import logger
13from pyrogram.client import Client
14from pyrogram.types import InputMediaDocument, Message
15
16from config import AI, CAPTION_LENGTH, DOWNLOAD_DIR, PROXY
17from messages.progress import modify_progress
18from messages.sender import send2tg, send_blockquote_texts
19from messages.utils import smart_split
20from networking import download_file, hx_req
21from preview.utils import add_summary_url
22from summarize.summarize import summarize
23from utils import nowdt
24
25HEADERS = {
26 "User-Agent": "feedparser/6.0.11 +https://github.com/kurtmckee/feedparser/",
27 "Accept": "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1",
28}
29
30
31async def preview_arxiv(
32 client: Client,
33 message: Message,
34 url: str,
35 arxiv_id: str,
36 *,
37 summary_arxiv: bool = True,
38 summary_arxiv_model: str = AI.AI_SUMMARY_MODEL_ALIAS,
39 **kwargs,
40) -> None:
41 """Preview arxiv in the message."""
42 status_msg = (await send2tg(client, message, texts=f"🔗正在解析arXiv链接\n{url}", **kwargs))[0]
43 kwargs["send_from_user"] = "" # disable @send_user
44 if not isinstance(status_msg, Message):
45 return
46
47 # First, get the PDF and send it.
48 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}"
49 pdf = await download_file(pdf_url, suffix=".pdf", proxy=PROXY.ARXIV, stream=True)
50 if not pdf:
51 await modify_progress(status_msg, text="❌下载PDF失败", force_update=True)
52 return
53
54 status_msg = await status_msg.edit_media(media=InputMediaDocument(pdf, caption=f"arXiv: [{arxiv_id}]({url})"))
55 file_id = glom(status_msg, "document.file_id", default=pdf)
56 arxiv_info = await get_arxiv_meta(arxiv_id)
57 if not arxiv_info:
58 logger.error("❌arXiv API调用失败")
59 if summary_arxiv:
60 sources = await extract_arxiv_tex(arxiv_id) + [{"type": "file", "path": pdf, "mime_type": "application/pdf"}]
61 summary = await summarize(sources=sources, title=f"arXiv-{arxiv_id}", model=summary_arxiv_model, url=url, force_r2_page=True)
62 await send_blockquote_texts(client, status_msg, texts=summary["texts"], **kwargs)
63 Path(pdf).unlink(missing_ok=True)
64 return
65
66 title = arxiv_info["title"]
67 authors = arxiv_info["authors"]
68 updated = arxiv_info["updated"]
69 comment = arxiv_info["comment"]
70 abstract = arxiv_info["abstract"]
71
72 texts = f"📄**[{title}]({url})**\n🕒{updated}\n👥{authors}\n"
73 if comment:
74 texts += f"📝{comment}"
75
76 caption = (await smart_split(texts, CAPTION_LENGTH))[0]
77 status_msg = await status_msg.edit_media(media=InputMediaDocument(file_id, caption=caption))
78 if not summary_arxiv:
79 await send_blockquote_texts(client, status_msg, texts=f"**Abstract**\n{abstract}", **kwargs)
80 else:
81 iframe = f'<iframe src="{pdf_url}" width="100%" height="800px" style="border: none; border-radius: 8px;"></iframe>'
82 ptag = f'<p style="text-align: center;"><a href="{pdf_url}" target="_blank">在新标签页中打开论文</a></p>'
83 sources = [{"type": "file", "path": pdf, "mime_type": "application/pdf"}, {"type": "text", "text": json.dumps(arxiv_info)}] + await extract_arxiv_tex(arxiv_id)
84 summary = await summarize(
85 sources=sources,
86 title=title,
87 model=summary_arxiv_model,
88 author=authors,
89 date=updated,
90 url=url,
91 description={"emoji": "📄", "name": "原始论文", "html": iframe + ptag},
92 force_r2_page=True,
93 )
94 telegraph_url = summary.get("telegraph_url")
95 if not telegraph_url:
96 await send_blockquote_texts(client, status_msg, texts=summary.get("texts", ""), **kwargs)
97 else:
98 await add_summary_url(telegraph_url, status_msg)
99
100 Path(pdf).unlink(missing_ok=True)
101
102
103async def get_arxiv_meta(arxiv_id: str) -> dict:
104 """Get arxiv metadata."""
105 # first, get the metadata from the standard arXxiv API
106 api = f"https://export.arxiv.org/api/query?id_list={arxiv_id}"
107 resp = await hx_req(api, headers=HEADERS, proxy=PROXY.ARXIV, rformat="text", timeout=3, max_retry=1)
108 if resp.get("status_code") == 200:
109 arxiv = feedparser.parse(resp["text"])
110 entry = glom(arxiv, "entries.0", default={})
111 title = glom(entry, "title", default="")
112 published = glom(entry, "published", default="")
113 updated = glom(entry, Coalesce("updated", "published"), default="")
114 published = published.replace("T", " ").rstrip("Z")
115 updated = updated.replace("T", " ").rstrip("Z")
116 abstract = glom(entry, "summary", default="")
117 comment = glom(entry, "arxiv_comment", default="")
118 authors = ""
119 for author in glom(entry, "authors", default=[]):
120 if name := author.get("name"):
121 authors += f"{name}, "
122 authors = authors.rstrip(", ")
123 return {"title": title, "authors": authors, "published": published, "updated": updated, "comment": comment, "abstract": abstract}
124
125 logger.warning("❌arXiv standard API调用失败,回退到Open Archives Initiative")
126 clean_id = re.sub(r"v\d+$", "", arxiv_id)
127 api = f"https://oaipmh.arxiv.org/oai?verb=GetRecord&identifier=oai:arXiv.org:{clean_id}&metadataPrefix=arXivRaw"
128 resp = await hx_req(api, headers=HEADERS, proxy=PROXY.ARXIV, rformat="text", timeout=3, max_retry=1)
129 if resp.get("status_code") == 200:
130 soup = BeautifulSoup(resp["text"], "xml")
131 title = glom(soup, "title.text", default="").strip().replace("\n", " ")
132 authors = glom(soup, "authors.text", default="").strip()
133 abstract = glom(soup, "abstract.text", default="").strip()
134 comment = glom(soup, "comments.text", default="").strip().replace("\n", " ")
135 versions = soup.find_all("version")
136 pub_dt = nowdt("UTC")
137 upd_dt = nowdt("UTC")
138 if published := glom(versions, "0.date.text", default=""):
139 pub_dt = parsedate_to_datetime(published)
140 if updated := glom(versions, "-1.date.text", default=""):
141 upd_dt = parsedate_to_datetime(updated)
142 return {
143 "title": title,
144 "authors": authors,
145 "published": pub_dt.strftime("%Y-%m-%d %H:%M:%S"),
146 "updated": upd_dt.strftime("%Y-%m-%d %H:%M:%S"),
147 "comment": comment,
148 "abstract": abstract,
149 }
150 return {}
151
152
153async def extract_arxiv_tex(arxiv_id: str) -> list[dict]:
154
155 def remove_comments(content: str) -> str:
156 content = re.sub(r"(?<!\\)%.*$", "", content, flags=re.MULTILINE)
157 return re.sub(r"\n\s*\n", "\n\n", content)
158
159 arxiv_dir = Path(DOWNLOAD_DIR) / arxiv_id
160 save_path = Path(DOWNLOAD_DIR) / f"{arxiv_id}.tar.gz"
161 try:
162 await download_file(f"https://arxiv.org/src/{arxiv_id}", save_path, proxy=PROXY.ARXIV, stream=True)
163 shutil.rmtree(arxiv_dir, ignore_errors=True)
164 arxiv_dir.mkdir(parents=True, exist_ok=True)
165 shutil.unpack_archive(save_path, arxiv_dir)
166
167 main_tex = [f for f in arxiv_dir.rglob("*") if f.is_file and f.name == "main.tex"]
168 tex_files = [f for f in arxiv_dir.rglob("*") if f.is_file and f.suffix == ".tex"]
169 tex_files = sorted(tex_files, key=lambda x: x.name)
170 bib_files = [f for f in arxiv_dir.rglob("*") if f.is_file and f.suffix == ".bib"]
171
172 sources = []
173 files = main_tex + tex_files + bib_files
174 for f in files:
175 name = f.relative_to(arxiv_dir).name
176 content = f.read_text().strip()
177 if f.suffix == ".tex":
178 content = remove_comments(content)
179 sources.append({"type": "text", "text": json.dumps({"filename": name, "content": content})})
180 except Exception as e:
181 logger.error(f"❌arXiv {arxiv_id} 提取 tex 失败: {e}")
182 sources = []
183 finally:
184 Path(save_path).unlink(missing_ok=True)
185 shutil.rmtree(arxiv_dir, ignore_errors=True)
186 return sources