main
1#!/venv/bin/python
2# -*- coding: utf-8 -*-
3import asyncio
4import json
5import warnings
6from pathlib import Path
7from typing import Any
8
9from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
10from glom import Coalesce, glom
11from loguru import logger
12from pyrogram.client import Client
13from pyrogram.types import Message
14
15from ai.texts.contexts import MARKDOWN_EXT, TXT_EXT, full_chain_contexts, is_multi_user_chat, message_bytes
16from config import AI, ASR, PREFIX, PROXY, TZ
17from database.r2 import get_cf_r2
18from messages.database import copy_messages_from_db
19from messages.help import social_media_help
20from messages.parser import parse_msg
21from messages.sender import send2tg, send_blockquote_texts
22from messages.utils import delete_message, equal_prefix, set_reaction, startswith_prefix
23from networking import match_social_media_link
24from others.download_external import AUDIO_FORMAT, VIDEO_FORMAT
25from preview.arxiv import preview_arxiv
26from preview.bilibili import make_bvid_clickable, preview_bilibili
27from preview.douyin import preview_douyin
28from preview.instagram import preview_instagram
29from preview.twitter import preview_twitter
30from preview.v2ex import preview_v2ex
31from preview.wechat import preview_wechat
32from preview.weibo import preview_weibo
33from preview.xiaohongshu import preview_xhs
34from summarize.summarize import summarize
35from utils import convert2md, nowstr, read_text, soup_to_text, ts_to_dt
36from ytdlp.download import ytdlp_download
37from ytdlp.utils import ProxyError, get_subtitles
38
39
40# ruff: noqa: RET502,RET503
41async def ai_summary(client: Client, message: Message, summary_model_id: str = AI.AI_SUMMARY_MODEL_ALIAS, *, mermaid: bool = False, **kwargs) -> Any:
42 if not startswith_prefix(message.content, prefix=PREFIX.AI_SUMMARY):
43 return
44 this_msg = message
45 if equal_prefix(message.content, PREFIX.AI_SUMMARY):
46 if not message.reply_to_message:
47 return await send2tg(client, message, texts=social_media_help(message), **kwargs)
48 message = message.reply_to_message
49
50 chains = await full_chain_contexts(client, message, order="asc") # old to new
51 file_bytes = sum(message_bytes(m) for m in chains)
52 if file_bytes > 512 * 1024 * 1024:
53 logger.warning(f"file_bytes: {file_bytes} > 512MB, skip")
54 await this_msg.reply_text("❌上下文大小超过512MB,不支持总结")
55 await asyncio.sleep(5)
56 await delete_message(message)
57 return
58 await set_reaction(client, this_msg, "👌")
59 matched = await match_social_media_link(str(message.content))
60 kwargs |= {
61 "summary_twitter": True,
62 "summary_douyin": True,
63 "summary_xhs": True,
64 "summary_weibo": True,
65 "summary_wechat": True,
66 "summary_instagram": True,
67 "summary_v2ex": True,
68 "summary_ytdlp": True,
69 "enable_corrector": False,
70 } | matched
71 if matched["platform"] == "arxiv":
72 return await preview_arxiv(client, message, **kwargs)
73 if matched["platform"] in ["douyin", "tiktok"]: # noqa: RET505
74 return await preview_douyin(client, message, **kwargs)
75 if matched["platform"] == "instagram":
76 return await preview_instagram(client, message, **kwargs)
77 if matched["platform"] in ["x", "twitter", "fxtwitter", "fixupx"]:
78 return await preview_twitter(client, message, **kwargs)
79 if matched["platform"] == "weibo":
80 return await preview_weibo(client, message, **kwargs)
81 if matched["platform"] == "xiaohongshu":
82 return await preview_xhs(client, message, **kwargs)
83 if matched["platform"] == "wechat":
84 return await preview_wechat(client, message, **kwargs)
85 if matched["platform"] == "v2ex":
86 return await preview_v2ex(client, message, **kwargs)
87 if matched["platform"].startswith("bilibili-"): # this is not bilibili video, for videos, use yt-dlp
88 return await preview_bilibili(client, message, **kwargs)
89
90 sources = await get_sources(client, chains)
91 info = {}
92 if matched["platform"] in ["bilibili", "youtube", "ytdlp"]:
93 r2 = await get_cf_r2(matched["db_key"])
94 if "🤖AI导读" in "".join(glom(r2, "data.*.text", default=[])) and await copy_messages_from_db(client, message, key=matched["db_key"], kv=r2, **kwargs):
95 await set_reaction(client, this_msg, "🎉")
96 return
97 if info := await download_ytdlp(**kwargs):
98 sources.extend(info.get("sources", []))
99
100 logger.debug(f"Summary sources: {sources}")
101 summary = await summarize(
102 sources=sources,
103 model=summary_model_id,
104 title=info.get("title") or "AI导读",
105 author=info.get("author") or "Anonymous",
106 url=matched.get("url"),
107 date=info.get("created_at") or nowstr(TZ),
108 description=info.get("description"),
109 force_r2_page=bool(kwargs.get("force_r2_page")),
110 mermaid=mermaid,
111 )
112 if summary.get("texts"):
113 await send_blockquote_texts(client, message, texts=summary["texts"], **kwargs)
114 await set_reaction(client, this_msg, "🎉")
115 return
116 await set_reaction(client, this_msg, "💔")
117
118
119async def get_sources(client: Client, chains: list[Message]) -> list[dict]:
120 sources = []
121 add_sender = is_multi_user_chat(chains)
122 for msg in chains:
123 info = parse_msg(msg, silent=True, use_cache=False)
124 meta: dict = {"message_sender": info["full_name"]} if add_sender else {}
125
126 if msg.audio or msg.photo or msg.video or msg.document:
127 fpath: str = await client.download_media(msg) # ty:ignore[invalid-assignment]
128 if not Path(fpath).is_file():
129 continue
130 if msg.photo:
131 sources.append({"type": "image", "path": fpath})
132 elif msg.video:
133 sources.append({"type": "video", "path": fpath, "mime_type": msg.video.mime_type})
134 elif msg.audio:
135 sources.append({"type": "audio", "path": fpath, "mime_type": msg.audio.mime_type})
136 elif msg.document:
137 mime = glom(msg, "document.mime_type", default="") or ""
138 fname = glom(msg, "document.file_name", default="") or ""
139 if mime.startswith("image/"):
140 sources.append({"type": "image", "path": fpath, "mime_type": mime})
141 elif mime.startswith("audio/") or Path(fname).suffix in AUDIO_FORMAT:
142 sources.append({"type": "audio", "path": fpath, "mime_type": mime})
143 elif mime.startswith("video/") or Path(fname).suffix in VIDEO_FORMAT:
144 sources.append({"type": "video", "path": fpath, "mime_type": mime})
145 elif mime.startswith("text/") or Path(fname).suffix in TXT_EXT:
146 txt = {"file_name": fname, "file_content": read_text(fpath)}
147 sources.append({"type": "text", "text": json.dumps(meta | txt, ensure_ascii=False)})
148 elif Path(fname).suffix in MARKDOWN_EXT:
149 txt = {"file_name": fname, "file_content": convert2md(path=fpath)}
150 sources.append({"type": "text", "text": json.dumps(meta | txt, ensure_ascii=False)})
151 else:
152 sources.append({"type": "file", "path": fpath, "mime_type": mime})
153 if txt := glom(msg, Coalesce("content.html", "content", "text", "caption"), default=""):
154 texts = json.dumps(meta | {"message": txt}, ensure_ascii=False) if add_sender else txt
155 sources.append({"type": "text", "text": texts})
156 matched = await match_social_media_link(txt)
157 if matched["platform"] == "youtube":
158 sources.append({"type": "youtube", "url": matched["url"]})
159 return sources
160
161
162async def download_ytdlp(url: str, **kwargs) -> dict:
163 kwargs |= {"ytdlp_download_video": True, "show_progress": False}
164 try:
165 resp = await ytdlp_download(url, **kwargs)
166 if resp["video_path"].is_file():
167 return await ytdlp_info(resp, url, kwargs["platform"])
168 except ProxyError:
169 logger.error(f"🚫{kwargs['platform']}代理错误")
170 if PROXY.YTDLP_FALLBACK:
171 logger.warning(f"🔄使用备用代理{PROXY.YTDLP_FALLBACK}")
172 kwargs |= {"proxy": PROXY.YTDLP_FALLBACK}
173 return await download_ytdlp(url, **kwargs)
174 return {}
175
176
177async def ytdlp_info(info: dict, url: str, platform: str) -> dict:
178 data = {
179 "platform": platform.title(),
180 "author": info.get("author") or "Anonymous",
181 "title": info.get("title") or platform.title(),
182 "url": url,
183 }
184 sources = []
185 video = info["video_path"]
186 audio = info["audio_path"]
187 asr_path = audio if audio.is_file() else video
188 if video.is_file():
189 sources.append({"type": "video", "path": video.as_posix()})
190
191 if not video.is_file() and info["audio_path"].is_file():
192 sources.append({"type": "audio", "path": audio.as_posix()})
193
194 if subtitles := await get_subtitles(asr_path, url, asr_engine=ASR.DEFAULT_ENGINE, vinfo=info):
195 sources.append({"type": "transcripts", "text": subtitles})
196
197 # date
198 if info.get("pubdate"):
199 data["created_at"] = info["pubdate"].removeprefix("🕒")
200 elif dt := ts_to_dt(info.get("timestamp")):
201 data["created_at"] = f"{dt:%Y-%m-%d %H:%M:%S}"
202 elif info.get("upload_date"):
203 data["created_at"] = info["update_date"]
204 else:
205 data["created_at"] = nowstr(TZ)
206
207 # desc
208 if (desc := info.get("description")) and (desc != "-"):
209 warnings.simplefilter("ignore", MarkupResemblesLocatorWarning)
210 soup = BeautifulSoup(desc, "html.parser")
211 desc_text = soup_to_text(soup)
212 data["description"] = make_bvid_clickable(desc_text)
213 sources.append({"type": "text", "text": json.dumps(data, ensure_ascii=False)})
214 data["sources"] = sources
215 return data