main
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3import warnings
4from pathlib import Path
5from typing import Literal
6
7from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
8from glom import Coalesce, glom
9from loguru import logger
10from pyrogram.client import Client
11from pyrogram.types import Message
12
13from config import AI, ASR, CAPTION_LENGTH, MAX_FILE_BYTES, YTDLP_RE_ENCODING_MAX_FILE_BYTES
14from database.r2 import get_cf_r2
15from messages.database import copy_messages_from_db, save_messages
16from messages.preprocess import preprocess_media
17from messages.progress import modify_progress, telegram_uploading
18from messages.sender import send2tg
19from messages.utils import better_blockquote, count_without_entities, get_reply_to, smart_split
20from multimedia import convert_to_h264
21from preview.bilibili import get_bilibili_comments, get_bilibili_vinfo, make_bvid_clickable
22from preview.youtube import get_youtube_comments, get_youtube_vinfo
23from publish import publish_telegraph
24from summarize.summarize import summarize
25from utils import convert2html, readable_size, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
26from ytdlp.download import ytdlp_download
27from ytdlp.utils import append_tag, cleanup_ytdlp, generate_prompt, get_subtitles, platform_emoji
28
29
30async def preview_ytdlp(
31 client: Client,
32 message: Message,
33 url: str = "",
34 *,
35 platform: Literal["youtube", "bilibili", "ytdlp"] = "ytdlp",
36 vid: str = "",
37 bvid: str = "",
38 use_db: bool = True,
39 ytdlp_download_video: bool = True,
40 use_aria2: bool = False,
41 ytdlp_send_video: bool = True,
42 ytdlp_send_audio: bool = True,
43 bilibili_comments: bool = True,
44 youtube_comments: bool = True,
45 proxy: str | None = None,
46 ytdlp_video_target: str | int | None = None,
47 ytdlp_audio_target: str | int | None = None,
48 ytdlp_send_subtitle: bool = False,
49 summary_ytdlp: bool = False,
50 summary_ytdlp_model: str = AI.SUBTITLE_SUMMARY_MODEL_ALIAS,
51 enable_corrector: bool = False,
52 show_author: bool = True,
53 show_title: bool = True,
54 show_pubdate: bool = True,
55 show_statistics: bool = True,
56 show_description: bool = True,
57 **kwargs,
58) -> list[Message]:
59 """Preview ytdlp link in the message.
60
61 Args:
62 client (Client): The Pyrogram client.
63 message (Message): The trigger message object.
64 url (str, optional): ytdlp link.
65 platform (str, optional): The platform of the video.
66 vid (str, optional): The YouTube video id.
67 bvid (str, optional): The Bilibili video id.
68 use_db (bool, optional): Whether to use database to cache the result. Defaults to True.
69 ytdlp_download_video (bool, optional): Download video. Defaults to True.
70 use_aria2 (bool, optional): Whether to use aria2 to download the video. Defaults to False.
71 ytdlp_send_video (bool, optional): Send video. Defaults to True.
72 ytdlp_send_audio (bool, optional): Send audio. Defaults to False.
73 bilibili_comments (bool, optional): Enable bilibili comments
74 youtube_comments (bool, optional): Enable youtube comments
75 proxy (str, optional): Proxy to use. Defaults to None.
76 ytdlp_video_target (str | int, optional): The target chat id to send video.
77 ytdlp_audio_target (str | int, optional): The target chat id to send audio.
78 ytdlp_send_subtitle (bool, optional): Send subtitle. Defaults to False.
79 summary_ytdlp (bool, optional): Send AI summary. Defaults to False.
80 summary_ytdlp_model (str, optional): The model id to use for AI summary.
81 to_telegraph (bool, optional): Whether to publish the subtitle or transcription to telegraph.
82 """
83 logger.trace(f"{url=} {kwargs=}")
84 # try cache
85 db_key = url
86 if true(use_db) and (kv := await get_cf_r2(db_key)):
87 logger.debug(f"YT-DLP preview cache hit for key={db_key}")
88 kwargs |= {"copy_video_msg": kwargs.get("copy_video_msg", ytdlp_send_video), "copy_audio_msg": kwargs.get("copy_audio_msg", ytdlp_send_audio)}
89 if db_msgs := await copy_messages_from_db(client, message, key=db_key, kv=kv, **kwargs):
90 return db_msgs
91 logger.warning("❌从缓存中转发失败, 尝试重新解析...")
92
93 if kwargs.get("show_progress") and not kwargs.get("progress"):
94 res = await send2tg(client, message, texts=f"🔗正在解析链接\n{url}", **kwargs)
95 kwargs["progress"] = res[0]
96
97 # get video info from API first
98 if platform == "youtube":
99 vinfo = await get_youtube_vinfo(vid)
100 elif platform == "bilibili":
101 vinfo = await get_bilibili_vinfo(bvid)
102 else:
103 vinfo = {}
104 if platform in ["youtube", "bilibili"] and not vinfo.get("downloadable"):
105 await modify_progress(text=vinfo.get("error_msg") or "❌视频无法下载", force_update=True, **kwargs)
106 return []
107
108 info = await ytdlp_download(
109 url,
110 proxy=proxy,
111 platform=platform,
112 ytdlp_download_video=ytdlp_download_video,
113 use_aria2=use_aria2,
114 **kwargs,
115 )
116 if not (info["video_path"].is_file() or info["audio_path"].is_file()):
117 return []
118 info |= vinfo # merge video info
119 captions = await generate_captions(
120 info,
121 url=url,
122 platform=platform,
123 vid=vid,
124 bvid=bvid,
125 bilibili_comments=bilibili_comments,
126 youtube_comments=youtube_comments,
127 show_author=show_author,
128 show_title=show_title,
129 show_pubdate=show_pubdate,
130 show_statistics=show_statistics,
131 show_description=show_description,
132 )
133 # add send_from_user prefix to caption
134 prefix = kwargs.get("send_from_user", "")
135 texts = f"{prefix}{captions['caption']}"
136 info["caption"] = texts
137 sent_messages = await send_media(client, message, info, ytdlp_video_target, ytdlp_audio_target, ytdlp_send_video=ytdlp_send_video, ytdlp_send_audio=ytdlp_send_audio, **kwargs)
138
139 # get subtitles
140 subtitles = ""
141 if true(ytdlp_send_subtitle) or true(summary_ytdlp):
142 fpath = info["audio_path"] if info["audio_path"].is_file() else info["video_path"]
143 asr_engine = kwargs.get("asr_engine", "uncensored") if platform == "youtube" else ASR.DEFAULT_ENGINE
144 subtitles = await get_subtitles(fpath, url, asr_engine, info, enable_corrector=enable_corrector)
145
146 # get ai summary
147 telegraph_ai = ""
148 if subtitles and true(summary_ytdlp):
149 desc = info.get("description", "")
150 desc_html = desc if desc.startswith("<") else convert2html(desc)
151 if platform == "bilibili":
152 desc_html = f'<iframe src="https://player.bilibili.com/player.html?isOutside=true&bvid={bvid}&p=1&autoplay=0&poster=1&danmaku=1" frameborder="0" scrolling="no" border="0" framespacing="0" allowfullscreen="true" style="width: 100%; aspect-ratio: 16/9;"></iframe>{desc_html}'
153 elif platform == "youtube":
154 desc_html = f'<iframe src="https://www.youtube.com/embed/{vid}" frameborder="0" scrolling="no" border="0" framespacing="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen="true" style="width: 100%; aspect-ratio: 16/9;"></iframe>{desc_html}'
155 desc_page = {"emoji": "🎬", "name": "视频详情", "html": desc_html}
156 summary = await summarize(
157 sources=[{"type": "system_prompt", "text": generate_prompt(info)}, {"type": "transcripts", "text": subtitles}],
158 model=summary_ytdlp_model,
159 title=info.get("title"),
160 description=desc_page,
161 author=info.get("author"),
162 url=url,
163 date=glom(info, Coalesce("pubdate", "upload_date"), default=""),
164 min_text_length=200,
165 )
166 telegraph_ai = summary.get("telegraph_url", "")
167
168 if telegraph_ai: # ai summary with subtitles
169 sent_messages = await append_tag(f'<a href="{telegraph_ai}">🤖AI导读</a>', sent_messages)
170 elif subtitles: # subtitles only
171 html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")]).replace("<p></p>", "")
172 if telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url):
173 sent_messages = await append_tag(f'<a href="{telegraph_url}">🔤字幕</a>', sent_messages)
174
175 # save messages when video is uploaded
176 messages = [msg for msgs in sent_messages.values() for msg in (msgs if isinstance(msgs, list) else [msgs])]
177 if bool(use_db and info["video_path"].is_file()):
178 metadata = {}
179 for k in ["author", "author_url", "title", "url", "create_time", "duration", "description"]:
180 if v := locals().get(k):
181 metadata[k] = unicode_to_ascii(v)
182 await save_messages(messages=messages, key=url, metadata=metadata)
183
184 Path(info["json_path"]).unlink(missing_ok=True)
185 cleanup_ytdlp(info["id"])
186 return messages
187
188
189async def generate_captions(
190 info: dict,
191 url: str,
192 platform: str,
193 vid: str,
194 bvid: str,
195 *,
196 bilibili_comments: bool,
197 youtube_comments: bool,
198 show_author: bool = True,
199 show_title: bool = True,
200 show_pubdate: bool = True,
201 show_statistics: bool = True,
202 show_description: bool = True,
203) -> dict:
204 """Generate captions."""
205 captions = ""
206 results = {}
207 emoji = platform_emoji(info["extractor"])
208 results["emoji"] = emoji
209
210 # author
211 if info.get("author") and info["author_url"]:
212 results["author"] = f"{emoji}[{info['author']}]({info['author_url']})"
213 elif info.get("author"):
214 results["author"] = f"{emoji}[{info['author']}]({url})"
215 else:
216 results["author"] = f"{emoji}[原始链接]({url})"
217 if true(show_author):
218 captions += f"{results['author']}\n"
219
220 # date
221 if info.get("pubdate"):
222 results["create_time"] = "🕒" + info["pubdate"]
223 elif dt := ts_to_dt(info.get("timestamp")):
224 results["create_time"] = f"🕒{dt:%Y-%m-%d %H:%M:%S}"
225 elif info.get("upload_date"):
226 results["create_time"] = "🕒" + info["update_date"]
227 else:
228 results["create_time"] = ""
229 if true(show_pubdate) and results["create_time"]:
230 captions += f"{results['create_time']}\n"
231
232 results["statistics"] = info.get("statistics", "")
233 if true(show_statistics) and results["statistics"]:
234 captions += f"{results['statistics']}\n"
235
236 # title
237 if info.get("title"):
238 results["title"] = f"📝[{info['title']}]({url})"
239 else:
240 results["title"] = ""
241 if true(show_title) and results["title"]:
242 captions += f"{results['title']}\n"
243
244 # desc
245 if (desc := info.get("description")) and (desc != "-"):
246 warnings.simplefilter("ignore", MarkupResemblesLocatorWarning)
247 soup = BeautifulSoup(desc, "html.parser")
248 desc_text = soup_to_text(soup)
249 results["description"] = make_bvid_clickable(desc_text)
250 else:
251 results["description"] = ""
252 if true(show_description) and results["description"]:
253 captions += f"{results['description']}\n"
254
255 # comments
256 comment_list = []
257 comments = ""
258 if true(bilibili_comments) and platform == "bilibili":
259 comment_list = await get_bilibili_comments(bvid)
260 elif true(youtube_comments) and platform == "youtube":
261 comment_list = await get_youtube_comments(vid)
262 for comment in comment_list:
263 if await count_without_entities(f"{captions}{comment}") < CAPTION_LENGTH - 15: # leave some margin for other info
264 comments += comment
265 comments = comments.strip()
266 results["comments"] = comments
267 results["caption_without_comments"] = captions.strip()
268 results["caption"] = f"{captions}{comments}".strip()
269 return results
270
271
272def get_target_chats(message: Message, video_target: str | int | None = None, audio_target: str | int | None = None, **kwargs) -> tuple[int | str, int | str]:
273 """Get target chats of video and audio messages.
274
275 Returns:
276 (video_target_chat, audio_target_chat)
277 """
278 main_target = kwargs.get("target_chat") or message.chat.id
279 if video_target is None:
280 video_target = main_target
281 if audio_target is None:
282 audio_target = main_target
283 return to_int(video_target), to_int(audio_target)
284
285
286async def send_media(
287 client: Client,
288 message: Message,
289 info: dict,
290 ytdlp_video_target: str | int | None = None,
291 ytdlp_audio_target: str | int | None = None,
292 *,
293 ytdlp_send_video: bool = True,
294 ytdlp_send_audio: bool = False,
295 **kwargs,
296) -> dict:
297 """Send media to target chats.
298
299 Returns:
300 {
301 "video": list[Message],
302 "audio": Message,
303 }
304 """
305 video_path: Path = info["video_path"]
306 audio_path: Path = info["audio_path"]
307 thumb = info["thumb"]
308 video_messages = []
309 audio_message = None
310 video_target, audio_target = get_target_chats(message, ytdlp_video_target, ytdlp_audio_target, **kwargs)
311 reply_msg_id = kwargs.get("reply_msg_id", 0)
312 reply_parameters = get_reply_to(message.id, reply_msg_id)
313
314 # split large videos into multiple parts (>= 2GB)
315 if true(ytdlp_send_video) and video_path.is_file():
316 video_path = await convert_to_h264(video_path, allow_re_encoding=True, max_file_size=YTDLP_RE_ENCODING_MAX_FILE_BYTES, skip_h264=True)
317 if video_path.stat().st_size > MAX_FILE_BYTES:
318 await modify_progress(text=f"🎬视频大小超过Telegram限制({MAX_FILE_BYTES / 1024 / 1024:.0f}MB), 正在切分...", **kwargs)
319 videos = await preprocess_media([{"video": video_path, "thumb": thumb}])
320 for idx, video in enumerate(videos):
321 video["thumb"] = thumb # use the same thumb for all videos
322 caption = info["caption"].replace("📝[", f"📝[P{idx + 1}-") if len(videos) > 1 else info["caption"]
323 caption = (await smart_split(caption, CAPTION_LENGTH))[0]
324 await modify_progress(text=f"🎬视频上传中-P{idx + 1}: {readable_size(path=video['video'])}", force_update=True, **kwargs)
325 video_messages.append(
326 await client.send_video(
327 chat_id=to_int(video_target),
328 caption=better_blockquote(caption),
329 reply_parameters=reply_parameters,
330 progress=telegram_uploading,
331 progress_args=(kwargs.get("progress", False), video["video"], true(kwargs.get("detail_progress"))), # message, path, detail_progress
332 **video,
333 )
334 )
335 # don't need to split audio
336 if true(ytdlp_send_audio) and audio_path.is_file():
337 await modify_progress(text=f"🎧音频上传中: {readable_size(path=audio_path)}", force_update=True, **kwargs)
338 caption = (await smart_split(info["caption"], CAPTION_LENGTH))[0]
339 audio_message = await client.send_audio(
340 chat_id=to_int(audio_target),
341 audio=audio_path.as_posix(),
342 caption=better_blockquote(caption),
343 performer=info["author"],
344 title=info["title"],
345 duration=round(float(info.get("duration", "0"))),
346 reply_parameters=reply_parameters,
347 progress=telegram_uploading,
348 progress_args=(kwargs.get("progress", False), audio_path, true(kwargs.get("detail_progress"))), # message, path, detail_progress
349 thumb=info["thumb"],
350 )
351 await modify_progress(del_status=True, **kwargs)
352 sent_messages = {}
353 if all(isinstance(x, Message) for x in video_messages):
354 sent_messages["video"] = video_messages
355 if isinstance(audio_message, Message):
356 sent_messages["audio"] = audio_message
357 return sent_messages