main
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3import io
4import warnings
5from pathlib import Path
6from typing import Literal
7
8import markdown
9from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
10from loguru import logger
11from pyrogram.client import Client
12from pyrogram.types import Message
13from pyrogram.types.messages_and_media.message import Str
14
15from ai.main import ai_text_generation
16from config import AI, ASR, CAPTION_LENGTH, DB, MAX_FILE_BYTES, PREFIX, READING_SPEED, YTDLP_RE_ENCODING_MAX_FILE_BYTES
17from database.database import get_db
18from messages.database import copy_messages_from_db, save_messages
19from messages.preprocess import preprocess_media
20from messages.progress import modify_progress, telegram_uploading
21from messages.sender import send2tg
22from messages.utils import count_without_entities, get_reply_to, smart_split
23from multimedia import convert_to_h264
24from preview.bilibili import get_bilibili_comments, get_bilibili_vinfo, make_bvid_clickable
25from preview.youtube import get_youtube_comments, get_youtube_vinfo
26from publish import publish_telegraph
27from utils import count_subtitles, rand_number, readable_size, readable_time, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
28from ytdlp.download import ytdlp_download
29from ytdlp.utils import append_subtitle, cleanup_ytdlp, generate_prompt, get_subtitles, platform_emoji
30
31
32async def preview_ytdlp(
33 client: Client,
34 message: Message,
35 url: str = "",
36 *,
37 platform: Literal["youtube", "bilibili", "ytdlp"] = "ytdlp",
38 vid: str = "",
39 bvid: str = "",
40 use_db: bool = True,
41 ytdlp_download_video: bool = True,
42 ytdlp_send_video: bool = True,
43 ytdlp_send_audio: bool = True,
44 bilibili_comments: bool = True,
45 youtube_comments: bool = True,
46 proxy: str | None = None,
47 ytdlp_video_target: str | int | None = None,
48 ytdlp_audio_target: str | int | None = None,
49 ytdlp_subtitle_target: str | int | None = None,
50 ytdlp_send_subtitle: bool = False,
51 ytdlp_send_summary: bool = False,
52 summary_model_id: str = AI.SUBTITLE_SUMMARY_MODEL_ALIAS,
53 to_telegraph: bool = True,
54 show_author: bool = True,
55 show_title: bool = True,
56 show_pubdate: bool = True,
57 show_statistics: bool = True,
58 show_description: bool = True,
59 **kwargs,
60) -> list[Message]:
61 """Preview ytdlp link in the message.
62
63 Args:
64 client (Client): The Pyrogram client.
65 message (Message): The trigger message object.
66 url (str, optional): ytdlp link.
67 platform (str, optional): The platform of the video.
68 vid (str, optional): The YouTube video id.
69 bvid (str, optional): The Bilibili video id.
70 use_db (bool, optional): Whether to use database to cache the result. Defaults to True.
71 ytdlp_download_video (bool, optional): Download video. Defaults to True.
72 ytdlp_send_video (bool, optional): Send video. Defaults to True.
73 ytdlp_send_audio (bool, optional): Send audio. Defaults to False.
74 bilibili_comments (bool, optional): Enable bilibili comments
75 youtube_comments (bool, optional): Enable youtube comments
76 proxy (str, optional): Proxy to use. Defaults to None.
77 ytdlp_video_target (str | int, optional): The target chat id to send video.
78 ytdlp_audio_target (str | int, optional): The target chat id to send audio.
79 ytdlp_send_subtitle (bool, optional): Send subtitle. Defaults to False.
80 ytdlp_send_summary (bool, optional): Send AI summary. Defaults to False.
81 summary_model_id (str, optional): The model id to use for AI summary.
82 to_telegraph (bool, optional): Whether to publish the subtitle or transcription to telegraph.
83 """
84 logger.trace(f"{url=} {kwargs=}")
85 if kwargs.get("show_progress") and not kwargs.get("progress"):
86 res = await send2tg(client, message, texts=f"🔗正在解析链接\n{url}", **kwargs)
87 kwargs["progress"] = res[0]
88 # try cache
89 db_key = url
90 if true(use_db) and (kv := await get_db(db_key)):
91 logger.debug(f"YT-DLP preview {DB.ENGINE} cache hit for key={db_key}")
92 kwargs |= {"copy_video_msg": kwargs.get("copy_video_msg", ytdlp_send_video), "copy_audio_msg": kwargs.get("copy_audio_msg", ytdlp_send_audio)}
93 if db_msgs := await copy_messages_from_db(client, message, key=db_key, kv=kv, **kwargs):
94 return db_msgs
95 await modify_progress(text=f"❌从{DB.ENGINE}缓存中转发失败, 尝试重新解析...", **kwargs)
96
97 # get video info from API first
98 if platform == "youtube":
99 vinfo = await get_youtube_vinfo(vid)
100 elif platform == "bilibili":
101 vinfo = await get_bilibili_vinfo(bvid)
102 else:
103 vinfo = {}
104 if platform in ["youtube", "bilibili"] and not vinfo.get("downloadable"):
105 await modify_progress(text=vinfo.get("error_msg") or "❌视频无法下载", force_update=True, **kwargs)
106 return []
107
108 info = await ytdlp_download(url, proxy=proxy, platform=platform, ytdlp_download_video=ytdlp_download_video, **kwargs)
109 if not (info["video_path"].is_file() or info["audio_path"].is_file()):
110 return []
111 info |= vinfo # merge video info
112 captions = await generate_captions(
113 info,
114 url=url,
115 platform=platform,
116 vid=vid,
117 bvid=bvid,
118 bilibili_comments=bilibili_comments,
119 youtube_comments=youtube_comments,
120 show_author=show_author,
121 show_title=show_title,
122 show_pubdate=show_pubdate,
123 show_statistics=show_statistics,
124 show_description=show_description,
125 )
126 # add send_from_user prefix to caption
127 prefix = kwargs.get("send_from_user", "")
128 texts = f"{prefix}{captions['caption']}"
129 info["caption"] = texts
130 sent_messages = await send_media(client, message, info, ytdlp_video_target, ytdlp_audio_target, ytdlp_send_video=ytdlp_send_video, ytdlp_send_audio=ytdlp_send_audio, **kwargs)
131
132 # get subtitles
133 subtitles = ""
134 if true(ytdlp_send_subtitle) or true(ytdlp_send_summary):
135 fpath = info["audio_path"] if info["audio_path"].is_file() else info["video_path"]
136 asr_engine = kwargs.get("asr_engine", "uncensored") if platform == "youtube" else ASR.DEFAULT_ENGINE
137 if sub := await get_subtitles(fpath, url, asr_engine, info):
138 subtitles = f"🔤<b>字幕:</b>\n{sub}"
139
140 # get ai summary
141 summary = ""
142 if subtitles and true(ytdlp_send_summary):
143 prompt = generate_prompt(info, target="summary")
144 ai_msg = Message( # Construct a message for AI
145 id=rand_number(),
146 chat=message.chat,
147 text=Str(f"{PREFIX.AI_TEXT_GENERATION} @{summary_model_id} {prompt}"),
148 reply_to_message=Message(id=rand_number(), chat=message.chat, text=Str(subtitles)),
149 )
150 aires = await ai_text_generation(client, ai_msg, silent=True)
151 if aires.get("texts"):
152 summary = f"🤖<b>{aires['model_name']}总结:</b>\n{markdown.markdown(aires['texts'])}\n"
153
154 if summary_with_subtitle := f"{summary}{subtitles}":
155 telegraph_name = "🤖总结 & 🔤字幕" if summary and subtitles else "🔤字幕" if subtitles else "🤖AI总结"
156 caption = f"{captions['caption_without_comments']}\n"
157 caption += f"#️⃣字符数: {count_subtitles(summary_with_subtitle)}\n"
158 caption += f"⏳阅读时长: {readable_time(60 * count_subtitles(summary_with_subtitle) / READING_SPEED)}"
159 html = "\n".join([f"<p>{s}</p>" for s in summary_with_subtitle.split("\n")]).replace("<p></p>", "")
160 if true(to_telegraph) and (telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url)):
161 caption += f"\n⚡️[即时预览]({telegraph_url})"
162 sent_messages = await append_subtitle(f'<a href="{telegraph_url}">{telegraph_name}</a>', sent_messages)
163 else:
164 subtitle_target = ytdlp_subtitle_target or kwargs.get("target_chat") or message.chat.id
165 with io.BytesIO(subtitles.encode("utf-8")) as f:
166 subtitle_msg = await client.send_document(to_int(subtitle_target), f, file_name=f"{info['title']}.txt", caption=caption)
167 if isinstance(subtitle_msg, Message):
168 sent_messages["caption"] = subtitle_msg
169
170 # save messages when video is uploaded
171 messages = [msg for msgs in sent_messages.values() for msg in (msgs if isinstance(msgs, list) else [msgs])]
172 if bool(use_db and info["video_path"].is_file()):
173 metadata = {}
174 for k in ["author", "author_url", "title", "url", "create_time", "duration", "description"]:
175 if v := locals().get(k):
176 metadata[k] = unicode_to_ascii(v)
177 await save_messages(messages=messages, key=url, metadata=metadata)
178
179 Path(info["json_path"]).unlink(missing_ok=True)
180 cleanup_ytdlp(info["id"])
181 return messages
182
183
184async def generate_captions(
185 info: dict,
186 url: str,
187 platform: str,
188 vid: str,
189 bvid: str,
190 *,
191 bilibili_comments: bool,
192 youtube_comments: bool,
193 show_author: bool = True,
194 show_title: bool = True,
195 show_pubdate: bool = True,
196 show_statistics: bool = True,
197 show_description: bool = True,
198) -> dict:
199 """Generate captions."""
200 captions = ""
201 results = {}
202 emoji = platform_emoji(info["extractor"])
203 results["emoji"] = emoji
204
205 # author
206 if info.get("author") and info["author_url"]:
207 results["author"] = f"{emoji}[{info['author']}]({info['author_url']})"
208 elif info.get("author"):
209 results["author"] = f"{emoji}[{info['author']}]({url})"
210 else:
211 results["author"] = f"{emoji}[原始链接]({url})"
212 if true(show_author):
213 captions += f"{results['author']}\n"
214
215 # date
216 if info.get("pubdate"):
217 results["create_time"] = "🕒" + info["pubdate"]
218 elif dt := ts_to_dt(info.get("timestamp")):
219 results["create_time"] = f"🕒{dt:%Y-%m-%d %H:%M:%S}"
220 elif info.get("upload_date"):
221 results["create_time"] = "🕒" + info["update_date"]
222 else:
223 results["create_time"] = ""
224 if true(show_pubdate) and results["create_time"]:
225 captions += f"{results['create_time']}\n"
226
227 results["statistics"] = info.get("statistics", "")
228 if true(show_statistics) and results["statistics"]:
229 captions += f"{results['statistics']}\n"
230
231 # title
232 if info.get("title"):
233 results["title"] = f"📝[{info['title']}]({url})"
234 else:
235 results["title"] = ""
236 if true(show_title) and results["title"]:
237 captions += f"{results['title']}\n"
238
239 # desc
240 if (desc := info.get("description")) and (desc != "-"):
241 warnings.simplefilter("ignore", MarkupResemblesLocatorWarning)
242 soup = BeautifulSoup(desc, "html.parser")
243 desc_text = soup_to_text(soup)
244 results["description"] = make_bvid_clickable(desc_text)
245 else:
246 results["description"] = ""
247 if true(show_description) and results["description"]:
248 captions += f"{results['description']}\n"
249
250 # comments
251 comment_list = []
252 comments = ""
253 if true(bilibili_comments) and platform == "bilibili":
254 comment_list = await get_bilibili_comments(bvid)
255 elif true(youtube_comments) and platform == "youtube":
256 comment_list = await get_youtube_comments(vid)
257 for comment in comment_list:
258 if await count_without_entities(f"{captions}{comment}") < CAPTION_LENGTH - 15: # leave some margin for other info
259 comments += comment
260 comments = comments.strip()
261 results["comments"] = comments
262 results["caption_without_comments"] = captions.strip()
263 results["caption"] = f"{captions}{comments}".strip()
264 return results
265
266
267def get_target_chats(message: Message, video_target: str | int | None = None, audio_target: str | int | None = None, **kwargs) -> tuple[int | str, int | str]:
268 """Get target chats of video and audio messages.
269
270 Returns:
271 (video_target_chat, audio_target_chat)
272 """
273 main_target = kwargs.get("target_chat") or message.chat.id
274 if video_target is None:
275 video_target = main_target
276 if audio_target is None:
277 audio_target = main_target
278 return to_int(video_target), to_int(audio_target)
279
280
281async def send_media(
282 client: Client,
283 message: Message,
284 info: dict,
285 ytdlp_video_target: str | int | None = None,
286 ytdlp_audio_target: str | int | None = None,
287 *,
288 ytdlp_send_video: bool = True,
289 ytdlp_send_audio: bool = False,
290 **kwargs,
291) -> dict:
292 """Send media to target chats.
293
294 Returns:
295 {
296 "video": list[Message],
297 "audio": Message,
298 }
299 """
300 video_path: Path = info["video_path"]
301 audio_path: Path = info["audio_path"]
302 thumb = info["thumb"]
303 video_messages = []
304 audio_message = None
305 video_target, audio_target = get_target_chats(message, ytdlp_video_target, ytdlp_audio_target, **kwargs)
306 reply_msg_id = kwargs.get("reply_msg_id", 0)
307 reply_parameters = get_reply_to(message.id, reply_msg_id)
308
309 # split large videos into multiple parts (>= 2GB)
310 if true(ytdlp_send_video) and video_path.is_file():
311 video_path = await convert_to_h264(video_path, allow_re_encoding=True, max_file_size=YTDLP_RE_ENCODING_MAX_FILE_BYTES, skip_h264=True)
312 if video_path.stat().st_size > MAX_FILE_BYTES:
313 await modify_progress(text=f"🎬视频大小超过Telegram限制({MAX_FILE_BYTES / 1024 / 1024:.0f}MB), 正在切分...", **kwargs)
314 videos = await preprocess_media([{"video": video_path, "thumb": thumb}])
315 for idx, video in enumerate(videos):
316 video["thumb"] = thumb # use the same thumb for all videos
317 caption = info["caption"].replace("📝[", f"📝[P{idx + 1}-") if len(videos) > 1 else info["caption"]
318 caption = (await smart_split(caption, CAPTION_LENGTH))[0]
319 await modify_progress(text=f"🎬视频上传中-P{idx + 1}: {readable_size(path=video['video'])}", force_update=True, **kwargs)
320 video_messages.append(
321 await client.send_video(
322 chat_id=to_int(video_target),
323 caption=caption,
324 reply_parameters=reply_parameters,
325 progress=telegram_uploading,
326 progress_args=(kwargs.get("progress", False), video["video"], true(kwargs.get("detail_progress"))), # message, path, detail_progress
327 **video,
328 )
329 )
330 # don't need to split audio
331 if true(ytdlp_send_audio) and audio_path.is_file():
332 await modify_progress(text=f"🎧音频上传中: {readable_size(path=audio_path)}", force_update=True, **kwargs)
333 caption = (await smart_split(info["caption"], CAPTION_LENGTH))[0]
334 audio_message = await client.send_audio(
335 chat_id=to_int(audio_target),
336 audio=audio_path.as_posix(),
337 caption=caption,
338 performer=info["author"],
339 title=info["title"],
340 duration=round(float(info.get("duration", "0"))),
341 reply_parameters=reply_parameters,
342 progress=telegram_uploading,
343 progress_args=(kwargs.get("progress", False), audio_path, true(kwargs.get("detail_progress"))), # message, path, detail_progress
344 thumb=info["thumb"],
345 )
346 await modify_progress(del_status=True, **kwargs)
347 sent_messages = {}
348 if all(isinstance(x, Message) for x in video_messages):
349 sent_messages["video"] = video_messages
350 if isinstance(audio_message, Message):
351 sent_messages["audio"] = audio_message
352 return sent_messages