bennybot/src/ytdlp/main.py at main

  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3import io
  4import warnings
  5from pathlib import Path
  6from typing import Literal
  7
  8import markdown
  9from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 10from loguru import logger
 11from pyrogram.client import Client
 12from pyrogram.types import Message
 13from pyrogram.types.messages_and_media.message import Str
 14
 15from ai.main import ai_text_generation
 16from config import AI, ASR, CAPTION_LENGTH, DB, MAX_FILE_BYTES, PREFIX, READING_SPEED, YTDLP_RE_ENCODING_MAX_FILE_BYTES
 17from database.database import get_db
 18from messages.database import copy_messages_from_db, save_messages
 19from messages.preprocess import preprocess_media
 20from messages.progress import modify_progress, telegram_uploading
 21from messages.sender import send2tg
 22from messages.utils import count_without_entities, get_reply_to, smart_split
 23from multimedia import convert_to_h264
 24from preview.bilibili import get_bilibili_comments, get_bilibili_vinfo, make_bvid_clickable
 25from preview.youtube import get_youtube_comments, get_youtube_vinfo
 26from publish import publish_telegraph
 27from utils import count_subtitles, rand_number, readable_size, readable_time, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
 28from ytdlp.download import ytdlp_download
 29from ytdlp.utils import append_subtitle, cleanup_ytdlp, generate_prompt, get_subtitles, platform_emoji
 30
 31
 32async def preview_ytdlp(
 33    client: Client,
 34    message: Message,
 35    url: str = "",
 36    *,
 37    platform: Literal["youtube", "bilibili", "ytdlp"] = "ytdlp",
 38    vid: str = "",
 39    bvid: str = "",
 40    use_db: bool = True,
 41    ytdlp_download_video: bool = True,
 42    ytdlp_send_video: bool = True,
 43    ytdlp_send_audio: bool = True,
 44    bilibili_comments: bool = True,
 45    youtube_comments: bool = True,
 46    proxy: str | None = None,
 47    ytdlp_video_target: str | int | None = None,
 48    ytdlp_audio_target: str | int | None = None,
 49    ytdlp_subtitle_target: str | int | None = None,
 50    ytdlp_send_subtitle: bool = False,
 51    ytdlp_send_summary: bool = False,
 52    summary_model_id: str = AI.SUBTITLE_SUMMARY_MODEL_ALIAS,
 53    to_telegraph: bool = True,
 54    show_author: bool = True,
 55    show_title: bool = True,
 56    show_pubdate: bool = True,
 57    show_statistics: bool = True,
 58    show_description: bool = True,
 59    **kwargs,
 60) -> list[Message]:
 61    """Preview ytdlp link in the message.
 62
 63    Args:
 64        client (Client): The Pyrogram client.
 65        message (Message): The trigger message object.
 66        url (str, optional): ytdlp link.
 67        platform (str, optional): The platform of the video.
 68        vid (str, optional): The YouTube video id.
 69        bvid (str, optional): The Bilibili video id.
 70        use_db (bool, optional): Whether to use database to cache the result. Defaults to True.
 71        ytdlp_download_video (bool, optional): Download video. Defaults to True.
 72        ytdlp_send_video (bool, optional): Send video. Defaults to True.
 73        ytdlp_send_audio (bool, optional): Send audio. Defaults to False.
 74        bilibili_comments (bool, optional): Enable bilibili comments
 75        youtube_comments (bool, optional): Enable youtube comments
 76        proxy (str, optional): Proxy to use. Defaults to None.
 77        ytdlp_video_target (str | int, optional): The target chat id to send video.
 78        ytdlp_audio_target (str | int, optional): The target chat id to send audio.
 79        ytdlp_send_subtitle (bool, optional): Send subtitle. Defaults to False.
 80        ytdlp_send_summary (bool, optional): Send AI summary. Defaults to False.
 81        summary_model_id (str, optional): The model id to use for AI summary.
 82        to_telegraph (bool, optional): Whether to publish the subtitle or transcription to telegraph.
 83    """
 84    logger.trace(f"{url=} {kwargs=}")
 85    if kwargs.get("show_progress") and not kwargs.get("progress"):
 86        res = await send2tg(client, message, texts=f"🔗正在解析链接\n{url}", **kwargs)
 87        kwargs["progress"] = res[0]
 88    # try cache
 89    db_key = url
 90    if true(use_db) and (kv := await get_db(db_key)):
 91        logger.debug(f"YT-DLP preview {DB.ENGINE} cache hit for key={db_key}")
 92        kwargs |= {"copy_video_msg": kwargs.get("copy_video_msg", ytdlp_send_video), "copy_audio_msg": kwargs.get("copy_audio_msg", ytdlp_send_audio)}
 93        if db_msgs := await copy_messages_from_db(client, message, key=db_key, kv=kv, **kwargs):
 94            return db_msgs
 95        await modify_progress(text=f"❌从{DB.ENGINE}缓存中转发失败, 尝试重新解析...", **kwargs)
 96
 97    # get video info from API first
 98    if platform == "youtube":
 99        vinfo = await get_youtube_vinfo(vid)
100    elif platform == "bilibili":
101        vinfo = await get_bilibili_vinfo(bvid)
102    else:
103        vinfo = {}
104    if platform in ["youtube", "bilibili"] and not vinfo.get("downloadable"):
105        await modify_progress(text=vinfo.get("error_msg") or "❌视频无法下载", force_update=True, **kwargs)
106        return []
107
108    info = await ytdlp_download(url, proxy=proxy, platform=platform, ytdlp_download_video=ytdlp_download_video, **kwargs)
109    if not (info["video_path"].is_file() or info["audio_path"].is_file()):
110        return []
111    info |= vinfo  # merge video info
112    captions = await generate_captions(
113        info,
114        url=url,
115        platform=platform,
116        vid=vid,
117        bvid=bvid,
118        bilibili_comments=bilibili_comments,
119        youtube_comments=youtube_comments,
120        show_author=show_author,
121        show_title=show_title,
122        show_pubdate=show_pubdate,
123        show_statistics=show_statistics,
124        show_description=show_description,
125    )
126    # add send_from_user prefix to caption
127    prefix = kwargs.get("send_from_user", "")
128    texts = f"{prefix}{captions['caption']}"
129    info["caption"] = texts
130    sent_messages = await send_media(client, message, info, ytdlp_video_target, ytdlp_audio_target, ytdlp_send_video=ytdlp_send_video, ytdlp_send_audio=ytdlp_send_audio, **kwargs)
131
132    # get subtitles
133    subtitles = ""
134    if true(ytdlp_send_subtitle) or true(ytdlp_send_summary):
135        fpath = info["audio_path"] if info["audio_path"].is_file() else info["video_path"]
136        asr_engine = kwargs.get("asr_engine", "uncensored") if platform == "youtube" else ASR.DEFAULT_ENGINE
137        if sub := await get_subtitles(fpath, url, asr_engine, info):
138            subtitles = f"🔤<b>字幕:</b>\n{sub}"
139
140    # get ai summary
141    summary = ""
142    if subtitles and true(ytdlp_send_summary):
143        prompt = generate_prompt(info, target="summary")
144        ai_msg = Message(  # Construct a message for AI
145            id=rand_number(),
146            chat=message.chat,
147            text=Str(f"{PREFIX.AI_TEXT_GENERATION} @{summary_model_id} {prompt}"),
148            reply_to_message=Message(id=rand_number(), chat=message.chat, text=Str(subtitles)),
149        )
150        aires = await ai_text_generation(client, ai_msg, silent=True)
151        if aires.get("texts"):
152            summary = f"🤖<b>{aires['model_name']}总结:</b>\n{markdown.markdown(aires['texts'])}\n"
153
154    if summary_with_subtitle := f"{summary}{subtitles}":
155        telegraph_name = "🤖总结 & 🔤字幕" if summary and subtitles else "🔤字幕" if subtitles else "🤖AI总结"
156        caption = f"{captions['caption_without_comments']}\n"
157        caption += f"#️⃣字符数: {count_subtitles(summary_with_subtitle)}\n"
158        caption += f"⏳阅读时长: {readable_time(60 * count_subtitles(summary_with_subtitle) / READING_SPEED)}"
159        html = "\n".join([f"<p>{s}</p>" for s in summary_with_subtitle.split("\n")]).replace("<p></p>", "")
160        if true(to_telegraph) and (telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url)):
161            caption += f"\n⚡️[即时预览]({telegraph_url})"
162            sent_messages = await append_subtitle(f'<a href="{telegraph_url}">{telegraph_name}</a>', sent_messages)
163        else:
164            subtitle_target = ytdlp_subtitle_target or kwargs.get("target_chat") or message.chat.id
165            with io.BytesIO(subtitles.encode("utf-8")) as f:
166                subtitle_msg = await client.send_document(to_int(subtitle_target), f, file_name=f"{info['title']}.txt", caption=caption)
167            if isinstance(subtitle_msg, Message):
168                sent_messages["caption"] = subtitle_msg
169
170    # save messages when video is uploaded
171    messages = [msg for msgs in sent_messages.values() for msg in (msgs if isinstance(msgs, list) else [msgs])]
172    if bool(use_db and info["video_path"].is_file()):
173        metadata = {}
174        for k in ["author", "author_url", "title", "url", "create_time", "duration", "description"]:
175            if v := locals().get(k):
176                metadata[k] = unicode_to_ascii(v)
177        await save_messages(messages=messages, key=url, metadata=metadata)
178
179    Path(info["json_path"]).unlink(missing_ok=True)
180    cleanup_ytdlp(info["id"])
181    return messages
182
183
184async def generate_captions(
185    info: dict,
186    url: str,
187    platform: str,
188    vid: str,
189    bvid: str,
190    *,
191    bilibili_comments: bool,
192    youtube_comments: bool,
193    show_author: bool = True,
194    show_title: bool = True,
195    show_pubdate: bool = True,
196    show_statistics: bool = True,
197    show_description: bool = True,
198) -> dict:
199    """Generate captions."""
200    captions = ""
201    results = {}
202    emoji = platform_emoji(info["extractor"])
203    results["emoji"] = emoji
204
205    # author
206    if info.get("author") and info["author_url"]:
207        results["author"] = f"{emoji}[{info['author']}]({info['author_url']})"
208    elif info.get("author"):
209        results["author"] = f"{emoji}[{info['author']}]({url})"
210    else:
211        results["author"] = f"{emoji}[原始链接]({url})"
212    if true(show_author):
213        captions += f"{results['author']}\n"
214
215    # date
216    if info.get("pubdate"):
217        results["create_time"] = "🕒" + info["pubdate"]
218    elif dt := ts_to_dt(info.get("timestamp")):
219        results["create_time"] = f"🕒{dt:%Y-%m-%d %H:%M:%S}"
220    elif info.get("upload_date"):
221        results["create_time"] = "🕒" + info["update_date"]
222    else:
223        results["create_time"] = ""
224    if true(show_pubdate) and results["create_time"]:
225        captions += f"{results['create_time']}\n"
226
227    results["statistics"] = info.get("statistics", "")
228    if true(show_statistics) and results["statistics"]:
229        captions += f"{results['statistics']}\n"
230
231    # title
232    if info.get("title"):
233        results["title"] = f"📝[{info['title']}]({url})"
234    else:
235        results["title"] = ""
236    if true(show_title) and results["title"]:
237        captions += f"{results['title']}\n"
238
239    # desc
240    if (desc := info.get("description")) and (desc != "-"):
241        warnings.simplefilter("ignore", MarkupResemblesLocatorWarning)
242        soup = BeautifulSoup(desc, "html.parser")
243        desc_text = soup_to_text(soup)
244        results["description"] = make_bvid_clickable(desc_text)
245    else:
246        results["description"] = ""
247    if true(show_description) and results["description"]:
248        captions += f"{results['description']}\n"
249
250    # comments
251    comment_list = []
252    comments = ""
253    if true(bilibili_comments) and platform == "bilibili":
254        comment_list = await get_bilibili_comments(bvid)
255    elif true(youtube_comments) and platform == "youtube":
256        comment_list = await get_youtube_comments(vid)
257    for comment in comment_list:
258        if await count_without_entities(f"{captions}{comment}") < CAPTION_LENGTH - 15:  # leave some margin for other info
259            comments += comment
260    comments = comments.strip()
261    results["comments"] = comments
262    results["caption_without_comments"] = captions.strip()
263    results["caption"] = f"{captions}{comments}".strip()
264    return results
265
266
267def get_target_chats(message: Message, video_target: str | int | None = None, audio_target: str | int | None = None, **kwargs) -> tuple[int | str, int | str]:
268    """Get target chats of video and audio messages.
269
270    Returns:
271        (video_target_chat, audio_target_chat)
272    """
273    main_target = kwargs.get("target_chat") or message.chat.id
274    if video_target is None:
275        video_target = main_target
276    if audio_target is None:
277        audio_target = main_target
278    return to_int(video_target), to_int(audio_target)
279
280
281async def send_media(
282    client: Client,
283    message: Message,
284    info: dict,
285    ytdlp_video_target: str | int | None = None,
286    ytdlp_audio_target: str | int | None = None,
287    *,
288    ytdlp_send_video: bool = True,
289    ytdlp_send_audio: bool = False,
290    **kwargs,
291) -> dict:
292    """Send media to target chats.
293
294    Returns:
295    {
296        "video": list[Message],
297        "audio": Message,
298    }
299    """
300    video_path: Path = info["video_path"]
301    audio_path: Path = info["audio_path"]
302    thumb = info["thumb"]
303    video_messages = []
304    audio_message = None
305    video_target, audio_target = get_target_chats(message, ytdlp_video_target, ytdlp_audio_target, **kwargs)
306    reply_msg_id = kwargs.get("reply_msg_id", 0)
307    reply_parameters = get_reply_to(message.id, reply_msg_id)
308
309    # split large videos into multiple parts (>= 2GB)
310    if true(ytdlp_send_video) and video_path.is_file():
311        video_path = await convert_to_h264(video_path, allow_re_encoding=True, max_file_size=YTDLP_RE_ENCODING_MAX_FILE_BYTES, skip_h264=True)
312        if video_path.stat().st_size > MAX_FILE_BYTES:
313            await modify_progress(text=f"🎬视频大小超过Telegram限制({MAX_FILE_BYTES / 1024 / 1024:.0f}MB), 正在切分...", **kwargs)
314        videos = await preprocess_media([{"video": video_path, "thumb": thumb}])
315        for idx, video in enumerate(videos):
316            video["thumb"] = thumb  # use the same thumb for all videos
317            caption = info["caption"].replace("📝[", f"📝[P{idx + 1}-") if len(videos) > 1 else info["caption"]
318            caption = (await smart_split(caption, CAPTION_LENGTH))[0]
319            await modify_progress(text=f"🎬视频上传中-P{idx + 1}: {readable_size(path=video['video'])}", force_update=True, **kwargs)
320            video_messages.append(
321                await client.send_video(
322                    chat_id=to_int(video_target),
323                    caption=caption,
324                    reply_parameters=reply_parameters,
325                    progress=telegram_uploading,
326                    progress_args=(kwargs.get("progress", False), video["video"], true(kwargs.get("detail_progress"))),  # message, path, detail_progress
327                    **video,
328                )
329            )
330    # don't need to split audio
331    if true(ytdlp_send_audio) and audio_path.is_file():
332        await modify_progress(text=f"🎧音频上传中: {readable_size(path=audio_path)}", force_update=True, **kwargs)
333        caption = (await smart_split(info["caption"], CAPTION_LENGTH))[0]
334        audio_message = await client.send_audio(
335            chat_id=to_int(audio_target),
336            audio=audio_path.as_posix(),
337            caption=caption,
338            performer=info["author"],
339            title=info["title"],
340            duration=round(float(info.get("duration", "0"))),
341            reply_parameters=reply_parameters,
342            progress=telegram_uploading,
343            progress_args=(kwargs.get("progress", False), audio_path, true(kwargs.get("detail_progress"))),  # message, path, detail_progress
344            thumb=info["thumb"],
345        )
346    await modify_progress(del_status=True, **kwargs)
347    sent_messages = {}
348    if all(isinstance(x, Message) for x in video_messages):
349        sent_messages["video"] = video_messages
350    if isinstance(audio_message, Message):
351        sent_messages["audio"] = audio_message
352    return sent_messages