bennybot/src/ytdlp/main.py at main

  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3import warnings
  4from pathlib import Path
  5from typing import Literal
  6
  7from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
  8from glom import Coalesce, glom
  9from loguru import logger
 10from pyrogram.client import Client
 11from pyrogram.types import Message
 12
 13from config import AI, ASR, CAPTION_LENGTH, MAX_FILE_BYTES, YTDLP_RE_ENCODING_MAX_FILE_BYTES
 14from database.r2 import get_cf_r2
 15from messages.database import copy_messages_from_db, save_messages
 16from messages.preprocess import preprocess_media
 17from messages.progress import modify_progress, telegram_uploading
 18from messages.sender import send2tg
 19from messages.utils import better_blockquote, count_without_entities, get_reply_to, smart_split
 20from multimedia import convert_to_h264
 21from preview.bilibili import get_bilibili_comments, get_bilibili_vinfo, make_bvid_clickable
 22from preview.youtube import get_youtube_comments, get_youtube_vinfo
 23from publish import publish_telegraph
 24from summarize.summarize import summarize
 25from utils import convert2html, readable_size, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
 26from ytdlp.download import ytdlp_download
 27from ytdlp.utils import append_tag, cleanup_ytdlp, generate_prompt, get_subtitles, platform_emoji
 28
 29
 30async def preview_ytdlp(
 31    client: Client,
 32    message: Message,
 33    url: str = "",
 34    *,
 35    platform: Literal["youtube", "bilibili", "ytdlp"] = "ytdlp",
 36    vid: str = "",
 37    bvid: str = "",
 38    use_db: bool = True,
 39    ytdlp_download_video: bool = True,
 40    use_aria2: bool = False,
 41    ytdlp_send_video: bool = True,
 42    ytdlp_send_audio: bool = True,
 43    bilibili_comments: bool = True,
 44    youtube_comments: bool = True,
 45    proxy: str | None = None,
 46    ytdlp_video_target: str | int | None = None,
 47    ytdlp_audio_target: str | int | None = None,
 48    ytdlp_send_subtitle: bool = False,
 49    summary_ytdlp: bool = False,
 50    summary_ytdlp_model: str = AI.SUBTITLE_SUMMARY_MODEL_ALIAS,
 51    enable_corrector: bool = False,
 52    show_author: bool = True,
 53    show_title: bool = True,
 54    show_pubdate: bool = True,
 55    show_statistics: bool = True,
 56    show_description: bool = True,
 57    **kwargs,
 58) -> list[Message]:
 59    """Preview ytdlp link in the message.
 60
 61    Args:
 62        client (Client): The Pyrogram client.
 63        message (Message): The trigger message object.
 64        url (str, optional): ytdlp link.
 65        platform (str, optional): The platform of the video.
 66        vid (str, optional): The YouTube video id.
 67        bvid (str, optional): The Bilibili video id.
 68        use_db (bool, optional): Whether to use database to cache the result. Defaults to True.
 69        ytdlp_download_video (bool, optional): Download video. Defaults to True.
 70        use_aria2 (bool, optional): Whether to use aria2 to download the video. Defaults to False.
 71        ytdlp_send_video (bool, optional): Send video. Defaults to True.
 72        ytdlp_send_audio (bool, optional): Send audio. Defaults to False.
 73        bilibili_comments (bool, optional): Enable bilibili comments
 74        youtube_comments (bool, optional): Enable youtube comments
 75        proxy (str, optional): Proxy to use. Defaults to None.
 76        ytdlp_video_target (str | int, optional): The target chat id to send video.
 77        ytdlp_audio_target (str | int, optional): The target chat id to send audio.
 78        ytdlp_send_subtitle (bool, optional): Send subtitle. Defaults to False.
 79        summary_ytdlp (bool, optional): Send AI summary. Defaults to False.
 80        summary_ytdlp_model (str, optional): The model id to use for AI summary.
 81        to_telegraph (bool, optional): Whether to publish the subtitle or transcription to telegraph.
 82    """
 83    logger.trace(f"{url=} {kwargs=}")
 84    # try cache
 85    db_key = url
 86    if true(use_db) and (kv := await get_cf_r2(db_key)):
 87        logger.debug(f"YT-DLP preview cache hit for key={db_key}")
 88        kwargs |= {"copy_video_msg": kwargs.get("copy_video_msg", ytdlp_send_video), "copy_audio_msg": kwargs.get("copy_audio_msg", ytdlp_send_audio)}
 89        if db_msgs := await copy_messages_from_db(client, message, key=db_key, kv=kv, **kwargs):
 90            return db_msgs
 91        logger.warning("❌从缓存中转发失败, 尝试重新解析...")
 92
 93    if kwargs.get("show_progress") and not kwargs.get("progress"):
 94        res = await send2tg(client, message, texts=f"🔗正在解析链接\n{url}", **kwargs)
 95        kwargs["progress"] = res[0]
 96
 97    # get video info from API first
 98    if platform == "youtube":
 99        vinfo = await get_youtube_vinfo(vid)
100    elif platform == "bilibili":
101        vinfo = await get_bilibili_vinfo(bvid)
102    else:
103        vinfo = {}
104    if platform in ["youtube", "bilibili"] and not vinfo.get("downloadable"):
105        await modify_progress(text=vinfo.get("error_msg") or "❌视频无法下载", force_update=True, **kwargs)
106        return []
107
108    info = await ytdlp_download(
109        url,
110        proxy=proxy,
111        platform=platform,
112        ytdlp_download_video=ytdlp_download_video,
113        use_aria2=use_aria2,
114        **kwargs,
115    )
116    if not (info["video_path"].is_file() or info["audio_path"].is_file()):
117        return []
118    info |= vinfo  # merge video info
119    captions = await generate_captions(
120        info,
121        url=url,
122        platform=platform,
123        vid=vid,
124        bvid=bvid,
125        bilibili_comments=bilibili_comments,
126        youtube_comments=youtube_comments,
127        show_author=show_author,
128        show_title=show_title,
129        show_pubdate=show_pubdate,
130        show_statistics=show_statistics,
131        show_description=show_description,
132    )
133    # add send_from_user prefix to caption
134    prefix = kwargs.get("send_from_user", "")
135    texts = f"{prefix}{captions['caption']}"
136    info["caption"] = texts
137    sent_messages = await send_media(client, message, info, ytdlp_video_target, ytdlp_audio_target, ytdlp_send_video=ytdlp_send_video, ytdlp_send_audio=ytdlp_send_audio, **kwargs)
138
139    # get subtitles
140    subtitles = ""
141    if true(ytdlp_send_subtitle) or true(summary_ytdlp):
142        fpath = info["audio_path"] if info["audio_path"].is_file() else info["video_path"]
143        asr_engine = kwargs.get("asr_engine", "uncensored") if platform == "youtube" else ASR.DEFAULT_ENGINE
144        subtitles = await get_subtitles(fpath, url, asr_engine, info, enable_corrector=enable_corrector)
145
146    # get ai summary
147    telegraph_ai = ""
148    if subtitles and true(summary_ytdlp):
149        desc = info.get("description", "")
150        desc_html = desc if desc.startswith("<") else convert2html(desc)
151        if platform == "bilibili":
152            desc_html = f'<iframe src="https://player.bilibili.com/player.html?isOutside=true&bvid={bvid}&p=1&autoplay=0&poster=1&danmaku=1" frameborder="0" scrolling="no" border="0" framespacing="0" allowfullscreen="true" style="width: 100%; aspect-ratio: 16/9;"></iframe>{desc_html}'
153        elif platform == "youtube":
154            desc_html = f'<iframe src="https://www.youtube.com/embed/{vid}" frameborder="0" scrolling="no" border="0" framespacing="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen="true" style="width: 100%; aspect-ratio: 16/9;"></iframe>{desc_html}'
155        desc_page = {"emoji": "🎬", "name": "视频详情", "html": desc_html}
156        summary = await summarize(
157            sources=[{"type": "system_prompt", "text": generate_prompt(info)}, {"type": "transcripts", "text": subtitles}],
158            model=summary_ytdlp_model,
159            title=info.get("title"),
160            description=desc_page,
161            author=info.get("author"),
162            url=url,
163            date=glom(info, Coalesce("pubdate", "upload_date"), default=""),
164            min_text_length=200,
165        )
166        telegraph_ai = summary.get("telegraph_url", "")
167
168    if telegraph_ai:  # ai summary with subtitles
169        sent_messages = await append_tag(f'<a href="{telegraph_ai}">🤖AI导读</a>', sent_messages)
170    elif subtitles:  # subtitles only
171        html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")]).replace("<p></p>", "")
172        if telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url):
173            sent_messages = await append_tag(f'<a href="{telegraph_url}">🔤字幕</a>', sent_messages)
174
175    # save messages when video is uploaded
176    messages = [msg for msgs in sent_messages.values() for msg in (msgs if isinstance(msgs, list) else [msgs])]
177    if bool(use_db and info["video_path"].is_file()):
178        metadata = {}
179        for k in ["author", "author_url", "title", "url", "create_time", "duration", "description"]:
180            if v := locals().get(k):
181                metadata[k] = unicode_to_ascii(v)
182        await save_messages(messages=messages, key=url, metadata=metadata)
183
184    Path(info["json_path"]).unlink(missing_ok=True)
185    cleanup_ytdlp(info["id"])
186    return messages
187
188
189async def generate_captions(
190    info: dict,
191    url: str,
192    platform: str,
193    vid: str,
194    bvid: str,
195    *,
196    bilibili_comments: bool,
197    youtube_comments: bool,
198    show_author: bool = True,
199    show_title: bool = True,
200    show_pubdate: bool = True,
201    show_statistics: bool = True,
202    show_description: bool = True,
203) -> dict:
204    """Generate captions."""
205    captions = ""
206    results = {}
207    emoji = platform_emoji(info["extractor"])
208    results["emoji"] = emoji
209
210    # author
211    if info.get("author") and info["author_url"]:
212        results["author"] = f"{emoji}[{info['author']}]({info['author_url']})"
213    elif info.get("author"):
214        results["author"] = f"{emoji}[{info['author']}]({url})"
215    else:
216        results["author"] = f"{emoji}[原始链接]({url})"
217    if true(show_author):
218        captions += f"{results['author']}\n"
219
220    # date
221    if info.get("pubdate"):
222        results["create_time"] = "🕒" + info["pubdate"]
223    elif dt := ts_to_dt(info.get("timestamp")):
224        results["create_time"] = f"🕒{dt:%Y-%m-%d %H:%M:%S}"
225    elif info.get("upload_date"):
226        results["create_time"] = "🕒" + info["update_date"]
227    else:
228        results["create_time"] = ""
229    if true(show_pubdate) and results["create_time"]:
230        captions += f"{results['create_time']}\n"
231
232    results["statistics"] = info.get("statistics", "")
233    if true(show_statistics) and results["statistics"]:
234        captions += f"{results['statistics']}\n"
235
236    # title
237    if info.get("title"):
238        results["title"] = f"📝[{info['title']}]({url})"
239    else:
240        results["title"] = ""
241    if true(show_title) and results["title"]:
242        captions += f"{results['title']}\n"
243
244    # desc
245    if (desc := info.get("description")) and (desc != "-"):
246        warnings.simplefilter("ignore", MarkupResemblesLocatorWarning)
247        soup = BeautifulSoup(desc, "html.parser")
248        desc_text = soup_to_text(soup)
249        results["description"] = make_bvid_clickable(desc_text)
250    else:
251        results["description"] = ""
252    if true(show_description) and results["description"]:
253        captions += f"{results['description']}\n"
254
255    # comments
256    comment_list = []
257    comments = ""
258    if true(bilibili_comments) and platform == "bilibili":
259        comment_list = await get_bilibili_comments(bvid)
260    elif true(youtube_comments) and platform == "youtube":
261        comment_list = await get_youtube_comments(vid)
262    for comment in comment_list:
263        if await count_without_entities(f"{captions}{comment}") < CAPTION_LENGTH - 15:  # leave some margin for other info
264            comments += comment
265    comments = comments.strip()
266    results["comments"] = comments
267    results["caption_without_comments"] = captions.strip()
268    results["caption"] = f"{captions}{comments}".strip()
269    return results
270
271
272def get_target_chats(message: Message, video_target: str | int | None = None, audio_target: str | int | None = None, **kwargs) -> tuple[int | str, int | str]:
273    """Get target chats of video and audio messages.
274
275    Returns:
276        (video_target_chat, audio_target_chat)
277    """
278    main_target = kwargs.get("target_chat") or message.chat.id
279    if video_target is None:
280        video_target = main_target
281    if audio_target is None:
282        audio_target = main_target
283    return to_int(video_target), to_int(audio_target)
284
285
286async def send_media(
287    client: Client,
288    message: Message,
289    info: dict,
290    ytdlp_video_target: str | int | None = None,
291    ytdlp_audio_target: str | int | None = None,
292    *,
293    ytdlp_send_video: bool = True,
294    ytdlp_send_audio: bool = False,
295    **kwargs,
296) -> dict:
297    """Send media to target chats.
298
299    Returns:
300    {
301        "video": list[Message],
302        "audio": Message,
303    }
304    """
305    video_path: Path = info["video_path"]
306    audio_path: Path = info["audio_path"]
307    thumb = info["thumb"]
308    video_messages = []
309    audio_message = None
310    video_target, audio_target = get_target_chats(message, ytdlp_video_target, ytdlp_audio_target, **kwargs)
311    reply_msg_id = kwargs.get("reply_msg_id", 0)
312    reply_parameters = get_reply_to(message.id, reply_msg_id)
313
314    # split large videos into multiple parts (>= 2GB)
315    if true(ytdlp_send_video) and video_path.is_file():
316        video_path = await convert_to_h264(video_path, allow_re_encoding=True, max_file_size=YTDLP_RE_ENCODING_MAX_FILE_BYTES, skip_h264=True)
317        if video_path.stat().st_size > MAX_FILE_BYTES:
318            await modify_progress(text=f"🎬视频大小超过Telegram限制({MAX_FILE_BYTES / 1024 / 1024:.0f}MB), 正在切分...", **kwargs)
319        videos = await preprocess_media([{"video": video_path, "thumb": thumb}])
320        for idx, video in enumerate(videos):
321            video["thumb"] = thumb  # use the same thumb for all videos
322            caption = info["caption"].replace("📝[", f"📝[P{idx + 1}-") if len(videos) > 1 else info["caption"]
323            caption = (await smart_split(caption, CAPTION_LENGTH))[0]
324            await modify_progress(text=f"🎬视频上传中-P{idx + 1}: {readable_size(path=video['video'])}", force_update=True, **kwargs)
325            video_messages.append(
326                await client.send_video(
327                    chat_id=to_int(video_target),
328                    caption=better_blockquote(caption),
329                    reply_parameters=reply_parameters,
330                    progress=telegram_uploading,
331                    progress_args=(kwargs.get("progress", False), video["video"], true(kwargs.get("detail_progress"))),  # message, path, detail_progress
332                    **video,
333                )
334            )
335    # don't need to split audio
336    if true(ytdlp_send_audio) and audio_path.is_file():
337        await modify_progress(text=f"🎧音频上传中: {readable_size(path=audio_path)}", force_update=True, **kwargs)
338        caption = (await smart_split(info["caption"], CAPTION_LENGTH))[0]
339        audio_message = await client.send_audio(
340            chat_id=to_int(audio_target),
341            audio=audio_path.as_posix(),
342            caption=better_blockquote(caption),
343            performer=info["author"],
344            title=info["title"],
345            duration=round(float(info.get("duration", "0"))),
346            reply_parameters=reply_parameters,
347            progress=telegram_uploading,
348            progress_args=(kwargs.get("progress", False), audio_path, true(kwargs.get("detail_progress"))),  # message, path, detail_progress
349            thumb=info["thumb"],
350        )
351    await modify_progress(del_status=True, **kwargs)
352    sent_messages = {}
353    if all(isinstance(x, Message) for x in video_messages):
354        sent_messages["video"] = video_messages
355    if isinstance(audio_message, Message):
356        sent_messages["audio"] = audio_message
357    return sent_messages