Commit b140695

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-09-15 05:53:03
feat(ytdlp): add AI summary support
1 parent a83730f
Changed files (4)
src/ytdlp/download.py
@@ -68,6 +68,12 @@ async def ytdlp_download(
     msg = f"✅下载成功:\n{info['summary']}"
     logger.success(f"{msg!r}")
     info["thumb"] = find_thumbnail(info["video_path"], info["audio_path"])
+    # correct audio format == .mp4
+    if info["audio_path"].suffix == ".mp4":
+        new_path = info["audio_path"].with_suffix(".m4a")
+        info["audio_path"].rename(new_path)
+        info["audio_path"] = new_path
+    # summary
     await modify_progress(text=msg.strip(), force_update=True, **kwargs)
     return info
 
src/ytdlp/main.py
@@ -5,28 +5,29 @@ import warnings
 from pathlib import Path
 from typing import Literal
 
+import markdown
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 from glom import Coalesce, glom
 from loguru import logger
 from pyrogram.client import Client
-from pyrogram.types import Message, ReplyParameters
+from pyrogram.types import Message
+from pyrogram.types.messages_and_media.message import Str
 
-from asr.voice_recognition import asr_file
-from config import ASR, CAPTION_LENGTH, DB, MAX_FILE_BYTES, READING_SPEED, TEXT_LENGTH, YTDLP_RE_ENCODING_MAX_FILE_BYTES
+from config import ASR, CAPTION_LENGTH, DB, GPT, MAX_FILE_BYTES, PREFIX, READING_SPEED, YTDLP_RE_ENCODING_MAX_FILE_BYTES
 from database.database import get_db
+from llm.gpt import gpt_response
 from messages.database import copy_messages_from_db, save_messages
 from messages.preprocess import preprocess_media
 from messages.progress import modify_progress, telegram_uploading
 from messages.sender import send2tg
-from messages.utils import blockquote, count_without_entities, get_reply_to, smart_split, warp_comments
+from messages.utils import count_without_entities, get_reply_to, smart_split, warp_comments
 from multimedia import convert_to_h264
 from preview.bilibili import get_bilibili_comments, get_bilibili_vinfo, make_bvid_clickable
 from preview.youtube import get_youtube_comments, get_youtube_vinfo
 from publish import publish_telegraph
-from subtitles.base import fetch_subtitle
-from utils import count_subtitles, readable_size, readable_time, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
+from utils import count_subtitles, rand_number, readable_size, readable_time, soup_to_text, strings_list, to_int, true, ts_to_dt, unicode_to_ascii
 from ytdlp.download import ytdlp_download
-from ytdlp.utils import cleanup_ytdlp, platform_emoji
+from ytdlp.utils import append_subtitle, cleanup_ytdlp, get_subtitles, platform_emoji
 
 
 async def preview_ytdlp(
@@ -48,7 +49,7 @@ async def preview_ytdlp(
     ytdlp_audio_target: str | int | None = None,
     ytdlp_subtitle_target: str | int | None = None,
     ytdlp_send_subtitle: bool = False,
-    subtitle_force_file: bool = False,
+    ytdlp_send_summary: bool = False,
     to_telegraph: bool = True,
     show_author: bool = True,
     show_title: bool = True,
@@ -76,7 +77,7 @@ async def preview_ytdlp(
         ytdlp_video_target (str | int, optional): The target chat id to send video.
         ytdlp_audio_target (str | int, optional): The target chat id to send audio.
         ytdlp_send_subtitle (bool, optional): Send subtitle. Defaults to False.
-        subtitle_force_file (str, optional): If True, force to send transcription as file.
+        ytdlp_send_summary (bool, optional): Send AI summary. Defaults to False.
         to_telegraph (bool, optional): Whether to publish the subtitle or transcription to telegraph.
     """
     logger.trace(f"{url=} {kwargs=}")
@@ -125,41 +126,44 @@ async def preview_ytdlp(
     prefix = kwargs.get("send_from_user", "")
     texts = f"{prefix}{captions['caption']}"
     info["caption"] = texts
-
     sent_messages = await send_media(client, message, info, ytdlp_video_target, ytdlp_audio_target, ytdlp_send_video=ytdlp_send_video, ytdlp_send_audio=ytdlp_send_audio, **kwargs)
 
-    # send subtitles
+    # get subtitles
     subtitles = ""
-    if true(ytdlp_send_subtitle) and info["audio_path"].is_file():
-        if platform in ["bilibili", "youtube"]:  # get subtitle from API first
-            res = await fetch_subtitle(url=url)
-            subtitles = glom(res, Coalesce("full", "subtitles"), default="")
-        if not subtitles:
-            asr_engine = kwargs.get("asr_engine", "uncensored") if "youtube" in info["extractor"] else ASR.DEFAULT_ENGINE
-            res = await asr_file(info["audio_path"], asr_engine, client=client, message=message, silent=True)
-            subtitles = res.get("texts", "")
-            if count_subtitles(subtitles) < 20:
-                subtitles = ""  # ignore too  short transcription
-        if subtitles:
-            subtitle_msg = None
+    if info["audio_path"].is_file() and (true(ytdlp_send_subtitle) or true(ytdlp_send_summary)):
+        asr_engine = kwargs.get("asr_engine", "uncensored") if platform == "youtube" else ASR.DEFAULT_ENGINE
+        if sub := await get_subtitles(info["audio_path"], url, asr_engine):
+            subtitles = f"🔤<b>字幕:</b>\n{sub}"
+
+    # get ai summary
+    summary = ""
+    if subtitles and true(ytdlp_send_summary):
+        prompt = generate_prompt(info)
+        # Construct a message to call GPT
+        ai_msg = Message(
+            id=rand_number(),
+            chat=message.chat,
+            text=Str(f"{strings_list(PREFIX.GPT)[0]} {prompt}"),
+            reply_to_message=Message(id=rand_number(), chat=message.chat, text=Str(subtitles)),
+        )
+        params = {"include_thoughts": False, "append_grounding": False, "silent": True, "custom_model_id": GPT.SUBTITLE_SUMMARY_MODEL_ID, "custom_model_name": GPT.SUBTITLE_SUMMARY_MODEL_NAME}
+        aires = await gpt_response(client, ai_msg, **params)
+        if aires.get("texts"):
+            summary = f"🤖<b>{aires['model_name']}总结:</b>\n{markdown.markdown(aires['texts'])}\n"
+
+    if summary_with_subtitle := f"{summary}{subtitles}":
+        telegraph_name = "🤖总结 & 🔤字幕" if summary and subtitles else "🔤字幕" if subtitles else "🤖AI总结"
+        caption = f"{captions['caption_without_comments']}\n"
+        caption += f"#️⃣字符数: {count_subtitles(summary_with_subtitle)}\n"
+        caption += f"⏳阅读时长: {readable_time(60 * count_subtitles(summary_with_subtitle) / READING_SPEED)}"
+        html = "\n".join([f"<p>{s}</p>" for s in summary_with_subtitle.split("\n")]).replace("<p></p>", "")
+        if true(to_telegraph) and (telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url)):
+            caption += f"\n⚡️[即时预览]({telegraph_url})"
+            sent_messages = await append_subtitle(f'<a href="{telegraph_url}">{telegraph_name}</a>', sent_messages)
+        else:
             subtitle_target = ytdlp_subtitle_target or kwargs.get("target_chat") or message.chat.id
-            if len(subtitles) > TEXT_LENGTH or true(subtitle_force_file):
-                caption = f"{captions['caption_without_comments']}\n#️⃣字符数: {count_subtitles(subtitles)}\n⏳阅读时长: {readable_time(60 * count_subtitles(subtitles) / READING_SPEED)}"
-                if true(to_telegraph):
-                    html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")])
-                    if telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url):
-                        caption += f"\n⚡️[即时预览]({telegraph_url})"
-                with io.BytesIO(subtitles.encode("utf-8")) as f:
-                    subtitle_msg = await client.send_document(to_int(subtitle_target), f, file_name=f"{info['title']}.txt", caption=caption)
-            else:
-                # get reply msg id
-                if sent_messages.get("video"):
-                    reply_mid = sent_messages["video"][0].id
-                elif sent_messages.get("audio"):
-                    reply_mid = sent_messages["audio"].id
-                else:
-                    reply_mid = message.id
-                subtitle_msg = await client.send_message(subtitle_target, blockquote(subtitles), reply_parameters=ReplyParameters(message_id=reply_mid))
+            with io.BytesIO(subtitles.encode("utf-8")) as f:
+                subtitle_msg = await client.send_document(to_int(subtitle_target), f, file_name=f"{info['title']}.txt", caption=caption)
             if isinstance(subtitle_msg, Message):
                 sent_messages["caption"] = subtitle_msg
 
@@ -260,6 +264,23 @@ async def generate_captions(
     return results
 
 
+def generate_prompt(info: dict) -> str:
+    """Generate prompt for AI summary."""
+    prompt = f"以上是{info['extractor'].title()}视频"
+    if author := info.get("author"):
+        prompt += f"作者【{author}】"
+    prompt += "的一期节目的文字稿。该期节目详情如下:\n"
+    if title := info.get("title"):
+        prompt += f"节目标题: {title}\n"
+    if pubdate := glom(info, Coalesce("pubdate", "upload_date"), default=""):
+        prompt += f"发布日期: {pubdate}\n"
+
+    if desc := info.get("description"):
+        prompt += f"节目简介: {desc}\n"
+    prompt += "\n请解读本期节目内容。要求: 直接输出节目内容解读, 以“该节目讲述了”开头"
+    return prompt
+
+
 def get_target_chats(message: Message, video_target: str | int | None = None, audio_target: str | int | None = None, **kwargs) -> tuple[int | str, int | str]:
     """Get target chats of video and audio messages.
 
src/ytdlp/utils.py
@@ -5,13 +5,18 @@ from pathlib import Path
 from typing import Literal
 from urllib.parse import urlparse
 
+from glom import glom
 from loguru import logger
+from pyrogram.types import Message
 from yt_dlp.utils import YoutubeDLError
 
+from asr.voice_recognition import asr_file
 from config import COOKIE, DOWNLOAD_DIR, PROXY
 from cookies import ytdlp_bilibili_cookie
 from multimedia import convert_img_to_telegram_format, generate_cover
-from utils import remove_none_values
+from networking import match_social_media_link
+from subtitles.base import fetch_subtitle
+from utils import count_subtitles, remove_none_values
 
 
 class ProxyError(Exception):
@@ -177,6 +182,58 @@ def find_thumbnail(video_path: str | Path, audio_path: str | Path) -> str | None
     return None
 
 
+async def get_subtitles(audio_path: str | Path, url: str, asr_engine: str) -> str:
+    # send subtitles
+    subtitles = ""
+    matched = await match_social_media_link(url)
+    if matched["platform"] in ["bilibili", "youtube"]:  # get subtitle from API first
+        res = await fetch_subtitle(url=url)
+        subtitles = res.get("subtitles", "")  # only subtitles, no Bilibili's AI summary
+    if not subtitles:
+        res = await asr_file(audio_path, asr_engine, silent=True)
+        subtitles = res.get("texts", "")
+        if count_subtitles(subtitles) < 20:
+            subtitles = ""  # ignore too  short transcription
+    return subtitles
+
+
+async def append_subtitle(name: str, sent_messages: dict) -> dict:
+    """Add subtitle to sent messages.
+
+    sent_message:
+    {
+        "video": list[Message],
+        "audio": Message,
+    }
+    """
+
+    def new_caption(m: Message) -> str:
+        # insert name after description
+        html = glom(m, "content.html", default="")
+        lines = html.split("\n")
+        pos = -1
+        for i, line in enumerate(lines):
+            if line.startswith("📝<a href="):
+                pos = i + 1
+                break
+        lines.insert(pos, name)
+        return "\n".join(lines)
+
+    video_msgs = []
+    audio_msg = None
+    for k, message in sent_messages.items():
+        if k == "video":
+            video_msgs = [await msg.edit_caption(new_caption(msg)) for msg in message]
+        else:
+            audio_msg = await message.edit_caption(new_caption(message))
+    modified = {}
+    if all(isinstance(x, Message) for x in video_msgs):
+        modified["video"] = video_msgs
+    if isinstance(audio_msg, Message):
+        modified["audio"] = audio_msg
+    return modified
+
+
 def cleanup_ytdlp(vid: str):
     if not vid:
         return
src/publish.py
@@ -30,7 +30,7 @@ async def publish_telegraph(title: str, texts: str | None = None, html: str = ""
         return ""
     if not TOKEN.TELEGRAPH:
         return await publish_cf_r2(title, texts=texts, html=html, author=author, url=url)
-    if texts:
+    if texts and not html:
         html = markdown.markdown(texts)
     telegraph = Telegraph(access_token=TOKEN.TELEGRAPH)
     account_info = {}
@@ -60,7 +60,7 @@ async def publish_cf_r2(title: str, texts: str | None = None, html: str = "", au
     """Publish to CF R2."""
     if not (texts or html):
         return ""
-    if texts:
+    if texts and not html:
         html = markdown.markdown(texts)
     now = nowdt(TZ)
     today = f"{now:%Y-%m-%d}"
@@ -85,7 +85,7 @@ async def publish_neocities(title: str, texts: str | None = None, html: str = ""
         return ""
     if not (texts or html):
         return ""
-    if texts:
+    if texts and not html:
         html = markdown.markdown(texts)
     base_url = "https://neocities.org/api/upload"
     username, password = TOKEN.NEOCITIES.split(",")