bennybot/src/summarize/main.py at main

  1#!/venv/bin/python
  2# -*- coding: utf-8 -*-
  3import asyncio
  4import json
  5import warnings
  6from pathlib import Path
  7from typing import Any
  8
  9from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 10from glom import Coalesce, glom
 11from loguru import logger
 12from pyrogram.client import Client
 13from pyrogram.types import Message
 14
 15from ai.texts.contexts import MARKDOWN_EXT, TXT_EXT, full_chain_contexts, is_multi_user_chat, message_bytes
 16from config import AI, ASR, PREFIX, PROXY, TZ
 17from database.r2 import get_cf_r2
 18from messages.database import copy_messages_from_db
 19from messages.help import social_media_help
 20from messages.parser import parse_msg
 21from messages.sender import send2tg, send_blockquote_texts
 22from messages.utils import delete_message, equal_prefix, set_reaction, startswith_prefix
 23from networking import match_social_media_link
 24from others.download_external import AUDIO_FORMAT, VIDEO_FORMAT
 25from preview.arxiv import preview_arxiv
 26from preview.bilibili import make_bvid_clickable, preview_bilibili
 27from preview.douyin import preview_douyin
 28from preview.instagram import preview_instagram
 29from preview.twitter import preview_twitter
 30from preview.v2ex import preview_v2ex
 31from preview.wechat import preview_wechat
 32from preview.weibo import preview_weibo
 33from preview.xiaohongshu import preview_xhs
 34from summarize.summarize import summarize
 35from utils import convert2md, nowstr, read_text, soup_to_text, ts_to_dt
 36from ytdlp.download import ytdlp_download
 37from ytdlp.utils import ProxyError, get_subtitles
 38
 39
 40# ruff: noqa: RET502,RET503
 41async def ai_summary(client: Client, message: Message, summary_model_id: str = AI.AI_SUMMARY_MODEL_ALIAS, *, mermaid: bool = False, **kwargs) -> Any:
 42    if not startswith_prefix(message.content, prefix=PREFIX.AI_SUMMARY):
 43        return
 44    this_msg = message
 45    if equal_prefix(message.content, PREFIX.AI_SUMMARY):
 46        if not message.reply_to_message:
 47            return await send2tg(client, message, texts=social_media_help(message), **kwargs)
 48        message = message.reply_to_message
 49
 50    chains = await full_chain_contexts(client, message, order="asc")  # old to new
 51    file_bytes = sum(message_bytes(m) for m in chains)
 52    if file_bytes > 512 * 1024 * 1024:
 53        logger.warning(f"file_bytes: {file_bytes} > 512MB, skip")
 54        await this_msg.reply_text("❌上下文大小超过512MB，不支持总结")
 55        await asyncio.sleep(5)
 56        await delete_message(message)
 57        return
 58    await set_reaction(client, this_msg, "👌")
 59    matched = await match_social_media_link(str(message.content))
 60    kwargs |= {
 61        "summary_twitter": True,
 62        "summary_douyin": True,
 63        "summary_xhs": True,
 64        "summary_weibo": True,
 65        "summary_wechat": True,
 66        "summary_instagram": True,
 67        "summary_v2ex": True,
 68        "summary_ytdlp": True,
 69        "enable_corrector": False,
 70    } | matched
 71    if matched["platform"] == "arxiv":
 72        return await preview_arxiv(client, message, **kwargs)
 73    if matched["platform"] in ["douyin", "tiktok"]:  # noqa: RET505
 74        return await preview_douyin(client, message, **kwargs)
 75    if matched["platform"] == "instagram":
 76        return await preview_instagram(client, message, **kwargs)
 77    if matched["platform"] in ["x", "twitter", "fxtwitter", "fixupx"]:
 78        return await preview_twitter(client, message, **kwargs)
 79    if matched["platform"] == "weibo":
 80        return await preview_weibo(client, message, **kwargs)
 81    if matched["platform"] == "xiaohongshu":
 82        return await preview_xhs(client, message, **kwargs)
 83    if matched["platform"] == "wechat":
 84        return await preview_wechat(client, message, **kwargs)
 85    if matched["platform"] == "v2ex":
 86        return await preview_v2ex(client, message, **kwargs)
 87    if matched["platform"].startswith("bilibili-"):  # this is not bilibili video, for videos, use yt-dlp
 88        return await preview_bilibili(client, message, **kwargs)
 89
 90    sources = await get_sources(client, chains)
 91    info = {}
 92    if matched["platform"] in ["bilibili", "youtube", "ytdlp"]:
 93        r2 = await get_cf_r2(matched["db_key"])
 94        if "🤖AI导读" in "".join(glom(r2, "data.*.text", default=[])) and await copy_messages_from_db(client, message, key=matched["db_key"], kv=r2, **kwargs):
 95            await set_reaction(client, this_msg, "🎉")
 96            return
 97        if info := await download_ytdlp(**kwargs):
 98            sources.extend(info.get("sources", []))
 99
100    logger.debug(f"Summary sources: {sources}")
101    summary = await summarize(
102        sources=sources,
103        model=summary_model_id,
104        title=info.get("title") or "AI导读",
105        author=info.get("author") or "Anonymous",
106        url=matched.get("url"),
107        date=info.get("created_at") or nowstr(TZ),
108        description=info.get("description"),
109        force_r2_page=bool(kwargs.get("force_r2_page")),
110        mermaid=mermaid,
111    )
112    if summary.get("texts"):
113        await send_blockquote_texts(client, message, texts=summary["texts"], **kwargs)
114        await set_reaction(client, this_msg, "🎉")
115        return
116    await set_reaction(client, this_msg, "💔")
117
118
119async def get_sources(client: Client, chains: list[Message]) -> list[dict]:
120    sources = []
121    add_sender = is_multi_user_chat(chains)
122    for msg in chains:
123        info = parse_msg(msg, silent=True, use_cache=False)
124        meta: dict = {"message_sender": info["full_name"]} if add_sender else {}
125
126        if msg.audio or msg.photo or msg.video or msg.document:
127            fpath: str = await client.download_media(msg)  # ty:ignore[invalid-assignment]
128            if not Path(fpath).is_file():
129                continue
130            if msg.photo:
131                sources.append({"type": "image", "path": fpath})
132            elif msg.video:
133                sources.append({"type": "video", "path": fpath, "mime_type": msg.video.mime_type})
134            elif msg.audio:
135                sources.append({"type": "audio", "path": fpath, "mime_type": msg.audio.mime_type})
136            elif msg.document:
137                mime = glom(msg, "document.mime_type", default="") or ""
138                fname = glom(msg, "document.file_name", default="") or ""
139                if mime.startswith("image/"):
140                    sources.append({"type": "image", "path": fpath, "mime_type": mime})
141                elif mime.startswith("audio/") or Path(fname).suffix in AUDIO_FORMAT:
142                    sources.append({"type": "audio", "path": fpath, "mime_type": mime})
143                elif mime.startswith("video/") or Path(fname).suffix in VIDEO_FORMAT:
144                    sources.append({"type": "video", "path": fpath, "mime_type": mime})
145                elif mime.startswith("text/") or Path(fname).suffix in TXT_EXT:
146                    txt = {"file_name": fname, "file_content": read_text(fpath)}
147                    sources.append({"type": "text", "text": json.dumps(meta | txt, ensure_ascii=False)})
148                elif Path(fname).suffix in MARKDOWN_EXT:
149                    txt = {"file_name": fname, "file_content": convert2md(path=fpath)}
150                    sources.append({"type": "text", "text": json.dumps(meta | txt, ensure_ascii=False)})
151                else:
152                    sources.append({"type": "file", "path": fpath, "mime_type": mime})
153        if txt := glom(msg, Coalesce("content.html", "content", "text", "caption"), default=""):
154            texts = json.dumps(meta | {"message": txt}, ensure_ascii=False) if add_sender else txt
155            sources.append({"type": "text", "text": texts})
156            matched = await match_social_media_link(txt)
157            if matched["platform"] == "youtube":
158                sources.append({"type": "youtube", "url": matched["url"]})
159    return sources
160
161
162async def download_ytdlp(url: str, **kwargs) -> dict:
163    kwargs |= {"ytdlp_download_video": True, "show_progress": False}
164    try:
165        resp = await ytdlp_download(url, **kwargs)
166        if resp["video_path"].is_file():
167            return await ytdlp_info(resp, url, kwargs["platform"])
168    except ProxyError:
169        logger.error(f"🚫{kwargs['platform']}代理错误")
170        if PROXY.YTDLP_FALLBACK:
171            logger.warning(f"🔄使用备用代理{PROXY.YTDLP_FALLBACK}")
172            kwargs |= {"proxy": PROXY.YTDLP_FALLBACK}
173            return await download_ytdlp(url, **kwargs)
174    return {}
175
176
177async def ytdlp_info(info: dict, url: str, platform: str) -> dict:
178    data = {
179        "platform": platform.title(),
180        "author": info.get("author") or "Anonymous",
181        "title": info.get("title") or platform.title(),
182        "url": url,
183    }
184    sources = []
185    video = info["video_path"]
186    audio = info["audio_path"]
187    asr_path = audio if audio.is_file() else video
188    if video.is_file():
189        sources.append({"type": "video", "path": video.as_posix()})
190
191    if not video.is_file() and info["audio_path"].is_file():
192        sources.append({"type": "audio", "path": audio.as_posix()})
193
194    if subtitles := await get_subtitles(asr_path, url, asr_engine=ASR.DEFAULT_ENGINE, vinfo=info):
195        sources.append({"type": "transcripts", "text": subtitles})
196
197    # date
198    if info.get("pubdate"):
199        data["created_at"] = info["pubdate"].removeprefix("🕒")
200    elif dt := ts_to_dt(info.get("timestamp")):
201        data["created_at"] = f"{dt:%Y-%m-%d %H:%M:%S}"
202    elif info.get("upload_date"):
203        data["created_at"] = info["update_date"]
204    else:
205        data["created_at"] = nowstr(TZ)
206
207    # desc
208    if (desc := info.get("description")) and (desc != "-"):
209        warnings.simplefilter("ignore", MarkupResemblesLocatorWarning)
210        soup = BeautifulSoup(desc, "html.parser")
211        desc_text = soup_to_text(soup)
212        data["description"] = make_bvid_clickable(desc_text)
213    sources.append({"type": "text", "text": json.dumps(data, ensure_ascii=False)})
214    data["sources"] = sources
215    return data