bennybot/src/ai/chat_summary.py at main

  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3import io
  4import json
  5import re
  6from datetime import datetime, timedelta
  7from zoneinfo import ZoneInfo
  8
  9from glom import Coalesce, glom
 10from loguru import logger
 11from pyrogram.client import Client
 12from pyrogram.types import Chat, Message
 13from pyrogram.types.messages_and_media.message import Str
 14
 15from ai.main import ai_text_generation
 16from ai.utils import BOT_TIPS
 17from config import AI, MAX_MESSAGE_SUMMARY, PREFIX, TID, TZ, cache
 18from messages.chat_history import get_history_info_list
 19from messages.parser import parse_msg
 20from messages.progress import modify_progress
 21from messages.sender import send2tg
 22from messages.utils import equal_prefix, remove_prefix, startswith_prefix, to_int
 23from networking import match_social_media_link
 24from subtitles.subtitle import get_subtitle
 25from utils import nowdt, rand_number, strings_list
 26
 27# ruff: noqa: RUF001
 28HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
 29⚠️使用`{PREFIX.AI_SUMMARY}`命令生成聊天记录文件 + 聊天记录AI总结
 30⚠️使用`{PREFIX.COMBINATION}`命令只生成聊天记录文件, 不对聊天记录AI总结
 31⚠️额外功能: 使用`{PREFIX.AI_SUMMARY} + 油管或B站链接`对视频内容进行AI总结
 32
 33{PREFIX.AI_SUMMARY}使用说明:
 34- # 后跟消息数量或时间范围
 35- @ 后跟用户名 (可多次使用@)
 36
 37**1️⃣指定条目数**
 38- `{PREFIX.AI_SUMMARY} #N`: 总结最近的N条历史消息
 39- `{PREFIX.AI_SUMMARY} #N @User`: 总结最近只属于User的N条消息
 40- `{PREFIX.AI_SUMMARY} #N @User @User2`: 总结最近只属于User和User2的N条消息
 41
 42示例:
 43- `{PREFIX.AI_SUMMARY} #10`: 总结最近的10条历史消息
 44- `{PREFIX.AI_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
 45- `{PREFIX.AI_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
 46- `{PREFIX.AI_SUMMARY} #20 @John @Bob`: 总结最近用户John和Bob的20条消息
 47
 48**2️⃣指定最近时间段**
 49- `{PREFIX.AI_SUMMARY} #interval`: 总结最近interval时段内的消息
 50- `{PREFIX.AI_SUMMARY} #interval @User`: 总结最近interval时段内, 且只属于User的消息
 51示例:
 52- `{PREFIX.AI_SUMMARY} #10m`: 总结最近10分钟内的消息
 53- `{PREFIX.AI_SUMMARY} #2h`: 总结最近2小时内的消息
 54- `{PREFIX.AI_SUMMARY} #1d`: 总结最近1天的消息
 55- `{PREFIX.AI_SUMMARY} #2h @John`: 总结最近2小时内的消息, 且只属于John的消息
 56
 57**3️⃣ 指定具体时间范围**
 58- `{PREFIX.AI_SUMMARY} #20240101123000`: 总结从2024-01-01 12:30:00开始到现在的消息
 59- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息
 60- `{PREFIX.AI_SUMMARY} #20240101123000 @User`: 总结从2024-01-01 12:30:00开始到现在的消息, 且只属于User的消息
 61- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000 @User`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息, 且只属于User的消息
 62- 时间格式中没有任何分隔符, 必须为YYYYMMDDHHMMSS (14位纯数字)
 63
 64注意:
 65- 用上述各种`{PREFIX.AI_SUMMARY}`命令回复消息M, 视为将截止时间设为消息M的发送时间
 66- 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
 67"""
 68
 69SYSTEM_PROMPT = """总结以下群聊记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 请注明消息username和message_id。
 70群聊记录以TSV格式发送(分隔符为\t), 列名为: message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
 71其中:
 72- message_id (int): 消息ID, 唯一标识每条消息
 73- username (str): 发送消息的用户用户名
 74- content (str): 消息内容
 75- reply_to_message_id (int | None): 该消息所回复的消息的message_id
 76- reply_to_message_content (str | None): 该消息所回复的消息的content
 77
 78示例:
 79message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
 80123\tJohn\t今天好冷啊\t\t
 81124\tLily\t我这里还好\t\t
 82125\tAlice\t你那里下雪了吗\t123\t今天好冷啊
 83126\tJohn\t天气预报说有，但是还没下\t125\t你那里下雪了吗
 84127\tDavid\t我这里已经下了\t125\t你那里下雪了吗
 85
 86# 步骤
 871. 阅读聊天记录: 仔细查看对话内容, 了解讨论的流程和上下文。
 882. 识别关键主题: 提取整个聊天中讨论的主要话题。
 893. 忽略废话及无关内容, 专注于关键信息。
 904. 突出争议话题: 记录任何分歧或意见不同的地方。
 915. 识别重要观点: 捕捉参与者提出的重要观点或论点。
 926. 保留意图和上下文: 确保总结反映对话的原始意义和上下文。
 937. 引用用户名和消息ID: 在适当情况下, 引用username和message_id以为某些陈述提供上下文。
 948. 撰写总结: 以简洁的语言编写总结, 同时包含必要的引用。
 95
 96# 输出格式
 97- 使用中文撰写总结。
 98- 简明扼要地总结聊天记录的内容。
 99- 在必要时引用消息username和message_id。
100- 保持清晰和简洁的表达。
101- 引用username和message_id时, 务必使用 **username[message_id]** 格式。如: **John[123]**
102- 如果需要同时引用多个message_id, 请使用 **username[id1, id2, ...]** 格式。如: **Alice[125, 126, 127]**
103
104"""
105
106DAILY_SUMMARY_PREFIX = "🏪**#爬楼助手**\n"
107CONTEXT_FILENAME = "聊天记录.txt"
108
109
110async def ai_chat_summary(
111    client: Client,
112    message: Message,
113    summary_prefix: str | None = None,
114    summary_model_id: str = AI.CHAT_SUMMARY_MODEL_ALIAS,
115    **kwargs,
116):
117    """GPT summary of the message history.
118
119    Args:
120        client (Client): The Pyrogram client.
121        message (Message): The trigger message object.
122        summary_prefix (str | None): Prefix string of the response message.
123        summary_model_id (str, optional): The model id to use for AI summary.
124    """
125    # send docs if message == "/summary"
126    if equal_prefix(message.text, prefix=[PREFIX.AI_SUMMARY, PREFIX.COMBINATION]) and not message.reply_to_message:
127        await send2tg(client, message, texts=HELP, **kwargs)
128        return
129    if not startswith_prefix(message.content, prefix=[PREFIX.AI_SUMMARY, PREFIX.COMBINATION]):
130        return
131    # summary Youtube & Bilibili video (skip for summaring chat history)
132    if startswith_prefix(message.text, prefix=PREFIX.AI_SUMMARY) and not remove_prefix(message.text, prefix=PREFIX.AI_SUMMARY).strip().startswith("#"):
133        # Youtube & Bilibili links in message's content or reply_to_message's content or reply_to_message's entity_urls
134        links_to_check = [message.content, glom(message, Coalesce("reply_to_message.content"), default="")]
135        if message.reply_to_message:
136            reply_info = parse_msg(message.reply_to_message, use_cache=False, silent=True)
137            links_to_check.extend(reply_info["entity_urls"])
138        for link in links_to_check:
139            matched = await match_social_media_link(link)
140            if matched["platform"] in ["youtube", "bilibili"]:
141                cache.delete(f"parse_msg-{message.chat.id}-{message.id}")
142                msg = Message(id=glom(message, Coalesce("reply_to_message.id", "id")), chat=message.chat, text=Str(f"{PREFIX.SUBTITLE} {matched['url']}"))
143                kwargs |= {"ai_summary": True, "send_subtitle_as": "none"}
144                await get_subtitle(client, msg, **kwargs)
145                return
146
147    info = parse_msg(message, silent=True)
148    need_summay = startswith_prefix(info["text"], prefix=PREFIX.AI_SUMMARY)
149    # replace /combine with /summary, because we need to use `/summary` to match different patterns
150    info["text"] = re.sub(r"^" + PREFIX.COMBINATION, PREFIX.AI_SUMMARY, info["text"], flags=re.IGNORECASE)
151    num_history = MAX_MESSAGE_SUMMARY
152    filter_users = []
153    begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
154    end_time = nowdt(tz=TZ)
155    # reply to a message with /summary
156    offset_id = 0
157    if message.reply_to_message:
158        offset_id = message.reply_to_message.id + 1  # include the reply message
159        end_time = message.date.astimezone(ZoneInfo(TZ)) if isinstance(message.date, datetime) else nowdt(TZ)
160
161    # 3️⃣ /summary #YYYYMMDDHHMMSS @user
162    # 4️⃣ /summary #YYYYMMDDHHMMSS-YYYYMMDDHHMMSS @user
163    if matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d{14})-?(\d{14})?(\s+)?(@\w+)?", info["text"]):
164        begin_time = datetime.strptime(matched.group(1), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
165        end_time = datetime.strptime(matched.group(2) or end_time.strftime("%Y%m%d%H%M%S"), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
166        filter_users = re.findall(r"@([^\s]+)", info["text"])
167    # 2️⃣ /summary #interval @user  (/summary #4h @user)
168    elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)([mMhHdD])(\s+)?(@\w+)?", info["text"]):
169        interval = int(matched.group(1))
170        unit = matched.group(2).lower()
171        filter_users = re.findall(r"@([^\s]+)", info["text"])
172        if unit == "m":
173            begin_time = end_time - timedelta(minutes=interval)
174        elif unit == "h":
175            begin_time = end_time - timedelta(hours=interval)
176        elif unit == "d":
177            begin_time = end_time - timedelta(days=interval)
178    # 1️⃣ /summary #N @user
179    elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)(\s+)?(@\w+)?", info["text"]):
180        num_history = min(int(matched.group(1)), MAX_MESSAGE_SUMMARY)
181        filter_users = re.findall(r"@([^\s]+)", info["text"])
182    else:
183        return
184    # set custom chat_id and message_id (useful for debug)
185    if matched := re.search(r"cid=(-?\w+)", info["text"], re.IGNORECASE):
186        info["cid"] = to_int(matched.group(1))
187    if matched := re.search(r"mid=(\d+)", info["text"], re.IGNORECASE):
188        offset_id = int(matched.group(1)) + 1  # include this message
189    if kwargs.get("show_progress") and "progress" not in kwargs:
190        res = await send2tg(client, message, texts=f"📝正在获取历史消息...\n⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}", **kwargs)
191        kwargs["progress"] = res[0]
192    history_list = await get_history_info_list(client, info["cid"], offset_id, num_history, begin_time, end_time, filter_users)
193    parsed = await parse_history_list(history_list)  # parse the history as csv
194    if parsed.get("num_message", 0) == 0:
195        await send2tg(client, message, texts=f"{num_history}条历史消息中未找到符合条件的消息", **kwargs)
196        await modify_progress(del_status=True, **kwargs)
197        return
198    msg = f"⏩开始时间: {parsed['begin_time']}\n"
199    msg += f"⏯️结束时间: {parsed['end_time']}\n"
200    msg += f"🔢消息条数: {parsed['num_message']}\n"
201    # send contexts as txt file
202    txt_format = get_txt_format(history_list)
203    with io.BytesIO(txt_format.encode("utf-8")) as f:
204        await client.send_document(to_int(message.chat.id), f, file_name=CONTEXT_FILENAME, caption=msg)
205    if not need_summay:
206        await modify_progress(del_status=True, **kwargs)
207        return
208    await modify_progress(text=f"🤖AI总结中...\n{msg}", force_update=True, **kwargs)
209    ai_msg = Message(  # Construct a message for AI
210        id=rand_number(),
211        chat=message.chat,
212        text=Str(f"{PREFIX.AI_TEXT_GENERATION} @{summary_model_id} {SYSTEM_PROMPT} {parsed['history']}"),
213    )
214    ai_res = await ai_text_generation(client, ai_msg, silent=True)
215    if texts := ai_res.get("texts"):
216        summary_prefix = summary_prefix or f"🤖**{ai_res['model_name']}**:\n"
217        kwargs["reply_msg_id"] = -1  # DO NOT send as a reply message
218        texts = revert_to_original_url(texts, history_list, parsed["msg_offset"])
219        await send2tg(client, message, texts=f"{summary_prefix}⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}\n{texts}", **kwargs)
220        await modify_progress(del_status=True, **kwargs)
221
222
223async def parse_history_list(info_list: list[dict]) -> dict:
224    """Parse chat history info list.
225
226    Currently, we only summarize text contents.
227
228    Generate a history string in csv format, sep by pipe `|`
229
230    Returns:
231        {"history": str, "num_message": int, "msg_offset": int, "begin_time": str, "end_time": str}
232
233    """
234    now = nowdt(tz=TZ).strftime("%m-%d %H:%M:%S")
235    begin_time = ""
236    end_time = now
237    headers = "message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content\n"
238    history_csv = ""
239    num_message = 0
240    msg_offset = 0
241    for info in info_list:
242        if info["file_name"] == CONTEXT_FILENAME:
243            continue
244        if info["is_bot"]:  # bots
245            continue
246        if not info["text"]:  # currently, we only include texts
247            continue
248        num_message += 1
249        msg_offset = msg_offset or info["mid"]
250        dt = info["datetime"]
251        begin_time = begin_time or dt.strftime("%m-%d %H:%M:%S")
252        end_time = dt.strftime("%m-%d %H:%M:%S")
253        media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
254        message_id = info["mid"]
255        reply_to_message_id = info.get("reply_to_message_id") or 0
256        username = info["full_name"]
257        content = media_type + info["text"]
258        reply_msg = get_message_by_id(info_list, reply_to_message_id)
259        reply_msg_content = reply_msg.get("message", "")
260        if len(reply_msg_content) > 30:
261            reply_msg_content = reply_msg_content[:30] + "..."
262        history_csv += f"{message_id - msg_offset}\t{username}\t{content}\t{reply_to_message_id - msg_offset}\t{reply_msg_content}\n"
263    if not history_csv:
264        return {}
265
266    """IMPORTANT: We need to remove `BOT_TIPS` in the history!
267
268    Because we need to call `ai_text_generation` function,
269    it uses `BOT_TIPS` to check if the message is from GPT model.
270
271    If the history contains `BOT_TIPS`, the context of this message will be `model` (not `user`)
272    But `model` only message is not allowed, so we need to remove `BOT_TIPS`
273    """
274    history_csv = history_csv.replace(BOT_TIPS, "")
275    return {"history": headers + history_csv, "num_message": num_message, "msg_offset": msg_offset, "begin_time": begin_time, "end_time": end_time}
276
277
278def get_txt_format(info_list: list[dict]) -> str:
279    """Format the history as plaintext."""
280    txt_format = ""
281    txt_mediagroup_ids = set()  # record processed mediagroup messages
282    for info in info_list:
283        if info["file_name"] == CONTEXT_FILENAME:
284            continue
285        if info["media_group_id"] in txt_mediagroup_ids:
286            continue
287        # add txt format
288        txt_format += f"[{info['datetime']:%m-%d %H:%M:%S}]{info['full_name']}:\n"
289        if info["mtype"] != "text":  # not plaintext message
290            # media group
291            if info["media_group_id"]:
292                media_types = [f"[{x['mtype']}]" for x in info_list if x["media_group_id"] == info["media_group_id"]]
293                txt_format += " ".join(media_types)
294                txt_mediagroup_ids.add(info["media_group_id"])
295            else:
296                txt_format += f"[{info['mtype']}]"
297        txt_format += info["text"]
298        # append quote msg
299        if reply_msg_content := get_message_by_id(info_list, message_id=info.get("reply_to_message_id") or 0):
300            txt_format += f"\n<quote>{reply_msg_content['username']}: {reply_msg_content['message']}</quote>"
301        txt_format += "\n\n"
302    return txt_format
303
304
305def get_message_by_id(info_list: list[dict], message_id: int = 0) -> dict:
306    """Get message by id."""
307    if not message_id:
308        return {}
309    info = next((info for info in info_list if info["mid"] == message_id), {})
310    if not info:
311        return {}
312    media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
313    return {
314        "username": info["full_name"],
315        "time": f"{info['datetime']:%H:%M:%S}",
316        "url": info["message_url"],
317        "message": media_type + info["text"],
318    }
319
320
321def get_media_group_by_id(info_list: list[dict], media_group_id: int | None = None) -> list[dict]:
322    if not media_group_id:
323        return []
324    return [x for x in info_list if x["media_group_id"] == media_group_id]
325
326
327def revert_to_original_url(ai_texts: str, info_list: list[dict], msg_offset: int) -> str:
328    def get_message_markdown_url(mid: str) -> str:
329        real_mid = int(mid) + msg_offset
330        msg = get_message_by_id(info_list, real_mid)
331        if not msg:
332            return ""
333        return f"[{msg['time']}]({msg['url']})"
334
335    for group in re.findall(r"\[([\d, ]+)\]", ai_texts):
336        url_list = [get_message_markdown_url(mid) for mid in strings_list(group)]
337        urls = ", ".join(url_list)
338        ai_texts = ai_texts.replace(f"[{group}]", f"({urls})")
339    return ai_texts
340
341
342async def daily_summary(client: Client):
343    """Daily summary of the chat history."""
344    now = nowdt(TZ)
345    durations = {
346        0: 12,
347        12: 12,
348        7: 24,
349    }  # time in hour: duration in hours
350    if now.hour not in durations:
351        return
352    duration = durations[now.hour]
353    mapping = {}  # summarize chat id -> send to chat id
354    try:
355        mapping = json.loads(TID.DAILY_SUMMARY)
356    except Exception:
357        logger.warning(f"Invalid DAILY_SUMMARY: {TID.DAILY_SUMMARY}")
358        return
359    for source_chat_id, target_chat_id in mapping.items():
360        logger.info(f"Summary chat {source_chat_id}, send results to {target_chat_id}")
361        # fake message
362        message = Message(
363            id=rand_number(),
364            chat=Chat(id=target_chat_id),
365            text=f"{PREFIX.AI_SUMMARY} #{duration}h cid={to_int(source_chat_id)}",  # type: ignore
366        )
367        await ai_chat_summary(client, message, summary_prefix=DAILY_SUMMARY_PREFIX, target_chat=to_int(target_chat_id), reply_msg_id=-1)