bennybot/src/ai/chat_summary.py at main

  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3import io
  4import json
  5import re
  6from datetime import datetime, timedelta
  7from zoneinfo import ZoneInfo
  8
  9from loguru import logger
 10from pyrogram.client import Client
 11from pyrogram.types import Chat, Message
 12from pyrogram.types.messages_and_media.message import Str
 13
 14from ai.main import ai_text_generation
 15from ai.utils import BOT_TIPS
 16from config import AI, MAX_MESSAGE_SUMMARY, PREFIX, TID, TZ
 17from messages.chat_history import get_history_info_list
 18from messages.parser import parse_msg
 19from messages.progress import modify_progress
 20from messages.sender import send2tg
 21from messages.utils import equal_prefix, startswith_prefix, to_int
 22from utils import nowdt, rand_number, strings_list
 23
 24# ruff: noqa: RUF001
 25HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
 26⚠️使用`{PREFIX.CHAT_SUMMARY}`命令生成聊天记录文件 + 聊天记录AI总结
 27⚠️使用`{PREFIX.COMBINATION}`命令只生成聊天记录文件, 不对聊天记录AI总结
 28⚠️额外功能: 使用`{PREFIX.CHAT_SUMMARY} + 油管或B站链接`对视频内容进行AI总结
 29
 30{PREFIX.CHAT_SUMMARY}使用说明:
 31- # 后跟消息数量或时间范围
 32- @ 后跟用户名 (可多次使用@)
 33
 34**1️⃣指定条目数**
 35- `{PREFIX.CHAT_SUMMARY} #N`: 总结最近的N条历史消息
 36- `{PREFIX.CHAT_SUMMARY} #N @User`: 总结最近只属于User的N条消息
 37- `{PREFIX.CHAT_SUMMARY} #N @User @User2`: 总结最近只属于User和User2的N条消息
 38
 39示例:
 40- `{PREFIX.CHAT_SUMMARY} #10`: 总结最近的10条历史消息
 41- `{PREFIX.CHAT_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
 42- `{PREFIX.CHAT_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
 43- `{PREFIX.CHAT_SUMMARY} #20 @John @Bob`: 总结最近用户John和Bob的20条消息
 44
 45**2️⃣指定最近时间段**
 46- `{PREFIX.CHAT_SUMMARY} #interval`: 总结最近interval时段内的消息
 47- `{PREFIX.CHAT_SUMMARY} #interval @User`: 总结最近interval时段内, 且只属于User的消息
 48示例:
 49- `{PREFIX.CHAT_SUMMARY} #10m`: 总结最近10分钟内的消息
 50- `{PREFIX.CHAT_SUMMARY} #2h`: 总结最近2小时内的消息
 51- `{PREFIX.CHAT_SUMMARY} #1d`: 总结最近1天的消息
 52- `{PREFIX.CHAT_SUMMARY} #2h @John`: 总结最近2小时内的消息, 且只属于John的消息
 53
 54**3️⃣ 指定具体时间范围**
 55- `{PREFIX.CHAT_SUMMARY} #20240101123000`: 总结从2024-01-01 12:30:00开始到现在的消息
 56- `{PREFIX.CHAT_SUMMARY} #20240101123000-20240101124000`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息
 57- `{PREFIX.CHAT_SUMMARY} #20240101123000 @User`: 总结从2024-01-01 12:30:00开始到现在的消息, 且只属于User的消息
 58- `{PREFIX.CHAT_SUMMARY} #20240101123000-20240101124000 @User`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息, 且只属于User的消息
 59- 时间格式中没有任何分隔符, 必须为YYYYMMDDHHMMSS (14位纯数字)
 60
 61注意:
 62- 用上述各种`{PREFIX.CHAT_SUMMARY}`命令回复消息M, 视为将截止时间设为消息M的发送时间
 63- 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
 64"""
 65
 66SYSTEM_PROMPT = """总结以下群聊记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 请注明消息username和message_id。
 67群聊记录以TSV格式发送(分隔符为\t), 列名为: message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
 68其中:
 69- message_id (int): 消息ID, 唯一标识每条消息
 70- username (str): 发送消息的用户用户名
 71- content (str): 消息内容
 72- reply_to_message_id (int | None): 该消息所回复的消息的message_id
 73- reply_to_message_content (str | None): 该消息所回复的消息的content
 74
 75示例:
 76message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
 77123\tJohn\t今天好冷啊\t\t
 78124\tLily\t我这里还好\t\t
 79125\tAlice\t你那里下雪了吗\t123\t今天好冷啊
 80126\tJohn\t天气预报说有，但是还没下\t125\t你那里下雪了吗
 81127\tDavid\t我这里已经下了\t125\t你那里下雪了吗
 82
 83# 步骤
 841. 阅读聊天记录: 仔细查看对话内容, 了解讨论的流程和上下文。
 852. 识别关键主题: 提取整个聊天中讨论的主要话题。
 863. 忽略废话及无关内容, 专注于关键信息。
 874. 突出争议话题: 记录任何分歧或意见不同的地方。
 885. 识别重要观点: 捕捉参与者提出的重要观点或论点。
 896. 保留意图和上下文: 确保总结反映对话的原始意义和上下文。
 907. 引用用户名和消息ID: 在适当情况下, 引用username和message_id以为某些陈述提供上下文。
 918. 撰写总结: 以简洁的语言编写总结, 同时包含必要的引用。
 92
 93# 输出格式
 94- 使用中文撰写总结。
 95- 简明扼要地总结聊天记录的内容。
 96- 在必要时引用消息username和message_id。
 97- 保持清晰和简洁的表达。
 98- 引用username和message_id时, 务必使用 **username[message_id]** 格式。如: **John[123]**
 99- 如果需要同时引用多个message_id, 请使用 **username[id1, id2, ...]** 格式。如: **Alice[125, 126, 127]**
100
101"""
102
103DAILY_SUMMARY_PREFIX = "🏪**#爬楼助手**\n"
104CONTEXT_FILENAME = "聊天记录.txt"
105
106
107async def ai_chat_summary(
108    client: Client,
109    message: Message,
110    summary_prefix: str | None = None,
111    summary_chat_model: str = AI.CHAT_SUMMARY_MODEL_ALIAS,
112    **kwargs,
113):
114    """GPT summary of the message history.
115
116    Args:
117        client (Client): The Pyrogram client.
118        message (Message): The trigger message object.
119        summary_prefix (str | None): Prefix string of the response message.
120        summary_chat_model (str, optional): The model id to use for AI summary.
121    """
122    # send docs if message == "/summary"
123    if equal_prefix(message.text, prefix=[PREFIX.CHAT_SUMMARY, PREFIX.COMBINATION]) and not message.reply_to_message:
124        await send2tg(client, message, texts=HELP, **kwargs)
125        return
126    if not startswith_prefix(message.content, prefix=[PREFIX.CHAT_SUMMARY, PREFIX.COMBINATION]):
127        return
128
129    info = parse_msg(message, silent=True)
130    need_summay = startswith_prefix(info["text"], prefix=PREFIX.CHAT_SUMMARY)
131    # replace /combine with /summary, because we need to use `/summary` to match different patterns
132    info["text"] = re.sub(r"^" + PREFIX.COMBINATION, PREFIX.CHAT_SUMMARY, info["text"], flags=re.IGNORECASE)
133    num_history = MAX_MESSAGE_SUMMARY
134    filter_users = []
135    begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
136    end_time = nowdt(tz=TZ)
137    # reply to a message with /summary
138    offset_id = 0
139    if message.reply_to_message:
140        offset_id = message.reply_to_message.id + 1  # include the reply message
141        end_time = message.date.astimezone(ZoneInfo(TZ)) if isinstance(message.date, datetime) else nowdt(TZ)
142
143    # 3️⃣ /summary #YYYYMMDDHHMMSS @user
144    # 4️⃣ /summary #YYYYMMDDHHMMSS-YYYYMMDDHHMMSS @user
145    if matched := re.match(r"^" + PREFIX.CHAT_SUMMARY + r"\s+#(\d{14})-?(\d{14})?(\s+)?(@\w+)?", info["text"]):
146        begin_time = datetime.strptime(matched.group(1), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
147        end_time = datetime.strptime(matched.group(2) or end_time.strftime("%Y%m%d%H%M%S"), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
148        filter_users = re.findall(r"@([^\s]+)", info["text"])
149    # 2️⃣ /summary #interval @user  (/summary #4h @user)
150    elif matched := re.match(r"^" + PREFIX.CHAT_SUMMARY + r"\s+#(\d+)([mMhHdD])(\s+)?(@\w+)?", info["text"]):
151        interval = int(matched.group(1))
152        unit = matched.group(2).lower()
153        filter_users = re.findall(r"@([^\s]+)", info["text"])
154        if unit == "m":
155            begin_time = end_time - timedelta(minutes=interval)
156        elif unit == "h":
157            begin_time = end_time - timedelta(hours=interval)
158        elif unit == "d":
159            begin_time = end_time - timedelta(days=interval)
160    # 1️⃣ /summary #N @user
161    elif matched := re.match(r"^" + PREFIX.CHAT_SUMMARY + r"\s+#(\d+)(\s+)?(@\w+)?", info["text"]):
162        num_history = min(int(matched.group(1)), MAX_MESSAGE_SUMMARY)
163        filter_users = re.findall(r"@([^\s]+)", info["text"])
164    else:
165        return
166    # set custom chat_id and message_id (useful for debug)
167    if matched := re.search(r"cid=(-?\w+)", info["text"], re.IGNORECASE):
168        info["cid"] = to_int(matched.group(1))
169    if matched := re.search(r"mid=(\d+)", info["text"], re.IGNORECASE):
170        offset_id = int(matched.group(1)) + 1  # include this message
171    if kwargs.get("show_progress") and "progress" not in kwargs:
172        res = await send2tg(client, message, texts=f"📝正在获取历史消息...\n⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}", **kwargs)
173        kwargs["progress"] = res[0]
174    history_list = await get_history_info_list(client, info["cid"], offset_id, num_history, begin_time, end_time, filter_users)
175    parsed = parse_history_list(history_list)  # parse the history as csv
176    if parsed.get("num_message", 0) == 0:
177        await send2tg(client, message, texts=f"{num_history}条历史消息中未找到符合条件的消息", **kwargs)
178        await modify_progress(del_status=True, **kwargs)
179        return
180    msg = f"⏩开始时间: {parsed['begin_time']}\n"
181    msg += f"⏯️结束时间: {parsed['end_time']}\n"
182    msg += f"🔢消息条数: {parsed['num_message']}\n"
183    # send contexts as txt file
184    txt_format = get_txt_format(history_list)
185    with io.BytesIO(txt_format.encode("utf-8")) as f:
186        await client.send_document(to_int(message.chat.id), f, file_name=CONTEXT_FILENAME, caption=msg)
187    if not need_summay:
188        await modify_progress(del_status=True, **kwargs)
189        return
190    await modify_progress(text=f"🤖AI总结中...\n{msg}", force_update=True, **kwargs)
191    ai_msg = Message(  # Construct a message for AI
192        id=rand_number(),
193        chat=message.chat,
194        text=Str(f"{PREFIX.AI_TEXT_GENERATION} @{summary_chat_model} {SYSTEM_PROMPT} {parsed['history']}"),
195    )
196    ai_res = await ai_text_generation(client, ai_msg, silent=True)
197    if texts := ai_res.get("texts"):
198        summary_prefix = summary_prefix or f"🤖**{ai_res['model_name']}**:\n"
199        kwargs["reply_msg_id"] = -1  # DO NOT send as a reply message
200        texts = revert_to_original_url(texts, history_list, parsed["msg_offset"])
201        await send2tg(client, message, texts=f"{summary_prefix}⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}\n{texts}", **kwargs)
202        await modify_progress(del_status=True, **kwargs)
203
204
205def parse_history_list(info_list: list[dict]) -> dict:
206    """Parse chat history info list.
207
208    Currently, we only summarize text contents.
209
210    Generate a history string in csv format, sep by pipe `|`
211
212    Returns:
213        {"history": str, "num_message": int, "msg_offset": int, "begin_time": str, "end_time": str}
214
215    """
216    now = nowdt(tz=TZ).strftime("%m-%d %H:%M:%S")
217    begin_time = ""
218    end_time = now
219    headers = "message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content\n"
220    history_csv = ""
221    num_message = 0
222    msg_offset = 0
223    for info in info_list:
224        if info["file_name"] == CONTEXT_FILENAME:
225            continue
226        if info["is_bot"]:  # bots
227            continue
228        if not info["text"]:  # currently, we only include texts
229            continue
230        num_message += 1
231        msg_offset = msg_offset or info["mid"]
232        dt = info["datetime"]
233        begin_time = begin_time or dt.strftime("%m-%d %H:%M:%S")
234        end_time = dt.strftime("%m-%d %H:%M:%S")
235        media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
236        message_id = info["mid"]
237        reply_to_message_id = info.get("reply_to_message_id") or 0
238        username = info["full_name"]
239        content = media_type + info["text"]
240        reply_msg = get_message_by_id(info_list, reply_to_message_id)
241        reply_msg_content = reply_msg.get("message", "")
242        if len(reply_msg_content) > 30:
243            reply_msg_content = reply_msg_content[:30] + "..."
244        if reply_msg:
245            history_csv += f"{message_id - msg_offset}\t{username}\t{content}\t{reply_to_message_id - msg_offset}\t{reply_msg_content}\n"
246        else:
247            history_csv += f"{message_id - msg_offset}\t{username}\t{content}\t \t \n"
248    if not history_csv:
249        return {}
250
251    """IMPORTANT: We need to remove `BOT_TIPS` in the history!
252
253    Because we need to call `ai_text_generation` function,
254    it uses `BOT_TIPS` to check if the message is from GPT model.
255
256    If the history contains `BOT_TIPS`, the context of this message will be `model` (not `user`)
257    But `model` only message is not allowed, so we need to remove `BOT_TIPS`
258    """
259    history_csv = history_csv.replace(BOT_TIPS, "")
260    return {"history": headers + history_csv, "num_message": num_message, "msg_offset": msg_offset, "begin_time": begin_time, "end_time": end_time}
261
262
263def get_txt_format(info_list: list[dict]) -> str:
264    """Format the history as plaintext."""
265    txt_format = ""
266    txt_mediagroup_ids = set()  # record processed mediagroup messages
267    for info in info_list:
268        if info["file_name"] == CONTEXT_FILENAME:
269            continue
270        if info["media_group_id"] in txt_mediagroup_ids:
271            continue
272        # add txt format
273        txt_format += f"[{info['datetime']:%Y-%m-%d %H:%M:%S}]{info['full_name']}:\n"
274        if info["mtype"] != "text":  # not plaintext message
275            # media group
276            if info["media_group_id"]:
277                media_types = [f"[{x['mtype']}]" for x in info_list if x["media_group_id"] == info["media_group_id"]]
278                txt_format += " ".join(media_types)
279                txt_mediagroup_ids.add(info["media_group_id"])
280            else:
281                txt_format += f"[{info['mtype']}]"
282        txt_format += info["text"]
283        # append quote msg
284        if reply_msg_content := get_message_by_id(info_list, message_id=info.get("reply_to_message_id") or 0):
285            txt_format += f"\n<quote>{reply_msg_content['username']}: {reply_msg_content['message']}</quote>"
286        txt_format += "\n\n"
287    return txt_format
288
289
290def get_message_by_id(info_list: list[dict], message_id: int = 0) -> dict:
291    """Get message by id."""
292    if not message_id:
293        return {}
294    info = next((info for info in info_list if info["mid"] == message_id), {})
295    if not info:
296        return {}
297    media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
298    return {
299        "username": info["full_name"],
300        "time": f"{info['datetime']:%H:%M:%S}",
301        "url": info["message_url"],
302        "message": media_type + info["text"],
303    }
304
305
306def get_media_group_by_id(info_list: list[dict], media_group_id: int | None = None) -> list[dict]:
307    if not media_group_id:
308        return []
309    return [x for x in info_list if x["media_group_id"] == media_group_id]
310
311
312def revert_to_original_url(ai_texts: str, info_list: list[dict], msg_offset: int) -> str:
313    def get_message_markdown_url(mid: str) -> str:
314        real_mid = int(mid) + msg_offset
315        msg = get_message_by_id(info_list, real_mid)
316        if not msg:
317            return ""
318        return f"[{msg['time']}]({msg['url']})"
319
320    for group in re.findall(r"\[([\d, ]+)\]", ai_texts):
321        url_list = [get_message_markdown_url(mid) for mid in strings_list(group)]
322        urls = ", ".join(url_list)
323        ai_texts = ai_texts.replace(f"[{group}]", f"({urls})")
324    return ai_texts
325
326
327async def daily_summary(client: Client):
328    """Daily summary of the chat history."""
329    now = nowdt(TZ)
330    durations = {
331        0: 12,
332        12: 12,
333        7: 24,
334    }  # time in hour: duration in hours
335    if now.hour not in durations:
336        return
337    duration = durations[now.hour]
338    mapping = {}  # summarize chat id -> send to chat id
339    try:
340        mapping = json.loads(TID.DAILY_SUMMARY)
341    except Exception:
342        logger.warning(f"Invalid DAILY_SUMMARY: {TID.DAILY_SUMMARY}")
343        return
344    for source_chat_id, target_chat_id in mapping.items():
345        logger.info(f"Summary chat {source_chat_id}, send results to {target_chat_id}")
346        # fake message
347        message = Message(
348            id=rand_number(),
349            chat=Chat(id=target_chat_id),
350            text=f"{PREFIX.CHAT_SUMMARY} #{duration}h cid={to_int(source_chat_id)}",  # type: ignore
351        )
352        await ai_chat_summary(client, message, summary_prefix=DAILY_SUMMARY_PREFIX, target_chat=to_int(target_chat_id), reply_msg_id=-1)