main
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3import io
4import json
5import re
6from datetime import datetime, timedelta
7from zoneinfo import ZoneInfo
8
9from loguru import logger
10from pyrogram.client import Client
11from pyrogram.types import Chat, Message
12from pyrogram.types.messages_and_media.message import Str
13
14from ai.main import ai_text_generation
15from ai.utils import BOT_TIPS
16from config import AI, MAX_MESSAGE_SUMMARY, PREFIX, TID, TZ
17from messages.chat_history import get_history_info_list
18from messages.parser import parse_msg
19from messages.progress import modify_progress
20from messages.sender import send2tg
21from messages.utils import equal_prefix, startswith_prefix, to_int
22from utils import nowdt, rand_number, strings_list
23
24# ruff: noqa: RUF001
25HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
26⚠️使用`{PREFIX.CHAT_SUMMARY}`命令生成聊天记录文件 + 聊天记录AI总结
27⚠️使用`{PREFIX.COMBINATION}`命令只生成聊天记录文件, 不对聊天记录AI总结
28⚠️额外功能: 使用`{PREFIX.CHAT_SUMMARY} + 油管或B站链接`对视频内容进行AI总结
29
30{PREFIX.CHAT_SUMMARY}使用说明:
31- # 后跟消息数量或时间范围
32- @ 后跟用户名 (可多次使用@)
33
34**1️⃣指定条目数**
35- `{PREFIX.CHAT_SUMMARY} #N`: 总结最近的N条历史消息
36- `{PREFIX.CHAT_SUMMARY} #N @User`: 总结最近只属于User的N条消息
37- `{PREFIX.CHAT_SUMMARY} #N @User @User2`: 总结最近只属于User和User2的N条消息
38
39示例:
40- `{PREFIX.CHAT_SUMMARY} #10`: 总结最近的10条历史消息
41- `{PREFIX.CHAT_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
42- `{PREFIX.CHAT_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
43- `{PREFIX.CHAT_SUMMARY} #20 @John @Bob`: 总结最近用户John和Bob的20条消息
44
45**2️⃣指定最近时间段**
46- `{PREFIX.CHAT_SUMMARY} #interval`: 总结最近interval时段内的消息
47- `{PREFIX.CHAT_SUMMARY} #interval @User`: 总结最近interval时段内, 且只属于User的消息
48示例:
49- `{PREFIX.CHAT_SUMMARY} #10m`: 总结最近10分钟内的消息
50- `{PREFIX.CHAT_SUMMARY} #2h`: 总结最近2小时内的消息
51- `{PREFIX.CHAT_SUMMARY} #1d`: 总结最近1天的消息
52- `{PREFIX.CHAT_SUMMARY} #2h @John`: 总结最近2小时内的消息, 且只属于John的消息
53
54**3️⃣ 指定具体时间范围**
55- `{PREFIX.CHAT_SUMMARY} #20240101123000`: 总结从2024-01-01 12:30:00开始到现在的消息
56- `{PREFIX.CHAT_SUMMARY} #20240101123000-20240101124000`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息
57- `{PREFIX.CHAT_SUMMARY} #20240101123000 @User`: 总结从2024-01-01 12:30:00开始到现在的消息, 且只属于User的消息
58- `{PREFIX.CHAT_SUMMARY} #20240101123000-20240101124000 @User`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息, 且只属于User的消息
59- 时间格式中没有任何分隔符, 必须为YYYYMMDDHHMMSS (14位纯数字)
60
61注意:
62- 用上述各种`{PREFIX.CHAT_SUMMARY}`命令回复消息M, 视为将截止时间设为消息M的发送时间
63- 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
64"""
65
66SYSTEM_PROMPT = """总结以下群聊记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 请注明消息username和message_id。
67群聊记录以TSV格式发送(分隔符为\t), 列名为: message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
68其中:
69- message_id (int): 消息ID, 唯一标识每条消息
70- username (str): 发送消息的用户用户名
71- content (str): 消息内容
72- reply_to_message_id (int | None): 该消息所回复的消息的message_id
73- reply_to_message_content (str | None): 该消息所回复的消息的content
74
75示例:
76message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
77123\tJohn\t今天好冷啊\t\t
78124\tLily\t我这里还好\t\t
79125\tAlice\t你那里下雪了吗\t123\t今天好冷啊
80126\tJohn\t天气预报说有,但是还没下\t125\t你那里下雪了吗
81127\tDavid\t我这里已经下了\t125\t你那里下雪了吗
82
83# 步骤
841. 阅读聊天记录: 仔细查看对话内容, 了解讨论的流程和上下文。
852. 识别关键主题: 提取整个聊天中讨论的主要话题。
863. 忽略废话及无关内容, 专注于关键信息。
874. 突出争议话题: 记录任何分歧或意见不同的地方。
885. 识别重要观点: 捕捉参与者提出的重要观点或论点。
896. 保留意图和上下文: 确保总结反映对话的原始意义和上下文。
907. 引用用户名和消息ID: 在适当情况下, 引用username和message_id以为某些陈述提供上下文。
918. 撰写总结: 以简洁的语言编写总结, 同时包含必要的引用。
92
93# 输出格式
94- 使用中文撰写总结。
95- 简明扼要地总结聊天记录的内容。
96- 在必要时引用消息username和message_id。
97- 保持清晰和简洁的表达。
98- 引用username和message_id时, 务必使用 **username[message_id]** 格式。如: **John[123]**
99- 如果需要同时引用多个message_id, 请使用 **username[id1, id2, ...]** 格式。如: **Alice[125, 126, 127]**
100
101"""
102
103DAILY_SUMMARY_PREFIX = "🏪**#爬楼助手**\n"
104CONTEXT_FILENAME = "聊天记录.txt"
105
106
107async def ai_chat_summary(
108 client: Client,
109 message: Message,
110 summary_prefix: str | None = None,
111 summary_chat_model: str = AI.CHAT_SUMMARY_MODEL_ALIAS,
112 **kwargs,
113):
114 """GPT summary of the message history.
115
116 Args:
117 client (Client): The Pyrogram client.
118 message (Message): The trigger message object.
119 summary_prefix (str | None): Prefix string of the response message.
120 summary_chat_model (str, optional): The model id to use for AI summary.
121 """
122 # send docs if message == "/summary"
123 if equal_prefix(message.text, prefix=[PREFIX.CHAT_SUMMARY, PREFIX.COMBINATION]) and not message.reply_to_message:
124 await send2tg(client, message, texts=HELP, **kwargs)
125 return
126 if not startswith_prefix(message.content, prefix=[PREFIX.CHAT_SUMMARY, PREFIX.COMBINATION]):
127 return
128
129 info = parse_msg(message, silent=True)
130 need_summay = startswith_prefix(info["text"], prefix=PREFIX.CHAT_SUMMARY)
131 # replace /combine with /summary, because we need to use `/summary` to match different patterns
132 info["text"] = re.sub(r"^" + PREFIX.COMBINATION, PREFIX.CHAT_SUMMARY, info["text"], flags=re.IGNORECASE)
133 num_history = MAX_MESSAGE_SUMMARY
134 filter_users = []
135 begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
136 end_time = nowdt(tz=TZ)
137 # reply to a message with /summary
138 offset_id = 0
139 if message.reply_to_message:
140 offset_id = message.reply_to_message.id + 1 # include the reply message
141 end_time = message.date.astimezone(ZoneInfo(TZ)) if isinstance(message.date, datetime) else nowdt(TZ)
142
143 # 3️⃣ /summary #YYYYMMDDHHMMSS @user
144 # 4️⃣ /summary #YYYYMMDDHHMMSS-YYYYMMDDHHMMSS @user
145 if matched := re.match(r"^" + PREFIX.CHAT_SUMMARY + r"\s+#(\d{14})-?(\d{14})?(\s+)?(@\w+)?", info["text"]):
146 begin_time = datetime.strptime(matched.group(1), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
147 end_time = datetime.strptime(matched.group(2) or end_time.strftime("%Y%m%d%H%M%S"), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
148 filter_users = re.findall(r"@([^\s]+)", info["text"])
149 # 2️⃣ /summary #interval @user (/summary #4h @user)
150 elif matched := re.match(r"^" + PREFIX.CHAT_SUMMARY + r"\s+#(\d+)([mMhHdD])(\s+)?(@\w+)?", info["text"]):
151 interval = int(matched.group(1))
152 unit = matched.group(2).lower()
153 filter_users = re.findall(r"@([^\s]+)", info["text"])
154 if unit == "m":
155 begin_time = end_time - timedelta(minutes=interval)
156 elif unit == "h":
157 begin_time = end_time - timedelta(hours=interval)
158 elif unit == "d":
159 begin_time = end_time - timedelta(days=interval)
160 # 1️⃣ /summary #N @user
161 elif matched := re.match(r"^" + PREFIX.CHAT_SUMMARY + r"\s+#(\d+)(\s+)?(@\w+)?", info["text"]):
162 num_history = min(int(matched.group(1)), MAX_MESSAGE_SUMMARY)
163 filter_users = re.findall(r"@([^\s]+)", info["text"])
164 else:
165 return
166 # set custom chat_id and message_id (useful for debug)
167 if matched := re.search(r"cid=(-?\w+)", info["text"], re.IGNORECASE):
168 info["cid"] = to_int(matched.group(1))
169 if matched := re.search(r"mid=(\d+)", info["text"], re.IGNORECASE):
170 offset_id = int(matched.group(1)) + 1 # include this message
171 if kwargs.get("show_progress") and "progress" not in kwargs:
172 res = await send2tg(client, message, texts=f"📝正在获取历史消息...\n⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}", **kwargs)
173 kwargs["progress"] = res[0]
174 history_list = await get_history_info_list(client, info["cid"], offset_id, num_history, begin_time, end_time, filter_users)
175 parsed = parse_history_list(history_list) # parse the history as csv
176 if parsed.get("num_message", 0) == 0:
177 await send2tg(client, message, texts=f"{num_history}条历史消息中未找到符合条件的消息", **kwargs)
178 await modify_progress(del_status=True, **kwargs)
179 return
180 msg = f"⏩开始时间: {parsed['begin_time']}\n"
181 msg += f"⏯️结束时间: {parsed['end_time']}\n"
182 msg += f"🔢消息条数: {parsed['num_message']}\n"
183 # send contexts as txt file
184 txt_format = get_txt_format(history_list)
185 with io.BytesIO(txt_format.encode("utf-8")) as f:
186 await client.send_document(to_int(message.chat.id), f, file_name=CONTEXT_FILENAME, caption=msg)
187 if not need_summay:
188 await modify_progress(del_status=True, **kwargs)
189 return
190 await modify_progress(text=f"🤖AI总结中...\n{msg}", force_update=True, **kwargs)
191 ai_msg = Message( # Construct a message for AI
192 id=rand_number(),
193 chat=message.chat,
194 text=Str(f"{PREFIX.AI_TEXT_GENERATION} @{summary_chat_model} {SYSTEM_PROMPT} {parsed['history']}"),
195 )
196 ai_res = await ai_text_generation(client, ai_msg, silent=True)
197 if texts := ai_res.get("texts"):
198 summary_prefix = summary_prefix or f"🤖**{ai_res['model_name']}**:\n"
199 kwargs["reply_msg_id"] = -1 # DO NOT send as a reply message
200 texts = revert_to_original_url(texts, history_list, parsed["msg_offset"])
201 await send2tg(client, message, texts=f"{summary_prefix}⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}\n{texts}", **kwargs)
202 await modify_progress(del_status=True, **kwargs)
203
204
205def parse_history_list(info_list: list[dict]) -> dict:
206 """Parse chat history info list.
207
208 Currently, we only summarize text contents.
209
210 Generate a history string in csv format, sep by pipe `|`
211
212 Returns:
213 {"history": str, "num_message": int, "msg_offset": int, "begin_time": str, "end_time": str}
214
215 """
216 now = nowdt(tz=TZ).strftime("%m-%d %H:%M:%S")
217 begin_time = ""
218 end_time = now
219 headers = "message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content\n"
220 history_csv = ""
221 num_message = 0
222 msg_offset = 0
223 for info in info_list:
224 if info["file_name"] == CONTEXT_FILENAME:
225 continue
226 if info["is_bot"]: # bots
227 continue
228 if not info["text"]: # currently, we only include texts
229 continue
230 num_message += 1
231 msg_offset = msg_offset or info["mid"]
232 dt = info["datetime"]
233 begin_time = begin_time or dt.strftime("%m-%d %H:%M:%S")
234 end_time = dt.strftime("%m-%d %H:%M:%S")
235 media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
236 message_id = info["mid"]
237 reply_to_message_id = info.get("reply_to_message_id") or 0
238 username = info["full_name"]
239 content = media_type + info["text"]
240 reply_msg = get_message_by_id(info_list, reply_to_message_id)
241 reply_msg_content = reply_msg.get("message", "")
242 if len(reply_msg_content) > 30:
243 reply_msg_content = reply_msg_content[:30] + "..."
244 if reply_msg:
245 history_csv += f"{message_id - msg_offset}\t{username}\t{content}\t{reply_to_message_id - msg_offset}\t{reply_msg_content}\n"
246 else:
247 history_csv += f"{message_id - msg_offset}\t{username}\t{content}\t \t \n"
248 if not history_csv:
249 return {}
250
251 """IMPORTANT: We need to remove `BOT_TIPS` in the history!
252
253 Because we need to call `ai_text_generation` function,
254 it uses `BOT_TIPS` to check if the message is from GPT model.
255
256 If the history contains `BOT_TIPS`, the context of this message will be `model` (not `user`)
257 But `model` only message is not allowed, so we need to remove `BOT_TIPS`
258 """
259 history_csv = history_csv.replace(BOT_TIPS, "")
260 return {"history": headers + history_csv, "num_message": num_message, "msg_offset": msg_offset, "begin_time": begin_time, "end_time": end_time}
261
262
263def get_txt_format(info_list: list[dict]) -> str:
264 """Format the history as plaintext."""
265 txt_format = ""
266 txt_mediagroup_ids = set() # record processed mediagroup messages
267 for info in info_list:
268 if info["file_name"] == CONTEXT_FILENAME:
269 continue
270 if info["media_group_id"] in txt_mediagroup_ids:
271 continue
272 # add txt format
273 txt_format += f"[{info['datetime']:%Y-%m-%d %H:%M:%S}]{info['full_name']}:\n"
274 if info["mtype"] != "text": # not plaintext message
275 # media group
276 if info["media_group_id"]:
277 media_types = [f"[{x['mtype']}]" for x in info_list if x["media_group_id"] == info["media_group_id"]]
278 txt_format += " ".join(media_types)
279 txt_mediagroup_ids.add(info["media_group_id"])
280 else:
281 txt_format += f"[{info['mtype']}]"
282 txt_format += info["text"]
283 # append quote msg
284 if reply_msg_content := get_message_by_id(info_list, message_id=info.get("reply_to_message_id") or 0):
285 txt_format += f"\n<quote>{reply_msg_content['username']}: {reply_msg_content['message']}</quote>"
286 txt_format += "\n\n"
287 return txt_format
288
289
290def get_message_by_id(info_list: list[dict], message_id: int = 0) -> dict:
291 """Get message by id."""
292 if not message_id:
293 return {}
294 info = next((info for info in info_list if info["mid"] == message_id), {})
295 if not info:
296 return {}
297 media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
298 return {
299 "username": info["full_name"],
300 "time": f"{info['datetime']:%H:%M:%S}",
301 "url": info["message_url"],
302 "message": media_type + info["text"],
303 }
304
305
306def get_media_group_by_id(info_list: list[dict], media_group_id: int | None = None) -> list[dict]:
307 if not media_group_id:
308 return []
309 return [x for x in info_list if x["media_group_id"] == media_group_id]
310
311
312def revert_to_original_url(ai_texts: str, info_list: list[dict], msg_offset: int) -> str:
313 def get_message_markdown_url(mid: str) -> str:
314 real_mid = int(mid) + msg_offset
315 msg = get_message_by_id(info_list, real_mid)
316 if not msg:
317 return ""
318 return f"[{msg['time']}]({msg['url']})"
319
320 for group in re.findall(r"\[([\d, ]+)\]", ai_texts):
321 url_list = [get_message_markdown_url(mid) for mid in strings_list(group)]
322 urls = ", ".join(url_list)
323 ai_texts = ai_texts.replace(f"[{group}]", f"({urls})")
324 return ai_texts
325
326
327async def daily_summary(client: Client):
328 """Daily summary of the chat history."""
329 now = nowdt(TZ)
330 durations = {
331 0: 12,
332 12: 12,
333 7: 24,
334 } # time in hour: duration in hours
335 if now.hour not in durations:
336 return
337 duration = durations[now.hour]
338 mapping = {} # summarize chat id -> send to chat id
339 try:
340 mapping = json.loads(TID.DAILY_SUMMARY)
341 except Exception:
342 logger.warning(f"Invalid DAILY_SUMMARY: {TID.DAILY_SUMMARY}")
343 return
344 for source_chat_id, target_chat_id in mapping.items():
345 logger.info(f"Summary chat {source_chat_id}, send results to {target_chat_id}")
346 # fake message
347 message = Message(
348 id=rand_number(),
349 chat=Chat(id=target_chat_id),
350 text=f"{PREFIX.CHAT_SUMMARY} #{duration}h cid={to_int(source_chat_id)}", # type: ignore
351 )
352 await ai_chat_summary(client, message, summary_prefix=DAILY_SUMMARY_PREFIX, target_chat=to_int(target_chat_id), reply_msg_id=-1)