main
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3import io
4import json
5import re
6from datetime import datetime, timedelta
7from zoneinfo import ZoneInfo
8
9from glom import Coalesce, glom
10from loguru import logger
11from pyrogram.client import Client
12from pyrogram.types import Chat, Message
13from pyrogram.types.messages_and_media.message import Str
14
15from ai.main import ai_text_generation
16from ai.utils import BOT_TIPS
17from config import AI, MAX_MESSAGE_SUMMARY, PREFIX, TID, TZ, cache
18from messages.chat_history import get_history_info_list
19from messages.parser import parse_msg
20from messages.progress import modify_progress
21from messages.sender import send2tg
22from messages.utils import equal_prefix, remove_prefix, startswith_prefix, to_int
23from networking import match_social_media_link
24from subtitles.subtitle import get_subtitle
25from utils import nowdt, rand_number, strings_list
26
27# ruff: noqa: RUF001
28HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
29⚠️使用`{PREFIX.AI_SUMMARY}`命令生成聊天记录文件 + 聊天记录AI总结
30⚠️使用`{PREFIX.COMBINATION}`命令只生成聊天记录文件, 不对聊天记录AI总结
31⚠️额外功能: 使用`{PREFIX.AI_SUMMARY} + 油管或B站链接`对视频内容进行AI总结
32
33{PREFIX.AI_SUMMARY}使用说明:
34- # 后跟消息数量或时间范围
35- @ 后跟用户名 (可多次使用@)
36
37**1️⃣指定条目数**
38- `{PREFIX.AI_SUMMARY} #N`: 总结最近的N条历史消息
39- `{PREFIX.AI_SUMMARY} #N @User`: 总结最近只属于User的N条消息
40- `{PREFIX.AI_SUMMARY} #N @User @User2`: 总结最近只属于User和User2的N条消息
41
42示例:
43- `{PREFIX.AI_SUMMARY} #10`: 总结最近的10条历史消息
44- `{PREFIX.AI_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
45- `{PREFIX.AI_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
46- `{PREFIX.AI_SUMMARY} #20 @John @Bob`: 总结最近用户John和Bob的20条消息
47
48**2️⃣指定最近时间段**
49- `{PREFIX.AI_SUMMARY} #interval`: 总结最近interval时段内的消息
50- `{PREFIX.AI_SUMMARY} #interval @User`: 总结最近interval时段内, 且只属于User的消息
51示例:
52- `{PREFIX.AI_SUMMARY} #10m`: 总结最近10分钟内的消息
53- `{PREFIX.AI_SUMMARY} #2h`: 总结最近2小时内的消息
54- `{PREFIX.AI_SUMMARY} #1d`: 总结最近1天的消息
55- `{PREFIX.AI_SUMMARY} #2h @John`: 总结最近2小时内的消息, 且只属于John的消息
56
57**3️⃣ 指定具体时间范围**
58- `{PREFIX.AI_SUMMARY} #20240101123000`: 总结从2024-01-01 12:30:00开始到现在的消息
59- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息
60- `{PREFIX.AI_SUMMARY} #20240101123000 @User`: 总结从2024-01-01 12:30:00开始到现在的消息, 且只属于User的消息
61- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000 @User`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息, 且只属于User的消息
62- 时间格式中没有任何分隔符, 必须为YYYYMMDDHHMMSS (14位纯数字)
63
64注意:
65- 用上述各种`{PREFIX.AI_SUMMARY}`命令回复消息M, 视为将截止时间设为消息M的发送时间
66- 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
67"""
68
69SYSTEM_PROMPT = """总结以下群聊记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 请注明消息username和message_id。
70群聊记录以TSV格式发送(分隔符为\t), 列名为: message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
71其中:
72- message_id (int): 消息ID, 唯一标识每条消息
73- username (str): 发送消息的用户用户名
74- content (str): 消息内容
75- reply_to_message_id (int | None): 该消息所回复的消息的message_id
76- reply_to_message_content (str | None): 该消息所回复的消息的content
77
78示例:
79message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
80123\tJohn\t今天好冷啊\t\t
81124\tLily\t我这里还好\t\t
82125\tAlice\t你那里下雪了吗\t123\t今天好冷啊
83126\tJohn\t天气预报说有,但是还没下\t125\t你那里下雪了吗
84127\tDavid\t我这里已经下了\t125\t你那里下雪了吗
85
86# 步骤
871. 阅读聊天记录: 仔细查看对话内容, 了解讨论的流程和上下文。
882. 识别关键主题: 提取整个聊天中讨论的主要话题。
893. 忽略废话及无关内容, 专注于关键信息。
904. 突出争议话题: 记录任何分歧或意见不同的地方。
915. 识别重要观点: 捕捉参与者提出的重要观点或论点。
926. 保留意图和上下文: 确保总结反映对话的原始意义和上下文。
937. 引用用户名和消息ID: 在适当情况下, 引用username和message_id以为某些陈述提供上下文。
948. 撰写总结: 以简洁的语言编写总结, 同时包含必要的引用。
95
96# 输出格式
97- 使用中文撰写总结。
98- 简明扼要地总结聊天记录的内容。
99- 在必要时引用消息username和message_id。
100- 保持清晰和简洁的表达。
101- 引用username和message_id时, 务必使用 **username[message_id]** 格式。如: **John[123]**
102- 如果需要同时引用多个message_id, 请使用 **username[id1, id2, ...]** 格式。如: **Alice[125, 126, 127]**
103
104"""
105
106DAILY_SUMMARY_PREFIX = "🏪**#爬楼助手**\n"
107CONTEXT_FILENAME = "聊天记录.txt"
108
109
110async def ai_chat_summary(
111 client: Client,
112 message: Message,
113 summary_prefix: str | None = None,
114 summary_model_id: str = AI.CHAT_SUMMARY_MODEL_ALIAS,
115 **kwargs,
116):
117 """GPT summary of the message history.
118
119 Args:
120 client (Client): The Pyrogram client.
121 message (Message): The trigger message object.
122 summary_prefix (str | None): Prefix string of the response message.
123 summary_model_id (str, optional): The model id to use for AI summary.
124 """
125 # send docs if message == "/summary"
126 if equal_prefix(message.text, prefix=[PREFIX.AI_SUMMARY, PREFIX.COMBINATION]) and not message.reply_to_message:
127 await send2tg(client, message, texts=HELP, **kwargs)
128 return
129 if not startswith_prefix(message.content, prefix=[PREFIX.AI_SUMMARY, PREFIX.COMBINATION]):
130 return
131 # summary Youtube & Bilibili video (skip for summaring chat history)
132 if startswith_prefix(message.text, prefix=PREFIX.AI_SUMMARY) and not remove_prefix(message.text, prefix=PREFIX.AI_SUMMARY).strip().startswith("#"):
133 # Youtube & Bilibili links in message's content or reply_to_message's content or reply_to_message's entity_urls
134 links_to_check = [message.content, glom(message, Coalesce("reply_to_message.content"), default="")]
135 if message.reply_to_message:
136 reply_info = parse_msg(message.reply_to_message, use_cache=False, silent=True)
137 links_to_check.extend(reply_info["entity_urls"])
138 for link in links_to_check:
139 matched = await match_social_media_link(link)
140 if matched["platform"] in ["youtube", "bilibili"]:
141 cache.delete(f"parse_msg-{message.chat.id}-{message.id}")
142 msg = Message(id=glom(message, Coalesce("reply_to_message.id", "id")), chat=message.chat, text=Str(f"{PREFIX.SUBTITLE} {matched['url']}"))
143 kwargs |= {"ai_summary": True, "send_subtitle_as": "none"}
144 await get_subtitle(client, msg, **kwargs)
145 return
146
147 info = parse_msg(message, silent=True)
148 need_summay = startswith_prefix(info["text"], prefix=PREFIX.AI_SUMMARY)
149 # replace /combine with /summary, because we need to use `/summary` to match different patterns
150 info["text"] = re.sub(r"^" + PREFIX.COMBINATION, PREFIX.AI_SUMMARY, info["text"], flags=re.IGNORECASE)
151 num_history = MAX_MESSAGE_SUMMARY
152 filter_users = []
153 begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
154 end_time = nowdt(tz=TZ)
155 # reply to a message with /summary
156 offset_id = 0
157 if message.reply_to_message:
158 offset_id = message.reply_to_message.id + 1 # include the reply message
159 end_time = message.date.astimezone(ZoneInfo(TZ)) if isinstance(message.date, datetime) else nowdt(TZ)
160
161 # 3️⃣ /summary #YYYYMMDDHHMMSS @user
162 # 4️⃣ /summary #YYYYMMDDHHMMSS-YYYYMMDDHHMMSS @user
163 if matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d{14})-?(\d{14})?(\s+)?(@\w+)?", info["text"]):
164 begin_time = datetime.strptime(matched.group(1), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
165 end_time = datetime.strptime(matched.group(2) or end_time.strftime("%Y%m%d%H%M%S"), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
166 filter_users = re.findall(r"@([^\s]+)", info["text"])
167 # 2️⃣ /summary #interval @user (/summary #4h @user)
168 elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)([mMhHdD])(\s+)?(@\w+)?", info["text"]):
169 interval = int(matched.group(1))
170 unit = matched.group(2).lower()
171 filter_users = re.findall(r"@([^\s]+)", info["text"])
172 if unit == "m":
173 begin_time = end_time - timedelta(minutes=interval)
174 elif unit == "h":
175 begin_time = end_time - timedelta(hours=interval)
176 elif unit == "d":
177 begin_time = end_time - timedelta(days=interval)
178 # 1️⃣ /summary #N @user
179 elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)(\s+)?(@\w+)?", info["text"]):
180 num_history = min(int(matched.group(1)), MAX_MESSAGE_SUMMARY)
181 filter_users = re.findall(r"@([^\s]+)", info["text"])
182 else:
183 return
184 # set custom chat_id and message_id (useful for debug)
185 if matched := re.search(r"cid=(-?\w+)", info["text"], re.IGNORECASE):
186 info["cid"] = to_int(matched.group(1))
187 if matched := re.search(r"mid=(\d+)", info["text"], re.IGNORECASE):
188 offset_id = int(matched.group(1)) + 1 # include this message
189 if kwargs.get("show_progress") and "progress" not in kwargs:
190 res = await send2tg(client, message, texts=f"📝正在获取历史消息...\n⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}", **kwargs)
191 kwargs["progress"] = res[0]
192 history_list = await get_history_info_list(client, info["cid"], offset_id, num_history, begin_time, end_time, filter_users)
193 parsed = await parse_history_list(history_list) # parse the history as csv
194 if parsed.get("num_message", 0) == 0:
195 await send2tg(client, message, texts=f"{num_history}条历史消息中未找到符合条件的消息", **kwargs)
196 await modify_progress(del_status=True, **kwargs)
197 return
198 msg = f"⏩开始时间: {parsed['begin_time']}\n"
199 msg += f"⏯️结束时间: {parsed['end_time']}\n"
200 msg += f"🔢消息条数: {parsed['num_message']}\n"
201 # send contexts as txt file
202 txt_format = get_txt_format(history_list)
203 with io.BytesIO(txt_format.encode("utf-8")) as f:
204 await client.send_document(to_int(message.chat.id), f, file_name=CONTEXT_FILENAME, caption=msg)
205 if not need_summay:
206 await modify_progress(del_status=True, **kwargs)
207 return
208 await modify_progress(text=f"🤖AI总结中...\n{msg}", force_update=True, **kwargs)
209 ai_msg = Message( # Construct a message for AI
210 id=rand_number(),
211 chat=message.chat,
212 text=Str(f"{PREFIX.AI_TEXT_GENERATION} @{summary_model_id} {SYSTEM_PROMPT} {parsed['history']}"),
213 )
214 ai_res = await ai_text_generation(client, ai_msg, silent=True)
215 if texts := ai_res.get("texts"):
216 summary_prefix = summary_prefix or f"🤖**{ai_res['model_name']}**:\n"
217 kwargs["reply_msg_id"] = -1 # DO NOT send as a reply message
218 texts = revert_to_original_url(texts, history_list, parsed["msg_offset"])
219 await send2tg(client, message, texts=f"{summary_prefix}⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}\n{texts}", **kwargs)
220 await modify_progress(del_status=True, **kwargs)
221
222
223async def parse_history_list(info_list: list[dict]) -> dict:
224 """Parse chat history info list.
225
226 Currently, we only summarize text contents.
227
228 Generate a history string in csv format, sep by pipe `|`
229
230 Returns:
231 {"history": str, "num_message": int, "msg_offset": int, "begin_time": str, "end_time": str}
232
233 """
234 now = nowdt(tz=TZ).strftime("%m-%d %H:%M:%S")
235 begin_time = ""
236 end_time = now
237 headers = "message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content\n"
238 history_csv = ""
239 num_message = 0
240 msg_offset = 0
241 for info in info_list:
242 if info["file_name"] == CONTEXT_FILENAME:
243 continue
244 if info["is_bot"]: # bots
245 continue
246 if not info["text"]: # currently, we only include texts
247 continue
248 num_message += 1
249 msg_offset = msg_offset or info["mid"]
250 dt = info["datetime"]
251 begin_time = begin_time or dt.strftime("%m-%d %H:%M:%S")
252 end_time = dt.strftime("%m-%d %H:%M:%S")
253 media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
254 message_id = info["mid"]
255 reply_to_message_id = info.get("reply_to_message_id") or 0
256 username = info["full_name"]
257 content = media_type + info["text"]
258 reply_msg = get_message_by_id(info_list, reply_to_message_id)
259 reply_msg_content = reply_msg.get("message", "")
260 if len(reply_msg_content) > 30:
261 reply_msg_content = reply_msg_content[:30] + "..."
262 history_csv += f"{message_id - msg_offset}\t{username}\t{content}\t{reply_to_message_id - msg_offset}\t{reply_msg_content}\n"
263 if not history_csv:
264 return {}
265
266 """IMPORTANT: We need to remove `BOT_TIPS` in the history!
267
268 Because we need to call `ai_text_generation` function,
269 it uses `BOT_TIPS` to check if the message is from GPT model.
270
271 If the history contains `BOT_TIPS`, the context of this message will be `model` (not `user`)
272 But `model` only message is not allowed, so we need to remove `BOT_TIPS`
273 """
274 history_csv = history_csv.replace(BOT_TIPS, "")
275 return {"history": headers + history_csv, "num_message": num_message, "msg_offset": msg_offset, "begin_time": begin_time, "end_time": end_time}
276
277
278def get_txt_format(info_list: list[dict]) -> str:
279 """Format the history as plaintext."""
280 txt_format = ""
281 txt_mediagroup_ids = set() # record processed mediagroup messages
282 for info in info_list:
283 if info["file_name"] == CONTEXT_FILENAME:
284 continue
285 if info["media_group_id"] in txt_mediagroup_ids:
286 continue
287 # add txt format
288 txt_format += f"[{info['datetime']:%m-%d %H:%M:%S}]{info['full_name']}:\n"
289 if info["mtype"] != "text": # not plaintext message
290 # media group
291 if info["media_group_id"]:
292 media_types = [f"[{x['mtype']}]" for x in info_list if x["media_group_id"] == info["media_group_id"]]
293 txt_format += " ".join(media_types)
294 txt_mediagroup_ids.add(info["media_group_id"])
295 else:
296 txt_format += f"[{info['mtype']}]"
297 txt_format += info["text"]
298 # append quote msg
299 if reply_msg_content := get_message_by_id(info_list, message_id=info.get("reply_to_message_id") or 0):
300 txt_format += f"\n<quote>{reply_msg_content['username']}: {reply_msg_content['message']}</quote>"
301 txt_format += "\n\n"
302 return txt_format
303
304
305def get_message_by_id(info_list: list[dict], message_id: int = 0) -> dict:
306 """Get message by id."""
307 if not message_id:
308 return {}
309 info = next((info for info in info_list if info["mid"] == message_id), {})
310 if not info:
311 return {}
312 media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
313 return {
314 "username": info["full_name"],
315 "time": f"{info['datetime']:%H:%M:%S}",
316 "url": info["message_url"],
317 "message": media_type + info["text"],
318 }
319
320
321def get_media_group_by_id(info_list: list[dict], media_group_id: int | None = None) -> list[dict]:
322 if not media_group_id:
323 return []
324 return [x for x in info_list if x["media_group_id"] == media_group_id]
325
326
327def revert_to_original_url(ai_texts: str, info_list: list[dict], msg_offset: int) -> str:
328 def get_message_markdown_url(mid: str) -> str:
329 real_mid = int(mid) + msg_offset
330 msg = get_message_by_id(info_list, real_mid)
331 if not msg:
332 return ""
333 return f"[{msg['time']}]({msg['url']})"
334
335 for group in re.findall(r"\[([\d, ]+)\]", ai_texts):
336 url_list = [get_message_markdown_url(mid) for mid in strings_list(group)]
337 urls = ", ".join(url_list)
338 ai_texts = ai_texts.replace(f"[{group}]", f"({urls})")
339 return ai_texts
340
341
342async def daily_summary(client: Client):
343 """Daily summary of the chat history."""
344 now = nowdt(TZ)
345 durations = {
346 0: 12,
347 12: 12,
348 7: 24,
349 } # time in hour: duration in hours
350 if now.hour not in durations:
351 return
352 duration = durations[now.hour]
353 mapping = {} # summarize chat id -> send to chat id
354 try:
355 mapping = json.loads(TID.DAILY_SUMMARY)
356 except Exception:
357 logger.warning(f"Invalid DAILY_SUMMARY: {TID.DAILY_SUMMARY}")
358 return
359 for source_chat_id, target_chat_id in mapping.items():
360 logger.info(f"Summary chat {source_chat_id}, send results to {target_chat_id}")
361 # fake message
362 message = Message(
363 id=rand_number(),
364 chat=Chat(id=target_chat_id),
365 text=f"{PREFIX.AI_SUMMARY} #{duration}h cid={to_int(source_chat_id)}", # type: ignore
366 )
367 await ai_chat_summary(client, message, summary_prefix=DAILY_SUMMARY_PREFIX, target_chat=to_int(target_chat_id), reply_msg_id=-1)