Commit 1f402d6
Changed files (2)
src
messages
src/ai/chat_summary.py
@@ -22,8 +22,9 @@ from messages.sender import send2tg
from messages.utils import equal_prefix, remove_prefix, startswith_prefix, to_int
from networking import match_social_media_link
from subtitles.subtitle import get_subtitle
-from utils import nowdt, rand_number
+from utils import nowdt, rand_number, strings_list
+# ruff: noqa: RUF001
HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
⚠️使用`{PREFIX.AI_SUMMARY}`命令生成聊天记录文件 + 聊天记录AI总结
⚠️使用`{PREFIX.COMBINATION}`命令只生成聊天记录文件, 不对聊天记录AI总结
@@ -65,15 +66,22 @@ HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
- 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
"""
-SYSTEM_PROMPT = """总结以下网络聊天记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 引用原始用户名和时间戳。
-每一条消息的格式如下:
-{
- "username": "消息发送者",
- "time": "消息发送时间",
- "url": "消息链接",
- "message": "本条消息内容",
- "reply_to_message": "被此条消息回复的消息"
-}
+SYSTEM_PROMPT = """总结以下群聊记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 请注明消息username和message_id。
+群聊记录以TSV格式发送(分隔符为\t), 列名为: message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
+其中:
+- message_id (int): 消息ID, 唯一标识每条消息
+- username (str): 发送消息的用户用户名
+- content (str): 消息内容
+- reply_to_message_id (int | None): 该消息所回复的消息的message_id
+- reply_to_message_content (str | None): 该消息所回复的消息的content
+
+示例:
+message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
+123\tJohn\t今天好冷啊\t\t
+124\tLily\t我这里还好\t\t
+125\tAlice\t你那里下雪了吗\t123\t今天好冷啊
+126\tJohn\t天气预报说有,但是还没下\t125\t你那里下雪了吗
+127\tDavid\t我这里已经下了\t125\t你那里下雪了吗
# 步骤
1. 阅读聊天记录: 仔细查看对话内容, 了解讨论的流程和上下文。
@@ -82,16 +90,17 @@ SYSTEM_PROMPT = """总结以下网络聊天记录, 识别关键主题、争议
4. 突出争议话题: 记录任何分歧或意见不同的地方。
5. 识别重要观点: 捕捉参与者提出的重要观点或论点。
6. 保留意图和上下文: 确保总结反映对话的原始意义和上下文。
-7. 引用用户名和时间戳: 在适当情况下, 引用用户名和时间戳以为某些陈述提供上下文。
+7. 引用用户名和消息ID: 在适当情况下, 引用username和message_id以为某些陈述提供上下文。
8. 撰写总结: 以简洁的语言编写总结, 同时包含必要的引用。
# 输出格式
- 使用中文撰写总结。
- 简明扼要地总结聊天记录的内容。
-- 在必要时引用用户名和时间。
+- 在必要时引用消息username和message_id。
- 保持清晰和简洁的表达。
-- 引用用户名时, 请使用 **username** 格式。如: **username**
-- 引用时间时, 请使用 [HH:MM:SS](url) 格式。如: [12:30:00](https://t.me/c/1234/56789)
+- 引用username和message_id时, 务必使用 **username[message_id]** 格式。如: **John[123]**
+- 如果需要同时引用多个message_id, 请使用 **username[id1, id2, ...]** 格式。如: **Alice[125, 126, 127]**
+
"""
DAILY_SUMMARY_PREFIX = "🏪**#爬楼助手**\n"
@@ -181,17 +190,17 @@ async def ai_chat_summary(
res = await send2tg(client, message, texts=f"📝正在获取历史消息...\n⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}", **kwargs)
kwargs["progress"] = res[0]
history_list = await get_history_info_list(client, info["cid"], offset_id, num_history, begin_time, end_time, filter_users)
- # parse the history contexts
- parsed = await parse_history_list(history_list)
- if parsed["num_message"] == 0:
+ parsed = await parse_history_list(history_list) # parse the history as csv
+ if parsed.get("num_message", 0) == 0:
await send2tg(client, message, texts=f"{num_history}条历史消息中未找到符合条件的消息", **kwargs)
await modify_progress(del_status=True, **kwargs)
return
- msg = f"⏩开始时间: {parsed['begin_time']:%m-%d %H:%M:%S}\n"
- msg += f"⏯️结束时间: {parsed['end_time']:%m-%d %H:%M:%S}\n"
+ msg = f"⏩开始时间: {parsed['begin_time']}\n"
+ msg += f"⏯️结束时间: {parsed['end_time']}\n"
msg += f"🔢消息条数: {parsed['num_message']}\n"
# send contexts as txt file
- with io.BytesIO(parsed["txt_format"].encode("utf-8")) as f:
+ txt_format = get_txt_format(history_list)
+ with io.BytesIO(txt_format.encode("utf-8")) as f:
await client.send_document(to_int(message.chat.id), f, file_name=CONTEXT_FILENAME, caption=msg)
if not need_summay:
await modify_progress(del_status=True, **kwargs)
@@ -206,6 +215,7 @@ async def ai_chat_summary(
if texts := ai_res.get("texts"):
summary_prefix = summary_prefix or f"🤖**{ai_res['model_name']}**:\n"
kwargs["reply_msg_id"] = -1 # DO NOT send as a reply message
+ texts = revert_to_original_url(texts, history_list, parsed["msg_offset"])
await send2tg(client, message, texts=f"{summary_prefix}⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}\n{texts}", **kwargs)
await modify_progress(del_status=True, **kwargs)
@@ -214,33 +224,45 @@ async def parse_history_list(info_list: list[dict]) -> dict:
"""Parse chat history info list.
Currently, we only summarize text contents.
+
+ Generate a history string in csv format, sep by pipe `|`
+
+ Returns:
+ {"history": str, "num_message": int, "msg_offset": int, "begin_time": str, "end_time": str}
+
"""
- begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
- end_time = nowdt(tz=TZ)
- messages: list[dict] = [] # hold user messages
+ now = nowdt(tz=TZ).strftime("%m-%d %H:%M:%S")
+ begin_time = ""
+ end_time = now
+ headers = "message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content\n"
+ history_csv = ""
+ num_message = 0
+ msg_offset = 0
for info in info_list:
if info["file_name"] == CONTEXT_FILENAME:
continue
if info["is_bot"]: # bots
continue
- if info["text"]: # currently, we only include texts
- if len(messages) == 0:
- begin_time = info["datetime"]
- end_time = info["datetime"]
- media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
- content = {
- "username": info["full_name"],
- "time": f"{info['datetime']:%H:%M:%S}",
- "url": info["message_url"],
- "message": media_type + info["text"],
- }
- if reply_msg_content := get_message_by_id(info_list, info.get("reply_to_message_id")):
- content["reply_to_message"] = reply_msg_content
- messages.append(content)
- if not messages:
+ if not info["text"]: # currently, we only include texts
+ continue
+ num_message += 1
+ msg_offset = msg_offset or info["mid"]
+ dt = info["datetime"]
+ begin_time = begin_time or dt.strftime("%m-%d %H:%M:%S")
+ end_time = dt.strftime("%m-%d %H:%M:%S")
+ media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
+ message_id = info["mid"]
+ reply_to_message_id = info.get("reply_to_message_id") or 0
+ username = info["full_name"]
+ content = media_type + info["text"]
+ reply_msg = get_message_by_id(info_list, reply_to_message_id)
+ reply_msg_content = reply_msg.get("message", "")
+ if len(reply_msg_content) > 30:
+ reply_msg_content = reply_msg_content[:30] + "..."
+ history_csv += f"{message_id - msg_offset}\t{username}\t{content}\t{reply_to_message_id - msg_offset}\t{reply_msg_content}\n"
+ if not history_csv:
return {}
- history = json.dumps(messages, ensure_ascii=False)
"""IMPORTANT: We need to remove `BOT_TIPS` in the history!
Because we need to call `ai_text_generation` function,
@@ -249,8 +271,8 @@ async def parse_history_list(info_list: list[dict]) -> dict:
If the history contains `BOT_TIPS`, the context of this message will be `model` (not `user`)
But `model` only message is not allowed, so we need to remove `BOT_TIPS`
"""
- history = history.replace(BOT_TIPS, "")
- return {"history": history, "num_message": len(messages), "txt_format": get_txt_format(info_list), "begin_time": begin_time, "end_time": end_time}
+ history_csv = history_csv.replace(BOT_TIPS, "")
+ return {"history": headers + history_csv, "num_message": num_message, "msg_offset": msg_offset, "begin_time": begin_time, "end_time": end_time}
def get_txt_format(info_list: list[dict]) -> str:
@@ -274,13 +296,13 @@ def get_txt_format(info_list: list[dict]) -> str:
txt_format += f"[{info['mtype']}]"
txt_format += info["text"]
# append quote msg
- if reply_msg_content := get_message_by_id(info_list, info.get("reply_to_message_id")):
+ if reply_msg_content := get_message_by_id(info_list, message_id=info.get("reply_to_message_id") or 0):
txt_format += f"\n<quote>{reply_msg_content['username']}: {reply_msg_content['message']}</quote>"
txt_format += "\n\n"
return txt_format
-def get_message_by_id(info_list: list[dict], message_id: int | None = None) -> dict:
+def get_message_by_id(info_list: list[dict], message_id: int = 0) -> dict:
"""Get message by id."""
if not message_id:
return {}
@@ -302,6 +324,21 @@ def get_media_group_by_id(info_list: list[dict], media_group_id: int | None = No
return [x for x in info_list if x["media_group_id"] == media_group_id]
+def revert_to_original_url(ai_texts: str, info_list: list[dict], msg_offset: int) -> str:
+ def get_message_markdown_url(mid: str) -> str:
+ real_mid = int(mid) + msg_offset
+ msg = get_message_by_id(info_list, real_mid)
+ if not msg:
+ return ""
+ return f"[{msg['time']}]({msg['url']})"
+
+ for group in re.findall(r"\[([\d, ]+)\]", ai_texts):
+ url_list = [get_message_markdown_url(mid) for mid in strings_list(group)]
+ urls = ", ".join(url_list)
+ ai_texts = ai_texts.replace(f"[{group}]", f"({urls})")
+ return ai_texts
+
+
async def daily_summary(client: Client):
"""Daily summary of the chat history."""
now = nowdt(TZ)
src/messages/chat_history.py
@@ -86,7 +86,7 @@ async def get_history_info_list_via_turso(
"mid": int(row["mid"]),
"full_name": row["fullname"],
"message_url": f"{message_url_prefix}/{row['mid']}",
- "reply_to_message_id": to_int(row["reply"]),
+ "reply_to_message_id": to_int(row["reply"]) or 0,
"media_group_id": row["gid"],
}
for row in rows
@@ -134,7 +134,7 @@ async def get_history_info_list_via_telegram(
if info["datetime"] > end_time:
continue
if msg.reply_to_message_id:
- info["reply_to_message_id"] = msg.reply_to_message_id
+ info["reply_to_message_id"] = msg.reply_to_message_id or 0
if not users:
history.append(info)
continue