Commit 1f402d6

benny-dou <60535774+benny-dou@users.noreply.github.com>
2026-01-19 06:34:54
chore(summary): update AI chat summary to use TSV format
1 parent b8be862
Changed files (2)
src/ai/chat_summary.py
@@ -22,8 +22,9 @@ from messages.sender import send2tg
 from messages.utils import equal_prefix, remove_prefix, startswith_prefix, to_int
 from networking import match_social_media_link
 from subtitles.subtitle import get_subtitle
-from utils import nowdt, rand_number
+from utils import nowdt, rand_number, strings_list
 
+# ruff: noqa: RUF001
 HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
 ⚠️使用`{PREFIX.AI_SUMMARY}`命令生成聊天记录文件 + 聊天记录AI总结
 ⚠️使用`{PREFIX.COMBINATION}`命令只生成聊天记录文件, 不对聊天记录AI总结
@@ -65,15 +66,22 @@ HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
 - 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
 """
 
-SYSTEM_PROMPT = """总结以下网络聊天记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 引用原始用户名和时间戳。
-每一条消息的格式如下:
-{
-    "username": "消息发送者",
-    "time": "消息发送时间",
-    "url": "消息链接",
-    "message": "本条消息内容",
-    "reply_to_message": "被此条消息回复的消息"
-}
+SYSTEM_PROMPT = """总结以下群聊记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 请注明消息username和message_id。
+群聊记录以TSV格式发送(分隔符为\t), 列名为: message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
+其中:
+- message_id (int): 消息ID, 唯一标识每条消息
+- username (str): 发送消息的用户用户名
+- content (str): 消息内容
+- reply_to_message_id (int | None): 该消息所回复的消息的message_id
+- reply_to_message_content (str | None): 该消息所回复的消息的content
+
+示例:
+message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content
+123\tJohn\t今天好冷啊\t\t
+124\tLily\t我这里还好\t\t
+125\tAlice\t你那里下雪了吗\t123\t今天好冷啊
+126\tJohn\t天气预报说有,但是还没下\t125\t你那里下雪了吗
+127\tDavid\t我这里已经下了\t125\t你那里下雪了吗
 
 # 步骤
 1. 阅读聊天记录: 仔细查看对话内容, 了解讨论的流程和上下文。
@@ -82,16 +90,17 @@ SYSTEM_PROMPT = """总结以下网络聊天记录, 识别关键主题、争议
 4. 突出争议话题: 记录任何分歧或意见不同的地方。
 5. 识别重要观点: 捕捉参与者提出的重要观点或论点。
 6. 保留意图和上下文: 确保总结反映对话的原始意义和上下文。
-7. 引用用户名和时间戳: 在适当情况下, 引用用户名和时间戳以为某些陈述提供上下文。
+7. 引用用户名和消息ID: 在适当情况下, 引用username和message_id以为某些陈述提供上下文。
 8. 撰写总结: 以简洁的语言编写总结, 同时包含必要的引用。
 
 # 输出格式
 - 使用中文撰写总结。
 - 简明扼要地总结聊天记录的内容。
-- 在必要时引用用户名和时间。
+- 在必要时引用消息username和message_id。
 - 保持清晰和简洁的表达。
-- 引用用户名时, 请使用 **username** 格式。如: **username**
-- 引用时间时, 请使用 [HH:MM:SS](url) 格式。如: [12:30:00](https://t.me/c/1234/56789)
+- 引用username和message_id时, 务必使用 **username[message_id]** 格式。如: **John[123]**
+- 如果需要同时引用多个message_id, 请使用 **username[id1, id2, ...]** 格式。如: **Alice[125, 126, 127]**
+
 """
 
 DAILY_SUMMARY_PREFIX = "🏪**#爬楼助手**\n"
@@ -181,17 +190,17 @@ async def ai_chat_summary(
         res = await send2tg(client, message, texts=f"📝正在获取历史消息...\n⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}", **kwargs)
         kwargs["progress"] = res[0]
     history_list = await get_history_info_list(client, info["cid"], offset_id, num_history, begin_time, end_time, filter_users)
-    # parse the history contexts
-    parsed = await parse_history_list(history_list)
-    if parsed["num_message"] == 0:
+    parsed = await parse_history_list(history_list)  # parse the history as csv
+    if parsed.get("num_message", 0) == 0:
         await send2tg(client, message, texts=f"{num_history}条历史消息中未找到符合条件的消息", **kwargs)
         await modify_progress(del_status=True, **kwargs)
         return
-    msg = f"⏩开始时间: {parsed['begin_time']:%m-%d %H:%M:%S}\n"
-    msg += f"⏯️结束时间: {parsed['end_time']:%m-%d %H:%M:%S}\n"
+    msg = f"⏩开始时间: {parsed['begin_time']}\n"
+    msg += f"⏯️结束时间: {parsed['end_time']}\n"
     msg += f"🔢消息条数: {parsed['num_message']}\n"
     # send contexts as txt file
-    with io.BytesIO(parsed["txt_format"].encode("utf-8")) as f:
+    txt_format = get_txt_format(history_list)
+    with io.BytesIO(txt_format.encode("utf-8")) as f:
         await client.send_document(to_int(message.chat.id), f, file_name=CONTEXT_FILENAME, caption=msg)
     if not need_summay:
         await modify_progress(del_status=True, **kwargs)
@@ -206,6 +215,7 @@ async def ai_chat_summary(
     if texts := ai_res.get("texts"):
         summary_prefix = summary_prefix or f"🤖**{ai_res['model_name']}**:\n"
         kwargs["reply_msg_id"] = -1  # DO NOT send as a reply message
+        texts = revert_to_original_url(texts, history_list, parsed["msg_offset"])
         await send2tg(client, message, texts=f"{summary_prefix}⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}\n{texts}", **kwargs)
         await modify_progress(del_status=True, **kwargs)
 
@@ -214,33 +224,45 @@ async def parse_history_list(info_list: list[dict]) -> dict:
     """Parse chat history info list.
 
     Currently, we only summarize text contents.
+
+    Generate a history string in csv format, sep by pipe `|`
+
+    Returns:
+        {"history": str, "num_message": int, "msg_offset": int, "begin_time": str, "end_time": str}
+
     """
-    begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
-    end_time = nowdt(tz=TZ)
-    messages: list[dict] = []  # hold user messages
+    now = nowdt(tz=TZ).strftime("%m-%d %H:%M:%S")
+    begin_time = ""
+    end_time = now
+    headers = "message_id\tusername\tcontent\treply_to_message_id\treply_to_message_content\n"
+    history_csv = ""
+    num_message = 0
+    msg_offset = 0
     for info in info_list:
         if info["file_name"] == CONTEXT_FILENAME:
             continue
         if info["is_bot"]:  # bots
             continue
-        if info["text"]:  # currently, we only include texts
-            if len(messages) == 0:
-                begin_time = info["datetime"]
-            end_time = info["datetime"]
-            media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
-            content = {
-                "username": info["full_name"],
-                "time": f"{info['datetime']:%H:%M:%S}",
-                "url": info["message_url"],
-                "message": media_type + info["text"],
-            }
-            if reply_msg_content := get_message_by_id(info_list, info.get("reply_to_message_id")):
-                content["reply_to_message"] = reply_msg_content
-            messages.append(content)
-    if not messages:
+        if not info["text"]:  # currently, we only include texts
+            continue
+        num_message += 1
+        msg_offset = msg_offset or info["mid"]
+        dt = info["datetime"]
+        begin_time = begin_time or dt.strftime("%m-%d %H:%M:%S")
+        end_time = dt.strftime("%m-%d %H:%M:%S")
+        media_type = f"[{info['mtype']}] " if info["mtype"] != "text" else ""
+        message_id = info["mid"]
+        reply_to_message_id = info.get("reply_to_message_id") or 0
+        username = info["full_name"]
+        content = media_type + info["text"]
+        reply_msg = get_message_by_id(info_list, reply_to_message_id)
+        reply_msg_content = reply_msg.get("message", "")
+        if len(reply_msg_content) > 30:
+            reply_msg_content = reply_msg_content[:30] + "..."
+        history_csv += f"{message_id - msg_offset}\t{username}\t{content}\t{reply_to_message_id - msg_offset}\t{reply_msg_content}\n"
+    if not history_csv:
         return {}
 
-    history = json.dumps(messages, ensure_ascii=False)
     """IMPORTANT: We need to remove `BOT_TIPS` in the history!
 
     Because we need to call `ai_text_generation` function,
@@ -249,8 +271,8 @@ async def parse_history_list(info_list: list[dict]) -> dict:
     If the history contains `BOT_TIPS`, the context of this message will be `model` (not `user`)
     But `model` only message is not allowed, so we need to remove `BOT_TIPS`
     """
-    history = history.replace(BOT_TIPS, "")
-    return {"history": history, "num_message": len(messages), "txt_format": get_txt_format(info_list), "begin_time": begin_time, "end_time": end_time}
+    history_csv = history_csv.replace(BOT_TIPS, "")
+    return {"history": headers + history_csv, "num_message": num_message, "msg_offset": msg_offset, "begin_time": begin_time, "end_time": end_time}
 
 
 def get_txt_format(info_list: list[dict]) -> str:
@@ -274,13 +296,13 @@ def get_txt_format(info_list: list[dict]) -> str:
                 txt_format += f"[{info['mtype']}]"
         txt_format += info["text"]
         # append quote msg
-        if reply_msg_content := get_message_by_id(info_list, info.get("reply_to_message_id")):
+        if reply_msg_content := get_message_by_id(info_list, message_id=info.get("reply_to_message_id") or 0):
             txt_format += f"\n<quote>{reply_msg_content['username']}: {reply_msg_content['message']}</quote>"
         txt_format += "\n\n"
     return txt_format
 
 
-def get_message_by_id(info_list: list[dict], message_id: int | None = None) -> dict:
+def get_message_by_id(info_list: list[dict], message_id: int = 0) -> dict:
     """Get message by id."""
     if not message_id:
         return {}
@@ -302,6 +324,21 @@ def get_media_group_by_id(info_list: list[dict], media_group_id: int | None = No
     return [x for x in info_list if x["media_group_id"] == media_group_id]
 
 
+def revert_to_original_url(ai_texts: str, info_list: list[dict], msg_offset: int) -> str:
+    def get_message_markdown_url(mid: str) -> str:
+        real_mid = int(mid) + msg_offset
+        msg = get_message_by_id(info_list, real_mid)
+        if not msg:
+            return ""
+        return f"[{msg['time']}]({msg['url']})"
+
+    for group in re.findall(r"\[([\d, ]+)\]", ai_texts):
+        url_list = [get_message_markdown_url(mid) for mid in strings_list(group)]
+        urls = ", ".join(url_list)
+        ai_texts = ai_texts.replace(f"[{group}]", f"({urls})")
+    return ai_texts
+
+
 async def daily_summary(client: Client):
     """Daily summary of the chat history."""
     now = nowdt(TZ)
src/messages/chat_history.py
@@ -86,7 +86,7 @@ async def get_history_info_list_via_turso(
             "mid": int(row["mid"]),
             "full_name": row["fullname"],
             "message_url": f"{message_url_prefix}/{row['mid']}",
-            "reply_to_message_id": to_int(row["reply"]),
+            "reply_to_message_id": to_int(row["reply"]) or 0,
             "media_group_id": row["gid"],
         }
         for row in rows
@@ -134,7 +134,7 @@ async def get_history_info_list_via_telegram(
         if info["datetime"] > end_time:
             continue
         if msg.reply_to_message_id:
-            info["reply_to_message_id"] = msg.reply_to_message_id
+            info["reply_to_message_id"] = msg.reply_to_message_id or 0
         if not users:
             history.append(info)
             continue