Commit 053bad0

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-04-03 10:56:40
refactor(combine): merge combine history function into AI summary
1 parent c7bdcf7
src/llm/summary.py
@@ -19,15 +19,15 @@ from messages.chat_history import get_parsed_chat_history
 from messages.parser import parse_msg
 from messages.progress import modify_progress
 from messages.sender import send2tg
-from messages.utils import equal_prefix, to_int
+from messages.utils import equal_prefix, startswith_prefix, to_int
 from utils import nowdt
 
-HELP = f"""🤖**GPT总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
+HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
+⚠️使用`{PREFIX.COMBINATION}`命令只生成聊天记录文件, 不进行AI总结
 使用说明:
 # 后跟消息数量或时间范围
 @ 后跟用户名 (可多次使用@)
 
-
 **1️⃣指定条目数**
 - `{PREFIX.AI_SUMMARY} #N`: 总结最近的N条历史消息
 - `{PREFIX.AI_SUMMARY} #N @User`: 总结最近只属于User的N条消息
@@ -53,11 +53,11 @@ HELP = f"""🤖**GPT总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
 - `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息
 - `{PREFIX.AI_SUMMARY} #20240101123000 @User`: 总结从2024-01-01 12:30:00开始到现在的消息, 且只属于User的消息
 - `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000 @User`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息, 且只属于User的消息
+- 时间格式中没有任何分隔符, 必须为YYYYMMDDHHMMSS (14位纯数字)
 
 注意:
 - 用上述各种`{PREFIX.AI_SUMMARY}`命令回复消息M, 视为将截止时间设为消息M的发送时间
 - 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
-- 3️⃣的时间格式中没有任何分隔符, 必须为YYYYMMDDHHMMSS (14位纯数字)
 """
 DAILY_SUMMARY_PREFIX = "🏪**#爬楼助手**\n"
 CONTEXT_FILENAME = "聊天记录.txt"
@@ -72,12 +72,14 @@ async def ai_summary(client: Client, message: Message, summary_prefix: str | Non
         summary_prefix (str | None): Prefix string of the response message.
     """
     # send docs if message == "/summary"
-    if equal_prefix(message.text, prefix=[PREFIX.AI_SUMMARY]):
+    if equal_prefix(message.text, prefix=[PREFIX.AI_SUMMARY, PREFIX.COMBINATION]):
         await send2tg(client, message, texts=HELP, **kwargs)
         return
 
-    # get the number of messages to combine
     info = parse_msg(message)
+    need_summay = startswith_prefix(info["text"], prefix=[PREFIX.AI_SUMMARY])
+    # replace /combine with /summary, because we need to use `/summary` to match different patterns
+    info["text"] = re.sub(r"^" + PREFIX.COMBINATION, PREFIX.AI_SUMMARY, info["text"], flags=re.IGNORECASE)
     num_history = MAX_MESSAGE_SUMMARY
     filter_users = []
     begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
@@ -146,6 +148,9 @@ async def ai_summary(client: Client, message: Message, summary_prefix: str | Non
     # send contexts as txt file
     with io.BytesIO(parsed["txt_format"].encode("utf-8")) as f:
         await client.send_document(to_int(message.chat.id), f, file_name=CONTEXT_FILENAME, caption=msg)
+    if not need_summay:
+        await modify_progress(del_status=True, **kwargs)
+        return
     await modify_progress(text=f"🤖**{summary_model_name}**总结中...\n{msg}", force_update=True, **kwargs)
     config = get_gpt_config(model_type="text", contexts=contexts, force_model=summary_model)
 
src/others/combine_history.py
@@ -1,100 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-import io
-import re
-
-from pyrogram.client import Client
-from pyrogram.types import Message
-
-from config import MAX_MESSAGE_RETRIEVED, PREFIX, READING_SPEED
-from llm.utils import count_tokens
-from messages.chat_history import get_parsed_chat_history
-from messages.parser import parse_msg
-from messages.sender import send2tg
-from messages.utils import equal_prefix, get_reply_to, startswith_prefix
-from utils import to_int
-
-HELP = f"""
-💬**合并对话历史** (最多{MAX_MESSAGE_RETRIEVED}条)
-使用说明:
-1. `{PREFIX.COMBINATION} + #N`
-将最近的N条消息文本合并为txt文件
-
-2. `{PREFIX.COMBINATION} + #N + @User`
-将最近只属于User的N条消息合并为txt文件
-
-如果以 `{PREFIX.COMBINATION} + #N` (或附加User) 回复消息M
-则合并消息M之前的N条消息文本 (包含M)
-
-示例:
-1. `{PREFIX.COMBINATION} #10`: 合并最近10条消息为txt文本
-2. `{PREFIX.COMBINATION} #20 @123456`: 合并最近UID为123456的20条消息为txt文本
-3. `{PREFIX.COMBINATION} #20 @John`: 合并最近用户John(大小写均可)的20条消息为txt文本
-如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
-"""
-
-
-async def combine_history(client: Client, message: Message, **kwargs):
-    """Download the url from the message."""
-    if not startswith_prefix(message.text, prefix=[PREFIX.COMBINATION]):
-        return
-    # send docs if message == "/combine", without reply
-    if equal_prefix(message.text, prefix=[PREFIX.WGET]) and not message.reply_to_message:
-        await send2tg(client, message, texts=HELP, **kwargs)
-        return
-
-    # get the number of messages to combine
-    info = parse_msg(message)
-    num_history = MAX_MESSAGE_RETRIEVED
-    if matched := re.match(r"^" + PREFIX.COMBINATION + r"\s+#(\d+)\s+@(\w+)", info["text"]):
-        num_history = min(int(matched.group(1)), MAX_MESSAGE_RETRIEVED)
-        filter_user = str(matched.group(2))
-        file_name = f"最近{num_history}条{filter_user}的消息.txt"
-    elif matched := re.match(r"^" + PREFIX.COMBINATION + r"\s+#(\d+)", info["text"]):
-        num_history = min(int(matched.group(1)), MAX_MESSAGE_RETRIEVED)
-        filter_user = ""
-        file_name = f"最近{num_history}条消息记录.txt"
-    else:
-        await send2tg(client, message, texts=HELP, **kwargs)
-        return
-
-    offset_id = info["mid"]
-    # reply a message with /combine
-    if message.reply_to_message:
-        message = message.reply_to_message
-        info = parse_msg(message, silent=True)
-        offset_id = info["mid"] + 1  # include the reply message
-    # set custom chat_id and message_id (useful for debug)
-    if matched := re.search(r"cid=(-?\w+)", info["text"], re.IGNORECASE):
-        info["cid"] = to_int(matched.group(1))
-    if matched := re.search(r"mid=(\d+)", info["text"], re.IGNORECASE):
-        info["mid"] = int(matched.group(1))
-        offset_id = info["mid"] + 1  # include this message
-
-    history = await get_parsed_chat_history(client, info["cid"], offset_id, num_history, users=filter_user)
-    if not history:
-        await send2tg(client, message, texts=f"最近{num_history}条消息中未找到符合条件的消息", **kwargs)
-        return
-    combination = ""
-    num_chars = 0
-    for info in history:
-        if info["full_name"]:
-            combination += f"@{info['full_name']} "
-        combination += f"{info['time']}\n"
-        media = f"[{info['mtype']}]" if info["mtype"] != "text" else ""
-        combination += f"{media}{info['text']}"
-        num_chars += len(f"{media}{info['text']}")
-        combination += "\n\n"
-    num_tokens = count_tokens(combination)
-    reading_minutes = num_chars / READING_SPEED  # minutes
-    target_chat = kwargs["target_chat"] if kwargs.get("target_chat") else message.chat.id
-    reply_msg_id = kwargs.get("reply_msg_id", 0)
-    reply_parameters = get_reply_to(message.id, reply_msg_id)
-    with io.BytesIO(combination.encode("utf-8")) as f:
-        await client.send_document(
-            to_int(target_chat),
-            f,
-            file_name=file_name,
-            reply_parameters=reply_parameters,
-            caption=f"字符数: {num_chars}\nToken: {num_tokens}\n阅读时长: {reading_minutes:.1f}分钟",
-        )
src/config.py
@@ -19,7 +19,7 @@ TEXT_LENGTH = int(os.getenv("TEXT_LENGTH", "4096"))  # Maximum length of text me
 CAPTION_LENGTH = int(os.getenv("CAPTION_LENGTH", "1024"))  # 4096 for Premium user
 MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", "2000")) * 1024 * 1024  # 4000 MB for Premium user
 ASR_MAX_DURATION = int(os.getenv("ASR_MAX_DURATION", "600"))
-MAX_MESSAGE_RETRIEVED = int(os.getenv("MAX_MESSAGE_RETRIEVED", "1000000"))  # Maximum number of messages to combine
+MAX_MESSAGE_RETRIEVED = int(os.getenv("MAX_MESSAGE_RETRIEVED", "1000000"))  # Maximum number of messages to retrieve
 MAX_MESSAGE_SUMMARY = int(os.getenv("MAX_MESSAGE_SUMMARY", "9999"))  # Maximum number of messages to summay
 READING_SPEED = int(os.getenv("READING_SPEED", "300"))  # words per minute
 DAILY_MESSAGES = os.getenv("DAILY_MESSAGES", "{}")  # Useful for daily checkin for some services. Should be a json string: '{"chat-1": "msg-1", "chat-2": "msg-2"}'
@@ -34,7 +34,6 @@ class ENABLE:  # see fine-grained permission in `src/permission.py`
     AI_SUMMARY = os.getenv("ENABLE_AI_SUMMARY", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     ASR = os.getenv("ENABLE_ASR", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     AUDIO = os.getenv("ENABLE_AUDIO", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
-    COMBINATION = os.getenv("ENABLE_COMBINATION", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     CRONTAB = os.getenv("ENABLE_CRONTAB", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     DOUYIN = os.getenv("ENABLE_DOUYIN", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     GPT = os.getenv("ENABLE_GPT", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
src/handler.py
@@ -17,7 +17,6 @@ from messages.parser import parse_msg
 from messages.sender import send2tg
 from messages.utils import equal_prefix, startswith_prefix
 from networking import match_social_media_link
-from others.combine_history import combine_history
 from others.download_external import download_url_in_message
 from others.extract_audio import extract_audio_file
 from others.raw_img_file import convert_raw_img_file
@@ -43,7 +42,6 @@ async def handle_utilities(
     ai: bool = True,
     asr: bool = True,
     audio: bool = True,
-    combine: bool = True,
     subtitle: bool = True,
     wget: bool = True,
     ocr: bool = True,
@@ -66,7 +64,6 @@ async def handle_utilities(
         ai (bool, optional): Enable GPT. Defaults to True.
         asr (bool, optional): Enable ASR. Defaults to True.
         audio (bool, optional): Enable Video -> Audio. Defaults to True.
-        combine (bool, optional): Enable History Combination. Defaults to True.
         subtitle (bool, optional): Enable YouTube subtitle. Defaults to True.
         wget (bool, optional): Enable WGET. Defaults to True.
         ocr (bool, optional): Enable OCR. Defaults to True.
@@ -83,8 +80,6 @@ async def handle_utilities(
         await voice_to_text(client, message, **kwargs)  # /asr
     if audio:
         await extract_audio_file(client, message, **kwargs)  # /audio
-    if combine:
-        await combine_history(client, message, **kwargs)  # /combine
     if subtitle:
         await get_subtitle(client, message, **kwargs)  # /subtitle
     if wget:
@@ -279,8 +274,6 @@ def get_social_media_help(chat_id: int | str, ctype: str, prefixes: list[str] |
         msg += f"\n🗣**语音转文字**: `{PREFIX.ASR}` 回复语音消息"
     if permission["audio"]:
         msg += f"\n🎧**提取音频或语音**: `{PREFIX.AUDIO}` `{PREFIX.VOICE}` 回复消息"
-    if permission["combine"]:
-        msg += f"\n💬**合并历史**: `{PREFIX.COMBINATION} #N` 合并最近N条对话历史"
     if permission["ocr"]:
         msg += f"\n🔤**图片转文字**: `{PREFIX.OCR}` 回复图片消息"
     if permission["price"]:
@@ -288,7 +281,7 @@ def get_social_media_help(chat_id: int | str, ctype: str, prefixes: list[str] |
     if permission["subtitle"]:
         msg += f"\n📃**提取字幕**: `{PREFIX.SUBTITLE}` + 油管链接 (或回复油管链接)"
     if permission["summary"] and permission["ai"]:  # summary depends on ai
-        msg += f"\n🤖**总结历史**: `{PREFIX.AI_SUMMARY} #N` AI总结最近N条对话历史"
+        msg += f"\n🤖**总结历史**: `{PREFIX.AI_SUMMARY}` AI总结历史聊天记录"
     if permission["wget"]:
         msg += f"\n⏬**下载文件**: `{PREFIX.WGET}` + URL"
 
src/permission.py
@@ -97,7 +97,6 @@ def check_service(cid: int | str, ctype: str) -> dict:
         "need_prefix": True,
         "asr": True,
         "audio": True,
-        "combine": True,
         "subtitle": True,
         "wget": True,
         "ocr": True,
@@ -140,8 +139,6 @@ def check_service(cid: int | str, ctype: str) -> dict:
         permission["asr"] = False
     if not ENABLE.AUDIO:
         permission["audio"] = False
-    if not ENABLE.COMBINATION:
-        permission["combine"] = False
     if not ENABLE.SUBTITLE:
         permission["subtitle"] = False
     if not ENABLE.WGET: