Commit `82aa5de`

benny-dou <60535774+benny-dou@users.noreply.github.com>

2025-02-21 05:24:57

feat(summary): add time range and interval support for `/summary`

main

1 parent 8e06006

Changed files (7)

src

llm

messages

others

@@ -12,7 +12,7 @@ from config import ENABLE, GPT
 from llm.models import openrouter_hook
 from llm.prompts import add_search_results_to_prompts
 from llm.tools import add_tools, get_online_search_result
-from llm.utils import beautify_model_name, extract_reasoning
+from llm.utils import beautify_llm_response, beautify_model_name, extract_reasoning
 from messages.progress import modify_progress
 from utils import number_to_emoji
 
@@ -132,7 +132,11 @@ async def parse_response(config: dict, response: dict) -> dict[str, str]:
 
         primary_model = glom(config, "completions.model", default="") or ""
         used_model = glom(response, "model", default="") or ""
-        response = {"content": content.strip(), "model": config["friendly_name"], "reasoning": reasoning.strip()}
+        response = {
+            "content": beautify_llm_response(content.strip()),
+            "model": config["friendly_name"],
+            "reasoning": reasoning.strip(),
+        }
         if not (used_model in primary_model or primary_model in used_model):
             # do not use `!=` to compare. (deepseek/deepseek-r1:free != deepseek/deepseek-r1,  gpt-4o != gpt-4o-2024-07-18)
             used_model = beautify_model_name(used_model)

@@ -1,12 +1,13 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import re
+from datetime import datetime, timedelta
+from zoneinfo import ZoneInfo
 
-from loguru import logger
 from pyrogram.client import Client
 from pyrogram.types import Message
 
-from config import ENABLE, GPT, MAX_MESSAGE_SUMMARY, PREFIX, cache
+from config import ENABLE, GPT, MAX_MESSAGE_SUMMARY, PREFIX, TZ, cache
 from llm.models import get_model_config_with_contexts
 from llm.prompts import refine_prompts
 from llm.response import send_to_gpt
@@ -16,24 +17,39 @@ from messages.parser import parse_msg
 from messages.progress import modify_progress
 from messages.sender import send2tg
 from messages.utils import equal_prefix, to_int
+from utils import nowdt
 
 HELP = f"""🤖**GPT总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
-当前模型: **{GPT.TEXT_MODEL_NAME}**
 使用说明:
-1. `{PREFIX.AI_SUMMARY} + #N`
-GPT总结最近的N条历史消息
 
-2. `{PREFIX.AI_SUMMARY} + #N + @User`
-GPT总结最近只属于User的N条消息
+**1️⃣指定条目数**
+- `{PREFIX.AI_SUMMARY} #N`: 总结最近的N条历史消息
+- `{PREFIX.AI_SUMMARY} #N @User`: 总结最近只属于User的N条消息
 
-如果以 `{PREFIX.AI_SUMMARY} + #N` (或附加User) 回复消息M
-则总结消息M之前的N条消息文本 (包含M)
+示例:
+- `{PREFIX.AI_SUMMARY} #10`: 总结最近的10条历史消息
+- `{PREFIX.AI_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
+- `{PREFIX.AI_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
 
+**2️⃣指定最近时间段**
+- `{PREFIX.AI_SUMMARY} #interval`: 总结最近interval时段内的消息
+- `{PREFIX.AI_SUMMARY} #interval @User`: 总结最近interval时段内, 且只属于User的消息
 示例:
-1. `{PREFIX.AI_SUMMARY} #10`: 总结最近的10条历史消息
-2. `{PREFIX.AI_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
-3. `{PREFIX.AI_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
-如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
+- `{PREFIX.AI_SUMMARY} #10m`: 总结最近10分钟内的消息
+- `{PREFIX.AI_SUMMARY} #2h`: 总结最近2小时内的消息
+- `{PREFIX.AI_SUMMARY} #1d`: 总结最近1天的消息
+- `{PREFIX.AI_SUMMARY} #2h @John`: 总结最近2小时内的消息, 且只属于John的消息
+
+**3️⃣ 指定具体时间范围**
+- `{PREFIX.AI_SUMMARY} #20240101123000`: 总结从2024-01-01 12:30:00开始到现在的消息
+- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息
+- `{PREFIX.AI_SUMMARY} #20240101123000 @User`: 总结从2024-01-01 12:30:00开始到现在的消息, 且只属于User的消息
+- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000 @User`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息, 且只属于User的消息
+
+注意:
+- 用上述各种`{PREFIX.AI_SUMMARY}`命令回复消息M, 视为将截止时间设为消息M的发送时间
+- 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
+- 3️⃣的时间格式中没有任何分隔符, 必须为YYYYMMDDHHMMSS (14位纯数字)
 """
 
 
@@ -54,41 +70,58 @@ async def ai_summary(client: Client, message: Message, **kwargs):
 
     # get the number of messages to combine
     info = parse_msg(message)
-    num_history = 0
-    if matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)\s+@(\w+)", info["text"]):
-        num_history = int(matched.group(1))
-        num_history = min(num_history, MAX_MESSAGE_SUMMARY)
-        filter_user = str(matched.group(2))
-    elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)", info["text"]):
-        num_history = int(matched.group(1))
-        num_history = min(num_history, MAX_MESSAGE_SUMMARY)
-        filter_user = ""
-    else:
-        return
-    # reply a message with /summary
-    offset_id = info["mid"]
+    num_history = MAX_MESSAGE_SUMMARY
+    filter_user = ""
+    begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
+    end_time = nowdt(tz=TZ)
+    # reply to a message with /summary
+    offset_id = 0
     if message.reply_to_message:
         offset_id = message.reply_to_message.id + 1  # include the reply message
+        reply_info = parse_msg(message.reply_to_message)
+        end_time = reply_info["datetime"]
+
+    # 3️⃣ /summay #YYYYMMDDHHMMSS @user
+    # 4️⃣ /summay #YYYYMMDDHHMMSS-YYYYMMDDHHMMSS @user
+    if matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d{14})-?(\d{14})?(\s+)?(@\w+)?", info["text"]):
+        begin_time = datetime.strptime(matched.group(1), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
+        end_time = datetime.strptime(matched.group(2) or end_time.strftime("%Y%m%d%H%M%S"), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
+        filter_user = matched.group(4) or ""
+    # 2️⃣ /summay #interval @user  (/summay #4h @user)
+    elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)([mMhHdD])(\s+)?(@\w+)?", info["text"]):
+        interval = int(matched.group(1))
+        unit = matched.group(2).lower()
+        filter_user = matched.group(4) or ""
+        if unit == "m":
+            begin_time = end_time - timedelta(minutes=interval)
+        elif unit == "h":
+            begin_time = end_time - timedelta(hours=interval)
+        elif unit == "d":
+            begin_time = end_time - timedelta(days=interval)
+    # 1️⃣ /summay #N @user
+    elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)(\s+)?(@\w+)?", info["text"]):
+        num_history = min(int(matched.group(1)), MAX_MESSAGE_SUMMARY)
+        filter_user = matched.group(3) or ""
+    else:
+        return
 
     # set custom chat_id and message_id (useful for debug)
     if matched := re.search(r"cid=(-?\w+)", info["text"], re.IGNORECASE):
         info["cid"] = to_int(matched.group(1))
     if matched := re.search(r"mid=(\d+)", info["text"], re.IGNORECASE):
-        info["mid"] = int(matched.group(1))
-        offset_id = info["mid"] + 1  # include this message
-
+        offset_id = int(matched.group(1)) + 1  # include this message
     if kwargs.get("show_progress") and "progress" not in kwargs:
-        res = await send2tg(client, message, texts=f"📝正在获取{num_history}条历史消息...", **kwargs)
+        res = await send2tg(client, message, texts=f"📝正在获取历史消息...\n⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}", **kwargs)
         kwargs["progress"] = res[0]
 
-    history = await get_parsed_chat_history(client, info["cid"], offset_id, num_history, filter_user)
+    history = await get_parsed_chat_history(client, info["cid"], offset_id, num_history, begin_time, end_time, filter_user.removeprefix("@"))
+
+    # parse the history contexts
+    parsed = await get_contexts(history)
     if not history:
-        await send2tg(client, message, texts=f"最近{num_history}条消息中未找到符合条件的消息", **kwargs)
+        await send2tg(client, message, texts=f"{num_history}条历史消息中未找到符合条件的消息", **kwargs)
         await modify_progress(del_status=True, **kwargs)
         return
-
-    # parse the history contexts
-    parsed = await get_contexts(client, history, **kwargs)
     contexts = refine_prompts(parsed["system_context"] + [{"role": "user", "content": parsed["user_context"]}])
     sysmtem_tokens = count_tokens(contexts[0]["content"])
     user_tokens = count_tokens(contexts[-1]["content"])
@@ -102,8 +135,10 @@ async def ai_summary(client: Client, message: Message, **kwargs):
         summary_model_name = GPT.LONG_MODEL_NAME
         max_tokens = int(GPT.LONG_MODEL_MAX_OUTPUT_LENGTH)
     msg = f"🤖**{summary_model_name}**总结中...\n"
-    msg += f"🔢有效消息条数: {len(parsed['user_context'])}\n"
-    msg += f"🔠总Token数量: {total_tokens}"
+    msg += f"⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n"
+    msg += f"⏯️结束时间: {end_time:%m-%d %H:%M:%S}\n"
+    msg += f"🔢有效消息: {len(parsed['user_context'])}\n"
+    msg += f"🔠总Token: {total_tokens}"
     await modify_progress(text=msg, force_update=True, **kwargs)
     config = get_model_config_with_contexts(model_type="text", contexts=contexts, force_model=summary_model, message_info=info)
 
@@ -112,15 +147,17 @@ async def ai_summary(client: Client, message: Message, **kwargs):
         config["completions"]["max_completion_tokens"] = max_tokens
     else:
         config["completions"]["max_tokens"] = max_tokens
-
+    config["client"]["timeout"] = int(GPT.SUMMARY_TIMEOUT)
     response = await send_to_gpt(config, **kwargs)
     if texts := response.get("content"):
-        logger.debug(response)
-        await send2tg(client, message, texts=texts.strip("`"), **kwargs)
+        texts = texts.strip("`")
+        if reasoning := response.get("reasoning"):
+            texts = f"{reasoning}\n{texts}"
+        await send2tg(client, message, texts=f"🤖**{response['model']}**:\n{texts}", **kwargs)
         await modify_progress(del_status=True, **kwargs)
 
 
-async def get_contexts(client: Client, history: list[dict], **kwargs) -> dict:  # noqa: ARG001
+async def get_contexts(history: list[dict]) -> dict:
     """Get GPT contexts based on parsed chat history.
 
     Currently, we only summarize text contents.
@@ -131,7 +168,17 @@ async def get_contexts(client: Client, history: list[dict], **kwargs) -> dict:
             "content": [
                 {
                     "type": "text",
-                    "text": """总结在线休闲讨论组的聊天记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 引用原始用户名和时间戳, 并使用清晰的语言。
+                    "text": """总结以下网络聊天记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 引用原始用户名和时间戳, 并使用清晰的语言。
+每一条消息的格式如下:
+{
+    "id": 消息ID, 按顺序递增,
+    "time": 消息发送时间,
+    "url": 消息链接,
+    "username": 消息发送者,
+    "content": 本条消息内容,
+    "reply_to_message": 回复消息的原始内容, 如果本消息并不回复其他消息, 则不存在该字段
+}
+
 # 步骤
 1. 阅读聊天记录: 仔细查看对话内容, 了解讨论的流程和上下文。
 2. 识别关键主题: 提取整个聊天中讨论的主要话题。
@@ -145,16 +192,10 @@ async def get_contexts(client: Client, history: list[dict], **kwargs) -> dict:
 # 输出格式
 - 使用中文撰写总结。
 - 简明扼要地总结聊天记录的内容。
-- 在必要时引用用户名和时间戳。
+- 在必要时引用用户名和时间。
 - 保持清晰和简洁的表达。
-
-# 示例
-- 输入: [包含用户名和时间戳的聊天记录片段]
-- 输出:
-  [10:23:30] Alice 提出关于气候变化的话题, 重点讨论其影响。
-  [11:00:30] Bob 表示反对, 引用了相反的证据。
-  [11:30:00] Charlie 提出了一个新的项目想法, 引起了大家的兴趣。
-  [12:00:00] 大家讨论了项目的潜在挑战和机会。最终, 决定下次会议继续讨论这个项目。
+- 引用用户名时, 请使用 **username** 格式。如: **username**
+- 引用时间时, 请使用 [HH:MM:SS](url) 格式。如: [12:30:00](https://t.me/username/1234567890)
 """,
                 }
             ],
@@ -169,28 +210,36 @@ async def get_contexts(client: Client, history: list[dict], **kwargs) -> dict:
             continue
 
         if info["text"]:  # currently, we only include texts
+            if len(user_context) == 0:
+                begin_time = info["datetime"]
+            end_time = info["datetime"]
             content = {
-                "message_id": info["mid"],
+                "id": info["mid"],
                 "time": f"{info['datetime']:%H:%M:%S}",
+                "url": info["message_url"],
                 "username": info["full_name"],
                 "content": info["text"],
             }
-            if (reply_to_message_id := info.get("reply_to_message_id")) and (reply_msg_content := get_message_by_id(reply_to_message_id, history)):
+            if reply_msg_content := get_message_by_id(history, info.get("reply_to_message_id")):
                 content["reply_to_message"] = reply_msg_content
             user_context.append({"type": "text", "text": str(content)})
-
-    return {"system_context": system_context, "user_context": user_context}
+    if not user_context:
+        return {}
+    return {"system_context": system_context, "user_context": user_context, "begin_time": begin_time, "end_time": end_time}
 
 
-def get_message_by_id(message_id: int, history: list[dict]) -> dict:
+def get_message_by_id(history: list[dict], message_id: int | None = None) -> dict:
     """Get message by id."""
+    if not message_id:
+        return {}
     info = next((info for info in history if info["mid"] == message_id), {})
     if not info:
         return {}
 
     return {
-        "message_id": info["mid"],
+        "id": info["mid"],
         "time": f"{info['datetime']:%H:%M:%S}",
+        "url": info["message_url"],
         "username": info["full_name"],
         "content": info["text"],
     }

@@ -7,6 +7,7 @@ import tiktoken
 from loguru import logger
 
 from config import DOWNLOAD_DIR, GPT
+from utils import remove_consecutive_newlines, remove_dash, remove_pound
 
 BOT_TIPS = "回复以继续"
 
@@ -45,6 +46,8 @@ def llm_cleanup_files(messages: list[dict]):
 
 def count_tokens(string: str, encoding_name: str | None = None) -> int:
     """Returns the number of tokens in a text string."""
+    if not string:
+        return 0
     if encoding_name is None:
         encoding_name = GPT.TOKEN_ENCODING
     try:
@@ -80,6 +83,21 @@ def beautify_model_name(name: str) -> str:
     return name.replace("gpt", "GPT").replace("gemini", "Gemini").replace("deepseek", "DeepSeek")  # GPT-4o
 
 
+def beautify_llm_response(text: str) -> str:
+    """Beautify LLM response.
+
+    Args:
+        text: LLM response
+    Returns:
+        beautified LLM response
+    """
+    if not text:
+        return text
+    text = remove_pound(text)
+    text = remove_dash(text)
+    return remove_consecutive_newlines(text)
+
+
 def extract_reasoning(text: str) -> tuple[str, str]:
     """Extract reasoning from text.

@@ -1,9 +1,12 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+from datetime import datetime
+from zoneinfo import ZoneInfo
+
 from pyrogram.client import Client
 
-from config import MAX_MESSAGE_COMBINATION, cache
+from config import MAX_MESSAGE_RETRIEVED, TZ, cache
 from messages.parser import parse_msg
 
 
@@ -11,28 +14,37 @@ from messages.parser import parse_msg
 async def get_parsed_chat_history(
     client: Client,
     chat_id: int | str,
-    offset_id: int,
+    offset_id: int = 0,
     num: int = 0,
+    begin_time: datetime | None = None,
+    end_time: datetime | None = None,
     user: str = "",
 ) -> list[dict]:
     """Get given number of chat history from old to new in parserd json format.
 
     If user is specified, number of messages from the user will be returned.
     """
-    if num <= 0:
-        return []
+    if begin_time is None:
+        begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
+    if end_time is None:
+        end_time = datetime.now(tz=ZoneInfo(TZ))
     history = []
     retrieved = 0
     user = user.replace(" ", "").lower()
     async for msg in client.get_chat_history(chat_id=chat_id, offset_id=offset_id):  # type: ignore
+        # iterate messages from new to old
         retrieved += 1
-        if retrieved > MAX_MESSAGE_COMBINATION:
+        if retrieved > MAX_MESSAGE_RETRIEVED:
             break
         if len(history) >= num:
             break
         if msg.empty:
             break
         info = parse_msg(msg, silent=True)
+        if info["datetime"] < begin_time:
+            break
+        if info["datetime"] > end_time:
+            continue
         if msg.reply_to_message_id:
             info["reply_to_message_id"] = msg.reply_to_message_id
         if not user:

@@ -6,7 +6,7 @@ import re
 from pyrogram.client import Client
 from pyrogram.types import Message
 
-from config import ENABLE, MAX_MESSAGE_COMBINATION, PREFIX, READING_SPEED
+from config import ENABLE, MAX_MESSAGE_RETRIEVED, PREFIX, READING_SPEED
 from llm.utils import count_tokens
 from messages.chat_history import get_parsed_chat_history
 from messages.parser import parse_msg
@@ -15,7 +15,7 @@ from messages.utils import equal_prefix, get_reply_to, startswith_prefix
 from utils import to_int
 
 HELP = f"""
-💬**合并对话历史** (最多{MAX_MESSAGE_COMBINATION}条)
+💬**合并对话历史** (最多{MAX_MESSAGE_RETRIEVED}条)
 使用说明:
 1. `{PREFIX.COMBINATION} + #N`
 将最近的N条消息文本合并为txt文件
@@ -47,15 +47,13 @@ async def combine_history(client: Client, message: Message, **kwargs):
 
     # get the number of messages to combine
     info = parse_msg(message)
-    num_history = 0
+    num_history = MAX_MESSAGE_RETRIEVED
     if matched := re.match(r"^" + PREFIX.COMBINATION + r"\s+#(\d+)\s+@(\w+)", info["text"]):
-        num_history = int(matched.group(1))
-        num_history = min(num_history, MAX_MESSAGE_COMBINATION)
+        num_history = min(int(matched.group(1)), MAX_MESSAGE_RETRIEVED)
         filter_user = str(matched.group(2))
         file_name = f"最近{num_history}条{filter_user}的消息.txt"
     elif matched := re.match(r"^" + PREFIX.COMBINATION + r"\s+#(\d+)", info["text"]):
-        num_history = int(matched.group(1))
-        num_history = min(num_history, MAX_MESSAGE_COMBINATION)
+        num_history = min(int(matched.group(1)), MAX_MESSAGE_RETRIEVED)
         filter_user = ""
         file_name = f"最近{num_history}条消息记录.txt"
     else:
@@ -75,7 +73,7 @@ async def combine_history(client: Client, message: Message, **kwargs):
         info["mid"] = int(matched.group(1))
         offset_id = info["mid"] + 1  # include this message
 
-    history = await get_parsed_chat_history(client, info["cid"], offset_id, num_history, filter_user)
+    history = await get_parsed_chat_history(client, info["cid"], offset_id, num_history, user=filter_user)
     if not history:
         await send2tg(client, message, texts=f"最近{num_history}条消息中未找到符合条件的消息", **kwargs)
         return

@@ -18,7 +18,7 @@ TEXT_LENGTH = int(os.getenv("TEXT_LENGTH", "4096"))  # Maximum length of text me
 CAPTION_LENGTH = int(os.getenv("CAPTION_LENGTH", "1024"))  # 4096 for Premium user
 MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", "2000")) * 1024 * 1024  # 4000 MB for Premium user
 ASR_MAX_DURATION = int(os.getenv("ASR_MAX_DURATION", "600"))
-MAX_MESSAGE_COMBINATION = int(os.getenv("MAX_MESSAGE_COMBINATION", "5000"))  # Maximum number of messages to combine
+MAX_MESSAGE_RETRIEVED = int(os.getenv("MAX_MESSAGE_RETRIEVED", "5000"))  # Maximum number of messages to combine
 MAX_MESSAGE_SUMMARY = int(os.getenv("MAX_MESSAGE_SUMMARY", "5000"))  # Maximum number of messages to summay
 READING_SPEED = int(os.getenv("READING_SPEED", "300"))  # words per minute
 DAILY_MESSAGES = os.getenv("DAILY_MESSAGES", "{}")  # Useful for daily checkin for some services. Should be a json string: '{"chat-1": "msg-1", "chat-2": "msg-2"}'
@@ -192,6 +192,7 @@ class GPT:  # see `llm/README.md`
     SUMMARY_MODEL_MAX_OUTPUT_LENGTH = os.getenv("GPT_SUMMARY_MODEL_MAX_OUTPUT_LENGTH", "8192")  # 8K
     SUMMARY_API_KEY = os.getenv("GPT_SUMMARY_API_KEY", "")
     SUMMARY_BASE_URL = os.getenv("GPT_SUMMARY_BASE_URL", "https://api.openai.com/v1")
+    SUMMARY_TIMEOUT = os.getenv("GPT_SUMMARY_TIMEOUT", "600")  # should be larger than default timeout
     # long context model
     LONG_MODEL = os.getenv("GPT_LONG_MODEL", "gemini-1.5-pro")
     LONG_MODEL_NAME = os.getenv("GPT_LONG_MODEL_NAME", "Gemini-1.5-Pro")

@@ -239,6 +239,32 @@ def match_urls(text: str) -> list[str]:
     return [https_url(x[0]) for x in res]
 
 
+def remove_dash(text: str) -> str:
+    if not text:
+        return ""
+    while "---" in text:
+        text = text.replace("---", "")
+    while "--" in text:
+        text = text.replace("--", "")
+    return text
+
+
+def remove_pound(text: str) -> str:
+    if not text:
+        return ""
+    while "# " in text:
+        text = text.replace("# ", " ")
+    return text
+
+
+def remove_consecutive_newlines(text: str) -> str:
+    if not text:
+        return ""
+    while "\n\n\n" in text:
+        text = text.replace("\n\n\n", "\n\n")
+    return text
+
+
 def is_supported_by_ytdlp(url: str) -> bool:
     """Check if this url is supported by ytdlp."""
     if "t.me" in url:  # tg link

Commit 82aa5de

Commit `82aa5de`