Commit 82aa5de
Changed files (7)
src
src/llm/response.py
@@ -12,7 +12,7 @@ from config import ENABLE, GPT
from llm.models import openrouter_hook
from llm.prompts import add_search_results_to_prompts
from llm.tools import add_tools, get_online_search_result
-from llm.utils import beautify_model_name, extract_reasoning
+from llm.utils import beautify_llm_response, beautify_model_name, extract_reasoning
from messages.progress import modify_progress
from utils import number_to_emoji
@@ -132,7 +132,11 @@ async def parse_response(config: dict, response: dict) -> dict[str, str]:
primary_model = glom(config, "completions.model", default="") or ""
used_model = glom(response, "model", default="") or ""
- response = {"content": content.strip(), "model": config["friendly_name"], "reasoning": reasoning.strip()}
+ response = {
+ "content": beautify_llm_response(content.strip()),
+ "model": config["friendly_name"],
+ "reasoning": reasoning.strip(),
+ }
if not (used_model in primary_model or primary_model in used_model):
# do not use `!=` to compare. (deepseek/deepseek-r1:free != deepseek/deepseek-r1, gpt-4o != gpt-4o-2024-07-18)
used_model = beautify_model_name(used_model)
src/llm/summary.py
@@ -1,12 +1,13 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
+from datetime import datetime, timedelta
+from zoneinfo import ZoneInfo
-from loguru import logger
from pyrogram.client import Client
from pyrogram.types import Message
-from config import ENABLE, GPT, MAX_MESSAGE_SUMMARY, PREFIX, cache
+from config import ENABLE, GPT, MAX_MESSAGE_SUMMARY, PREFIX, TZ, cache
from llm.models import get_model_config_with_contexts
from llm.prompts import refine_prompts
from llm.response import send_to_gpt
@@ -16,24 +17,39 @@ from messages.parser import parse_msg
from messages.progress import modify_progress
from messages.sender import send2tg
from messages.utils import equal_prefix, to_int
+from utils import nowdt
HELP = f"""🤖**GPT总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
-当前模型: **{GPT.TEXT_MODEL_NAME}**
使用说明:
-1. `{PREFIX.AI_SUMMARY} + #N`
-GPT总结最近的N条历史消息
-2. `{PREFIX.AI_SUMMARY} + #N + @User`
-GPT总结最近只属于User的N条消息
+**1️⃣指定条目数**
+- `{PREFIX.AI_SUMMARY} #N`: 总结最近的N条历史消息
+- `{PREFIX.AI_SUMMARY} #N @User`: 总结最近只属于User的N条消息
-如果以 `{PREFIX.AI_SUMMARY} + #N` (或附加User) 回复消息M
-则总结消息M之前的N条消息文本 (包含M)
+示例:
+- `{PREFIX.AI_SUMMARY} #10`: 总结最近的10条历史消息
+- `{PREFIX.AI_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
+- `{PREFIX.AI_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
+**2️⃣指定最近时间段**
+- `{PREFIX.AI_SUMMARY} #interval`: 总结最近interval时段内的消息
+- `{PREFIX.AI_SUMMARY} #interval @User`: 总结最近interval时段内, 且只属于User的消息
示例:
-1. `{PREFIX.AI_SUMMARY} #10`: 总结最近的10条历史消息
-2. `{PREFIX.AI_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
-3. `{PREFIX.AI_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
-如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
+- `{PREFIX.AI_SUMMARY} #10m`: 总结最近10分钟内的消息
+- `{PREFIX.AI_SUMMARY} #2h`: 总结最近2小时内的消息
+- `{PREFIX.AI_SUMMARY} #1d`: 总结最近1天的消息
+- `{PREFIX.AI_SUMMARY} #2h @John`: 总结最近2小时内的消息, 且只属于John的消息
+
+**3️⃣ 指定具体时间范围**
+- `{PREFIX.AI_SUMMARY} #20240101123000`: 总结从2024-01-01 12:30:00开始到现在的消息
+- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息
+- `{PREFIX.AI_SUMMARY} #20240101123000 @User`: 总结从2024-01-01 12:30:00开始到现在的消息, 且只属于User的消息
+- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000 @User`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息, 且只属于User的消息
+
+注意:
+- 用上述各种`{PREFIX.AI_SUMMARY}`命令回复消息M, 视为将截止时间设为消息M的发送时间
+- 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
+- 3️⃣的时间格式中没有任何分隔符, 必须为YYYYMMDDHHMMSS (14位纯数字)
"""
@@ -54,41 +70,58 @@ async def ai_summary(client: Client, message: Message, **kwargs):
# get the number of messages to combine
info = parse_msg(message)
- num_history = 0
- if matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)\s+@(\w+)", info["text"]):
- num_history = int(matched.group(1))
- num_history = min(num_history, MAX_MESSAGE_SUMMARY)
- filter_user = str(matched.group(2))
- elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)", info["text"]):
- num_history = int(matched.group(1))
- num_history = min(num_history, MAX_MESSAGE_SUMMARY)
- filter_user = ""
- else:
- return
- # reply a message with /summary
- offset_id = info["mid"]
+ num_history = MAX_MESSAGE_SUMMARY
+ filter_user = ""
+ begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
+ end_time = nowdt(tz=TZ)
+ # reply to a message with /summary
+ offset_id = 0
if message.reply_to_message:
offset_id = message.reply_to_message.id + 1 # include the reply message
+ reply_info = parse_msg(message.reply_to_message)
+ end_time = reply_info["datetime"]
+
+ # 3️⃣ /summay #YYYYMMDDHHMMSS @user
+ # 4️⃣ /summay #YYYYMMDDHHMMSS-YYYYMMDDHHMMSS @user
+ if matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d{14})-?(\d{14})?(\s+)?(@\w+)?", info["text"]):
+ begin_time = datetime.strptime(matched.group(1), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
+ end_time = datetime.strptime(matched.group(2) or end_time.strftime("%Y%m%d%H%M%S"), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
+ filter_user = matched.group(4) or ""
+ # 2️⃣ /summay #interval @user (/summay #4h @user)
+ elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)([mMhHdD])(\s+)?(@\w+)?", info["text"]):
+ interval = int(matched.group(1))
+ unit = matched.group(2).lower()
+ filter_user = matched.group(4) or ""
+ if unit == "m":
+ begin_time = end_time - timedelta(minutes=interval)
+ elif unit == "h":
+ begin_time = end_time - timedelta(hours=interval)
+ elif unit == "d":
+ begin_time = end_time - timedelta(days=interval)
+ # 1️⃣ /summay #N @user
+ elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)(\s+)?(@\w+)?", info["text"]):
+ num_history = min(int(matched.group(1)), MAX_MESSAGE_SUMMARY)
+ filter_user = matched.group(3) or ""
+ else:
+ return
# set custom chat_id and message_id (useful for debug)
if matched := re.search(r"cid=(-?\w+)", info["text"], re.IGNORECASE):
info["cid"] = to_int(matched.group(1))
if matched := re.search(r"mid=(\d+)", info["text"], re.IGNORECASE):
- info["mid"] = int(matched.group(1))
- offset_id = info["mid"] + 1 # include this message
-
+ offset_id = int(matched.group(1)) + 1 # include this message
if kwargs.get("show_progress") and "progress" not in kwargs:
- res = await send2tg(client, message, texts=f"📝正在获取{num_history}条历史消息...", **kwargs)
+ res = await send2tg(client, message, texts=f"📝正在获取历史消息...\n⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n⏯️结束时间: {end_time:%m-%d %H:%M:%S}", **kwargs)
kwargs["progress"] = res[0]
- history = await get_parsed_chat_history(client, info["cid"], offset_id, num_history, filter_user)
+ history = await get_parsed_chat_history(client, info["cid"], offset_id, num_history, begin_time, end_time, filter_user.removeprefix("@"))
+
+ # parse the history contexts
+ parsed = await get_contexts(history)
if not history:
- await send2tg(client, message, texts=f"最近{num_history}条消息中未找到符合条件的消息", **kwargs)
+ await send2tg(client, message, texts=f"{num_history}条历史消息中未找到符合条件的消息", **kwargs)
await modify_progress(del_status=True, **kwargs)
return
-
- # parse the history contexts
- parsed = await get_contexts(client, history, **kwargs)
contexts = refine_prompts(parsed["system_context"] + [{"role": "user", "content": parsed["user_context"]}])
sysmtem_tokens = count_tokens(contexts[0]["content"])
user_tokens = count_tokens(contexts[-1]["content"])
@@ -102,8 +135,10 @@ async def ai_summary(client: Client, message: Message, **kwargs):
summary_model_name = GPT.LONG_MODEL_NAME
max_tokens = int(GPT.LONG_MODEL_MAX_OUTPUT_LENGTH)
msg = f"🤖**{summary_model_name}**总结中...\n"
- msg += f"🔢有效消息条数: {len(parsed['user_context'])}\n"
- msg += f"🔠总Token数量: {total_tokens}"
+ msg += f"⏩开始时间: {begin_time:%m-%d %H:%M:%S}\n"
+ msg += f"⏯️结束时间: {end_time:%m-%d %H:%M:%S}\n"
+ msg += f"🔢有效消息: {len(parsed['user_context'])}\n"
+ msg += f"🔠总Token: {total_tokens}"
await modify_progress(text=msg, force_update=True, **kwargs)
config = get_model_config_with_contexts(model_type="text", contexts=contexts, force_model=summary_model, message_info=info)
@@ -112,15 +147,17 @@ async def ai_summary(client: Client, message: Message, **kwargs):
config["completions"]["max_completion_tokens"] = max_tokens
else:
config["completions"]["max_tokens"] = max_tokens
-
+ config["client"]["timeout"] = int(GPT.SUMMARY_TIMEOUT)
response = await send_to_gpt(config, **kwargs)
if texts := response.get("content"):
- logger.debug(response)
- await send2tg(client, message, texts=texts.strip("`"), **kwargs)
+ texts = texts.strip("`")
+ if reasoning := response.get("reasoning"):
+ texts = f"{reasoning}\n{texts}"
+ await send2tg(client, message, texts=f"🤖**{response['model']}**:\n{texts}", **kwargs)
await modify_progress(del_status=True, **kwargs)
-async def get_contexts(client: Client, history: list[dict], **kwargs) -> dict: # noqa: ARG001
+async def get_contexts(history: list[dict]) -> dict:
"""Get GPT contexts based on parsed chat history.
Currently, we only summarize text contents.
@@ -131,7 +168,17 @@ async def get_contexts(client: Client, history: list[dict], **kwargs) -> dict:
"content": [
{
"type": "text",
- "text": """总结在线休闲讨论组的聊天记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 引用原始用户名和时间戳, 并使用清晰的语言。
+ "text": """总结以下网络聊天记录, 识别关键主题、争议话题以及重要观点。提供一个简明的总结, 保留原始意图和上下文。如有必要, 引用原始用户名和时间戳, 并使用清晰的语言。
+每一条消息的格式如下:
+{
+ "id": 消息ID, 按顺序递增,
+ "time": 消息发送时间,
+ "url": 消息链接,
+ "username": 消息发送者,
+ "content": 本条消息内容,
+ "reply_to_message": 回复消息的原始内容, 如果本消息并不回复其他消息, 则不存在该字段
+}
+
# 步骤
1. 阅读聊天记录: 仔细查看对话内容, 了解讨论的流程和上下文。
2. 识别关键主题: 提取整个聊天中讨论的主要话题。
@@ -145,16 +192,10 @@ async def get_contexts(client: Client, history: list[dict], **kwargs) -> dict:
# 输出格式
- 使用中文撰写总结。
- 简明扼要地总结聊天记录的内容。
-- 在必要时引用用户名和时间戳。
+- 在必要时引用用户名和时间。
- 保持清晰和简洁的表达。
-
-# 示例
-- 输入: [包含用户名和时间戳的聊天记录片段]
-- 输出:
- [10:23:30] Alice 提出关于气候变化的话题, 重点讨论其影响。
- [11:00:30] Bob 表示反对, 引用了相反的证据。
- [11:30:00] Charlie 提出了一个新的项目想法, 引起了大家的兴趣。
- [12:00:00] 大家讨论了项目的潜在挑战和机会。最终, 决定下次会议继续讨论这个项目。
+- 引用用户名时, 请使用 **username** 格式。如: **username**
+- 引用时间时, 请使用 [HH:MM:SS](url) 格式。如: [12:30:00](https://t.me/username/1234567890)
""",
}
],
@@ -169,28 +210,36 @@ async def get_contexts(client: Client, history: list[dict], **kwargs) -> dict:
continue
if info["text"]: # currently, we only include texts
+ if len(user_context) == 0:
+ begin_time = info["datetime"]
+ end_time = info["datetime"]
content = {
- "message_id": info["mid"],
+ "id": info["mid"],
"time": f"{info['datetime']:%H:%M:%S}",
+ "url": info["message_url"],
"username": info["full_name"],
"content": info["text"],
}
- if (reply_to_message_id := info.get("reply_to_message_id")) and (reply_msg_content := get_message_by_id(reply_to_message_id, history)):
+ if reply_msg_content := get_message_by_id(history, info.get("reply_to_message_id")):
content["reply_to_message"] = reply_msg_content
user_context.append({"type": "text", "text": str(content)})
-
- return {"system_context": system_context, "user_context": user_context}
+ if not user_context:
+ return {}
+ return {"system_context": system_context, "user_context": user_context, "begin_time": begin_time, "end_time": end_time}
-def get_message_by_id(message_id: int, history: list[dict]) -> dict:
+def get_message_by_id(history: list[dict], message_id: int | None = None) -> dict:
"""Get message by id."""
+ if not message_id:
+ return {}
info = next((info for info in history if info["mid"] == message_id), {})
if not info:
return {}
return {
- "message_id": info["mid"],
+ "id": info["mid"],
"time": f"{info['datetime']:%H:%M:%S}",
+ "url": info["message_url"],
"username": info["full_name"],
"content": info["text"],
}
src/llm/utils.py
@@ -7,6 +7,7 @@ import tiktoken
from loguru import logger
from config import DOWNLOAD_DIR, GPT
+from utils import remove_consecutive_newlines, remove_dash, remove_pound
BOT_TIPS = "回复以继续"
@@ -45,6 +46,8 @@ def llm_cleanup_files(messages: list[dict]):
def count_tokens(string: str, encoding_name: str | None = None) -> int:
"""Returns the number of tokens in a text string."""
+ if not string:
+ return 0
if encoding_name is None:
encoding_name = GPT.TOKEN_ENCODING
try:
@@ -80,6 +83,21 @@ def beautify_model_name(name: str) -> str:
return name.replace("gpt", "GPT").replace("gemini", "Gemini").replace("deepseek", "DeepSeek") # GPT-4o
+def beautify_llm_response(text: str) -> str:
+ """Beautify LLM response.
+
+ Args:
+ text: LLM response
+ Returns:
+ beautified LLM response
+ """
+ if not text:
+ return text
+ text = remove_pound(text)
+ text = remove_dash(text)
+ return remove_consecutive_newlines(text)
+
+
def extract_reasoning(text: str) -> tuple[str, str]:
"""Extract reasoning from text.
src/messages/chat_history.py
@@ -1,9 +1,12 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+from datetime import datetime
+from zoneinfo import ZoneInfo
+
from pyrogram.client import Client
-from config import MAX_MESSAGE_COMBINATION, cache
+from config import MAX_MESSAGE_RETRIEVED, TZ, cache
from messages.parser import parse_msg
@@ -11,28 +14,37 @@ from messages.parser import parse_msg
async def get_parsed_chat_history(
client: Client,
chat_id: int | str,
- offset_id: int,
+ offset_id: int = 0,
num: int = 0,
+ begin_time: datetime | None = None,
+ end_time: datetime | None = None,
user: str = "",
) -> list[dict]:
"""Get given number of chat history from old to new in parserd json format.
If user is specified, number of messages from the user will be returned.
"""
- if num <= 0:
- return []
+ if begin_time is None:
+ begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
+ if end_time is None:
+ end_time = datetime.now(tz=ZoneInfo(TZ))
history = []
retrieved = 0
user = user.replace(" ", "").lower()
async for msg in client.get_chat_history(chat_id=chat_id, offset_id=offset_id): # type: ignore
+ # iterate messages from new to old
retrieved += 1
- if retrieved > MAX_MESSAGE_COMBINATION:
+ if retrieved > MAX_MESSAGE_RETRIEVED:
break
if len(history) >= num:
break
if msg.empty:
break
info = parse_msg(msg, silent=True)
+ if info["datetime"] < begin_time:
+ break
+ if info["datetime"] > end_time:
+ continue
if msg.reply_to_message_id:
info["reply_to_message_id"] = msg.reply_to_message_id
if not user:
src/others/combine_history.py
@@ -6,7 +6,7 @@ import re
from pyrogram.client import Client
from pyrogram.types import Message
-from config import ENABLE, MAX_MESSAGE_COMBINATION, PREFIX, READING_SPEED
+from config import ENABLE, MAX_MESSAGE_RETRIEVED, PREFIX, READING_SPEED
from llm.utils import count_tokens
from messages.chat_history import get_parsed_chat_history
from messages.parser import parse_msg
@@ -15,7 +15,7 @@ from messages.utils import equal_prefix, get_reply_to, startswith_prefix
from utils import to_int
HELP = f"""
-💬**合并对话历史** (最多{MAX_MESSAGE_COMBINATION}条)
+💬**合并对话历史** (最多{MAX_MESSAGE_RETRIEVED}条)
使用说明:
1. `{PREFIX.COMBINATION} + #N`
将最近的N条消息文本合并为txt文件
@@ -47,15 +47,13 @@ async def combine_history(client: Client, message: Message, **kwargs):
# get the number of messages to combine
info = parse_msg(message)
- num_history = 0
+ num_history = MAX_MESSAGE_RETRIEVED
if matched := re.match(r"^" + PREFIX.COMBINATION + r"\s+#(\d+)\s+@(\w+)", info["text"]):
- num_history = int(matched.group(1))
- num_history = min(num_history, MAX_MESSAGE_COMBINATION)
+ num_history = min(int(matched.group(1)), MAX_MESSAGE_RETRIEVED)
filter_user = str(matched.group(2))
file_name = f"最近{num_history}条{filter_user}的消息.txt"
elif matched := re.match(r"^" + PREFIX.COMBINATION + r"\s+#(\d+)", info["text"]):
- num_history = int(matched.group(1))
- num_history = min(num_history, MAX_MESSAGE_COMBINATION)
+ num_history = min(int(matched.group(1)), MAX_MESSAGE_RETRIEVED)
filter_user = ""
file_name = f"最近{num_history}条消息记录.txt"
else:
@@ -75,7 +73,7 @@ async def combine_history(client: Client, message: Message, **kwargs):
info["mid"] = int(matched.group(1))
offset_id = info["mid"] + 1 # include this message
- history = await get_parsed_chat_history(client, info["cid"], offset_id, num_history, filter_user)
+ history = await get_parsed_chat_history(client, info["cid"], offset_id, num_history, user=filter_user)
if not history:
await send2tg(client, message, texts=f"最近{num_history}条消息中未找到符合条件的消息", **kwargs)
return
src/config.py
@@ -18,7 +18,7 @@ TEXT_LENGTH = int(os.getenv("TEXT_LENGTH", "4096")) # Maximum length of text me
CAPTION_LENGTH = int(os.getenv("CAPTION_LENGTH", "1024")) # 4096 for Premium user
MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", "2000")) * 1024 * 1024 # 4000 MB for Premium user
ASR_MAX_DURATION = int(os.getenv("ASR_MAX_DURATION", "600"))
-MAX_MESSAGE_COMBINATION = int(os.getenv("MAX_MESSAGE_COMBINATION", "5000")) # Maximum number of messages to combine
+MAX_MESSAGE_RETRIEVED = int(os.getenv("MAX_MESSAGE_RETRIEVED", "5000")) # Maximum number of messages to combine
MAX_MESSAGE_SUMMARY = int(os.getenv("MAX_MESSAGE_SUMMARY", "5000")) # Maximum number of messages to summay
READING_SPEED = int(os.getenv("READING_SPEED", "300")) # words per minute
DAILY_MESSAGES = os.getenv("DAILY_MESSAGES", "{}") # Useful for daily checkin for some services. Should be a json string: '{"chat-1": "msg-1", "chat-2": "msg-2"}'
@@ -192,6 +192,7 @@ class GPT: # see `llm/README.md`
SUMMARY_MODEL_MAX_OUTPUT_LENGTH = os.getenv("GPT_SUMMARY_MODEL_MAX_OUTPUT_LENGTH", "8192") # 8K
SUMMARY_API_KEY = os.getenv("GPT_SUMMARY_API_KEY", "")
SUMMARY_BASE_URL = os.getenv("GPT_SUMMARY_BASE_URL", "https://api.openai.com/v1")
+ SUMMARY_TIMEOUT = os.getenv("GPT_SUMMARY_TIMEOUT", "600") # should be larger than default timeout
# long context model
LONG_MODEL = os.getenv("GPT_LONG_MODEL", "gemini-1.5-pro")
LONG_MODEL_NAME = os.getenv("GPT_LONG_MODEL_NAME", "Gemini-1.5-Pro")
src/utils.py
@@ -239,6 +239,32 @@ def match_urls(text: str) -> list[str]:
return [https_url(x[0]) for x in res]
+def remove_dash(text: str) -> str:
+ if not text:
+ return ""
+ while "---" in text:
+ text = text.replace("---", "")
+ while "--" in text:
+ text = text.replace("--", "")
+ return text
+
+
+def remove_pound(text: str) -> str:
+ if not text:
+ return ""
+ while "# " in text:
+ text = text.replace("# ", " ")
+ return text
+
+
+def remove_consecutive_newlines(text: str) -> str:
+ if not text:
+ return ""
+ while "\n\n\n" in text:
+ text = text.replace("\n\n\n", "\n\n")
+ return text
+
+
def is_supported_by_ytdlp(url: str) -> bool:
"""Check if this url is supported by ytdlp."""
if "t.me" in url: # tg link