Commit 4bb9274
Changed files (4)
src/llm/contexts.py
@@ -61,9 +61,9 @@ async def single_context(client: Client, message: Message) -> dict:
{
"role": "user or assistant",
"content": [
- {"type": "image_url", "image_url": {"url": "https://server.com/dir/image.jpg"}},
- {"type": "video_url", "video_url": {"url": "https://server.com/dir/video.mp4"}}, # 智谱
- # {"type": "video", "video": ["https://server.com/dir/1.jpg","https://server.com/dir/2.jpg","https://server.com/dir/3.jpg"]}, # 千问
+ {'type': 'text', 'text': 'caption this img'},
+ {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,base64_image'}},
+ {'type': 'image_url', 'image_url': {'url': 'https://server.com/dir/image.jpg'}},
],
}
"""
@@ -84,9 +84,8 @@ async def single_context(client: Client, message: Message) -> dict:
if info["mtype"] not in ["text", "photo", "voice", "video", "document"]:
return {}
- # has media
messages = await client.get_media_group(message.chat.id, message.id) if message.media_group_id else [message]
- media = []
+ contexts = []
for msg in messages:
info = parse_msg(msg, silent=True)
msg_text = clean_text(info["text"])
@@ -94,12 +93,12 @@ async def single_context(client: Client, message: Message) -> dict:
if GPT.MEDIA_FORMAT == "base64":
if info["mtype"] == "photo":
res = await base64_media(client, msg)
- media.append({"type": "image_url", "image_url": {"url": f"data:image/{res['ext']};base64,{res['base64']}"}})
+ contexts.append({"type": "image_url", "image_url": {"url": f"data:image/{res['ext']};base64,{res['base64']}"}})
# elif info["mtype"] == "video":
# media.append({"type": "video_url", "video_url": {"url": b64}})
elif info["mtype"] == "document" and info["mime_type"] == "text/plain":
res = await base64_media(client, msg)
- media.append(
+ contexts.append(
{
"type": "text",
"text": f"[from user]: {info['full_name']}\n[file name]: {info['file_name']}\n[file content begin]\n{res['value'].strip()}\n[file content end]",
@@ -109,11 +108,11 @@ async def single_context(client: Client, message: Message) -> dict:
path: str = await client.download_media(msg) # type: ignore
logger.debug(f"Downloaded GPT media: {path}")
if info["mtype"] == "photo":
- media.append({"type": "image_url", "image_url": {"url": f"{GPT.MEDIA_SERVER}/{Path(path).name}"}})
+ contexts.append({"type": "image_url", "image_url": {"url": f"{GPT.MEDIA_SERVER}/{Path(path).name}"}})
# elif info["mtype"] == "video":
# media.append({"type": "video_url", "video_url": {"url": f"{GPT.MEDIA_SERVER}/{Path(path).name}"}})
elif info["mtype"] == "document" and info["mime_type"] == "text/plain":
- media.append(
+ contexts.append(
{
"type": "text",
"text": f"[from user]: {info['full_name']}\n[file name]: {info['file_name']}\n[file content begin]\n{Path(path).read_text().strip()}\n[file content end]",
@@ -121,11 +120,14 @@ async def single_context(client: Client, message: Message) -> dict:
)
Path(path).unlink(missing_ok=True)
if msg_text:
- media.append({"type": "text", "text": f"[from user]: {info['full_name']}\n[message begin]\n{msg_text}\n[message end]"})
+ if role == "user":
+ contexts.append({"type": "text", "text": f"[from user]: {info['full_name']}\n[message begin]\n{msg_text}\n[message end]"})
+ else:
+ contexts.append({"type": "text", "text": msg_text})
except Exception as e:
logger.warning(f"Download media from message failed: {e}")
continue
- return {"role": role, "content": media}
+ return {"role": role, "content": contexts}
async def base64_media(client: Client, message: Message) -> dict:
src/llm/gpt.py
@@ -7,7 +7,7 @@ from pyrogram.types import Message
from config import GPT, PREFIX, TEXT_LENGTH, cache
from llm.contexts import get_conversation_contexts, get_conversations
-from llm.models import get_model_config_with_contexts, get_model_type
+from llm.models import get_context_type, get_model_config_with_contexts
from llm.response import send_to_gpt
from llm.response_stream import send_to_gpt_stream
from llm.tools import merge_tools_response
@@ -31,12 +31,14 @@ HELP = f"""🤖**GPT对话**
使用说明:
1. 在 `{PREFIX.GPT}` 后接提示词即可与GPT对话
2. 以 `{PREFIX.GPT}` 回复消息可将其加入上下文
-3. 暂不支持音频模型, 可以先用 `{PREFIX.ASR}` 命令转为文字后再使用 `{PREFIX.GPT}`
+3. 暂不支持视频/音频模型, 可以先用 `{PREFIX.ASR}` 命令转为文字后再使用 `{PREFIX.GPT}`
"""
def is_gpt_conversation(message: Message) -> bool:
info = parse_msg(message)
+ if info["is_bot"]: # do not process bot message
+ return False
if startswith_prefix(info["text"], prefix=[PREFIX.GPT, "/gpt", "/gemini", "/ds", "/qwen", "/doubao"]):
return True
# is replying to gpt-bot response message?
@@ -45,7 +47,8 @@ def is_gpt_conversation(message: Message) -> bool:
reply_msg = message.reply_to_message
reply_info = parse_msg(reply_msg, silent=True)
- return reply_info["text"].startswith("🤖")
+ model_names = [GPT.OPENAI_MODEL_NAME, GPT.GEMINI_MODEL_NAME, GPT.DEEPSEEK_MODEL_NAME, GPT.QWEN_MODEL_NAME, GPT.DOUBAO_MODEL_NAME]
+ return startswith_prefix(reply_info["text"], prefix=[f"🤖{x}".lower() for x in model_names])
@cache.memoize(ttl=60)
@@ -69,23 +72,27 @@ async def gpt_response(client: Client, message: Message, *, gpt_stream: bool = G
# /gpt = OpenAI, /gemini = Gemini, /ds = DeepSeek, /qwen = Qwen, /doubao = Doubao
force_model = "N/A"
- if startswith_prefix(info["text"], prefix=["/gpt"]):
+ reply_text = ""
+ if message.reply_to_message:
+ reply_info = parse_msg(message.reply_to_message, silent=True)
+ reply_text = reply_info["text"]
+ if startswith_prefix(info["text"], prefix=["/gpt"]) or reply_text.startswith(f"🤖{GPT.OPENAI_MODEL_NAME}"):
force_model = GPT.OPENAI_MODEL
if not GPT.OPENAI_API_KEY:
return await send2tg(client, message, texts=f"⚠️GPT未配置API Key, 请尝试其他命令\n\n{HELP}", **kwargs)
- elif startswith_prefix(info["text"], prefix=["/gemini"]):
+ elif startswith_prefix(info["text"], prefix=["/gemini"]) or reply_text.startswith(f"🤖{GPT.GEMINI_MODEL_NAME}"):
force_model = GPT.GEMINI_MODEL
if not GPT.GEMINI_API_KEY:
return await send2tg(client, message, texts=f"⚠️Gemini未配置API Key, 请尝试其他命令\n\n{HELP}", **kwargs)
- elif startswith_prefix(info["text"], prefix=["/ds"]):
+ elif startswith_prefix(info["text"], prefix=["/ds"]) or reply_text.startswith(f"🤖{GPT.DEEPSEEK_MODEL_NAME}"):
force_model = GPT.DEEPSEEK_MODEL
if not GPT.DEEPSEEK_API_KEY:
return await send2tg(client, message, texts=f"⚠️DeepSeek未配置API Key, 请尝试其他命令\n\n{HELP}", **kwargs)
- elif startswith_prefix(info["text"], prefix=["/qwen"]):
+ elif startswith_prefix(info["text"], prefix=["/qwen"]) or reply_text.startswith(f"🤖{GPT.QWEN_MODEL_NAME}"):
force_model = GPT.QWEN_MODEL
if not GPT.QWEN_API_KEY:
return await send2tg(client, message, texts=f"⚠️通义千问未配置API Key, 请尝试其他命令\n\n{HELP}", **kwargs)
- elif startswith_prefix(info["text"], prefix=["/doubao"]):
+ elif startswith_prefix(info["text"], prefix=["/doubao"]) or reply_text.startswith(f"🤖{GPT.DOUBAO_MODEL_NAME}"):
force_model = GPT.DOUBAO_MODEL
if not GPT.DOUBAO_API_KEY:
return await send2tg(client, message, texts=f"⚠️豆包未配置API Key, 请尝试其他命令\n\n{HELP}", **kwargs)
@@ -96,17 +103,15 @@ async def gpt_response(client: Client, message: Message, *, gpt_stream: bool = G
return
cache.set(f"gpt-{info['cid']}-{media_group_id}", "1", ttl=120)
conversations = get_conversations(message)
- model_type = get_model_type(conversations)
- if model_type.startswith("ERROR"):
- logger.error(model_type)
- await send2tg(client, message, texts=model_type, **kwargs)
- return
+ context_type = get_context_type(conversations)
contexts = await get_conversation_contexts(client, conversations)
- config = get_model_config_with_contexts(model_type, contexts, force_model, info)
+ config = get_model_config_with_contexts(context_type["type"], contexts, force_model, info)
msg = f"🤖**{config['friendly_name']}**: 思考中..."
status_msg = (await send2tg(client, message, texts=msg, **kwargs))[0]
kwargs["progress"] = status_msg
-
+ if context_type.get("error"):
+ logger.warning(context_type["error"])
+ await modify_progress(message=status_msg, text=context_type["error"], force_update=True, **kwargs)
config, response = await merge_tools_response(config, **kwargs)
# skip send a new request if tool_model is the same as the current model
if response and config["completions"]["model"] == GPT.TOOLS_MODEL and response.get("content"):
src/llm/models.py
@@ -5,28 +5,28 @@ from typing import Any
from openai import DefaultAsyncHttpxClient
from pyrogram.types import Message
-from config import GPT, PROXY
+from config import GPT, PREFIX, PROXY
from llm.prompts import refine_prompts
from messages.parser import parse_msg
from utils import unicode_to_ascii
-def get_model_type(conversations: list[Message]) -> str:
+def get_context_type(conversations: list[Message]) -> dict:
"""Get model type based on conversation messages."""
- has_image = False
has_video = False
- model_type = "text"
+ has_audio = False
+ res = {"type": "text"}
for message in conversations:
info = parse_msg(message, silent=True)
if info["mtype"] == "photo":
- model_type = "image"
- has_image = True
- # if info["mtype"] == "video": # disable video
- # model_type = "video"
- # has_video = True
- if has_image and has_video:
- model_type = "ERROR: this conversation have both image and video."
- return model_type
+ res["type"] = "image"
+ if info["mtype"] == "video":
+ has_video = True
+ if info["mtype"] == "audio":
+ has_audio = True
+ if has_audio or has_video:
+ res["error"] = f"⚠️暂不支持视频/音频模型, 已忽略上下文中的视频/音频消息\n可以先用 `{PREFIX.ASR}` 命令转为文字后再使用 `{PREFIX.GPT}`"
+ return res
def get_model_config_with_contexts(model_type: str, contexts: list[dict], force_model: str = "N/A", message_info: dict | None = None) -> dict:
@@ -39,55 +39,11 @@ def get_model_config_with_contexts(model_type: str, contexts: list[dict], force_
"content": [
{"type": "text", "text": "text"},
{"type": "image_url", "image_url": {"url": "https://server.com/dir/image.jpg"}},
- {"type": "video_url", "video_url": {"url": "https://server.com/dir/video.mp4"}},
]
}
]
"""
- models = {"text": GPT.TEXT_MODEL, "image": GPT.IMAGE_MODEL, "video": GPT.VIDEO_MODEL}
- model_names = {"text": GPT.TEXT_MODEL_NAME, "image": GPT.IMAGE_MODEL_NAME, "video": GPT.VIDEO_MODEL_NAME}
- apis = {"text": GPT.TEXT_API_KEY, "image": GPT.IMAGE_API_KEY, "video": GPT.VIDEO_API_KEY}
- urls = {"text": GPT.TEXT_BASE_URL, "image": GPT.IMAGE_BASE_URL, "video": GPT.VIDEO_BASE_URL}
-
- model = force_model if force_model != "N/A" else models[model_type]
- model_name = model_names[model_type]
- # setup configs
- # params for OpenAI client
- client = {
- "api_key": apis[model_type],
- "base_url": urls[model_type],
- "timeout": round(float(GPT.TIMEOUT)),
- "http_client": DefaultAsyncHttpxClient(proxy=PROXY.GPT),
- }
-
- if force_model == GPT.OPENAI_MODEL:
- client["api_key"] = GPT.OPENAI_API_KEY
- client["base_url"] = GPT.OPENAI_BASE_URL
- model_name = GPT.OPENAI_MODEL_NAME
- elif force_model == GPT.GEMINI_MODEL:
- client["api_key"] = GPT.GEMINI_API_KEY
- client["base_url"] = GPT.GEMINI_BASE_URL
- model_name = GPT.GEMINI_MODEL_NAME
- elif force_model == GPT.DEEPSEEK_MODEL:
- client["api_key"] = GPT.DEEPSEEK_API_KEY
- client["base_url"] = GPT.DEEPSEEK_BASE_URL
- model_name = GPT.DEEPSEEK_MODEL_NAME
- elif force_model == GPT.QWEN_MODEL:
- client["api_key"] = GPT.QWEN_API_KEY
- client["base_url"] = GPT.QWEN_BASE_URL
- model_name = GPT.QWEN_MODEL_NAME
- elif force_model == GPT.DOUBAO_MODEL:
- client["api_key"] = GPT.DOUBAO_API_KEY
- client["base_url"] = GPT.DOUBAO_BASE_URL
- model_name = GPT.DOUBAO_MODEL_NAME
- elif force_model == GPT.SUMMARY_MODEL:
- client["api_key"] = GPT.SUMMARY_API_KEY
- client["base_url"] = GPT.SUMMARY_BASE_URL
- model_name = GPT.SUMMARY_MODEL_NAME
- elif force_model == GPT.LONG_MODEL:
- client["api_key"] = GPT.LONG_API_KEY
- client["base_url"] = GPT.LONG_BASE_URL
- model_name = GPT.LONG_MODEL_NAME
+ client, model, model_name = align_with_force_model(model_type, force_model)
# params for `openai.chat.completions.create()`
completions = {"model": model, "messages": contexts, "temperature": float(GPT.TEMPERATURE)}
@@ -137,3 +93,56 @@ def helicone_hook(client: dict, message_info: dict | None) -> None:
if user_name := message_info.get("full_name"):
headers |= {"Helicone-User-Id": unicode_to_ascii(user_name), "Helicone-Property-User": str(message_info["uid"])}
client |= {"default_headers": headers}
+
+
+def align_with_force_model(model_type: str, force_model: str = "N/A") -> tuple[dict, str, str]:
+ """Align the model with the modalities if force_model is specified.
+
+ For example, user use `/ds` to reply an image, but the model only support text, so we need to use switch to image model.
+ """
+ models = {"text": GPT.TEXT_MODEL, "image": GPT.IMAGE_MODEL, "video": GPT.VIDEO_MODEL}
+ model_names = {"text": GPT.TEXT_MODEL_NAME, "image": GPT.IMAGE_MODEL_NAME, "video": GPT.VIDEO_MODEL_NAME}
+ apis = {"text": GPT.TEXT_API_KEY, "image": GPT.IMAGE_API_KEY, "video": GPT.VIDEO_API_KEY}
+ urls = {"text": GPT.TEXT_BASE_URL, "image": GPT.IMAGE_BASE_URL, "video": GPT.VIDEO_BASE_URL}
+
+ model = models[model_type]
+ model_name = model_names[model_type]
+ if force_model == "N/A":
+ force_model = model
+ # params for OpenAI client
+ client = {
+ "api_key": apis[model_type],
+ "base_url": urls[model_type],
+ "timeout": round(float(GPT.TIMEOUT)),
+ "http_client": DefaultAsyncHttpxClient(proxy=PROXY.GPT),
+ }
+
+ model_factory = {
+ GPT.OPENAI_MODEL: {"api_key": GPT.OPENAI_API_KEY, "base_url": GPT.OPENAI_BASE_URL, "model_name": GPT.OPENAI_MODEL_NAME},
+ GPT.GEMINI_MODEL: {"api_key": GPT.GEMINI_API_KEY, "base_url": GPT.GEMINI_BASE_URL, "model_name": GPT.GEMINI_MODEL_NAME},
+ GPT.DEEPSEEK_MODEL: {"api_key": GPT.DEEPSEEK_API_KEY, "base_url": GPT.DEEPSEEK_BASE_URL, "model_name": GPT.DEEPSEEK_MODEL_NAME},
+ GPT.QWEN_MODEL: {"api_key": GPT.QWEN_API_KEY, "base_url": GPT.QWEN_BASE_URL, "model_name": GPT.QWEN_MODEL_NAME},
+ GPT.DOUBAO_MODEL: {"api_key": GPT.DOUBAO_API_KEY, "base_url": GPT.DOUBAO_BASE_URL, "model_name": GPT.DOUBAO_MODEL_NAME},
+ }
+ model_factory |= {GPT.SUMMARY_MODEL: {"api_key": GPT.SUMMARY_API_KEY, "base_url": GPT.SUMMARY_BASE_URL, "model_name": GPT.SUMMARY_MODEL_NAME}}
+ model_factory |= {GPT.LONG_MODEL: {"api_key": GPT.LONG_API_KEY, "base_url": GPT.LONG_BASE_URL, "model_name": GPT.LONG_MODEL_NAME}}
+ force_model_config = model_factory.get(force_model, {})
+
+ force_model_name = force_model_config.get("model_name", model_name)
+ force_model_config.pop("model_name", None)
+ if model_type == "text": # respect the force model
+ client |= force_model_config
+ return client, force_model, force_model_name
+
+ if model_type == "image" and ( # check capabilities
+ (force_model == GPT.OPENAI_MODEL and GPT.OPENAI_IMAGE_CAPABILITY)
+ or (force_model == GPT.GEMINI_MODEL and GPT.GEMINI_IMAGE_CAPABILITY)
+ or (force_model == GPT.DEEPSEEK_MODEL and GPT.DEEPSEEK_IMAGE_CAPABILITY)
+ or (force_model == GPT.QWEN_MODEL and GPT.QWEN_IMAGE_CAPABILITY)
+ or (force_model == GPT.DOUBAO_MODEL and GPT.DOUBAO_IMAGE_CAPABILITY)
+ or (force_model == GPT.SUMMARY_MODEL and GPT.SUMMARY_IMAGE_CAPABILITY)
+ or (force_model == GPT.LONG_MODEL and GPT.LONG_IMAGE_CAPABILITY)
+ ):
+ client |= force_model_config
+ return client, force_model, force_model_name
+ return client, model, model_name
src/config.py
@@ -175,27 +175,31 @@ class GPT: # see `llm/README.md`
GEMINI_MODEL_NAME = os.getenv("GPT_GEMINI_MODEL_NAME", "Gemini-2.0-Flash")
GEMINI_API_KEY = os.getenv("GPT_GEMINI_API_KEY", "")
GEMINI_BASE_URL = os.getenv("GPT_GEMINI_BASE_URL", "https://generativelanguage.googleapis.com/v1beta/openai")
+ GEMINI_IMAGE_CAPABILITY = os.getenv("GPT_GEMINI_IMAGE_CAPABILITY", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
# /gpt command
OPENAI_MODEL = os.getenv("GPT_OPENAI_MODEL", "gpt-4o")
OPENAI_MODEL_NAME = os.getenv("GPT_OPENAI_MODEL_NAME", "GPT-4o")
OPENAI_API_KEY = os.getenv("GPT_OPENAI_API_KEY", "")
OPENAI_BASE_URL = os.getenv("GPT_OPENAI_BASE_URL", "https://api.openai.com/v1")
+ OPENAI_IMAGE_CAPABILITY = os.getenv("GPT_OPENAI_IMAGE_CAPABILITY", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
# /ds command
DEEPSEEK_MODEL = os.getenv("GPT_DEEPSEEK_MODEL", "deepseek-r1")
DEEPSEEK_MODEL_NAME = os.getenv("GPT_DEEPSEEK_MODEL_NAME", "DeepSeek-R1")
DEEPSEEK_API_KEY = os.getenv("GPT_DEEPSEEK_API_KEY", "")
DEEPSEEK_BASE_URL = os.getenv("GPT_DEEPSEEK_BASE_URL", "https://api.deepseek.com/v1")
+ DEEPSEEK_IMAGE_CAPABILITY = os.getenv("GPT_DEEPSEEK_IMAGE_CAPABILITY", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
# /qwen command
QWEN_MODEL = os.getenv("GPT_QWEN_MODEL", "qwen-vl-max")
QWEN_MODEL_NAME = os.getenv("GPT_QWEN_MODEL_NAME", "Qwen-VL-Max")
QWEN_API_KEY = os.getenv("GPT_QWEN_API_KEY", "")
QWEN_BASE_URL = os.getenv("GPT_QWEN_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
+ QWEN_IMAGE_CAPABILITY = os.getenv("GPT_QWEN_IMAGE_CAPABILITY", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
# /doubao command
DOUBAO_MODEL = os.getenv("GPT_DOUBAO_MODEL", "doubao-1-5-vision-pro-32k-250115")
DOUBAO_MODEL_NAME = os.getenv("GPT_DOUBAO_MODEL_NAME", "豆包-1.5-Pro")
DOUBAO_API_KEY = os.getenv("GPT_DOUBAO_API_KEY", "")
DOUBAO_BASE_URL = os.getenv("GPT_DOUBAO_BASE_URL", "https://ark.cn-beijing.volces.com/api/v3")
-
+ DOUBAO_IMAGE_CAPABILITY = os.getenv("GPT_DOUBAO_IMAGE_CAPABILITY", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
# /summary command
SUMMARY_MODEL = os.getenv("GPT_SUMMARY_MODEL", "gpt-4o")
SUMMARY_MODEL_NAME = os.getenv("GPT_SUMMARY_MODEL_NAME", "GPT-4o")
@@ -204,6 +208,7 @@ class GPT: # see `llm/README.md`
SUMMARY_API_KEY = os.getenv("GPT_SUMMARY_API_KEY", "")
SUMMARY_BASE_URL = os.getenv("GPT_SUMMARY_BASE_URL", "https://api.openai.com/v1")
SUMMARY_TIMEOUT = os.getenv("GPT_SUMMARY_TIMEOUT", "600") # should be larger than default timeout
+ SUMMARY_IMAGE_CAPABILITY = os.getenv("GPT_SUMMARY_IMAGE_CAPABILITY", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
# long context model
LONG_MODEL = os.getenv("GPT_LONG_MODEL", "gemini-1.5-pro")
LONG_MODEL_NAME = os.getenv("GPT_LONG_MODEL_NAME", "Gemini-1.5-Pro")
@@ -211,6 +216,7 @@ class GPT: # see `llm/README.md`
LONG_MODEL_MAX_OUTPUT_LENGTH = os.getenv("GPT_LONG_MODEL_MAX_OUTPUT_LENGTH", "8192") # 8K
LONG_API_KEY = os.getenv("GPT_LONG_API_KEY", "")
LONG_BASE_URL = os.getenv("GPT_LONG_BASE_URL", "https://generativelanguage.googleapis.com/v1beta/openai")
+ LONG_IMAGE_CAPABILITY = os.getenv("GPT_LONG_IMAGE_CAPABILITY", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
class TID: # see more TID usecase in `src/permission.py`