Commit 86bb25f

benny-dou <60535774+benny-dou@users.noreply.github.com>
2026-05-22 04:32:07
feat(ai): support audio input for openai responses
1 parent c6e8275
Changed files (2)
src/ai/texts/contexts.py
@@ -5,7 +5,6 @@ import base64
 import contextlib
 import hashlib
 import mimetypes
-import time
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
 
@@ -178,32 +177,40 @@ async def single_openai_response_context(client: Client, message: Message, opena
     extra_markdown_extensions = [".html", ".docx", ".pptx", ".xls", ".xlsx"]  # convert to markdown
 
     messages = await client.get_media_group(message.chat.id, message.id) if message.media_group_id else [message]
+    media_send_as = openai_params.get("openai_media_send_as", "base64")
+    allow_image = bool(openai_params.get("allow_image"))
+    allow_video = bool(openai_params.get("allow_video"))
+    allow_audio = bool(openai_params.get("allow_audio"))
+    allow_file = bool(openai_params.get("allow_file"))
     contexts = []
     for msg in messages:
         info = parse_msg(msg, silent=True)
         sender = info["fwd_full_name"] or info["full_name"]
         media_path = DOWNLOAD_DIR + "/" + info["file_name"]
-        media_send_as = openai_params.get("openai_media_send_as", "base64")
         file_id = ""
         try:
-            if info["mtype"] == "photo":
-                if media_send_as == "file_id" and (file_id := await get_openai_file_id(client, msg, openai_params, info["mtype"])):
-                    contexts.append({"type": "input_image", "file_id": file_id, "detail": "high"})
+            if info["mtype"] == "photo" and allow_image:
+                if media_send_as == "file_id" and (file_id := await get_openai_file_id(client, msg, openai_params)):
+                    contexts.append({"type": "input_image", "file_id": file_id})
                 if not file_id:
                     res = await base64_media(client, msg)
-                    contexts.append({"type": "input_image", "image_url": f"data:image/{res['ext']};base64,{res['base64']}", "detail": "high"})
-
-            elif info["mtype"] == "video":
-                if media_send_as == "file_id" and (file_id := await get_openai_file_id(client, msg, openai_params, info["mtype"])):
+                    contexts.append({"type": "input_image", "image_url": f"data:image/{res['ext']};base64,{res['base64']}"})
+            elif info["mtype"] == "video" and allow_video:
+                if media_send_as == "file_id" and (file_id := await get_openai_file_id(client, msg, openai_params)):
                     contexts.append({"type": "input_video", "file_id": file_id})
                 if not file_id:
                     res = await base64_media(client, msg)
                     contexts.append({"type": "input_video", "video_url": f"data:video/{res['ext']};base64,{res['base64']}"})
-
-            elif info["mtype"] == "document":
+            elif info["mtype"] == "audio" and allow_audio:
+                if media_send_as == "file_id" and (file_id := await get_openai_file_id(client, msg, openai_params)):
+                    contexts.append({"type": "input_audio", "file_id": file_id})
+                if not file_id:
+                    res = await base64_media(client, msg)
+                    contexts.append({"type": "input_audio", "audio_url": f"data:audio/{res['ext']};base64,{res['base64']}"})
+            elif info["mtype"] == "document" and allow_file:
                 guessed_mime, _ = mimetypes.guess_type(info["file_name"])
                 if info["mime_type"] == "application/pdf" or guessed_mime == "application/pdf":
-                    if media_send_as == "file_id" and (file_id := await get_openai_file_id(client, msg, openai_params, info["mtype"])):
+                    if media_send_as == "file_id" and (file_id := await get_openai_file_id(client, msg, openai_params)):
                         contexts.append({"type": "input_file", "file_id": file_id})
                     if not file_id:
                         res = await base64_media(client, msg)
@@ -246,7 +253,7 @@ async def single_openai_response_context(client: Client, message: Message, opena
     return {"role": role, "type": "message", "content": contexts, **extra}
 
 
-async def get_openai_file_id(client: Client, message: Message, openai_params: dict, mtype: str) -> str:
+async def get_openai_file_id(client: Client, message: Message, openai_params: dict) -> str:
     def get_real_baseurl() -> str:
         base_url = str(openai_params["base_url"]) or ""
         default_headers = openai_params.get("default_headers", {})
@@ -258,15 +265,9 @@ async def get_openai_file_id(client: Client, message: Message, openai_params: di
             return default_headers.get("x-portkey-custom-host") or ""
         return base_url
 
-    if mtype not in ["photo", "video", "document"]:
-        return ""
-    if not openai_params["allow_image"] and mtype == "photo":
+    if openai_params.get("max_upload_size") and message_bytes(message) > int(openai_params["max_upload_size"]):
+        logger.warning(f"Message-{message.id} size {message_bytes(message)} bytes exceeds max_upload_size {openai_params['max_upload_size']}")
         return ""
-    if not openai_params["allow_video"] and mtype == "video":
-        return ""
-    if not openai_params["allow_file"] and mtype == "document":
-        return ""
-
     cache_day = openai_params.get("cache_day", 30)
     api_key = openai_params["api_key"]
     model_id = openai_params["model_id"]
@@ -282,28 +283,17 @@ async def get_openai_file_id(client: Client, message: Message, openai_params: di
         api_key=api_key,
         http_client=DefaultAsyncHttpxClient(proxy=openai_params["proxy"]) if openai_params.get("proxy") else None,
     )
-    fpath: str = await client.download_media(message)  # type: ignore
-    extra_body = {"expire_at": int(time.time()) + 3600 * 24 * cache_day}
-
-    preprocess_configs = {}
-    if message.video:
-        duration = glom(message, "video.duration", default=1e8)
-        ratio = int(duration // 300)
-        fps = ratio * 0.5
-        if fps < 0.5:
-            fps = 0.5
-        elif fps > 5.0:
-            fps = 5.0
-        preprocess_configs = {"video": {"fps": fps, "model": openai_params["model_id"]}}
-    if preprocess_configs:
-        extra_body["preprocess_configs"] = preprocess_configs
+    fpath: str | Path = await client.download_media(message)  # ty:ignore[invalid-assignment]
     try:
-        resp = await openai.files.create(file=Path(fpath), purpose="user_data", extra_body=extra_body)
-        while resp.status == "processing":
+        # hotfix: convert audio to aac
+        if message.audio and not str(fpath).endswith(".aac"):
+            fpath = await downsampe_audio(fpath, ext="aac", codec="aac")
+        resp = await openai.files.create(file=Path(fpath), purpose="user_data")
+        while resp.status in ["processing", "uploaded"]:
             logger.trace(f"Upload media to OpenAI processing: {resp.model_dump()}")
             await asyncio.sleep(3)
             resp = await openai.files.retrieve(file_id=resp.id)
-        if resp.status == "active":
+        if resp.status in ["active", "processed"]:
             Path(fpath).unlink(missing_ok=True)
             await set_cf_r2(r2_key, data=resp.model_dump(), metadata={"file_id": resp.id})
             return resp.id
@@ -495,7 +485,8 @@ async def context_bytes(client: Client, message: Message) -> int:
     for msg in chains:
         groups = await client.get_media_group(msg.chat.id, msg.id) if msg.media_group_id else [msg]
         messages.extend(groups)
-    size_bytes = 0
-    for m in messages:
-        size_bytes += glom(m, Coalesce("photo.sizes.-1.file_size", "video.file_size", "document.file_size"), default=0)
-    return size_bytes
+    return sum(message_bytes(m) for m in messages)
+
+
+def message_bytes(message: Message) -> int:
+    return glom(message, Coalesce("photo.sizes.-1.file_size", "video.file_size", "document.file_size"), default=0)
src/ai/texts/openai_response.py
@@ -38,8 +38,9 @@ async def openai_responses_api(
     openai_responses_config: str | dict = "",
     openai_proxy: str | None = PROXY.OPENAI,
     cache_response_ttl: int = 0,
-    openai_allow_image: bool = False,  # whether to allow image in input modalities
+    openai_allow_image: bool = True,  # whether to allow image in input modalities
     openai_allow_video: bool = False,  # whether to allow video in input modalities
+    openai_allow_audio: bool = False,  # whether to allow audio in input modalities
     openai_allow_file: bool = False,  # whether to allow file in input modalities
     openai_media_send_as: Literal["base64", "file_id"] = "base64",
     skills: str = "",
@@ -91,6 +92,7 @@ async def openai_responses_api(
                     "cache_day": cache_day,
                     "allow_image": openai_allow_image,
                     "allow_video": openai_allow_video,
+                    "allow_audio": openai_allow_audio,
                     "allow_file": openai_allow_file,
                     "openai_media_send_as": openai_media_send_as,
                 },