Commit ed65b8f

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-09-19 17:20:30
chore(audio): prefer `wav` over `opus` format for ASR & TTS processing
1 parent 5dab447
src/asr/ali.py
@@ -34,8 +34,8 @@ async def ali_asr(path: str | Path) -> dict:
     if not path.is_file():
         return {"texts": "", "error": "File not found."}
     supported_ext = [".aac", ".amr", ".avi", ".flac", ".flv", ".m4a", ".mkv", ".mov", ".mp3", ".mp4", ".mpeg", ".oga", ".ogg", ".opus", ".wav", ".webm", ".wma", ".wmv"]
-    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="opus", codec="libopus")
-    audio_path = await convert_single_channel(audio_path)
+    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="wav", codec="pcm_s16le")
+    audio_path = await convert_single_channel(audio_path, ext="wav", codec="pcm_s16le")
     api_keys = strings_list(ASR.ALI_API_KEY, shuffle=True)
     if not api_keys:
         return {"error": "请配置阿里云语音识别的API Key"}
@@ -50,7 +50,7 @@ async def ali_asr(path: str | Path) -> dict:
                 url = FILE_SERVER.removesuffix("/") + "/" + path.name
             elif ASR.ALI_FS_ENGINE.lower() == "uguu":
                 if audio_path.stat().st_size > 100 * 1024 * 1024:  # 100 MB
-                    audio_path = await downsampe_audio(audio_path)
+                    audio_path = await downsampe_audio(audio_path, ext="wav", codec="pcm_s16le")
                 url = await upload_uguu(audio_path)  # max 100 MB for Uguu
             elif ASR.ALI_FS_ENGINE.lower() == "alist":
                 url = await upload_alist(audio_path)
@@ -120,9 +120,8 @@ async def query_ali_asr(task_id: str, api_key: str, query_times: int = 0) -> dic
 async def ali_realtime_asr(model: str, path: str | Path, api_key: str) -> dict:
     # convert audio file
     sample_rate = 8000 if "8k" in model else 16000
-    ext = "opus"
-    audio_path = await downsampe_audio(path, ext=ext, sample_rate=sample_rate, channel=1)
-    recognition = Recognition(model=model, format=ext, sample_rate=sample_rate, callback=RecognitionCallback(), api_key=api_key)
+    audio_path = await downsampe_audio(path, ext="wav", codec="pcm_s16le", sample_rate=sample_rate, channel=1)
+    recognition = Recognition(model=model, format="wav", sample_rate=sample_rate, callback=RecognitionCallback(), api_key=api_key)
     result = recognition.call(Path(audio_path).as_posix())
     if result.status_code != 200:
         return {"error": f"❌语音识别失败: {result.message}"}
src/asr/cloudflare.py
@@ -37,10 +37,10 @@ async def cloudflare_asr(
     if not path.is_file():
         return {"texts": "", "error": "File not found."}
     supported_ext = [".mp3", ".opus", ".ogg", ".oga", ".wav", ".flac", ".aac"]
-    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="opus", codec="libopus")
-    audio_path = await convert_single_channel(audio_path)
+    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="wav", codec="pcm_s16le")
+    audio_path = await convert_single_channel(audio_path, ext="wav", codec="pcm_s16le")
     # max allowed file size is 25MB
-    if audio_path.stat().st_size < ASR.CLOUDFLARE_MAX_BYTES:
+    if duration < ASR.CLOUDFLARE_CHUNK_SECONDS:
         return await cloudflare_single_file(audio_path, model=model, prompt=prompt)
     return await cloudflare_file_chunks(audio_path, duration, model=model, prompt=prompt)
 
@@ -123,9 +123,9 @@ async def cloudflare_file_chunks(
     Returns:
         dict: {"texts": str, "raw_texts": str, "segments": list[dict]}
     """
-    # only support opus file
-    ogg_path = path if path.suffix in [".oga", ".ogg", ".opus"] else await downsampe_audio(path, ext="opus", codec="libopus")
-    audio, duration, sr = load_audio(ogg_path)
+    # only support wav file
+    wav_path = path if path.suffix.lower() == ".wav" else await downsampe_audio(path, ext="wav", codec="pcm_s16le")
+    audio, duration, sr = load_audio(wav_path)
     if sr == 0:
         return {"error": "Failed to load audio."}
     transcription = {}
src/asr/deepgram.py
@@ -22,8 +22,8 @@ async def deepgram_asr(path: str | Path) -> dict:
     if not path.is_file():
         return {"texts": "", "error": "File not found."}
     supported_ext = [".mp3", ".aac", ".flac", ".m4a", ".mp2", ".mp4", ".ogg", ".opus", ".oga", ".pcm", ".wav", ".webm"]
-    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="opus", codec="libopus")
-    audio_path = await convert_single_channel(audio_path)
+    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="wav", codec="pcm_s16le")
+    audio_path = await convert_single_channel(audio_path, ext="wav", codec="pcm_s16le")
     api_keys = strings_list(ASR.DEEPGRAM_API, shuffle=True)
     if not api_keys:
         return {"error": "请配置DeepGram语音识别的API Key"}
src/asr/gemini.py
@@ -43,8 +43,8 @@ async def gemini_asr(
     path = Path(path).expanduser().resolve()
     if not path.is_file():
         return {"texts": "", "error": "File not found."}
-    audio_path = path if path.suffix.lower() in GEMINI_AUDIO_EXT else await downsampe_audio(path, ext="opus", codec="libopus")
-    audio_path = await convert_single_channel(audio_path)
+    audio_path = path if path.suffix.lower() in GEMINI_AUDIO_EXT else await downsampe_audio(path, ext="wav", codec="pcm_s16le")
+    audio_path = await convert_single_channel(audio_path, ext="wav", codec="pcm_s16le")
     duration = audio_duration(audio_path)
     if duration < ASR.GEMINI_CHUNK_SECONDS:
         return await gemini_single_file(message, audio_path, model_id=model_id, prompt=prompt, delete_gemini_file=delete_gemini_file)
@@ -147,8 +147,8 @@ async def gemini_file_chunks(
         dict: {"texts": str, "raw_texts": str, "segments": list[dict]}
     """
     path = Path(path).expanduser().resolve()
-    ogg_path = path if path.suffix in [".oga", ".ogg", ".opus"] else await downsampe_audio(path, ext="opus", codec="libopus")
-    audio, duration, sr = load_audio(ogg_path)
+    wav_path = path if path.suffix.lower() == ".wav" else await downsampe_audio(path, ext="wav", codec="pcm_s16le")
+    audio, duration, sr = load_audio(wav_path)
     if sr == 0:
         return {"error": "Failed to load audio."}
     transcription = {}
@@ -156,7 +156,7 @@ async def gemini_file_chunks(
         # Calculate # of chunks
         total_chunks = (duration // (chunk_seconds - overlap_seconds)) + 1
         total_chunks = int(total_chunks)
-        chunk_paths = [Path(DOWNLOAD_DIR) / f"{rand_string()}.opus" for _ in range(total_chunks)]
+        chunk_paths = [Path(DOWNLOAD_DIR) / f"{rand_string()}.wav" for _ in range(total_chunks)]
         tasks = []
         offset_list = []
         # Loop through each chunk, extract current chunk from audio
src/asr/groq.py
@@ -27,8 +27,8 @@ async def groq_asr(path: str | Path, model: str = "", prompt: str = "", temperat
     if not path.is_file():
         return {"texts": "", "error": "File not found."}
     supported_ext = [".aac", ".flac", ".m4a", ".mp3", ".mpeg", ".mpga", ".ogg", ".opus", ".wav", ".webm"]
-    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="opus", codec="libopus")
-    audio_path = await convert_single_channel(audio_path)
+    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="wav", codec="pcm_s16le")
+    audio_path = await convert_single_channel(audio_path, ext="wav", codec="pcm_s16le")
     # max allowed file size is 25MB
     if audio_path.stat().st_size < ASR.GROQ_MAX_BYTES:
         return await groq_single_file(audio_path, model=model, prompt=prompt, temperature=temperature, language=language)
@@ -59,8 +59,8 @@ async def groq_single_file(
         file_name = Path(path_or_bytes).name
         mime = guess_mime(Path(path_or_bytes))
     else:
-        file_name = "chunk.ogg"
-        mime = "audio/ogg"
+        file_name = "chunk.wav"
+        mime = "audio/wav"
     if prompt:
         data["prompt"] = prompt
     if language:
@@ -273,8 +273,8 @@ async def groq_file_chunks(
     path = Path(path).expanduser().resolve()
     if not path.is_file():
         return {"texts": "", "error": "File not found."}
-    if path.suffix.lower() not in [".opus", ".ogg"]:
-        path = await downsampe_audio(path, ext="opus", codec="libopus")
+    if path.suffix.lower() != ".wav":
+        path = await downsampe_audio(path, ext="wav", codec="pcm_s16le")
     audio, duration, sr = load_audio(path)
     if sr == 0:
         return {"error": "Failed to load audio."}
src/asr/tecent.py
@@ -88,10 +88,10 @@ async def tencent_asr(path: str | Path, language: str, duration: float) -> dict:
     if not path.is_file():
         return {"texts": "", "error": "File not found."}
     supported_ext = [".wav", ".pcm", ".ogg", ".opus", ".oga", ".speex", ".silk", ".mp3", ".m4a", ".aac", ".amr"]
-    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="opus", codec="libopus")
-    audio_path = await convert_single_channel(audio_path)
+    audio_path = path if path.suffix.lower() in supported_ext else await downsampe_audio(path, ext="wav", codec="pcm_s16le")
+    audio_path = await convert_single_channel(audio_path, ext="wav", codec="pcm_s16le")
     if duration < 1:  # some thing error in detecting duration
-        audio_path = await downsampe_audio(path, ext="opus", codec="libopus")
+        audio_path = await downsampe_audio(path, ext="wav", codec="pcm_s16le")
         duration = audio_duration(audio_path)
 
     # max allowed duration is 60s
@@ -120,13 +120,13 @@ async def tencent_single_asr(path_or_bytes: Path | bytes, language: str, *, offs
     if isinstance(path_or_bytes, Path):
         # max 3 MB
         file_size = path_or_bytes.stat().st_size
-        audio_path = path_or_bytes if file_size < 3 * 1024 * 1024 else await downsampe_audio(path_or_bytes, ext="opus", codec="libopus")
+        audio_path = path_or_bytes if file_size < 3 * 1024 * 1024 else await downsampe_audio(path_or_bytes)
         voice_format = Path(audio_path).suffix.lower().lstrip(".")
         if voice_format in ["ogg", "opus", "oga"]:  # tencnet only supports ogg-opus
             voice_format = "ogg-opus"
         audio_bytes = await get_file_bytes(audio_path)
     elif isinstance(path_or_bytes, bytes):
-        voice_format = "ogg-opus"
+        voice_format = "wav"
         audio_bytes = path_or_bytes
     audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
     payload = f'{{"EngSerViceType":"{language}","SourceType":1,"WordInfo":2,"VoiceFormat":"{voice_format}","Data":"{audio_base64}"}}'
@@ -193,9 +193,9 @@ async def tencent_file_chunks(
     Returns:
         dict: {"texts": str, "raw_texts": str, "segments": list[dict]}
     """
-    # only support opus file
-    ogg_path = path if path.suffix in [".oga", ".ogg", ".opus"] else await downsampe_audio(path, ext="opus", codec="libopus")
-    audio, _, sr = load_audio(ogg_path)
+    # only support wav file
+    aac_path = path if path.suffix == ".wav" else await downsampe_audio(path)
+    audio, _, sr = load_audio(aac_path)
     if sr == 0:
         return {"error": "Failed to load audio."}
 
src/asr/utils.py
@@ -37,7 +37,7 @@ def auto_choose_asr_engine(duration: float, engine: str) -> str:
             enabled_engines.append("ali")
         if all([ASR.TENCENT_APPID, ASR.TENCENT_SECRET_ID, ASR.TENCENT_SECRET_KEY, ASR.TENCENT_FS_ENGINE]):
             enabled_engines.append("tencent")
-        if all([ASR.CLOUDFLARE_MODEL, ASR.CLOUDFLARE_KEYS, ASR.CLOUDFLARE_MAX_BYTES, ASR.CLOUDFLARE_CHUNK_SECONDS]):
+        if all([ASR.CLOUDFLARE_MODEL, ASR.CLOUDFLARE_KEYS, ASR.CLOUDFLARE_CHUNK_SECONDS]):
             enabled_engines.append("cloudflare")
         if all([GEMINI.ASR_MODEL, GEMINI.API_KEY, GEMINI.BASE_URL, ASR.GEMINI_CHUNK_SECONDS]):
             enabled_engines.append("gemini")
@@ -74,7 +74,7 @@ def auto_choose_asr_engine(duration: float, engine: str) -> str:
     return random.choice(engines) if engines else fallback_engine
 
 
-async def downsampe_audio(path: str | Path, ext: str = "opus", codec: str = "libopus", sample_rate: int = 16000, channel: int = 1, **kwargs) -> Path:
+async def downsampe_audio(path: str | Path, ext: str = "wav", codec: str = "pcm_s16le", sample_rate: int = 16000, channel: int = 1, **kwargs) -> Path:
     path = Path(path).expanduser().resolve()
     if not path.is_file():
         return path
@@ -96,11 +96,11 @@ async def get_audio_channel(path: str | Path) -> int:
     return -1
 
 
-async def convert_single_channel(path: str | Path) -> Path:
+async def convert_single_channel(path: str | Path, **kwargs) -> Path:
     path = Path(path).expanduser().resolve()
     num_channel = await get_audio_channel(path)
     if num_channel != 1:
-        return await downsampe_audio(path, ext="opus", codec="libopus", channel=1)
+        return await downsampe_audio(path, **kwargs)
     return path
 
 
@@ -142,14 +142,14 @@ def load_audio(path: Path | str) -> tuple[ndarray, float, int]:
     return ndarray([]), 0, 0
 
 
-async def audio_chunk_to_bytes(chunk: ndarray, samplerate: int, fmt: str = "ogg", subtype: str = "OPUS") -> bytes:
+async def audio_chunk_to_bytes(chunk: ndarray, samplerate: int, fmt: str = "WAV", subtype: str = "PCM_16") -> bytes:
     buffer = io.BytesIO()
     await asyncio.to_thread(sf.write, buffer, chunk, samplerate, format=fmt, subtype=subtype)
     buffer.seek(0)  # move cursor to beginning
     return buffer.getvalue()
 
 
-async def audio_chunk_to_path(chunk: ndarray, samplerate: int, path: str | Path, fmt: str = "ogg", subtype: str = "OPUS"):
+async def audio_chunk_to_path(chunk: ndarray, samplerate: int, path: str | Path, fmt: str = "WAV", subtype: str = "PCM_16"):
     out_path = Path(path).expanduser().resolve()
     out_path.parent.mkdir(exist_ok=True, parents=True)
     await asyncio.to_thread(sf.write, out_path.as_posix(), chunk, samplerate, format=fmt, subtype=subtype)
src/tts/gemini.py
@@ -15,7 +15,6 @@ from pyrogram.types import Message
 from config import CAPTION_LENGTH, DOWNLOAD_DIR, GEMINI, TTS
 from llm.hooks import hook_gemini_httpoptions
 from messages.utils import blockquote, smart_split
-from multimedia import convert_to_audio
 from utils import markdown_to_text, rand_string, strings_list
 
 
@@ -36,12 +35,11 @@ async def gemini_tts(message: Message, texts: str, model: str = "", voice_name:
     # split
     text_list = await smart_split(texts, chars_per_string=TTS.GEMINI_SPLIT_LENGTH, mode=ParseMode.DISABLED)
     resp = await asyncio.gather(*[gemini_tts_real(message, text, model, voice_name) for text in text_list])
-    save_path = Path(DOWNLOAD_DIR) / f"{rand_string(16)}.wav"
+    wav_path = Path(DOWNLOAD_DIR) / f"{rand_string(16)}.wav"
     combined_data = b"".join([r["voice"] for r in resp])
-    save_wave_file(save_path, combined_data)
-    ogg = await convert_to_audio(save_path, ext="ogg", codec="libopus")
+    save_wave_file(wav_path, combined_data)
     caption = f"🗣音色: {voice_name}\n🤖引擎: {model}\n{blockquote(texts[: CAPTION_LENGTH - 20])}"
-    return {"voice": ogg, "duration": calculate_duration(combined_data), "caption": caption}
+    return {"voice": wav_path, "duration": calculate_duration(combined_data), "caption": caption}
 
 
 async def gemini_tts_real(message: Message, texts: str, model: str, voice_name: str, *, return_bytes: bool = True) -> dict:
@@ -75,10 +73,9 @@ async def gemini_tts_real(message: Message, texts: str, model: str, voice_name:
                 caption = f"🗣音色: {voice_name}\n🤖引擎: {model}\n{blockquote(texts[: CAPTION_LENGTH - 20])}"
                 if return_bytes:
                     return {"voice": data, "duration": calculate_duration(data), "caption": caption}
-                save_path = Path(DOWNLOAD_DIR) / f"{rand_string(16)}.wav"
-                save_wave_file(save_path, data)
-                ogg = await convert_to_audio(save_path, ext="ogg", codec="libopus")
-                return {"voice": ogg, "duration": calculate_duration(data), "caption": caption}
+                wav_path = Path(DOWNLOAD_DIR) / f"{rand_string(16)}.wav"
+                save_wave_file(wav_path, data)
+                return {"voice": wav_path, "duration": calculate_duration(data), "caption": caption}
         except Exception as e:
             logger.error(e)
     return {}
src/tts/qwen.py
@@ -11,7 +11,6 @@ from pyrogram.enums import ParseMode
 
 from config import CAPTION_LENGTH, DOWNLOAD_DIR, TTS
 from messages.utils import blockquote, smart_split
-from multimedia import convert_to_audio
 from networking import download_file, hx_req
 from utils import markdown_to_text, rand_string, strings_list
 
@@ -29,18 +28,17 @@ async def qwen_tts(texts: str, model: str = "", voice_name: str = "") -> dict:
     raw_texts = markdown_to_text(texts)
     num_token = count_token(raw_texts, model)
     if num_token < TTS.QWEN_INPUT_TOKEN_LIMIT:
-        return await qwen_tts_real(texts, model, voice_name, convert_ogg=True)
+        return await qwen_tts_real(texts, model, voice_name)
     # split
     text_list = await smart_split(texts, chars_per_string=TTS.QWEN_SPLIT_LENGTH, mode=ParseMode.DISABLED)
     resp = await asyncio.gather(*[qwen_tts_real(text, model, voice_name) for text in text_list])
-    save_path = Path(DOWNLOAD_DIR) / f"{rand_string(16)}.wav"
-    merge_wav([r["voice"] for r in resp], save_path)
-    ogg = await convert_to_audio(save_path, ext="ogg", codec="libopus")
+    wav_path = Path(DOWNLOAD_DIR) / f"{rand_string(16)}.wav"
+    merge_wav([r["voice"] for r in resp], wav_path)
     caption = f"🗣音色: {voice_name}\n🤖引擎: {model}\n{blockquote(texts[: CAPTION_LENGTH - 20])}"
-    return {"voice": ogg, "duration": sum([r["duration"] for r in resp]), "caption": caption}
+    return {"voice": wav_path, "duration": sum([r["duration"] for r in resp]), "caption": caption}
 
 
-async def qwen_tts_real(texts: str, model: str, voice_name: str, *, convert_ogg: bool = False) -> dict:
+async def qwen_tts_real(texts: str, model: str, voice_name: str) -> dict:
     """Qwen TTS.
 
     Args:
@@ -67,8 +65,6 @@ async def qwen_tts_real(texts: str, model: str, voice_name: str, *, convert_ogg:
             save_path = await download_file(url, proxy=TTS.ALI_PROXY)
             duration = glom(response, "usage.output_tokens", default=0) / 50  # 1s = 50 tokens
             caption = f"🗣音色: {voice_name}\n🤖引擎: {model}\n{blockquote(texts[: CAPTION_LENGTH - 20])}"
-            if convert_ogg:
-                save_path = await convert_to_audio(save_path, ext="ogg", codec="libopus")
         except Exception as e:
             logger.error(e)
         return {"voice": save_path, "duration": duration, "caption": caption}
src/tts/sambert.py
@@ -13,7 +13,6 @@ from pyrogram.enums import ParseMode
 
 from config import CAPTION_LENGTH, DOWNLOAD_DIR, TTS
 from messages.utils import blockquote, smart_split
-from multimedia import convert_to_audio
 from tts.engines import LIMIT_FOR_MODEL, get_random_one
 from utils import markdown_to_text, rand_string, strings_list
 
@@ -33,18 +32,17 @@ async def sambert_tts(texts: str, model: str = "", voice_name: str = "") -> dict
 
     raw_texts = markdown_to_text(texts)
     if len(raw_texts) < TTS.SAMBERT_LENGTH_LIMIT:
-        return await sambert_tts_real(texts, model, voice_name, convert_ogg=True)
+        return await sambert_tts_real(texts, model, voice_name)
     # split
     text_list = await smart_split(texts, chars_per_string=TTS.SAMBERT_LENGTH_LIMIT, mode=ParseMode.DISABLED)
     resp = await asyncio.gather(*[sambert_tts_real(text, model, voice_name) for text in text_list])
-    save_path = Path(DOWNLOAD_DIR) / f"{rand_string(16)}.wav"
-    merge_wav([r["voice"] for r in resp], save_path)
-    ogg = await convert_to_audio(save_path, ext="ogg", codec="libopus")
+    wav_path = Path(DOWNLOAD_DIR) / f"{rand_string(16)}.wav"
+    merge_wav([r["voice"] for r in resp], wav_path)
     caption = f"🗣音色: {voice_name}\n🤖引擎: {model}\n{blockquote(texts[: CAPTION_LENGTH - 20])}"
-    return {"voice": ogg, "duration": sum([r["duration"] for r in resp]), "caption": caption}
+    return {"voice": wav_path, "duration": sum([r["duration"] for r in resp]), "caption": caption}
 
 
-async def sambert_tts_real(texts: str, model: str, voice_name: str, *, convert_ogg: bool = False) -> dict:
+async def sambert_tts_real(texts: str, model: str, voice_name: str) -> dict:
     """Sambert TTS.
 
     Args:
@@ -68,8 +66,6 @@ async def sambert_tts_real(texts: str, model: str, voice_name: str, *, convert_o
             if timestamps := response.get_timestamps():
                 duration = glom(timestamps, "-1.end_time", default=0) / 1000
             caption = f"🗣音色: {voice_name}\n🤖引擎: {model}\n{blockquote(texts[: CAPTION_LENGTH - 20])}"
-            if convert_ogg:
-                save_path = await convert_to_audio(save_path, ext="ogg", codec="libopus")
         except Exception as e:
             logger.error(e)
         return {"voice": save_path, "duration": duration, "caption": caption}
src/config.py
@@ -287,8 +287,7 @@ class ASR:
     ALI_FS_ENGINE = os.getenv("ASR_ALI_FS_ENGINE", "local")  # local, uguu or alist.
     DEEPGRAM_API = os.getenv("ASR_DEEPGRAM_API", "")  # comma separated keys for load balance. e.g. "key1,key2,key3"
     CLOUDFLARE_MODEL = os.getenv("ASR_CLOUDFLARE_MODEL", "@cf/openai/whisper-large-v3-turbo")
-    CLOUDFLARE_MAX_BYTES = int(os.getenv("ASR_CLOUDFLARE_MAX_BYTES", "26214400"))  # 25MB (max file bytes for single file)
-    CLOUDFLARE_CHUNK_SECONDS = float(os.getenv("ASR_CLOUDFLARE_CHUNK_SECONDS", "600"))  # split long audio file into chunks
+    CLOUDFLARE_CHUNK_SECONDS = float(os.getenv("ASR_CLOUDFLARE_CHUNK_SECONDS", "180"))  # split long audio file into chunks
     CLOUDFLARE_OVERLAP_SECONDS = float(os.getenv("ASR_CLOUDFLARE_OVERLAP_SECONDS", "5"))  # overlap seconds between chunks
     CLOUDFLARE_KEYS = os.getenv("ASR_CLOUDFLARE_KEYS", "")  # comma separated keys for load balance. e.g. "AccountID:API_TOKEN, AccountID:API_TOKEN, ..."
     CLOUDFLARE_PROXY = os.getenv("ASR_CLOUDFLARE_PROXY", None)
@@ -297,7 +296,7 @@ class ASR:
     GEMINI_OVERLAP_SECONDS = float(os.getenv("ASR_GEMINI_OVERLAP_SECONDS", "5"))  # overlap seconds between chunks
     GROQ_PROXY = os.getenv("ASR_GROQ_PROXY", None)  # Ban CN & HK IP
     GROQ_MAX_BYTES = int(os.getenv("ASR_GROQ_MAX_BYTES", "26214400"))  # 25MB (max file bytes for single file)
-    GROQ_CHUNK_SECONDS = float(os.getenv("ASR_GROQ_CHUNK_SECONDS", "600"))  # split long audio file into chunks
+    GROQ_CHUNK_SECONDS = float(os.getenv("ASR_GROQ_CHUNK_SECONDS", "180"))  # split long audio file into chunks
     GROQ_OVERLAP_SECONDS = float(os.getenv("ASR_GROQ_OVERLAP_SECONDS", "5"))  # overlap seconds between chunks
     GROQ_KEYS = os.getenv("ASR_GROQ_KEYS", "")  # comma separated keys for load balance.
     GROQ_MODELS = os.getenv("ASR_GROQ_MODELS", "whisper-large-v3")  # comma separated model names.