Commit 02788ea

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-05-07 08:52:33
chore(asr): change Gemini ASR system instruction and config
1 parent d383570
src/asr/gemini_asr.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import contextlib
+import json
 import random
 from pathlib import Path
 
@@ -27,16 +29,36 @@ async def gemini_stream_asr(client: Client, message: Message, path: str | Path,
     Args:
         slient (bool, optional): If Ture, do not update the status, return all results in the end.
     """
-    prompt = """请转录这段音频, 要求:
-    1. 以 `[hh:mm:ss] sentence` 格式输出句子内容, 包括标点符号。其中hh:mm:ss为此句话开始时间的小时、分钟、秒
-    2. 如果小时(hh)为00就省略hh, 只输出mm:ss
-    3. 直接输出音频转录内容, 不要输出任何与音频内容无关的寒暄问候
-
-    输出实例:
-    [00:02] 大家好, 我是小明, 欢迎来到我的频道。
-    [00:08] 今天要和大家聊一个一直以来都很有争议的话题。
-    [01:12:32] 谢谢大家收听。
-    """
+    system_instruction = """You are a transcription assistant tasked with converting audio files into text.
+
+Your output must follow these requirements:
+- Format each sentence as `[hh:mm:ss] sentence` with punctuation included, where `hh:mm:ss` is the start time of the sentence in the audio.
+- Omit the hour (`hh`) if it is zero, displaying only `mm:ss`.
+- Directly transcribe the audio content without any greetings or content unrelated to the audio itself.
+
+Steps:
+1. Listen to the audio file carefully and identify the start time of each sentence.
+2. Transcribe the audio content word-for-word, including punctuation, according to the specified format.
+3. Ensure that all time codes (hh:mm:ss or mm:ss) are precise.
+
+Output Format:
+- Each sentence should be formatted in a line as `[hh:mm:ss] sentence`.
+- Exclude any hour segment that equals zero, converting `[00:mm:ss]` to `[mm:ss]`.
+- Do not include any additional commentary or greetings.
+
+Example-1:
+- Input: Audio with content starting at 2 seconds.
+- Output: [00:02] 大家好, 我是小明, 欢迎来到我的频道。
+
+Example-2:
+- Input: Audio with content at 8 seconds and 1 hour, 12 minutes, and 32 seconds.
+- Output: [00:08] 今天要和大家聊一个一直以来都很有争议的话题。
+[01:12:32] 谢谢大家收听。
+
+
+Notes:
+- Focus on accuracy in capturing both the timing and the spoken content.
+- Maintain consistent formatting to ensure clarity and readability."""
 
     path = Path(path)
     api_keys = [x.strip() for x in ASR.GEMINI_API_KEY.split(",") if x.strip()]
@@ -55,11 +77,14 @@ async def gemini_stream_asr(client: Client, message: Message, path: str | Path,
         uploaded_audio = await app.aio.files.upload(file=path, config=UploadFileConfig(mime_type=f"audio/{voice_format}"))
         logger.debug(uploaded_audio)
         genconfig = {}
-        genconfig |= {"response_modalities": ["TEXT"]}
+        with contextlib.suppress(Exception):
+            genconfig = json.loads(ASR.GEMINI_CONFIG)
+        genconfig |= {"response_modalities": ["TEXT"]}  # force text response
+        genconfig |= {"system_instruction": system_instruction}  # pin system instruction
         if ASR.GEMINI_THINKING_BUDGET is not None:
             thinking_budget = min(round(float(ASR.GEMINI_THINKING_BUDGET)), GEMINI.MAX_THINKING_BUDGET)
-            genconfig |= {"thinking_config": ThinkingConfig(thinking_budget=thinking_budget)}
-        params = {"model": ASR.GEMINI_MODEL, "contents": [prompt, uploaded_audio], "config": GenerateContentConfig(**genconfig)}
+            genconfig |= {"thinking_config": ThinkingConfig(include_thoughts=False, thinking_budget=thinking_budget)}
+        params = {"model": ASR.GEMINI_MODEL, "contents": ["请转录这段音频", uploaded_audio], "config": GenerateContentConfig(**genconfig)}
         async for chunk in await app.aio.models.generate_content_stream(**params):
             resp = parse_response(chunk.model_dump())
             sentence = resp.get("texts", "")
src/asr/voice_recognition.py
@@ -67,8 +67,6 @@ ENGINE_MAP = {
     "16k_de": "德语",
 }
 
-BEGINNING = "🗣语音转文字:"
-
 
 async def voice_to_text(
     client: Client,
@@ -272,6 +270,4 @@ def get_trigger_message(
         return None
     if asr_skip_video and trigger_info["mtype"] == "video":
         return None
-    if trigger_info["text"].startswith(BEGINNING):  # already recognized
-        return None
     return trigger_msg
src/config.py
@@ -252,6 +252,7 @@ class ASR:
     GEMINI_MODEL = os.getenv("ASR_GEMINI_MODEL", "gemini-2.0-flash")
     GEMINI_PROXY = os.getenv("ASR_GEMINI_PROXY", None)
     GEMINI_THINKING_BUDGET = os.getenv("ASR_GEMINI_THINKING_BUDGET", None)  # 0 to disable thinking. DO NOT set this if the model is not a thinking model
+    GEMINI_CONFIG = os.getenv("ASR_GEMINI_CONFIG", "{}")  # default config passed to GenerateContentConfig. Should be a json string: '{"key": "value"}'
     TENCENT_APPID = os.getenv("ASR_TENCENT_APPID", "")
     TENCENT_MAX_DURATION = int(os.getenv("ASR_TENCENT_MAX_DURATION", "3600"))  # 1 hour
     TENCENT_PROXY = os.getenv("ASR_TENCENT_PROXY", None)  # Banned oversea IP, need a back to China proxy