Commit 02788ea
Changed files (3)
src/asr/gemini_asr.py
@@ -1,5 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+import contextlib
+import json
import random
from pathlib import Path
@@ -27,16 +29,36 @@ async def gemini_stream_asr(client: Client, message: Message, path: str | Path,
Args:
slient (bool, optional): If Ture, do not update the status, return all results in the end.
"""
- prompt = """请转录这段音频, 要求:
- 1. 以 `[hh:mm:ss] sentence` 格式输出句子内容, 包括标点符号。其中hh:mm:ss为此句话开始时间的小时、分钟、秒
- 2. 如果小时(hh)为00就省略hh, 只输出mm:ss
- 3. 直接输出音频转录内容, 不要输出任何与音频内容无关的寒暄问候
-
- 输出实例:
- [00:02] 大家好, 我是小明, 欢迎来到我的频道。
- [00:08] 今天要和大家聊一个一直以来都很有争议的话题。
- [01:12:32] 谢谢大家收听。
- """
+ system_instruction = """You are a transcription assistant tasked with converting audio files into text.
+
+Your output must follow these requirements:
+- Format each sentence as `[hh:mm:ss] sentence` with punctuation included, where `hh:mm:ss` is the start time of the sentence in the audio.
+- Omit the hour (`hh`) if it is zero, displaying only `mm:ss`.
+- Directly transcribe the audio content without any greetings or content unrelated to the audio itself.
+
+Steps:
+1. Listen to the audio file carefully and identify the start time of each sentence.
+2. Transcribe the audio content word-for-word, including punctuation, according to the specified format.
+3. Ensure that all time codes (hh:mm:ss or mm:ss) are precise.
+
+Output Format:
+- Each sentence should be formatted in a line as `[hh:mm:ss] sentence`.
+- Exclude any hour segment that equals zero, converting `[00:mm:ss]` to `[mm:ss]`.
+- Do not include any additional commentary or greetings.
+
+Example-1:
+- Input: Audio with content starting at 2 seconds.
+- Output: [00:02] 大家好, 我是小明, 欢迎来到我的频道。
+
+Example-2:
+- Input: Audio with content at 8 seconds and 1 hour, 12 minutes, and 32 seconds.
+- Output: [00:08] 今天要和大家聊一个一直以来都很有争议的话题。
+[01:12:32] 谢谢大家收听。
+
+
+Notes:
+- Focus on accuracy in capturing both the timing and the spoken content.
+- Maintain consistent formatting to ensure clarity and readability."""
path = Path(path)
api_keys = [x.strip() for x in ASR.GEMINI_API_KEY.split(",") if x.strip()]
@@ -55,11 +77,14 @@ async def gemini_stream_asr(client: Client, message: Message, path: str | Path,
uploaded_audio = await app.aio.files.upload(file=path, config=UploadFileConfig(mime_type=f"audio/{voice_format}"))
logger.debug(uploaded_audio)
genconfig = {}
- genconfig |= {"response_modalities": ["TEXT"]}
+ with contextlib.suppress(Exception):
+ genconfig = json.loads(ASR.GEMINI_CONFIG)
+ genconfig |= {"response_modalities": ["TEXT"]} # force text response
+ genconfig |= {"system_instruction": system_instruction} # pin system instruction
if ASR.GEMINI_THINKING_BUDGET is not None:
thinking_budget = min(round(float(ASR.GEMINI_THINKING_BUDGET)), GEMINI.MAX_THINKING_BUDGET)
- genconfig |= {"thinking_config": ThinkingConfig(thinking_budget=thinking_budget)}
- params = {"model": ASR.GEMINI_MODEL, "contents": [prompt, uploaded_audio], "config": GenerateContentConfig(**genconfig)}
+ genconfig |= {"thinking_config": ThinkingConfig(include_thoughts=False, thinking_budget=thinking_budget)}
+ params = {"model": ASR.GEMINI_MODEL, "contents": ["请转录这段音频", uploaded_audio], "config": GenerateContentConfig(**genconfig)}
async for chunk in await app.aio.models.generate_content_stream(**params):
resp = parse_response(chunk.model_dump())
sentence = resp.get("texts", "")
src/asr/voice_recognition.py
@@ -67,8 +67,6 @@ ENGINE_MAP = {
"16k_de": "德语",
}
-BEGINNING = "🗣语音转文字:"
-
async def voice_to_text(
client: Client,
@@ -272,6 +270,4 @@ def get_trigger_message(
return None
if asr_skip_video and trigger_info["mtype"] == "video":
return None
- if trigger_info["text"].startswith(BEGINNING): # already recognized
- return None
return trigger_msg
src/config.py
@@ -252,6 +252,7 @@ class ASR:
GEMINI_MODEL = os.getenv("ASR_GEMINI_MODEL", "gemini-2.0-flash")
GEMINI_PROXY = os.getenv("ASR_GEMINI_PROXY", None)
GEMINI_THINKING_BUDGET = os.getenv("ASR_GEMINI_THINKING_BUDGET", None) # 0 to disable thinking. DO NOT set this if the model is not a thinking model
+ GEMINI_CONFIG = os.getenv("ASR_GEMINI_CONFIG", "{}") # default config passed to GenerateContentConfig. Should be a json string: '{"key": "value"}'
TENCENT_APPID = os.getenv("ASR_TENCENT_APPID", "")
TENCENT_MAX_DURATION = int(os.getenv("ASR_TENCENT_MAX_DURATION", "3600")) # 1 hour
TENCENT_PROXY = os.getenv("ASR_TENCENT_PROXY", None) # Banned oversea IP, need a back to China proxy