Commit 1c3bd7b
Changed files (8)
src
podcast
subtitles
ytdlp
src/ai/chat_summary.py
@@ -26,43 +26,43 @@ from utils import nowdt, rand_number, strings_list
# ruff: noqa: RUF001
HELP = f"""🤖**AI总结历史消息** (最多{MAX_MESSAGE_SUMMARY}条)
-⚠️使用`{PREFIX.AI_SUMMARY}`命令生成聊天记录文件 + 聊天记录AI总结
+⚠️使用`{PREFIX.CHAT_SUMMARY}`命令生成聊天记录文件 + 聊天记录AI总结
⚠️使用`{PREFIX.COMBINATION}`命令只生成聊天记录文件, 不对聊天记录AI总结
-⚠️额外功能: 使用`{PREFIX.AI_SUMMARY} + 油管或B站链接`对视频内容进行AI总结
+⚠️额外功能: 使用`{PREFIX.CHAT_SUMMARY} + 油管或B站链接`对视频内容进行AI总结
-{PREFIX.AI_SUMMARY}使用说明:
+{PREFIX.CHAT_SUMMARY}使用说明:
- # 后跟消息数量或时间范围
- @ 后跟用户名 (可多次使用@)
**1️⃣指定条目数**
-- `{PREFIX.AI_SUMMARY} #N`: 总结最近的N条历史消息
-- `{PREFIX.AI_SUMMARY} #N @User`: 总结最近只属于User的N条消息
-- `{PREFIX.AI_SUMMARY} #N @User @User2`: 总结最近只属于User和User2的N条消息
+- `{PREFIX.CHAT_SUMMARY} #N`: 总结最近的N条历史消息
+- `{PREFIX.CHAT_SUMMARY} #N @User`: 总结最近只属于User的N条消息
+- `{PREFIX.CHAT_SUMMARY} #N @User @User2`: 总结最近只属于User和User2的N条消息
示例:
-- `{PREFIX.AI_SUMMARY} #10`: 总结最近的10条历史消息
-- `{PREFIX.AI_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
-- `{PREFIX.AI_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
-- `{PREFIX.AI_SUMMARY} #20 @John @Bob`: 总结最近用户John和Bob的20条消息
+- `{PREFIX.CHAT_SUMMARY} #10`: 总结最近的10条历史消息
+- `{PREFIX.CHAT_SUMMARY} #20 @123456`: 总结最近UID为123456的20条消息
+- `{PREFIX.CHAT_SUMMARY} #20 @John`: 总结最近用户John(大小写均可)的20条消息
+- `{PREFIX.CHAT_SUMMARY} #20 @John @Bob`: 总结最近用户John和Bob的20条消息
**2️⃣指定最近时间段**
-- `{PREFIX.AI_SUMMARY} #interval`: 总结最近interval时段内的消息
-- `{PREFIX.AI_SUMMARY} #interval @User`: 总结最近interval时段内, 且只属于User的消息
+- `{PREFIX.CHAT_SUMMARY} #interval`: 总结最近interval时段内的消息
+- `{PREFIX.CHAT_SUMMARY} #interval @User`: 总结最近interval时段内, 且只属于User的消息
示例:
-- `{PREFIX.AI_SUMMARY} #10m`: 总结最近10分钟内的消息
-- `{PREFIX.AI_SUMMARY} #2h`: 总结最近2小时内的消息
-- `{PREFIX.AI_SUMMARY} #1d`: 总结最近1天的消息
-- `{PREFIX.AI_SUMMARY} #2h @John`: 总结最近2小时内的消息, 且只属于John的消息
+- `{PREFIX.CHAT_SUMMARY} #10m`: 总结最近10分钟内的消息
+- `{PREFIX.CHAT_SUMMARY} #2h`: 总结最近2小时内的消息
+- `{PREFIX.CHAT_SUMMARY} #1d`: 总结最近1天的消息
+- `{PREFIX.CHAT_SUMMARY} #2h @John`: 总结最近2小时内的消息, 且只属于John的消息
**3️⃣ 指定具体时间范围**
-- `{PREFIX.AI_SUMMARY} #20240101123000`: 总结从2024-01-01 12:30:00开始到现在的消息
-- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息
-- `{PREFIX.AI_SUMMARY} #20240101123000 @User`: 总结从2024-01-01 12:30:00开始到现在的消息, 且只属于User的消息
-- `{PREFIX.AI_SUMMARY} #20240101123000-20240101124000 @User`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息, 且只属于User的消息
+- `{PREFIX.CHAT_SUMMARY} #20240101123000`: 总结从2024-01-01 12:30:00开始到现在的消息
+- `{PREFIX.CHAT_SUMMARY} #20240101123000-20240101124000`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息
+- `{PREFIX.CHAT_SUMMARY} #20240101123000 @User`: 总结从2024-01-01 12:30:00开始到现在的消息, 且只属于User的消息
+- `{PREFIX.CHAT_SUMMARY} #20240101123000-20240101124000 @User`: 总结从2024-01-01 12:30:00开始到2024-01-01 12:40:00的消息, 且只属于User的消息
- 时间格式中没有任何分隔符, 必须为YYYYMMDDHHMMSS (14位纯数字)
注意:
-- 用上述各种`{PREFIX.AI_SUMMARY}`命令回复消息M, 视为将截止时间设为消息M的发送时间
+- 用上述各种`{PREFIX.CHAT_SUMMARY}`命令回复消息M, 视为将截止时间设为消息M的发送时间
- 如果用户名中有空格, 请去除空格。例如: 想指定用户为John Doe请使用 `@JohnDoe`
"""
@@ -123,13 +123,13 @@ async def ai_chat_summary(
summary_model_id (str, optional): The model id to use for AI summary.
"""
# send docs if message == "/summary"
- if equal_prefix(message.text, prefix=[PREFIX.AI_SUMMARY, PREFIX.COMBINATION]) and not message.reply_to_message:
+ if equal_prefix(message.text, prefix=[PREFIX.CHAT_SUMMARY, PREFIX.COMBINATION]) and not message.reply_to_message:
await send2tg(client, message, texts=HELP, **kwargs)
return
- if not startswith_prefix(message.content, prefix=[PREFIX.AI_SUMMARY, PREFIX.COMBINATION]):
+ if not startswith_prefix(message.content, prefix=[PREFIX.CHAT_SUMMARY, PREFIX.COMBINATION]):
return
# summary Youtube & Bilibili video (skip for summaring chat history)
- if startswith_prefix(message.text, prefix=PREFIX.AI_SUMMARY) and not remove_prefix(message.text, prefix=PREFIX.AI_SUMMARY).strip().startswith("#"):
+ if startswith_prefix(message.text, prefix=PREFIX.CHAT_SUMMARY) and not remove_prefix(message.text, prefix=PREFIX.CHAT_SUMMARY).strip().startswith("#"):
# Youtube & Bilibili links in message's content or reply_to_message's content or reply_to_message's entity_urls
links_to_check = [message.content, glom(message, Coalesce("reply_to_message.content"), default="")]
if message.reply_to_message:
@@ -145,9 +145,9 @@ async def ai_chat_summary(
return
info = parse_msg(message, silent=True)
- need_summay = startswith_prefix(info["text"], prefix=PREFIX.AI_SUMMARY)
+ need_summay = startswith_prefix(info["text"], prefix=PREFIX.CHAT_SUMMARY)
# replace /combine with /summary, because we need to use `/summary` to match different patterns
- info["text"] = re.sub(r"^" + PREFIX.COMBINATION, PREFIX.AI_SUMMARY, info["text"], flags=re.IGNORECASE)
+ info["text"] = re.sub(r"^" + PREFIX.COMBINATION, PREFIX.CHAT_SUMMARY, info["text"], flags=re.IGNORECASE)
num_history = MAX_MESSAGE_SUMMARY
filter_users = []
begin_time = datetime.fromtimestamp(0, tz=ZoneInfo(TZ))
@@ -160,12 +160,12 @@ async def ai_chat_summary(
# 3️⃣ /summary #YYYYMMDDHHMMSS @user
# 4️⃣ /summary #YYYYMMDDHHMMSS-YYYYMMDDHHMMSS @user
- if matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d{14})-?(\d{14})?(\s+)?(@\w+)?", info["text"]):
+ if matched := re.match(r"^" + PREFIX.CHAT_SUMMARY + r"\s+#(\d{14})-?(\d{14})?(\s+)?(@\w+)?", info["text"]):
begin_time = datetime.strptime(matched.group(1), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
end_time = datetime.strptime(matched.group(2) or end_time.strftime("%Y%m%d%H%M%S"), "%Y%m%d%H%M%S").replace(tzinfo=ZoneInfo(TZ))
filter_users = re.findall(r"@([^\s]+)", info["text"])
# 2️⃣ /summary #interval @user (/summary #4h @user)
- elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)([mMhHdD])(\s+)?(@\w+)?", info["text"]):
+ elif matched := re.match(r"^" + PREFIX.CHAT_SUMMARY + r"\s+#(\d+)([mMhHdD])(\s+)?(@\w+)?", info["text"]):
interval = int(matched.group(1))
unit = matched.group(2).lower()
filter_users = re.findall(r"@([^\s]+)", info["text"])
@@ -176,7 +176,7 @@ async def ai_chat_summary(
elif unit == "d":
begin_time = end_time - timedelta(days=interval)
# 1️⃣ /summary #N @user
- elif matched := re.match(r"^" + PREFIX.AI_SUMMARY + r"\s+#(\d+)(\s+)?(@\w+)?", info["text"]):
+ elif matched := re.match(r"^" + PREFIX.CHAT_SUMMARY + r"\s+#(\d+)(\s+)?(@\w+)?", info["text"]):
num_history = min(int(matched.group(1)), MAX_MESSAGE_SUMMARY)
filter_users = re.findall(r"@([^\s]+)", info["text"])
else:
@@ -362,6 +362,6 @@ async def daily_summary(client: Client):
message = Message(
id=rand_number(),
chat=Chat(id=target_chat_id),
- text=f"{PREFIX.AI_SUMMARY} #{duration}h cid={to_int(source_chat_id)}", # type: ignore
+ text=f"{PREFIX.CHAT_SUMMARY} #{duration}h cid={to_int(source_chat_id)}", # type: ignore
)
await ai_chat_summary(client, message, summary_prefix=DAILY_SUMMARY_PREFIX, target_chat=to_int(target_chat_id), reply_msg_id=-1)
src/ai/transcription_summary.py → src/ai/summary.py
@@ -1,11 +1,12 @@
#!/venv/bin/python
# -*- coding: utf-8 -*-
import base64
+import hashlib
import json
import re
-from contextlib import suppress
from pathlib import Path
+from loguru import logger
from pyrogram.types import Chat, Message
from pyrogram.types.messages_and_media.message import Str
@@ -16,21 +17,25 @@ from networking import download_file
from utils import count_subtitles, rand_number
JSON_SCHEMA = {
- "title": "Transcription Summary",
+ "title": "Article Summary",
+ "description": "提炼出文章的核心内容,生成符合指定JSON格式的全文总结、分片内容和思维导图",
"type": "object",
"properties": {
- "abstract": {"title": "全文概览", "description": "需涵盖节目核心主题、关键观点和主要结论,用连贯的一段话概括,避免过于简略", "type": "string"},
+ "abstract": {"title": "全文总结", "description": "需涵盖文章核心主题、关键观点和主要结论,用连贯的一段话概括文章的主要内容,避免过于简略。如果内容过长,也可考虑分段总结。", "type": "string"},
"sections": {
- "description": "将节目划分为不同片段,每个片段需拟定简洁准确的标题,匹配1个相关emoji,并总结该片段的核心内容",
- "title": "片段内容",
+ "description": "将文章划分为不同的片段,每个片段需拟定简洁准确的标题,匹配1个相关emoji,并总结该片段的核心内容",
+ "title": "分片内容",
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string", "description": "该片段的标题"},
"emoji": {"type": "string", "description": "匹配该片段的emoji,例如💡、💰、⚠️等"},
- "summary": {"type": "string", "description": "该片段的总结"},
- "start": {"type": "string", "description": "该片段的开始时间, 格式为(HH:MM:SS或MM:SS)"},
+ "summary": {"type": "string", "description": "概括该片段的核心内容"},
+ "start": {
+ "type": ["string", "null"],
+ "description": "如果文章内容为包含时间戳的文字稿(如播客、视频、音频的转录稿),设置此字段为该片段的开始时间, 格式为(HH:MM:SS或MM:SS)。如果没有时间戳,则无需输出此字段。",
+ },
},
},
},
@@ -38,7 +43,7 @@ JSON_SCHEMA = {
"title": "思维导图",
"type": "string",
"pattern": "^flowchart LR",
- "description": "以Mermaid flowchart格式表示的思维导图,以'flowchart LR'开头",
+ "description": "以Mermaid flowchart格式表示的全文思维导图,以'flowchart LR'开头",
},
},
"required": ["abstract", "sections", "mermaid"],
@@ -46,15 +51,15 @@ JSON_SCHEMA = {
}
-async def summarize_transcription(transcription: str, reference: str | None = None, model: str = "gemini") -> dict:
- if count_subtitles(transcription) < 200: # skip short transcription
+async def summarize(article: str, reference: str | None = None, model: str = "gemini") -> dict:
+ if count_subtitles(article) < 200: # skip short article
return {}
res = await ai_text_generation(
"fake-client", # type: ignore
message=Message(
id=rand_number(),
chat=Chat(id=rand_number()),
- text=Str(f"{PREFIX.AI_TEXT_GENERATION} @{model} {transcription.strip()}"),
+ text=Str(f"{PREFIX.AI_TEXT_GENERATION} @{model} {article.strip()}"),
),
gemini_generate_content_config={
"system_instruction": system_prompt(reference),
@@ -66,9 +71,9 @@ async def summarize_transcription(transcription: str, reference: str | None = No
"text": {
"format": {
"type": "json_schema",
- "name": "TranscriptionSummary",
+ "name": "ArticleSummary",
"strict": True,
- "description": "基于提供的转录文稿,提炼出节目的核心内容,生成符合指定JSON格式的内容总结",
+ "description": "提炼出文章的核心内容,生成符合指定JSON格式的全文总结、分片内容和思维导图",
"schema": JSON_SCHEMA,
}
},
@@ -78,25 +83,34 @@ async def summarize_transcription(transcription: str, reference: str | None = No
openai_append_tool_results=False,
silent=True,
)
- with suppress(Exception):
- if not res.get("texts", ""):
- return {}
- summary = json.loads(res.get("texts", "{}"))
+ if not res.get("texts", ""):
+ return {}
+ res["texts"] = await parse_summary(res["texts"]) or res["texts"]
+ return res
+
+
+async def parse_summary(texts: str) -> str:
+ try:
+ summary = json.loads(texts)
mermaid = beautify_mermaid(summary["mermaid"])
mermaid_img = await save_mermaid_jpg_to_r2(mermaid)
- texts = f"{summary['abstract'].strip()}"
+ parsed = f"{summary['abstract'].strip()}"
if mermaid_img:
- texts += f"\n🧠**[思维导图]({mermaid_img})**\n"
- texts += "\n⚡️**章节速览**"
+ parsed += f"\n🧠**[思维导图]({mermaid_img})**\n"
+ parsed += "\n⚡️**章节速览**"
for section in summary["sections"]:
- texts += f"\n{section['emoji']}**{section['title']}** [{section['start']}]\n{section['summary']}"
- res["texts"] = texts
- return res
- return {}
+ parsed += f"\n{section['emoji']}**{section['title']}**"
+ if section.get("start"):
+ parsed += f" [{section['start']}]"
+ parsed += f"\n{section['summary']}"
+ except Exception as e:
+ logger.error(f"Error parsing summary: {e}")
+ return ""
+ return parsed
def system_prompt(reference: str | None = None) -> str:
- prompt = "你是一位专业的节目总结大师,任务是基于提供的转录文稿,提炼出节目的核心内容,生成符合指定JSON格式的内容总结。"
+ prompt = "你是一位专业的文章总结大师,任务是基于用户提供的文本,提炼出文章的核心内容,生成符合指定JSON格式的全文总结、分片内容和思维导图。"
if reference:
prompt += f"\n{reference}"
return prompt.strip()
@@ -125,7 +139,7 @@ def beautify_mermaid(mermaid: str) -> str:
async def save_mermaid_jpg_to_r2(mermaid: str) -> str:
b64_str = base64.urlsafe_b64encode(mermaid.encode("utf-8")).decode("ascii")
- save_path = Path(DOWNLOAD_DIR) / f"{rand_number()}.jpg"
+ save_path = Path(DOWNLOAD_DIR) / f"{hashlib.sha256(mermaid.encode()).hexdigest()}.jpg"
await download_file(f"https://mermaid.ink/img/{b64_str}?type=jpeg&theme=forest&width=2160", path=save_path, suffix=".jpg")
if save_path.is_file():
r2_key = f"TTL/365d/{save_path.name}"
src/messages/help.py
@@ -55,6 +55,7 @@ def social_media_help(chat_id: int | str, ctype: str, prefix: str):
msg += f"\n📃**提取字幕**: `{PREFIX.SUBTITLE}` + B站或油管链接"
if permission["history"]:
msg += f"\n🗣**查询聊天记录**: 发送 `{PREFIX.HISTORY}` 查看详细教程"
+ msg += f"\n📖**总结聊天记录**: 发送 `{PREFIX.CHAT_SUMMARY}` 查看详细教程"
if permission["wget"]:
msg += f"\n⏬**下载文件**: `{PREFIX.WGET}` + URL"
if permission["tmdb"]:
src/messages/main.py
@@ -193,28 +193,29 @@ async def preview_social_media(
"""
# these commands are handled in `process_message`
ignore_prefix = [
- PREFIX.ASR,
+ PREFIX.AI_IMG_GENERATION,
PREFIX.AI_SUMMARY,
PREFIX.AI_TEXT_GENERATION,
- PREFIX.AI_IMG_GENERATION,
+ PREFIX.ASR,
PREFIX.AUDIO,
+ PREFIX.CHAT_SUMMARY,
PREFIX.COMBINATION,
+ PREFIX.CONVERT_TO_SC,
+ PREFIX.CONVERT_TO_TC,
PREFIX.CONVERT,
PREFIX.CRYPTO,
PREFIX.DANMU,
+ PREFIX.FAYAN,
PREFIX.OCR,
PREFIX.PRICE,
PREFIX.SEARCH_GOOGLE,
PREFIX.SEARCH_YOUTUBE,
PREFIX.STOCK,
PREFIX.SUBTITLE,
+ PREFIX.TMDB,
+ PREFIX.TTS,
PREFIX.VOICE,
PREFIX.WGET,
- PREFIX.FAYAN,
- PREFIX.TTS,
- PREFIX.TMDB,
- PREFIX.CONVERT_TO_SC,
- PREFIX.CONVERT_TO_TC,
FAVORITE.SAVE_PREFIX,
FAVORITE.SEND_PREFIX,
]
src/podcast/main.py
@@ -30,7 +30,7 @@ from loguru import logger
from pyrogram.client import Client
from pyrogram.types import Chat, Message
-from ai.transcription_summary import summarize_transcription
+from ai.summary import summarize
from config import AI, PODCAST, PROXY
from database.github import gh_clean_assets
from database.r2 import get_cf_r2, set_cf_r2
@@ -79,7 +79,7 @@ async def summary_pods(client: Client):
markdown_desc = remove_consecutive_newlines(markdown_desc, newline_level=2)
prompt = f"该转录稿对应于播客栏目《{feed_title}》的一期节目,节目详情如下:\n节目标题: {entry['title']}\n节目播出日期: {pubdate}"
prompt += f"\n节目时长: {duration}\n节目简介: {markdown_desc}"
- ai_res = await summarize_transcription(transcripts, reference=prompt, model=AI.PODCAST_SUMMARY_MODEL_ALIAS)
+ ai_res = await summarize(transcripts, reference=prompt, model=AI.PODCAST_SUMMARY_MODEL_ALIAS)
telegraph_content = ""
if ai_res.get("texts"):
telegraph_content += f"\n🤖**{ai_res['model_name']}总结**:\n{ai_res['texts']}"
src/subtitles/subtitle.py
@@ -10,7 +10,7 @@ from loguru import logger
from pyrogram.client import Client
from pyrogram.types import Message
-from ai.transcription_summary import summarize_transcription
+from ai.summary import summarize
from asr.voice_recognition import asr_file
from config import AI, ASR, DOWNLOAD_DIR, PREFIX, READING_SPEED, TEXT_LENGTH, cache
from messages.parser import parse_msg
@@ -139,7 +139,7 @@ async def get_subtitle(
prompt += f"节目标题: {vinfo['title']}\n发布日期: {vinfo['pubdate']}\n"
if description.strip():
prompt += f"节目简介: {description}"
- res = await summarize_transcription(subtitles, reference=prompt, model=summary_model_id)
+ res = await summarize(subtitles, reference=prompt, model=summary_model_id)
if res.get("texts"):
await send2tg(client, subtitle_msg, texts=res["prefix"] + blockquote(res["texts"]), **kwargs)
with contextlib.suppress(Exception):
src/ytdlp/main.py
@@ -11,7 +11,7 @@ from loguru import logger
from pyrogram.client import Client
from pyrogram.types import Message
-from ai.transcription_summary import summarize_transcription
+from ai.summary import summarize
from config import AI, ASR, CAPTION_LENGTH, DB, MAX_FILE_BYTES, READING_SPEED, YTDLP_RE_ENCODING_MAX_FILE_BYTES
from database.database import get_db
from messages.database import copy_messages_from_db, save_messages
@@ -139,7 +139,7 @@ async def preview_ytdlp(
# get ai summary
summary = ""
if subtitles and true(ytdlp_send_summary):
- aires = await summarize_transcription(sub, reference=generate_prompt(info), model=summary_model_id)
+ aires = await summarize(sub, reference=generate_prompt(info), model=summary_model_id)
if aires.get("texts"):
summary = f"🤖<b>{aires['model_name']}总结:</b>\n{markdown.markdown(aires['texts'])}\n"
src/config.py
@@ -97,6 +97,7 @@ class PREFIX:
PRICE = os.getenv("PREFIX_PRICE", "/price").lower() # unify crypto, stock
CRYPTO = os.getenv("PREFIX_CRYPTO", "/crypto").lower() # crypto only
STOCK = os.getenv("PREFIX_STOCK", "/stock").lower() # stock only
+ CHAT_SUMMARY = os.getenv("PREFIX_CHAT_SUMMARY", "/chatsum").lower()
COMBINATION = os.getenv("PREFIX_COMBINATION", "/combine").lower()
VOICE = os.getenv("PREFIX_VOICE", "/voice").lower()
SEARCH_YOUTUBE = os.getenv("PREFIX_SEARCH_YOUTUBE", "/youtube, /ytb").lower()