main
1#!/venv/bin/python
2# -*- coding: utf-8 -*-
3import base64
4import json
5import zlib
6from pathlib import Path
7
8from config import DB, DOWNLOAD_DIR
9from database.r2 import set_cf_r2
10from networking import download_file, shorten_url
11from schema import MindMap, get_schema
12from utils import digest, read_text
13
14
15def system_prompt(sys: str | None = None) -> str:
16 prompt = """你是一位专业的内容提炼大师,任务是基于用户提供的资料,生成用户无需阅读完整原文档就能清晰理解主要事件、观点、结论的内容,生成符合指定JSON格式的全文总结、分片内容和思维导图。
17
18## 核心规则
191. 语言要求:无论原资料使用何种语言(中文、英文或其他语言),输出的所有内容**需以简体中文为主**,包括JSON结构中的文本、总结、分片内容及思维导图节点;若资料中存在特定领域的专业术语(如技术、学术等领域的外文术语),可保留原外文术语,无需强制译为中文,避免强行翻译导致信息失真。
202. 信息忠实性:提炼内容需完全忠实于原资料,不得添加个人观点、推测或无关信息。
213. 广告过滤规则:若资料中包含与主内容完全无关的广告(如播客/B站视频的植入广告、商业推广等,特征为:内容独立于节目主题、无实质信息关联、去掉后不影响对主内容的理解),需直接忽略该部分内容,不得将广告信息纳入任何提炼结果中。
22"""
23 if sys:
24 prompt += f"\n{sys}"
25 return prompt.strip()
26
27
28def parse_summary_sources(sources: list[dict], *, mermaid: bool = False) -> tuple[str, str, dict]:
29 r"""Parse the sources to texts, transcripts, schema.
30
31 Args:
32 sources (list[dict] | None): The sources to summary.
33 # text
34 {"type": "system_prompt", "text": "This is system prompt."}
35 {"type": "text", "text": "Hello."}
36 {"type": "text", "path": "/path/to/file.txt"}
37 {"type": "transcripts", "text": "[00:00] Hello\n[00:01] a sentence"}
38
39 # media
40 {"type": "image", "path": "/path/to/image.jpg", "mime_type": "image/jpeg"}
41 {"type": "video", "path": "/path/to/video.mp4", "mime_type": "video/mp4"}
42 {"type": "audio", "path": "/path/to/audio.mp3", "mime_type": "audio/mpeg"}
43
44 # file
45 {"type": "file", "path": "/path/to/file.pdf", "mime_type": "application/pdf"}
46
47 # special
48 {"type": "youtube", "url": "https://www.youtube.com/watch?v=videoid"}
49 """
50 sys_items = [x.get("text", "") for x in sources if x["type"] == "system_prompt"]
51 sys_prompt = system_prompt("\n".join(sys_items))
52 if not mermaid:
53 sys_prompt = sys_prompt.replace("和思维导图", "").replace("及思维导图节点", "")
54 transcripts = next((x.get("text", "") for x in sources if x["type"] == "transcripts"), "")
55 texts = ""
56 for source in sources:
57 if source["type"] in ["transcripts", "text"] and source.get("text"):
58 texts += source["text"] + "\n"
59 elif source["type"] == "text" and source.get("path"):
60 texts += read_text(source["path"]) + "\n"
61 media_items = [x for x in sources if x["type"] in ["image", "video", "audio", "file", "youtube"]]
62 schema = get_schema("content_extraction")
63 if not mermaid:
64 schema["properties"].pop("mindmap")
65 if "mindmap" in schema["properties"] and "mindmap" not in schema["required"]:
66 schema["required"].append("mindmap")
67 return (
68 texts.strip(),
69 transcripts.strip(),
70 {
71 "gemini_generate_content_config": {"system_instruction": sys_prompt, "responseMimeType": "application/json", "responseJsonSchema": schema},
72 "openai_responses_config": {
73 "instructions": sys_prompt,
74 "text": {
75 "format": {
76 "type": "json_schema",
77 "name": "ContentExtraction",
78 "strict": True,
79 "description": "精准提炼资料的核心主题、关键观点、主要结论及各片段核心内容,确保输出内容全面覆盖资料的关键信息,用户仅通过总结即可掌握信息全貌。",
80 "schema": schema,
81 }
82 },
83 },
84 "openai_system_prompt": sys_prompt,
85 "openai_completions_config": {
86 "response_format": {
87 "type": "json_schema",
88 "strict": True,
89 "json_schema": {
90 "name": "ContentExtraction",
91 "schema": schema,
92 "strict": True,
93 },
94 }
95 },
96 "additional_contexts": media_items,
97 "gemini_append_grounding": False,
98 "openai_enable_tool_call": False,
99 "openai_append_tool_results": False,
100 "silent": True,
101 },
102 )
103
104
105def generate_mermaid(mindmap: MindMap) -> str:
106 """Generate Mermaid code from MindMap.
107
108 Returns:
109 Mermaid code.
110
111 Example:
112 graph LR
113 A[mindmap.main_title] --> B[topic.title]
114 A --> C[topic.title]
115
116 B --> B1[subtopic.title]
117 B --> B2[subtopic.title]
118 B --> B3[subtopic.title]
119
120 C --> C1[subtopic.title]
121 C --> C2[subtopic.title]
122
123 C1 --> C11[subtopic.leaf]
124 C1 --> C12[subtopic.leaf]
125 """
126 letter = lambda n: chr(n + 66) # Convert integer to uppercase letter ( 0 -> B )
127 quote = lambda s: '"' + s.replace('"', """) + '"'
128
129 mermaid = "---\nconfig:\n theme: neo\n look: neo\n---\ngraph LR\n"
130 indent = " " # four spaces
131 for idx_ch, topic in enumerate(mindmap.topics):
132 if idx_ch == 0:
133 mermaid += indent + f"A[{quote(mindmap.main_title)}] --> {letter(idx_ch)}[{quote(topic.title)}]\n"
134 else:
135 mermaid += indent + f"A --> {letter(idx_ch)}[{quote(topic.title)}]\n"
136
137 # topic leafs
138 for idx_topicleaf, topic_leaf in enumerate(topic.leafs or []):
139 mermaid += indent + f"{letter(idx_ch)} --> {letter(idx_ch)}{idx_topicleaf + 1}[{quote(topic_leaf)}]\n"
140
141 # SubTopic
142 for idx_sub, sub in enumerate(topic.sub_tocpics or []):
143 mermaid += indent + f"{letter(idx_ch)} --> {letter(idx_ch)}{idx_sub + 1}[{quote(sub.title)}]\n"
144 # subtopic leafs
145 for idx_subleaf, subleaf in enumerate(sub.leafs or []):
146 mermaid += indent + f"{letter(idx_ch)}{idx_sub + 1} --> {letter(idx_ch)}{idx_sub + 1}{idx_subleaf + 1}[{quote(subleaf)}]\n"
147
148 return mermaid.strip()
149
150
151async def publish_mermaid(mermaid: str) -> tuple[str, str]:
152 """Save Mermaid image to R2.
153
154 Returns:
155 (image_url, pako_url)
156 """
157 b64_str = base64.urlsafe_b64encode(mermaid.encode("utf-8")).decode("ascii")
158 save_path = Path(DOWNLOAD_DIR) / f"{digest(mermaid)}.jpg" # noqa: S324
159 r2_key = f"TTL/365d/{save_path.name}"
160 img_url = f"{DB.CF_R2_PUBLIC_URL}/{r2_key}"
161 if await download_file(f"https://mermaid.ink/img/{b64_str}?type=jpeg&theme=forest&width=2160", path=save_path, suffix=".jpg"):
162 img_url = await shorten_url(img_url, alias=str(digest(mermaid, 16)))
163 mermaid = mermaid.replace("\ngraph LR", f"\n%% {img_url}\ngraph LR")
164 # generate pako url for mermaid image
165 json_str = json.dumps({"code": mermaid.strip()}, separators=(",", ":"))
166 compressed_bytes = zlib.compress(json_str.encode("utf-8"), level=9)
167 pako_b64_str = base64.urlsafe_b64encode(compressed_bytes).decode("utf-8").rstrip("=")
168 pako_url = await shorten_url(f"https://mermaid.live/view#pako:{pako_b64_str}", alias=str(digest(pako_b64_str, 16)))
169
170 if save_path.is_file():
171 await set_cf_r2(r2_key, data=save_path.read_bytes(), mime_type="image/jpeg", silent=True)
172 save_path.unlink(missing_ok=True)
173 return img_url, pako_url
174 return "", ""