bennybot/src/summarize/utils.py at main

  1#!/venv/bin/python
  2# -*- coding: utf-8 -*-
  3import base64
  4import json
  5import zlib
  6from pathlib import Path
  7
  8from config import DB, DOWNLOAD_DIR
  9from database.r2 import set_cf_r2
 10from networking import download_file, shorten_url
 11from schema import MindMap, get_schema
 12from utils import digest, read_text
 13
 14
 15def system_prompt(sys: str | None = None) -> str:
 16    prompt = """你是一位专业的内容提炼大师，任务是基于用户提供的资料，生成用户无需阅读完整原文档就能清晰理解主要事件、观点、结论的内容，生成符合指定JSON格式的全文总结、分片内容和思维导图。
 17
 18## 核心规则
 191. 语言要求：无论原资料使用何种语言（中文、英文或其他语言），输出的所有内容**需以简体中文为主**，包括JSON结构中的文本、总结、分片内容及思维导图节点；若资料中存在特定领域的专业术语（如技术、学术等领域的外文术语），可保留原外文术语，无需强制译为中文，避免强行翻译导致信息失真。
 202. 信息忠实性：提炼内容需完全忠实于原资料，不得添加个人观点、推测或无关信息。
 213. 广告过滤规则：若资料中包含与主内容完全无关的广告（如播客/B站视频的植入广告、商业推广等，特征为：内容独立于节目主题、无实质信息关联、去掉后不影响对主内容的理解），需直接忽略该部分内容，不得将广告信息纳入任何提炼结果中。
 22"""
 23    if sys:
 24        prompt += f"\n{sys}"
 25    return prompt.strip()
 26
 27
 28def parse_summary_sources(sources: list[dict], *, mermaid: bool = False) -> tuple[str, str, dict]:
 29    r"""Parse the sources to texts, transcripts, schema.
 30
 31    Args:
 32        sources (list[dict] | None): The sources to summary.
 33        # text
 34        {"type": "system_prompt", "text": "This is system prompt."}
 35        {"type": "text", "text": "Hello."}
 36        {"type": "text", "path": "/path/to/file.txt"}
 37        {"type": "transcripts", "text": "[00:00] Hello\n[00:01] a sentence"}
 38
 39        # media
 40        {"type": "image", "path": "/path/to/image.jpg", "mime_type": "image/jpeg"}
 41        {"type": "video", "path": "/path/to/video.mp4", "mime_type": "video/mp4"}
 42        {"type": "audio", "path": "/path/to/audio.mp3", "mime_type": "audio/mpeg"}
 43
 44        # file
 45        {"type": "file", "path": "/path/to/file.pdf", "mime_type": "application/pdf"}
 46
 47        # special
 48        {"type": "youtube", "url": "https://www.youtube.com/watch?v=videoid"}
 49    """
 50    sys_items = [x.get("text", "") for x in sources if x["type"] == "system_prompt"]
 51    sys_prompt = system_prompt("\n".join(sys_items))
 52    if not mermaid:
 53        sys_prompt = sys_prompt.replace("和思维导图", "").replace("及思维导图节点", "")
 54    transcripts = next((x.get("text", "") for x in sources if x["type"] == "transcripts"), "")
 55    texts = ""
 56    for source in sources:
 57        if source["type"] in ["transcripts", "text"] and source.get("text"):
 58            texts += source["text"] + "\n"
 59        elif source["type"] == "text" and source.get("path"):
 60            texts += read_text(source["path"]) + "\n"
 61    media_items = [x for x in sources if x["type"] in ["image", "video", "audio", "file", "youtube"]]
 62    schema = get_schema("content_extraction")
 63    if not mermaid:
 64        schema["properties"].pop("mindmap")
 65    if "mindmap" in schema["properties"] and "mindmap" not in schema["required"]:
 66        schema["required"].append("mindmap")
 67    return (
 68        texts.strip(),
 69        transcripts.strip(),
 70        {
 71            "gemini_generate_content_config": {"system_instruction": sys_prompt, "responseMimeType": "application/json", "responseJsonSchema": schema},
 72            "openai_responses_config": {
 73                "instructions": sys_prompt,
 74                "text": {
 75                    "format": {
 76                        "type": "json_schema",
 77                        "name": "ContentExtraction",
 78                        "strict": True,
 79                        "description": "精准提炼资料的核心主题、关键观点、主要结论及各片段核心内容，确保输出内容全面覆盖资料的关键信息，用户仅通过总结即可掌握信息全貌。",
 80                        "schema": schema,
 81                    }
 82                },
 83            },
 84            "openai_system_prompt": sys_prompt,
 85            "openai_completions_config": {
 86                "response_format": {
 87                    "type": "json_schema",
 88                    "strict": True,
 89                    "json_schema": {
 90                        "name": "ContentExtraction",
 91                        "schema": schema,
 92                        "strict": True,
 93                    },
 94                }
 95            },
 96            "additional_contexts": media_items,
 97            "gemini_append_grounding": False,
 98            "openai_enable_tool_call": False,
 99            "openai_append_tool_results": False,
100            "silent": True,
101        },
102    )
103
104
105def generate_mermaid(mindmap: MindMap) -> str:
106    """Generate Mermaid code from MindMap.
107
108    Returns:
109        Mermaid code.
110
111    Example:
112    graph LR
113        A[mindmap.main_title] --> B[topic.title]
114        A --> C[topic.title]
115
116        B --> B1[subtopic.title]
117        B --> B2[subtopic.title]
118        B --> B3[subtopic.title]
119
120        C --> C1[subtopic.title]
121        C --> C2[subtopic.title]
122
123        C1 --> C11[subtopic.leaf]
124        C1 --> C12[subtopic.leaf]
125    """
126    letter = lambda n: chr(n + 66)  # Convert integer to uppercase letter ( 0 -> B )
127    quote = lambda s: '"' + s.replace('"', "&quot;") + '"'
128
129    mermaid = "---\nconfig:\n  theme: neo\n  look: neo\n---\ngraph LR\n"
130    indent = "    "  # four spaces
131    for idx_ch, topic in enumerate(mindmap.topics):
132        if idx_ch == 0:
133            mermaid += indent + f"A[{quote(mindmap.main_title)}] --> {letter(idx_ch)}[{quote(topic.title)}]\n"
134        else:
135            mermaid += indent + f"A --> {letter(idx_ch)}[{quote(topic.title)}]\n"
136
137        # topic leafs
138        for idx_topicleaf, topic_leaf in enumerate(topic.leafs or []):
139            mermaid += indent + f"{letter(idx_ch)} --> {letter(idx_ch)}{idx_topicleaf + 1}[{quote(topic_leaf)}]\n"
140
141        # SubTopic
142        for idx_sub, sub in enumerate(topic.sub_tocpics or []):
143            mermaid += indent + f"{letter(idx_ch)} --> {letter(idx_ch)}{idx_sub + 1}[{quote(sub.title)}]\n"
144            # subtopic leafs
145            for idx_subleaf, subleaf in enumerate(sub.leafs or []):
146                mermaid += indent + f"{letter(idx_ch)}{idx_sub + 1} --> {letter(idx_ch)}{idx_sub + 1}{idx_subleaf + 1}[{quote(subleaf)}]\n"
147
148    return mermaid.strip()
149
150
151async def publish_mermaid(mermaid: str) -> tuple[str, str]:
152    """Save Mermaid image to R2.
153
154    Returns:
155        (image_url, pako_url)
156    """
157    b64_str = base64.urlsafe_b64encode(mermaid.encode("utf-8")).decode("ascii")
158    save_path = Path(DOWNLOAD_DIR) / f"{digest(mermaid)}.jpg"  # noqa: S324
159    r2_key = f"TTL/365d/{save_path.name}"
160    img_url = f"{DB.CF_R2_PUBLIC_URL}/{r2_key}"
161    if await download_file(f"https://mermaid.ink/img/{b64_str}?type=jpeg&theme=forest&width=2160", path=save_path, suffix=".jpg"):
162        img_url = await shorten_url(img_url, alias=str(digest(mermaid, 16)))
163    mermaid = mermaid.replace("\ngraph LR", f"\n%% {img_url}\ngraph LR")
164    # generate pako url for mermaid image
165    json_str = json.dumps({"code": mermaid.strip()}, separators=(",", ":"))
166    compressed_bytes = zlib.compress(json_str.encode("utf-8"), level=9)
167    pako_b64_str = base64.urlsafe_b64encode(compressed_bytes).decode("utf-8").rstrip("=")
168    pako_url = await shorten_url(f"https://mermaid.live/view#pako:{pako_b64_str}", alias=str(digest(pako_b64_str, 16)))
169
170    if save_path.is_file():
171        await set_cf_r2(r2_key, data=save_path.read_bytes(), mime_type="image/jpeg", silent=True)
172        save_path.unlink(missing_ok=True)
173        return img_url, pako_url
174    return "", ""