main
1from datetime import UTC, datetime
2from typing import Literal
3
4import jsonref
5from pydantic import BaseModel, ConfigDict, Field
6
7
8class SubTopic(BaseModel):
9 """思维导图二级话题."""
10
11 model_config = ConfigDict(extra="ignore")
12
13 title: str = Field(description="二级话题标题")
14 leafs: list[str] = Field(default=[], description="叶子节点内容列表")
15
16
17class Topic(BaseModel):
18 """思维导图一级话题."""
19
20 model_config = ConfigDict(extra="ignore")
21
22 title: str = Field(description="一级话题标题")
23 sub_tocpics: list[SubTopic] | None = Field(default=None, description="二级话题列表(可为空)")
24 leafs: list[str] | None = Field(default=None, description="一级话题的叶子节点内容列表(可为空)")
25
26
27class MindMap(BaseModel):
28 """思维导图根节点."""
29
30 model_config = ConfigDict(extra="ignore")
31
32 main_title: str = Field(description="根节点标题")
33 topics: list[Topic] = Field(description="一级话题列表")
34
35
36class Section(BaseModel):
37 """分片内容详情."""
38
39 model_config = ConfigDict(str_strip_whitespace=True)
40 title: str = Field(description="该片段的标题")
41 emoji: str = Field(description="匹配该片段的emoji,例如💡、💰、⚠️等")
42 content: str = Field(description="详细说明该片段的核心事件、具体观点或结论,禁止仅用1-2句话泛泛概括,需传递足够细节。")
43 start: str | None = Field(default=None, description="如果资料为含时间戳的文字稿(如播客/视频/音频的转录稿),需补充start字段HH:MM:SS或MM:SS;无时间戳则无需输出start字段。")
44
45
46class ContentExtraction(BaseModel):
47 model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
48 overview: str = Field(
49 title="全文总结",
50 description="需涵盖资料核心主题、关键观点和主要结论,采用连贯语言表述,若内容复杂可分段,但需逻辑清晰。禁止过于简略(如仅用一句话概括长文档),确保信息密度足够支撑用户理解。",
51 )
52 sections: list[Section] = Field(
53 title="分片内容",
54 description="需将文档划分为逻辑连贯的片段(如按章节、主题、时间线划分);每个片段需拟定**简洁准确**的标题(体现片段核心)、匹配1个相关emoji;并说明该片段的核心内容。",
55 )
56 mindmap: MindMap | None = Field(default=None, description="思维导图")
57
58
59class Sentence(BaseModel):
60 model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
61 content: str = Field(description="句子内容")
62 start: str = Field(description="句子开始时间,格式为HH:MM:SS或MM:SS")
63
64
65class AIPage(BaseModel):
66 model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
67 title: str = Field(default="AI导读", description="标题")
68 url: str | None = Field(default=None, description="原始链接")
69 author: str | None = Field(default=None, description="作者")
70 description: str | dict | None = Field(default=None, description="原始描述")
71 date: datetime | None = Field(default_factory=lambda: datetime.now(UTC), description="发布日期")
72 summary: ContentExtraction | None = Field(default=None, description="AI总结")
73 transcripts: str | list[Sentence] | None = Field(default=None, description="转录稿")
74 mermaid_img: str | None = Field(default=None, description="思维导图图片URL")
75 mermaid_url: str | None = Field(default=None, description="思维导图代码URL")
76
77
78class Correction(BaseModel):
79 """转录错误修正项.
80
81 只包含转录错误项,正确项不显示。
82 """
83
84 model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
85 idx: int = Field(title="索引", description="修正项在原始转录稿中的索引")
86 corrected: str = Field(description="修正后的文本")
87
88
89class TranscriptionCorrection(BaseModel):
90 """转录错误修正."""
91
92 model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
93 corrections: list[Correction] = Field(title="转录错误修正", description="转录错误修正项列表")
94
95
96def get_schema(name: Literal["content_extraction", "transcription_correction"] = "content_extraction") -> dict:
97 if name == "content_extraction":
98 schema = ContentExtraction.model_json_schema()
99 elif name == "transcription_correction":
100 schema = TranscriptionCorrection.model_json_schema()
101 else:
102 return {}
103 inlined_schema = jsonref.replace_refs(schema, proxies=False)
104 inlined_schema.pop("$defs", None)
105 return inlined_schema
106
107
108if __name__ == "__main__":
109 print(get_schema("content_extraction"))
110 print(get_schema("transcription_correction"))