bennybot/src/publish.py at main

  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3import io
  4import re
  5import tempfile
  6from datetime import UTC
  7from pathlib import Path
  8from typing import Literal
  9from urllib.parse import quote_plus
 10from zoneinfo import ZoneInfo
 11
 12import anyio
 13from glom import glom
 14from httpx import AsyncClient
 15from loguru import logger
 16from telegraph.aio import Telegraph
 17from telegraph.utils import html_to_nodes
 18
 19from asr.utils import split_transcripts
 20from config import DB, DOWNLOAD_DIR, TOKEN, TZ
 21from database.r2 import set_cf_r2
 22from networking import download_file
 23from schema import AIPage, Section
 24from utils import convert2html, convert2md, digest, nowdt, rand_string, remove_consecutive_newlines
 25
 26
 27def adjust_tags(s: str | None) -> str:
 28    # Revise Telegraph Tags
 29    s = str(s).replace("<h1>", "<h3>").replace("</h1>", "</h3>")
 30    return s.replace("<h2>", "<h3>").replace("</h2>", "</h3>")
 31
 32
 33async def publish_telegraph(
 34    title: str | None = None,
 35    html: str | None = None,
 36    nodes: list[dict] | None = None,
 37    author: str | None = None,
 38    url: str | None = None,
 39    aipage: AIPage | None = None,
 40    ttl: str | None = None,  # 12h, 7d, 1M, ...
 41    *,
 42    fallback_r2: bool = True,
 43) -> str:
 44    """Publish to Telegraph.
 45
 46    Available tags:
 47    a, aside, b, blockquote, br, code, em, figcaption, figure, h3, h4, hr, i, iframe, img, li, ol, p, pre, s, strong, u, ul, video.
 48
 49    """
 50    ttl = ttl or "365d"
 51    title = title or "Telegraph"
 52    if aipage is not None:
 53        return await telegraph_aipage(aipage, ttl=ttl)
 54
 55    # limit title, author, url length
 56    title = title[:256]
 57    if author:
 58        author = author[:128]
 59    if url:
 60        url = url[:512]
 61    try:
 62        telegraph = Telegraph(access_token=TOKEN.TELEGRAPH)
 63        page = await telegraph.create_page(title=title, author_name=author, author_url=url, content=nodes, html_content=adjust_tags(html))
 64        logger.info(f"⚡️Telegraph: {page['url']}")
 65        return page["url"]
 66    except Exception as e:
 67        logger.error(f"Telegraph publish error: {e}")
 68        if fallback_r2:
 69            return await publish_cf_r2(title, html=html, author=author, url=url, ttl=ttl)
 70    return ""
 71
 72
 73async def publish_cf_r2(
 74    title: str,
 75    html: str | None = None,
 76    author: str | None = None,
 77    url: str | None = None,
 78    aipage: AIPage | None = None,
 79    ttl: str = "365d",
 80) -> str:
 81    """Publish to CF R2."""
 82    if html is None and aipage is None:
 83        logger.error("`html` or `aipage` parameter is required")
 84        return ""
 85    if html is not None and aipage is not None:
 86        logger.error("`html` and `aipage` parameter cannot be both provided")
 87        return ""
 88    if not html:
 89        return ""
 90    now = nowdt(TZ)
 91    today = f"{now:%Y-%m-%d}"
 92    key = f"InstantView/{today}-{rand_string(8)}.html" if ttl == "forever" else f"TTL/{ttl}/{today}-{rand_string(8)}.html"
 93    if not url:
 94        url = "https://instantview.telegram.org"
 95    if not author:
 96        author = "BennyBot"
 97
 98    html = f'<h1 id="iv-title">{title}</h1><a href="{url}" id="iv-author">{author}</a>{html}'
 99    html = f'<!DOCTYPE html><html><head><meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>{title}</title><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/water.css@2/out/water.css"></head><body><article>{html}</article></body></html>'
100    if await set_cf_r2(key=key, data=html, metadata={"title": title, "author": author, "url": url}, mime_type="text/html") and DB.CF_R2_PUBLIC_URL and TOKEN.R2_IV_HASH:
101        pub_url = f"{DB.CF_R2_PUBLIC_URL.rstrip('/')}/{key}"
102        logger.info(f"⚡️CF R2: {pub_url}")
103        return f"https://t.me/iv?url={quote_plus(pub_url)}&rhash={TOKEN.R2_IV_HASH}"
104    return await publish_neocities(title, html=html, author=author, url=url)
105
106
107async def publish_neocities(title: str, html: str | None = None, author: str | None = None, url: str | None = None) -> str:
108    """Publish to neocities.org ."""
109    if not TOKEN.NEOCITIES:
110        return ""
111    if not html:
112        return ""
113    base_url = "https://neocities.org/api/upload"
114    username, password = TOKEN.NEOCITIES.split(",")
115    now = nowdt(TZ)
116    today = f"{now:%Y-%m-%d}"
117    server_file = f"{today}/{rand_string(12)}.html"
118    pub_url = f"https://{username}.neocities.org/{server_file.removesuffix('.html')}"
119    if not url:
120        url = pub_url
121    if not author:
122        author = "BennyBot"
123
124    html = f'<h1 id="iv-title">{title}</h1><a href="{url}" id="iv-author">{author}</a>{html}'
125    html = f'<!DOCTYPE html><html><head><meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>{title}</title><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/water.css@2/out/water.css"></head><body><article>{html}</article></body></html>'
126
127    try:
128        with tempfile.NamedTemporaryFile("w", suffix=".html", delete=False) as tempf:
129            tempf.write(html)
130        async with await anyio.open_file(tempf.name, "rb") as f:
131            content = await f.read()
132            client = AsyncClient(http2=True, timeout=20)
133            await client.post(
134                base_url,
135                auth=(username, password),
136                files={server_file: (server_file, io.BytesIO(content), "text/html")},
137            )
138        Path(tempf.name).unlink(missing_ok=True)
139        logger.info(f"⚡️Neocities: {pub_url}")
140    except Exception as e:
141        logger.error(f"Neocities publish error: {e}")
142        return ""
143
144    return f"https://t.me/iv?url={quote_plus(pub_url)}&rhash={TOKEN.NEOCITIES_IV_HASH}" if TOKEN.NEOCITIES_IV_HASH else pub_url
145
146
147async def telegraph_aipage(page: AIPage, ttl: str | None = None, *, force_r2: bool = False) -> str:
148    """Publish AI Page to Telegraph."""
149    if force_r2:
150        return await r2_aipage(page, ttl=ttl, rformat="url")
151    anchor = lambda s: s.replace(" ", "-")
152
153    nodes = []
154    overview = glom(page, "summary.overview", default="")
155    transcripts = page.transcripts or []
156    if isinstance(transcripts, str):
157        transcripts = split_transcripts(transcripts)
158    sections: list[Section] = glom(page, "summary.sections", default=[])
159
160    # TOC
161    ul = [{"tag": "li", "children": [{"attrs": {"href": f"#{anchor(section.emoji + section.title)}"}, "children": [section.emoji + section.title], "tag": "a"}]} for section in sections]
162    if transcripts:
163        ul.append({"tag": "li", "children": [{"attrs": {"href": "#🔤完整字幕"}, "children": ["🔤完整字幕"], "tag": "a"}]})
164    if page.mermaid_url:
165        ul.append({"tag": "li", "children": [{"attrs": {"href": page.mermaid_url}, "children": ["🧠思维导图"], "tag": "a"}]})
166    if ul:
167        toc = {"tag": "ul", "children": ul}
168        nodes.append(toc)
169
170    # Overview
171    if overview:
172        nodes.append({"tag": "h3", "children": ["🤖AI导读"]})
173        nodes.append({"tag": "p", "children": [overview]})
174
175    # Description
176    description = page.description
177    if description and isinstance(description, str):
178        desc = convert2md(html=description)
179        desc_html = convert2html(remove_consecutive_newlines(desc, newline_level=2))
180        desc_nodes = html_to_nodes(adjust_tags(desc_html))
181        nodes.append({"tag": "h4", "children": ["📖原始简介"]})
182        nodes.extend(desc_nodes)
183    elif description and isinstance(description, dict) and description.get("html"):
184        desc_emoji = description.get("emoji", "📖")
185        desc_title = description.get("name", "原始简介")
186        desc_md = convert2md(html=description["html"]) if description["html"].startswith("<") else description["html"]
187        desc_html = convert2html(remove_consecutive_newlines(desc_md, newline_level=2))
188        desc_nodes = html_to_nodes(adjust_tags(desc_html))
189        nodes.append({"tag": "h4", "children": [desc_emoji + desc_title]})
190        nodes.extend(desc_nodes)
191    # Sections
192    for section in sections:
193        nodes.append({"tag": "h4", "children": [section.emoji + section.title]})
194        nodes.append({"tag": "p", "children": [section.content]})
195
196    # Transcript
197    if transcripts:
198        nodes.append({"tag": "h4", "children": ["🔤完整字幕"]})
199        nodes.extend([{"children": [f"[{t.start}] {t.content}"], "tag": "p"} for t in transcripts])
200    if not nodes:
201        logger.warning("No Telegraph nodes to publish")
202        return ""
203    telegraph_url = await publish_telegraph(title=page.title, nodes=nodes, author=page.author, url=page.url, ttl=ttl, fallback_r2=False)
204    if not telegraph_url:
205        return await r2_aipage(page, ttl=ttl, rformat="url")
206    return telegraph_url
207
208
209async def r2_aipage(page: AIPage, ttl: str | None = None, *, expand_transcript: bool = False, rformat: Literal["url", "html"] = "url") -> str:
210    """Publish AI Page to R2."""
211    ttl = ttl or "365d"
212    today = f"{nowdt(TZ):%Y-%m-%d}"
213    r2_prefix = f"InstantView/{today}" if ttl == "forever" else f"TTL/{ttl}/{today}"
214    r2_key = f"{r2_prefix}-{digest(page)}.html"  # noqa: S324
215    r2_url = f"{DB.CF_R2_PUBLIC_URL}/{r2_key}"
216    if TOKEN.R2_IV_HASH:
217        r2_url = f"https://t.me/iv?url={quote_plus(r2_url)}&rhash={TOKEN.R2_IV_HASH}"
218    date = page.date or nowdt("UTC")
219    utc_date = date.astimezone(UTC)
220    tz_date = utc_date.astimezone(ZoneInfo(TZ))
221    url = page.url or "https://instantview.telegram.org"
222    author_tag = f'<div class="header-author"><span class="header-author">{page.author}</span><span class="header-date"> | {tz_date:%Y-%m-%d %H:%M:%S}</span></div>' if page.author else ""
223    overview = glom(page, "summary.overview", default="")
224    if overview:
225        overview = f'<div class="card summary"><div class="card-label" id="summary" >🤖AI导读</div>{convert2html(overview)}</div>'
226
227    sections: list[Section] = glom(page, "summary.sections", default=[])
228
229    sidebars = '<nav class="sidebar" id="sidebar"><ul>'
230    if overview:
231        sidebars += '<li><a href="#summary" onclick="navClick(event)"><span class="sidebar-icon">🤖</span><span class="sidebar-label">AI导读</span></a></li>'
232
233    desc_tag = ""
234    desc_head = ""
235    description = page.description
236    if description and isinstance(description, str):
237        sidebars += '<li><a href="#description" onclick="navClick(event)"><span class="sidebar-icon">📖</span><span class="sidebar-label">原始简介</span></a></li>'
238        desc_html = description if description.startswith("<") else convert2html(description)
239        desc_tag = f'<div class="card description"><div class="card-label" id="description">📖原始简介</div>{desc_html}</div>'
240    elif description and isinstance(description, dict) and description.get("html"):
241        desc_emoji = description.get("emoji", "📖")
242        desc_title = description.get("name", "原始简介")
243        desc_html = description["html"] if description["html"].startswith("<") else convert2html(description["html"])
244        sidebars += f'<li><a href="#description" onclick="navClick(event)"><span class="sidebar-icon">{desc_emoji}</span><span class="sidebar-label">{desc_title}</span></a></li>'
245        desc_tag = f'<div class="card description"><div class="card-label" id="description">{desc_emoji}{desc_title}</div>{desc_html}</div>'
246    if description and overview:
247        desc_head = f"""<meta property="og:description" content="{glom(page, "summary.overview", default="")}">"""
248
249    sections_tag = ""
250
251    for idx, section in enumerate(sections):
252        sidebars += f'<li><a href="#s{idx + 1}" onclick="navClick(event)"><span class="sidebar-icon">{section.emoji}</span><span class="sidebar-label">{section.title}</span>'
253        sections_tag += f'<section class="section" id="s{idx + 1}"><div class="section-header"><span class="section-icon">{section.emoji}</span><h2 class="section-title">{section.title}</h2>'
254        if section.start:
255            start = section.start.removeprefix("00:") if len(section.start) > 5 else section.start
256            sidebars += f'<span class="sidebar-time">{start}</span>'
257            sections_tag += f'<span class="section-time">{start}</span>'
258        sidebars += "</a></li>"
259        sections_tag += f"</div>{convert2html(section.content)}</section>"
260
261    transcripts = page.transcripts or []
262    if isinstance(transcripts, str):
263        transcripts = split_transcripts(transcripts)
264
265    transcriptions = ""
266    if transcripts:
267        sidebars += """<li class="nav-transcript"><a href="#transcript" onclick="navClick(event)"><span class="sidebar-icon">🔤</span><span class="sidebar-label">完整字幕</span></a></li>"""
268        transcriptions += '<div class="card transcript-card" id="transcript"><button class="transcript-toggle" aria-expanded="false" onclick="toggleTranscript(this)">展开字幕 <span class="arrow">&#9662;</span></button><div class="transcript-content" id="transcriptions">'
269        for sentence in transcripts:
270            transcriptions += f'<p><span class="ts">{sentence.start}</span>{sentence.content}</p>'
271        transcriptions += "</div></div>"
272
273    sidebars += "</ul></nav>"
274    sidebar_icon = '<button class="icon-toc" id="icon-toc" onclick="toggleSidebar()" aria-label="菜单"><svg class="icon-open" viewBox="0 0 24 24"><line x1="3" y1="6" x2="21" y2="6" /><line x1="3" y1="12" x2="21" y2="12" /><line x1="3" y1="18" x2="21" y2="18" /></svg><svg class="icon-close" viewBox="0 0 24 24" style="display:none"><polyline points="13,4 5,12 13,20" /><polyline points="20,4 12,12 20,20" /></svg></button>'
275    if sidebars == '<nav class="sidebar" id="sidebar"><ul></ul></nav>':  # empty sidebar
276        sidebars = ""
277        sidebar_icon = ""
278
279    if not any([overview, page.mermaid_img, sections_tag, desc_tag, transcriptions]):
280        logger.warning("No AIPage contents to publish")
281        return ""
282
283    theme_icon = '<button class="icon-theme" id="icon-theme" onclick="toggleTheme()" aria-label="切换主题"><svg class="icon-sun" viewBox="0 0 24 24"><circle cx="12" cy="12" r="5" /><path d="M12 1v2M12 21v2M4.22 4.22l1.42 1.42M18.36 18.36l1.42 1.42M1 12h2M21 12h2M4.22 19.78l1.42-1.42M18.36 5.64l1.42-1.42" /></svg><svg class="icon-moon" viewBox="0 0 24 24" style="display:none"><path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z" /></svg></button>'
284    mermaid_icon = '<button class="icon-mindmap" id="icon-mindmap" onclick="toggleMindmapPanel()" aria-label="思维导图"><svg viewBox="0 0 24 24"><circle cx="4" cy="12" r="2" /><path d="M6 12h6M12 12l8-8M12 12h8M12 12l8 8" /></svg></button>'if page.mermaid_img else ""  # fmt: skip
285
286    mermaid_desktop = f'<div class="mindmap-panel" id="mindmap-panel"><div class="mindmap-panel-content"><img data-src="{page.mermaid_img}" alt="思维导图"><a href="{page.mermaid_url}" target="_blank" class="mindmap-link">查看完整思维导图</a></div></div>' if page.mermaid_img else ""  # fmt: skip
287
288    mermaid_mobile = f'<div class="card mindmap-card mindmap-mobile" id="mindmap-mobile"><button class="transcript-toggle" aria-expanded="false" onclick="toggleMindmap(this)">展开思维导图 <span class="arrow">&#9662;</span></button><div class="mindmap-body" id="mindmap-body"><img data-src="{page.mermaid_img}" alt="思维导图"><a href="{page.mermaid_url}" target="_blank" class="mindmap-link">查看完整思维导图</a></div></div>' if page.mermaid_img else ""  # fmt: skip
289
290    html_str = f"""<!DOCTYPE html>
291<html lang="zh-CN">
292<head>
293    <meta charset="UTF-8">
294    <meta name="viewport" content="width=device-width, initial-scale=1.0">
295    <meta property="article:published_time" content="{utc_date:%Y-%m-%dT%H:%M:%SZ}">
296    <meta property="og:title" content="{page.title}">{desc_head}
297    <meta property="og:site_name" content="🤖AI导读" />
298    <link rel="icon" type="image/png" href="{DB.CF_R2_PUBLIC_URL}/favicon/favicon-96x96.png" sizes="96x96" />
299    <link rel="icon" type="image/svg+xml" href="{DB.CF_R2_PUBLIC_URL}/favicon/favicon.svg" />
300    <link rel="shortcut icon" href="{DB.CF_R2_PUBLIC_URL}/favicon/favicon.ico" />
301    <link rel="apple-touch-icon" sizes="180x180" href="{DB.CF_R2_PUBLIC_URL}/favicon/apple-touch-icon.png" />
302    <meta name="apple-mobile-web-app-title" content="R2" />
303    <link rel="manifest" href="{DB.CF_R2_PUBLIC_URL}/favicon/site.webmanifest" />
304    <title>{page.title}</title>
305    <link rel="stylesheet" href="{DB.CF_R2_PUBLIC_URL}/telegraph.css">
306    <script src="{DB.CF_R2_PUBLIC_URL}/telegraph.js" defer></script>
307</head>
308<body>
309
310    <!-- Icon -->
311    {sidebar_icon}
312
313    {mermaid_icon}
314
315    {theme_icon}
316
317    {sidebars}
318    <div class="resize-handle resize-handle-left" id="resize-handle-left"></div>
319    <div class="resize-handle resize-handle-right" id="resize-handle-right"></div>
320    <div class="container">
321        <header class="header"><h1 class="header-title"><a href="{url}" target="_blank">{page.title}</a></h1>{author_tag}</header>
322
323        <!-- AI Summary -->
324        {overview}
325
326        {mermaid_mobile}
327
328        <!-- Description -->
329        {desc_tag}
330
331        <!-- Sections -->
332        {sections_tag}
333
334
335        <!-- Transcript -->
336        {transcriptions}
337
338    </div>
339    {'<div class="transcript-panel" id="transcript-panel"></div>' if transcriptions else ""}
340    {mermaid_desktop}
341
342</body>
343
344</html>"""
345    # simplify html
346    html_str = html_str.replace(f'<a href="" target="_blank">{page.title}</a>', page.title)
347    html_str = html_str.replace('<footer class="footer header-title"><a href="" target="_blank" ></a></footer>', "")
348    html_str = re.sub(r"<!--[\s\S]*?-->", "", html_str)
349    html_str = re.sub(r"\s+", " ", html_str).strip()
350    if expand_transcript:
351        html_str = html_str.replace('class="transcript-content"', 'class="transcript-content open"')
352        html_str = html_str.replace('aria-expanded="false"', 'aria-expanded="true"')
353        html_str = html_str.replace(">展开字幕 <", ">收起字幕 <")
354    if rformat == "url":
355        return r2_url if await set_cf_r2(key=r2_key, data=html_str, metadata={"title": page.title, "author": page.author or "BennyBot", "url": page.url}, mime_type="text/html") else ""
356
357    # return html, embed css and js in head
358    css_path = Path(DOWNLOAD_DIR) / "HTML/telegraph.css"
359    js_path = Path(DOWNLOAD_DIR) / "HTML/telegraph.js"
360    await download_file(DB.CF_R2_PUBLIC_URL + "/telegraph.css", css_path, skip_exist=True)
361    await download_file(DB.CF_R2_PUBLIC_URL + "/telegraph.js", js_path, skip_exist=True)
362    if css_path.exists():
363        css = css_path.read_text()
364        html_str = html_str.replace(f'<link rel="stylesheet" href="{DB.CF_R2_PUBLIC_URL}/telegraph.css">', f"<style>\n{css}\n</style>")
365    if js_path.exists():
366        js = js_path.read_text()
367        html_str = html_str.replace(f'<script src="{DB.CF_R2_PUBLIC_URL}/telegraph.js" defer></script>', f"<script defer>\n{js}\n</script>")
368    return html_str