Commit c320e53

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-04-28 14:32:31
feat(wechat): support WeChat article preview
1 parent d4feaa6
src/messages/database.py
@@ -73,6 +73,10 @@ async def save_messages(messages: list[Message | None], key: str, metadata: dict
             logger.trace(f"Saving text message {msg.id}")
             data.append({"type": "text", "cid": msg.chat.id, "mid": msg.id} | msg_extra)
             continue
+        if info["mtype"] == "document":
+            logger.trace(f"Saving document message {msg.id}")
+            data.append({"type": "document", "cid": msg.chat.id, "mid": msg.id} | msg_extra)
+            continue
         logger.warning(f"Skip save message {msg.id} to {DB.ENGINE} due to unknown type: {msg}")
     if data:
         return await set_db(key, metadata=metadata, data={"data": data})
src/preview/wechat.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import contextlib
+from pathlib import Path
+from urllib.parse import quote_plus
+
+from loguru import logger
+from pyrogram.client import Client
+from pyrogram.parser.markdown import BLOCKQUOTE_EXPANDABLE_DELIM, BLOCKQUOTE_EXPANDABLE_END_DELIM
+from pyrogram.types import Message
+from telegraph.aio import Telegraph
+
+from config import API, CAPTION_LENGTH, DB, DOWNLOAD_DIR, PROXY, TEXT_LENGTH, TOKEN
+from database import get_db
+from messages.database import copy_messages_from_db, save_messages
+from messages.progress import modify_progress
+from messages.sender import send2tg
+from messages.utils import count_without_entities, summay_media
+from networking import download_file, download_media, hx_req
+from utils import nowstr, rand_string
+
+
+async def preview_wechat(client: Client, message: Message, url: str = "", db_key: str = "", **kwargs):
+    """Preview wechat link in the message.
+
+    Args:
+        client (Client): The Pyrogram client.
+        message (Message): The trigger message object.
+        url (str, optional): wechat link
+        db_key (str, optional): The cache key.
+    """
+    if kwargs.get("show_progress") and "progress" not in kwargs:
+        res = await send2tg(client, message, texts=f"🔗正在解析微信链接\n{url}", **kwargs)
+        kwargs["progress"] = res[0]
+    if kv := await get_db(db_key):
+        logger.debug(f"WeChat preview {DB.ENGINE} cache hit for key={db_key}")
+        if await copy_messages_from_db(client, message, key=db_key, kv=kv, **kwargs):
+            return
+        await modify_progress(text=f"❌从{DB.ENGINE}缓存中转发失败, 尝试重新解析...", **kwargs)
+    logger.info(f"WeChat link preview for {url}")
+
+    post_info = await get_wechat_info(url)
+    if error := post_info.get("error"):
+        await modify_progress(text=f"❌微信链接解析失败{url}\n{error}", force_update=True, **kwargs)
+        return
+    sent_messages = []
+    length = await count_without_entities(post_info["header"] + post_info["markdown"])
+    if not post_info.get("media"):  # 无图片
+        if length < TEXT_LENGTH - 8:  # 无图片短文
+            texts = f"{post_info['header']}\n{BLOCKQUOTE_EXPANDABLE_DELIM}{post_info['markdown']}\n{BLOCKQUOTE_EXPANDABLE_END_DELIM}"
+            sent_messages.extend(await send2tg(client, message, texts=texts, **kwargs))
+        else:  # 无图片长文
+            texts = f"{post_info['header']}"
+            telegraph_url = await publish_telegraph(title=post_info["title"], html=post_info["html"], author=post_info["author"], url=url)
+            if telegraph_url:
+                texts += f"\n[⚡️点击此处即时预览]({telegraph_url})"
+            sent_messages.extend(await send2tg(client, message, texts=texts, media=[{"document": post_info["html_path"]}], **kwargs))
+    elif length < CAPTION_LENGTH - 8:  # 有图片短文
+        texts = f"{post_info['header']}\n{BLOCKQUOTE_EXPANDABLE_DELIM}{post_info['markdown']}\n{BLOCKQUOTE_EXPANDABLE_END_DELIM}"
+        sent_messages.extend(await send2tg(client, message, texts=texts, media=post_info["media"], **kwargs))
+    else:  # 有图片长文
+        texts = f"{post_info['header']}"
+        telegraph_url = await publish_telegraph(title=post_info["title"], html=post_info["html"], author=post_info["author"], url=url)
+        if telegraph_url:
+            texts += f"\n**⚡️[点击此处即时预览]({telegraph_url})**"
+        sent_messages.extend(await send2tg(client, message, texts=texts, media=[{"document": post_info["path"]}], **kwargs))
+        kwargs["reply_msg_id"] = -1  # do not send as reply
+        sent_messages.extend(await send2tg(client, message, texts=texts, media=post_info["media"], **kwargs))
+    await modify_progress(del_status=True, **kwargs)
+    await save_messages(messages=sent_messages, key=db_key)
+
+
+async def get_wechat_info(url: str, **kwargs) -> dict:
+    """Get WeChat post info."""
+    api_url = API.TIKHUB_WECHAT + quote_plus(url)
+    logger.info(f"Preview WeChat TikHub for {api_url}")
+    headers = {"authorization": f"Bearer {TOKEN.TIKHUB}", "accept": "application/json"}
+    resp = await hx_req(api_url, headers=headers, check_keys=["data.content.raw_content", "data.title"], check_kv={"code": 200})
+    if resp.get("hx_error"):
+        return {"error": resp["hx_error"]}
+
+    try:
+        data = resp["data"]
+        title = data["title"]
+        author = data.get("author", "author")
+        dt = nowstr()
+        with contextlib.suppress(Exception):
+            dt = data["datetime"]  # 2025-04-28T06:12:35.833830
+            dt = dt[:19].replace("T", " ")  # 2025-04-28 06:12:35
+        header = f"🟢[{author}]({url})\n🕒{dt}\n**📝{title}**"
+        media = []
+        htmls = ""
+        texts = ""
+        markdowns = ""
+        for tag in data["content"]["raw_content"]:
+            html = ""
+            if text := tag.get("text", ""):
+                html = f"<h3>{text}</h3>" if tag.get("type", "") == "section" else f"<p>{text}</p>"
+                markdown = f"\n\n**{text}**" if tag.get("type", "") == "section" else f"\n{text}"
+                text = f"\n\n{text}" if tag.get("type", "") == "section" else f"\n{text}"
+                htmls += f"<br>{html}"
+                markdowns += f"\n{markdown}"
+                texts += f"\n{text}"
+            if images := tag.get("images", []):
+                for img in images:
+                    src = img.get("src", "")
+                    ext = img.get("type", "png")
+                    media.append({"photo": download_file(src, path=f"{DOWNLOAD_DIR}/{rand_string()}.{ext}", proxy=PROXY.WECHAT, **kwargs)})
+                    htmls += f"<br><img src='{PROXY.IMG}{src}' alt='微信图片'/>"
+        await modify_progress(text=f"✅解析成功...\n⏬正在下载:\n{summay_media(media)}", force_update=True, **kwargs)
+        media = await download_media(media, **kwargs)
+        txt_path = Path(DOWNLOAD_DIR) / f"{title}.txt"
+        with txt_path.open("w") as f:
+            f.write(f"📝{title}\n👤{author}\n🕒{dt}\n🔗{url}\n\n" + texts.strip())
+    except Exception as e:
+        logger.error(e)
+        return {"error": str(e)}
+    return {"html": htmls, "path": txt_path.as_posix(), "markdown": markdowns, "media": media, "title": title, "author": author, "header": header}
+
+
+async def publish_telegraph(title: str, html: str, author: str = "", url: str = "") -> str:
+    """Publish to Telegraph."""
+    if not TOKEN.TELEGRAPH or not html:
+        return ""
+    telegraph = Telegraph(access_token=TOKEN.TELEGRAPH)
+    try:
+        page = await telegraph.create_page(title=title, author_name=author, author_url=url, html_content=html)
+        return page["url"]
+    except Exception as e:
+        logger.error(f"Telegraph publish error: {e}")
+        return ""
src/config.py
@@ -48,6 +48,7 @@ class ENABLE:  # see fine-grained permission in `src/permission.py`
     TIKTOK = os.getenv("ENABLE_TIKTOK", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     TWITTER = os.getenv("ENABLE_TWITTER", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     WEIBO = os.getenv("ENABLE_WEIBO", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
+    WECHAT = os.getenv("ENABLE_WECHAT", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     WGET = os.getenv("ENABLE_WGET", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     XHS = os.getenv("ENABLE_XHS", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     YTDLP = os.getenv("ENABLE_YTDLP", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
@@ -88,6 +89,7 @@ class API:
     TIKHUB_INSTAGRAM = os.getenv("TIKHUB_INSTAGRAM_API", "https://api.tikhub.io/api/v1/instagram/web_app/fetch_post_info_by_url?url=")
     TIKHUB_TWITTER = os.getenv("TIKHUB_TWITTER_API", "https://api.tikhub.io/api/v1/twitter/web/fetch_post_comments?tweet_id=")
     TIKHUB_WEIBO_VIDEO = os.getenv("TIKHUB_WEIBO_VIDEO_API", "https://api.tikhub.io/api/v1/weibo/web/fetch_short_video_data?share_text=")
+    TIKHUB_WECHAT = os.getenv("TIKHUB_WECHAT", "https://api.tikhub.io/api/v1/wechat_mp/web/fetch_mp_article_detail_json?url=")
     BINANCE_SPOT = os.getenv("BINANCE_SPOT_API", "https://data-api.binance.vision")
     BINANCE_UM = os.getenv("BINANCE_UM_API", "https://fapi.binance.com")
     OKX = os.getenv("OKX_API", "https://www.okx.com")
@@ -113,13 +115,16 @@ class TOKEN:
     GOOGLE_SEARCH_API_KEY = os.getenv("GOOGLE_SEARCH_API_KEY", "")
     GOOGLE_SEARCH_CX = os.getenv("GOOGLE_SEARCH_CX", "")
     CHART_IMG = os.getenv("CHART_IMG_KEY", "")
+    TELEGRAPH = os.getenv("TELEGRAPH_TOKEN", "")
 
 
 class PROXY:  # format: socks5://127.0.0.1:7890
     TELEGRAM = os.getenv("TELEGRAM_PROXY", None)  # Telegram
     WORKERS = os.getenv("WORKERS_PROXY", "")  # https://github.com/netnr/workers
+    IMG = os.getenv("IMG_PROXY", "")  # https://caravaggio.ramielcreations.com/docs/install
     XHS = os.getenv("XHS_PROXY", None)  # Banned VPS IP, need residential proxy
     GPT = os.getenv("GPT_PROXY", None)
+    WECHAT = os.getenv("WECHAT_PROXY", None)
     DOUYIN = os.getenv("DOUYIN_PROXY", None)
     TIKTOK = os.getenv("TIKTOK_PROXY", None)
     INSTAGRAM = os.getenv("INSTAGRAM_PROXY", None)
src/handler.py
@@ -27,6 +27,7 @@ from preview.bilibili import preview_bilibili
 from preview.douyin import preview_douyin
 from preview.instagram import preview_instagram
 from preview.twitter import preview_twitter
+from preview.wechat import preview_wechat
 from preview.weibo import preview_weibo
 from preview.xiaohongshu import preview_xhs
 from preview.ytdlp import ProxyError, preview_ytdlp
@@ -223,6 +224,8 @@ async def handle_social_media(
             await preview_weibo(client, message, **kwargs)
         if xhs and matched["platform"] == "xiaohongshu":
             await preview_xhs(client, message, **kwargs)
+        if xhs and matched["platform"] == "wechat":
+            await preview_wechat(client, message, **kwargs)
         if matched["platform"].startswith("bilibili-"):  # this is not bilibili video, for videos, use yt-dlp
             await preview_bilibili(client, message, **kwargs)
         try:
@@ -299,6 +302,8 @@ def get_social_media_help(chat_id: int | str, ctype: str, prefixes: list[str] |
         msg += "\n🎶TikTok"
     if permission["instagram"]:
         msg += "\n🏞Instagram"
+    if permission["wechat"]:
+        msg += "\n🟢微信公众号文章"
     if permission["ytdlp"]:
         msg += "\n🔴油管"
         msg += "\n🅱️哔哩哔哩"
src/networking.py
@@ -356,6 +356,11 @@ async def match_social_media_link(text: str, *, flatten_first: bool = False) ->
         vid = matched.group(3)
         return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
 
+    # https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw
+    # https://mp.weixin.qq.com/s?__biz=MzI5Njc4NTYyOQ==&mid=2247494800&idx=1&sn=43a5732bd3a205d4dbdcd523afc0ca4a&sharer_shareinfo=1923203fd24bfa47c5b36b690026f5c8&sharer_shareinfo_first=8814eca80b4a37d10aa9b725e61f9486
+    if matched := re.search(r"(https?://)?mp.weixin.qq.com/s[\/|\?]{1}([_A-Za-z\=\&0-9\#\-]+)", text):
+        return {"url": matched.group(0), "db_key": bare_url(matched.group(0)), "platform": "wechat"}
+
     # if all above pre-defined patterns failed, try to match ytdlp link
     if urls := match_urls(text):
         for url in urls:
@@ -438,7 +443,8 @@ if __name__ == "__main__":
     import asyncio
 
     check_data(json.dumps({"foo": "bar", "baz": {"qux": "quux"}, "lst": ["1", "2", "3"]}), check_keys=["baz.qux"], check_kv={"foo": "bar", "baz.qux": "quux", "lst": ["1", "2", "3"]})
-    asyncio.run(match_social_media_link("https://b23.tv/3MSgT4q/", flatten_first=True))
+    # asyncio.run(match_social_media_link("https://b23.tv/3MSgT4q/", flatten_first=True))
+    print(asyncio.run(match_social_media_link("https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw", flatten_first=True)))
     # asyncio.run(match_social_media_link("https://www.facebook.com/share/r/19QGGp39T3/", flatten_first=True))
     # asyncio.run(match_social_media_link("https://www.douyin.com/video/7398813386827468041"))
     # asyncio.run(match_social_media_link("https://www.iesdouyin.com/share/note/7454527270925946138/"))
src/permission.py
@@ -112,6 +112,7 @@ def check_service(cid: int | str, ctype: str) -> dict:
         "twitter": True,
         "weibo": True,
         "xhs": True,
+        "wechat": True,
         "ytdlp": True,
     }
 
@@ -132,6 +133,8 @@ def check_service(cid: int | str, ctype: str) -> dict:
         permission["tiktok"] = False
     if not ENABLE.INSTAGRAM:
         permission["instagram"] = False
+    if not ENABLE.WECHAT:
+        permission["wechat"] = False
     if not ENABLE.YTDLP:
         permission["ytdlp"] = False
     if not ENABLE.GPT:
src/utils.py
@@ -26,6 +26,11 @@ def nowdt(tz: str = "UTC") -> datetime:
     return datetime.now(ZoneInfo(tz))
 
 
+def nowstr(tz: str = TZ) -> str:
+    now = nowdt(tz)
+    return f"{now:%Y-%m-%d %H:%M:%S}"
+
+
 def number(n: float | str | Decimal, precision: int = -1, *, sign: bool = False) -> str:
     """Normalize a number to its simplest decimal.
 
pyproject.toml
@@ -18,13 +18,14 @@ dependencies = [
   "pysocks>=1.7.1",
   "pytgcrypto>=1.2.9.2",
   "python-ffmpeg>=2.0.12",
+  "python-magic>=0.4.27",
   "pyyaml>=6.0.2",
   "quickchart-io>=2.0.0",
+  "telegraph[aio]>=2.2.0",
   "tiktoken>=0.8.0",
   "uvloop>=0.21.0",
   "youtube-transcript-api>=0.6.3",
   "yt-dlp>=2025.1.12rc",
-  "python-magic>=0.4.27",
 ]
 name = "bennybot"
 requires-python = ">=3.11"
uv.lock
@@ -234,6 +234,7 @@ dependencies = [
     { name = "python-magic" },
     { name = "pyyaml" },
     { name = "quickchart-io" },
+    { name = "telegraph", extra = ["aio"] },
     { name = "tiktoken" },
     { name = "uvloop" },
     { name = "youtube-transcript-api" },
@@ -268,6 +269,7 @@ requires-dist = [
     { name = "python-magic", specifier = ">=0.4.27" },
     { name = "pyyaml", specifier = ">=6.0.2" },
     { name = "quickchart-io", specifier = ">=2.0.0" },
+    { name = "telegraph", extras = ["aio"], specifier = ">=2.2.0" },
     { name = "tiktoken", specifier = ">=0.8.0" },
     { name = "uvloop", specifier = ">=0.21.0" },
     { name = "youtube-transcript-api", specifier = ">=0.6.3" },
@@ -1627,6 +1629,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521 },
 ]
 
+[[package]]
+name = "telegraph"
+version = "2.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/49/81/1c9f99004e23555fb21b80a2ef6ddbecb3a7a4eefbc4aac75ffb5a9ccf71/telegraph-2.2.0.tar.gz", hash = "sha256:012908f18208c451c7189f4bda7c39a1369241ac436c7543bb6c3fccbe9cfd5d", size = 8011 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d3/75/1bc2f8c4e4a736e2e582a1518eb4621db3c9dcb100f379fab5ab49c8d1ac/telegraph-2.2.0-py3-none-any.whl", hash = "sha256:d20b2a5d7cfdd66890c8c3fd60aa8585cabb7c6b03579d3eb1cd8af056ed9971", size = 10749 },
+]
+
+[package.optional-dependencies]
+aio = [
+    { name = "httpx" },
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.9.0"