Commit d294f98

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-04-29 03:04:22
feat(reddit): support reddit preview
1 parent c320e53
src/preview/reddit.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import contextlib
+import re
+from datetime import UTC, datetime
+from zoneinfo import ZoneInfo
+
+from glom import glom
+from loguru import logger
+from pyrogram.client import Client
+from pyrogram.parser.markdown import BLOCKQUOTE_EXPANDABLE_DELIM
+from pyrogram.types import Message
+
+from config import DB, PROXY, TZ
+from database import get_db
+from messages.database import copy_messages_from_db, save_messages
+from messages.progress import modify_progress
+from messages.sender import send2tg
+from messages.utils import summay_media
+from networking import download_file, download_media, hx_req
+from preview.utils import has_markdown_img
+from utils import nowstr
+
+
+async def preview_reddit(client: Client, message: Message, url: str = "", db_key: str = "", **kwargs):
+    """Preview reddit link in the message.
+
+    Args:
+        client (Client): The Pyrogram client.
+        message (Message): The trigger message object.
+        url (str, optional): Reddit link
+        db_key (str, optional): The cache key.
+    """
+    if kwargs.get("show_progress") and "progress" not in kwargs:
+        res = await send2tg(client, message, texts=f"🔗正在解析Reddit链接\n{url}", **kwargs)
+        kwargs["progress"] = res[0]
+    if kv := await get_db(db_key):
+        logger.debug(f"Reddit preview {DB.ENGINE} cache hit for key={db_key}")
+        if await copy_messages_from_db(client, message, key=db_key, kv=kv, **kwargs):
+            return
+        await modify_progress(text=f"❌从{DB.ENGINE}缓存中转发失败, 尝试重新解析...", **kwargs)
+    logger.info(f"Reddit link preview for {url}")
+
+    post_info = await get_reddit_info(url)
+    if error := post_info.get("error"):
+        await modify_progress(text=f"❌Reddit链接解析失败{url}\n{error}", force_update=True, **kwargs)
+        return
+    sent_messages = await send2tg(client, message, **post_info, **kwargs)
+    await modify_progress(del_status=True, **kwargs)
+    await save_messages(messages=sent_messages, key=db_key)
+
+
+async def get_reddit_info(url: str, **kwargs) -> dict:
+    """Get Reddit post info."""
+    api_url = url + ".json"
+    resp = await hx_req(api_url, proxy=PROXY.REDDIT, check_kv={"0.data.dist": 1, "1.data.children.0.kind": "t1"}, check_keys=["0.data.children.0.data.selftext"], **kwargs)
+    if isinstance(resp, dict) and resp.get("hx_error"):
+        return {"error": resp["hx_error"]}
+    try:
+        data = glom(resp, "0.data.children.0.data")
+        title = data.get("title", "Title")
+        author = data.get("author", "author")
+        author_url = f"https://www.reddit.com/user/{author}"
+        dt = nowstr()
+        with contextlib.suppress(Exception):
+            dt = datetime.fromtimestamp(data["created_utc"], tz=UTC).astimezone(ZoneInfo(TZ))
+            dt = dt.strftime("%Y-%m-%d %H:%M:%S")
+        desc = remove_preview_links(data.get("selftext", "")).strip()
+        texts = f"🎈[{author}]({author_url})\n🕒{dt}\n**📝[{title}]({url})**\n{desc}"
+        media = []
+        if gallery := glom(data, "media_metadata.*", default=[]):  # multiple images
+            for img in gallery:
+                ext = img.get("m", "").split("/")[-1]  # image/png -> ping
+                img_url = f"https://i.redd.it/{img['id']}.{ext}"
+                media.append({"photo": download_file(img_url, proxy=PROXY.REDDIT, **kwargs)})
+        elif data.get("url", "").startswith("https://i.redd.it/"):  # single image
+            media.append({"photo": download_file(data["url"], proxy=PROXY.REDDIT, **kwargs)})
+        if video_url := glom(data, "secure_media.reddit_video.fallback_url", default=""):
+            media.append({"video": download_file(video_url, proxy=PROXY.REDDIT, **kwargs)})
+        comments = []
+        for reply in glom(resp, "1.data.children.*.data"):
+            author = reply.get("author", "author")
+            author_url = f"https://www.reddit.com/user/{author}"
+            comment = reply.get("body", "")
+            if author == "[deleted]":
+                continue
+            if comment == "[removed]" or has_markdown_img(comment):
+                continue
+            comments.append(f"\n💬**[{author}]({author_url})**: {comment}")
+        if comments:
+            comments.insert(0, f"\n{BLOCKQUOTE_EXPANDABLE_DELIM}💬**点此展开评论区**:")
+        await modify_progress(text=f"⏬正在下载:\n{summay_media(media)}", force_update=True, **kwargs)
+        media = await download_media(media, **kwargs)
+    except Exception as e:
+        logger.error(e)
+        return {"error": str(e)}
+    return {"texts": texts, "media": media, "comments": comments}
+
+
+def remove_preview_links(text: str) -> str:
+    """Remove the preview.redd.it links in the post contents."""
+    pattern = r"https?://preview\.redd\.it/\S+\s"
+    return re.sub(pattern, "", text)
src/preview/utils.py
@@ -79,3 +79,12 @@ def make_bvid_clickable(texts: str) -> str:
     # match bilibili links or bvid only
     pattern = r"(https?://)?(:?m\.|www\.)?bilibili\.com/video/(BV1[a-zA-Z0-9]{9})\b|\bBV1[a-zA-Z0-9]{9}\b"
     return re.sub(pattern, markdown_url, texts)
+
+
+def has_markdown_img(text: str) -> bool:
+    """Check if the text contains markdown img format.
+
+    ![alt](https://example.png)
+    """
+    pattern = r"!\[.*?\]\(.*?\)"
+    return bool(re.search(pattern, text))
src/config.py
@@ -49,6 +49,7 @@ class ENABLE:  # see fine-grained permission in `src/permission.py`
     TWITTER = os.getenv("ENABLE_TWITTER", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     WEIBO = os.getenv("ENABLE_WEIBO", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     WECHAT = os.getenv("ENABLE_WECHAT", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
+    REDDIT = os.getenv("ENABLE_REDDIT", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     WGET = os.getenv("ENABLE_WGET", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     XHS = os.getenv("ENABLE_XHS", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
     YTDLP = os.getenv("ENABLE_YTDLP", "1").lower() in ["1", "y", "yes", "t", "true", "on"]
@@ -135,6 +136,7 @@ class PROXY:  # format: socks5://127.0.0.1:7890
     GOOGLE_SEARCH = os.getenv("GOOGLE_SEARCH_PROXY", None)
     DOWNLOAD = os.getenv("DOWNLOAD_PROXY", None)
     WEIBO = os.getenv("WEIBO_PROXY", None)
+    REDDIT = os.getenv("REDDIT_PROXY", None)
     YTDLP = os.getenv("YTDLP_PROXY", None)  # general proxy for ytdlp
     YTDLP_FALLBACK = os.getenv("YTDLP_PROXY_FALLBACK", None)  # fallback proxy for ytdlp
     # for ytdlp proxy of specific sites (Like Bilibili), use this format: YTDLP_PROXY_BILIBILI
src/handler.py
@@ -26,6 +26,7 @@ from permission import check_service
 from preview.bilibili import preview_bilibili
 from preview.douyin import preview_douyin
 from preview.instagram import preview_instagram
+from preview.reddit import preview_reddit
 from preview.twitter import preview_twitter
 from preview.wechat import preview_wechat
 from preview.weibo import preview_weibo
@@ -119,6 +120,7 @@ async def handle_social_media(
     instagram: bool = True,
     twitter: bool = True,
     weibo: bool = True,
+    reddit: bool = True,
     xhs: bool = True,
     ytdlp: bool = True,
     show_progress: bool = True,
@@ -226,6 +228,8 @@ async def handle_social_media(
             await preview_xhs(client, message, **kwargs)
         if xhs and matched["platform"] == "wechat":
             await preview_wechat(client, message, **kwargs)
+        if reddit and matched["platform"] == "reddit":
+            await preview_reddit(client, message, **kwargs)
         if matched["platform"].startswith("bilibili-"):  # this is not bilibili video, for videos, use yt-dlp
             await preview_bilibili(client, message, **kwargs)
         try:
@@ -302,8 +306,10 @@ def get_social_media_help(chat_id: int | str, ctype: str, prefixes: list[str] |
         msg += "\n🎶TikTok"
     if permission["instagram"]:
         msg += "\n🏞Instagram"
+    if permission["reddit"]:
+        msg += "\n🎈Reddit"
     if permission["wechat"]:
-        msg += "\n🟢微信公众号文章"
+        msg += "\n🟢微信文章"
     if permission["ytdlp"]:
         msg += "\n🔴油管"
         msg += "\n🅱️哔哩哔哩"
src/networking.py
@@ -361,6 +361,15 @@ async def match_social_media_link(text: str, *, flatten_first: bool = False) ->
     if matched := re.search(r"(https?://)?mp.weixin.qq.com/s[\/|\?]{1}([_A-Za-z\=\&0-9\#\-]+)", text):
         return {"url": matched.group(0), "db_key": bare_url(matched.group(0)), "platform": "wechat"}
 
+    # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/%E8%B5%B5%E8%96%87%E4%BB%80%E4%B9%88%E6%97%B6%E5%80%99%E5%9B%9E%E6%9D%A5/
+    # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/赵薇什么时候回来
+    # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/comment/mk43l4t/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button
+    if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/r/([_A-Za-z0-9]+)/comments/(.*?)/([^,,.。\?\s]+)", text):
+        return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
+    # https://reddit.com/comments/1kaazzn
+    if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/comments/([_A-Za-z0-9]+)", text):
+        return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
+
     # if all above pre-defined patterns failed, try to match ytdlp link
     if urls := match_urls(text):
         for url in urls:
@@ -444,7 +453,8 @@ if __name__ == "__main__":
 
     check_data(json.dumps({"foo": "bar", "baz": {"qux": "quux"}, "lst": ["1", "2", "3"]}), check_keys=["baz.qux"], check_kv={"foo": "bar", "baz.qux": "quux", "lst": ["1", "2", "3"]})
     # asyncio.run(match_social_media_link("https://b23.tv/3MSgT4q/", flatten_first=True))
-    print(asyncio.run(match_social_media_link("https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw", flatten_first=True)))
+    # print(asyncio.run(match_social_media_link("https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw", flatten_first=True)))
+    print(asyncio.run(match_social_media_link("https://reddit.com/comments/1kaazzn", flatten_first=True)))
     # asyncio.run(match_social_media_link("https://www.facebook.com/share/r/19QGGp39T3/", flatten_first=True))
     # asyncio.run(match_social_media_link("https://www.douyin.com/video/7398813386827468041"))
     # asyncio.run(match_social_media_link("https://www.iesdouyin.com/share/note/7454527270925946138/"))
src/permission.py
@@ -113,6 +113,7 @@ def check_service(cid: int | str, ctype: str) -> dict:
         "weibo": True,
         "xhs": True,
         "wechat": True,
+        "reddit": True,
         "ytdlp": True,
     }
 
@@ -135,6 +136,8 @@ def check_service(cid: int | str, ctype: str) -> dict:
         permission["instagram"] = False
     if not ENABLE.WECHAT:
         permission["wechat"] = False
+    if not ENABLE.REDDIT:
+        permission["reddit"] = False
     if not ENABLE.YTDLP:
         permission["ytdlp"] = False
     if not ENABLE.GPT: