Commit c7886e5

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-03-08 03:19:30
chore: parse `html` format for message
1 parent 886a272
Changed files (2)
src/messages/database.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import contextlib
 import json
 import re
 
@@ -11,6 +10,7 @@ from pyrogram.types import Message, ReplyParameters
 
 from config import DB
 from database import del_db, get_db, set_db
+from messages.parser import parse_msg
 from messages.progress import modify_progress
 from messages.utils import sender_markdown_to_html
 from utils import to_int
@@ -46,16 +46,10 @@ async def save_messages(messages: list[Message | None], key: str, metadata: dict
     data = []
     media_group_ids = set()  # save once
     for msg in valid_messages:
-        text = ""
-        if msg.text:
-            text = msg.text
-        if msg.caption:
-            text = msg.caption
-        if hasattr(text, "html"):  # DO NOT use markdown, because this format has some bugs
-            text = text.html  # type: ignore
+        info = parse_msg(msg, silent=True)
         # Caution: this format should be consistent with `handle_social_media` function in `handler.py`
         # text = re.sub(r"^๐Ÿ‘ค\[@.*?\]\(tg://user\?id=\d+\)//", "", text)  # remove markdown send_from_user
-        text = re.sub(r"^๐Ÿ‘ค\<a.*?tg://user\?id=\d+.*?@.*?</a>//", "", text)  # remove markdown send_from_user
+        text = re.sub(r"^๐Ÿ‘ค\<a.*?tg://user\?id=\d+.*?@.*?</a>//", "", info["html"])  # remove markdown send_from_user
         msg_extra = {"text": text} if text else {}
         if msg.media_group_id:
             if msg.media_group_id not in media_group_ids:
@@ -63,19 +57,19 @@ async def save_messages(messages: list[Message | None], key: str, metadata: dict
                 media_group_ids.add(msg.media_group_id)
                 data.append({"type": "media_group", "cid": msg.chat.id, "mid": msg.id} | msg_extra)
             continue
-        if msg.video:
+        if info["mtype"] == "video":
             logger.trace(f"Saving video message {msg.id}")
             data.append({"type": "video", "cid": msg.chat.id, "mid": msg.id} | msg_extra)
             continue
-        if msg.photo:
+        if info["mtype"] == "photo":
             logger.trace(f"Saving photo message {msg.id}")
             data.append({"type": "photo", "cid": msg.chat.id, "mid": msg.id} | msg_extra)
             continue
-        if msg.audio:
+        if info["mtype"] == "audio":
             logger.trace(f"Saving audio message {msg.id}")
             data.append({"type": "audio", "cid": msg.chat.id, "mid": msg.id} | msg_extra)
             continue
-        if msg.text:
+        if info["mtype"] == "text":
             logger.trace(f"Saving text message {msg.id}")
             data.append({"type": "text", "cid": msg.chat.id, "mid": msg.id} | msg_extra)
             continue
@@ -133,8 +127,7 @@ async def copy_messages_from_db(client: Client, message: Message, key: str, kv:
     results = []
     try:
         for idx, item in enumerate(sorted(data, key=lambda x: x["mid"])):
-            with contextlib.suppress(ValueError):
-                cid = int(item["cid"])
+            cid = to_int(item["cid"])
             if idx != 0:
                 reply_parameters = ReplyParameters()  # only send as reply of the first message
             logger.debug(f"Copying {item['type']} message: ({cid}, {item['mid']}) -> target_chat={target_chat}")
src/messages/parser.py
@@ -31,6 +31,7 @@ def parse_msg(message: Message, *, silent: bool = False, verbose: bool = False)
     media_group_id = message.media_group_id if message.media_group_id else 0
     is_bot = bool(message.from_user and message.from_user.is_bot)
     text = message.text or message.caption or ""
+    html = text.html if hasattr(text, "html") else ""  # type: ignore
     dt = message.date.replace(tzinfo=ZoneInfo(TZ)) if isinstance(message.date, datetime) else nowdt(TZ)
     time = f"{dt:%Y-%m-%d %H:%M:%S}"
 
@@ -96,6 +97,7 @@ def parse_msg(message: Message, *, silent: bool = False, verbose: bool = False)
         "media_group_id": int(media_group_id),
         "is_bot": bool(is_bot),
         "text": str(text),
+        "html": str(html),
         "first_name": str(first_name),
         "last_name": str(last_name),
         "full_name": str(full_name),