Commit 522e0a3

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-02-19 04:00:31
style: count message length without entities
1 parent 471cf7f
Changed files (5)
src/asr/voice_recognition.py
@@ -168,7 +168,7 @@ async def voice_to_text(
         logger.success(f"{final!r}")
 
         # send results
-        caption = smart_split(final, CAPTION_LENGTH)[0]
+        caption = (await smart_split(final, CAPTION_LENGTH))[0]
         remaining_texts = final.removeprefix(caption)
         reply_parameters = get_reply_to(trigger_info["mid"], kwargs.get("reply_msg_id", 0))
         target_chat = kwargs["target_chat"] if kwargs.get("target_chat") else this_info["cid"]
src/messages/preprocess.py
@@ -6,6 +6,7 @@ from loguru import logger
 from pyrogram.types import InputMediaAudio, InputMediaDocument, InputMediaPhoto, InputMediaVideo
 
 from config import CAPTION_LENGTH
+from messages.utils import count_without_entities, smart_split
 from multimedia import fix_video_rotation, generate_cover, is_valid_video_or_audio, parse_media_info, split_large_video, split_long_img, validate_img
 
 
@@ -112,7 +113,7 @@ def preprocess_media(media: list[dict]) -> list[dict]:
     return done_audios
 
 
-def warp_media_group(media: list[dict], caption: str = "") -> list:
+async def warp_media_group(media: list[dict], caption: str = "") -> list:
     """Warp media files into a list of media group objects.
 
     item in media:
@@ -139,9 +140,9 @@ def warp_media_group(media: list[dict], caption: str = "") -> list:
     if len(media) < 2:
         logger.error(f"Media group requires at least 2 items, number of media: {len(media)}")
         return []
-    if len(caption) > CAPTION_LENGTH:
+    if await count_without_entities(caption) > CAPTION_LENGTH:
         logger.warning(f"Caption too long, length: {len(caption)}, caption: {caption}")
-        caption = caption[:CAPTION_LENGTH]
+        caption = (await smart_split(caption, CAPTION_LENGTH))[0]
     if len(media) > 10:
         logger.warning(f"Too many media files, number of media: {len(media)}")
         media = media[:10]
src/messages/sender.py
@@ -11,7 +11,7 @@ from pyrogram.types import Message, ReplyParameters
 from config import CAPTION_LENGTH
 from messages.preprocess import preprocess_media, warp_media_group
 from messages.progress import modify_progress, telegram_uploading
-from messages.utils import get_reply_to, smart_split, summay_media
+from messages.utils import count_without_entities, get_reply_to, smart_split, summay_media
 from utils import to_int
 
 
@@ -79,11 +79,11 @@ async def send2tg(
         await modify_progress(text=f"⏫正在上传:\n{summay_media(media)}", force_update=True, **kwargs)
 
     # append comments to texts
-    # For len(texts) < 1024 , ensure the combined texts and comments remains below 1024 characters to avoid sending a subsequent message containing only the comments.
+    # For texts length < 1024 , ensure the combined texts and comments remains below 1024 characters to avoid sending a subsequent message containing only the comments.
     # For long texts, keep all comments
-    if len(texts) < CAPTION_LENGTH:
+    if await count_without_entities(texts) < CAPTION_LENGTH:
         for comment in comments:
-            if len(f"{texts}{comment}") < CAPTION_LENGTH:
+            if await count_without_entities(f"{texts}{comment}") < CAPTION_LENGTH:
                 texts += comment
     else:
         texts = texts + "".join(comments)
@@ -95,10 +95,10 @@ async def send2tg(
     if len(media) == 1:
         return await send_single_media(client, target_chat, reply_parameters, media=media[0], texts=texts, cooldown=cooldown, **kwargs)
 
-    caption = smart_split(texts, CAPTION_LENGTH)[0]
+    caption = (await smart_split(texts, CAPTION_LENGTH))[0]
     remaining_texts = texts.removeprefix(caption)
     if 1 < len(media) <= 10:
-        group = warp_media_group(media, caption=caption)
+        group = await warp_media_group(media, caption=caption)
         sent_messages.extend(await client.send_media_group(target_chat, media=group, reply_parameters=reply_parameters))
     else:  # media > 10
         media_chunks = [media[i : i + 10] for i in range(0, len(media), 10)]
@@ -106,10 +106,10 @@ async def send2tg(
         # send pure media first, and append captions at the last chunk
         for idx, batch in enumerate(media_chunks):
             if idx == 0:  # first chunk
-                group = warp_media_group(batch)
+                group = await warp_media_group(batch)
                 sent_messages.extend(await client.send_media_group(target_chat, media=group, reply_parameters=reply_parameters))
             elif idx != num_chunk - 1:  # disbale reply if not the last chunk
-                group = warp_media_group(batch)
+                group = await warp_media_group(batch)
                 sent_messages.extend(await client.send_media_group(target_chat, media=group, reply_parameters=ReplyParameters()))
             else:  # last chunk:  media <= 10, add caption here
                 sent_messages.extend(await send2tg(client, message, target_chat, reply_msg_id=-1, texts=caption, media=batch, cooldown=cooldown, **kwargs))
@@ -135,7 +135,7 @@ async def send_texts(
 ) -> list[Message | None]:
     sent_messages: list[Message | None] = []
     logger.trace(f"Sending {len(texts)} texts only")
-    for idx, msg in enumerate(smart_split(texts.strip())):
+    for idx, msg in enumerate(await smart_split(texts.strip())):
         if not msg:
             continue
         if idx == 0:
@@ -160,7 +160,7 @@ async def send_single_media(
 ) -> list[Message | None]:
     sent_messages: list[Message | None] = []
     logger.trace(f"Sending single media with {len(texts)} texts")
-    caption = smart_split(texts, CAPTION_LENGTH)[0]
+    caption = (await smart_split(texts, CAPTION_LENGTH))[0]
     remaining_texts = texts.removeprefix(caption)
     if photo := media.get("photo"):
         sent_messages.append(await client.send_photo(chat_id=target_chat, photo=photo, caption=caption, reply_parameters=reply_parameters))
src/messages/utils.py
@@ -3,6 +3,8 @@
 
 import re
 
+from pyrogram.enums import ParseMode
+from pyrogram.parser.parser import Parser
 from pyrogram.types import ReplyParameters
 
 from config import TEXT_LENGTH, cache
@@ -79,26 +81,46 @@ def sender_markdown_to_html(sender: str) -> str:
     return re.sub(r"^👤\[@(.*?)\]\(tg://user\?id=(\d+)\)", r'👤<a href="tg://user?id=\2">@\1</a>', sender)
 
 
-def smart_split(text: str, chars_per_string: int = TEXT_LENGTH) -> list[str]:
+async def count_without_entities(strings: str, mode: ParseMode = ParseMode.DEFAULT) -> int:
+    parser = Parser(client=None)
+    parsed = await parser.parse(strings, mode=mode)
+    return len(parsed["message"])
+
+
+async def smart_split(text: str, chars_per_string: int = TEXT_LENGTH, mode: ParseMode = ParseMode.DEFAULT) -> list[str]:
     """Splits one string into multiple strings, with a maximum amount of `chars_per_string` characters per string."""
 
-    def _text_before_last(substr: str) -> str:
-        return substr.join(part.split(substr)[:-1]) + substr
+    def next_sentence(strings: str) -> str:
+        # ruff: noqa: RUF001
+        if "\n" in strings:
+            return strings.split("\n")[0] + "\n"
+        if " " in strings:
+            return strings.split(" ")[0] + " "
+        if ". " in strings:
+            return strings.split(". ")[0] + ". "
+        if "。" in strings:
+            return strings.split("。")[0] + "。"
+        if ";" in strings:
+            return strings.split(";")[0] + ";"
+        if "!" in strings:
+            return strings.split("!")[0] + "!"
+        if "?" in strings:
+            return strings.split("?")[0] + "?"
+        return strings
 
     parts = []
     while True:
-        if len(text) < chars_per_string:
+        if await count_without_entities(text, mode) < chars_per_string:
             parts.append(text)
-            return parts
-
-        part = text[:chars_per_string]
-
-        if "\n" in part:
-            part = _text_before_last("\n")
-        elif ". " in part:
-            part = _text_before_last(". ")
-        elif " " in part:
-            part = _text_before_last(" ")
-
+            break
+
+        part = next_sentence(text)
+        left = text[len(part) :]
+        while await count_without_entities(f"{part}{next_sentence(left)}", mode) < chars_per_string:
+            part += next_sentence(left)
+            left = text[len(part) :]
+            if not left.strip():
+                break
         parts.append(part)
-        text = text[len(part) :]
+        text = left
+    return parts
src/preview/ytdlp.py
@@ -22,7 +22,7 @@ from messages.database import copy_messages_from_db, save_messages
 from messages.preprocess import preprocess_media
 from messages.progress import modify_progress, telegram_uploading
 from messages.sender import send2tg
-from messages.utils import get_reply_to
+from messages.utils import count_without_entities, get_reply_to, smart_split
 from multimedia import convert_to_h264, generate_cover
 from networking import hx_req
 from others.emoji import emojify
@@ -163,7 +163,7 @@ async def preview_ytdlp(
         comments = await get_youtube_comments(kwargs.get("vid"), youtube_comments_provider)
 
     for comment in comments:
-        if len(f"{texts}{comment}") < CAPTION_LENGTH:
+        if await count_without_entities(f"{texts}{comment}") < CAPTION_LENGTH:
             texts += comment
     texts = texts.strip()
     sent_messages: list[Message | None] = []  # 把发送的消息都记录下来
@@ -186,7 +186,7 @@ async def preview_ytdlp(
             sent_messages.append(
                 await client.send_video(
                     chat_id=to_int(target_chat),
-                    caption=caption[:CAPTION_LENGTH],
+                    caption=(await smart_split(caption, CAPTION_LENGTH))[0],
                     reply_parameters=reply_parameters,
                     progress=telegram_uploading,
                     progress_args=(kwargs.get("progress", False), video["video"], true(kwargs.get("detail_progress"))),  # message, path, detail_progress
@@ -201,7 +201,7 @@ async def preview_ytdlp(
             await client.send_audio(
                 chat_id=target_chat,
                 audio=audio_path.as_posix(),
-                caption=texts[:CAPTION_LENGTH],
+                caption=(await smart_split(texts, CAPTION_LENGTH))[0],
                 performer=info["author"],
                 title=info["title"],
                 duration=duration,