Commit 522e0a3
Changed files (5)
src
src/asr/voice_recognition.py
@@ -168,7 +168,7 @@ async def voice_to_text(
logger.success(f"{final!r}")
# send results
- caption = smart_split(final, CAPTION_LENGTH)[0]
+ caption = (await smart_split(final, CAPTION_LENGTH))[0]
remaining_texts = final.removeprefix(caption)
reply_parameters = get_reply_to(trigger_info["mid"], kwargs.get("reply_msg_id", 0))
target_chat = kwargs["target_chat"] if kwargs.get("target_chat") else this_info["cid"]
src/messages/preprocess.py
@@ -6,6 +6,7 @@ from loguru import logger
from pyrogram.types import InputMediaAudio, InputMediaDocument, InputMediaPhoto, InputMediaVideo
from config import CAPTION_LENGTH
+from messages.utils import count_without_entities, smart_split
from multimedia import fix_video_rotation, generate_cover, is_valid_video_or_audio, parse_media_info, split_large_video, split_long_img, validate_img
@@ -112,7 +113,7 @@ def preprocess_media(media: list[dict]) -> list[dict]:
return done_audios
-def warp_media_group(media: list[dict], caption: str = "") -> list:
+async def warp_media_group(media: list[dict], caption: str = "") -> list:
"""Warp media files into a list of media group objects.
item in media:
@@ -139,9 +140,9 @@ def warp_media_group(media: list[dict], caption: str = "") -> list:
if len(media) < 2:
logger.error(f"Media group requires at least 2 items, number of media: {len(media)}")
return []
- if len(caption) > CAPTION_LENGTH:
+ if await count_without_entities(caption) > CAPTION_LENGTH:
logger.warning(f"Caption too long, length: {len(caption)}, caption: {caption}")
- caption = caption[:CAPTION_LENGTH]
+ caption = (await smart_split(caption, CAPTION_LENGTH))[0]
if len(media) > 10:
logger.warning(f"Too many media files, number of media: {len(media)}")
media = media[:10]
src/messages/sender.py
@@ -11,7 +11,7 @@ from pyrogram.types import Message, ReplyParameters
from config import CAPTION_LENGTH
from messages.preprocess import preprocess_media, warp_media_group
from messages.progress import modify_progress, telegram_uploading
-from messages.utils import get_reply_to, smart_split, summay_media
+from messages.utils import count_without_entities, get_reply_to, smart_split, summay_media
from utils import to_int
@@ -79,11 +79,11 @@ async def send2tg(
await modify_progress(text=f"⏫正在上传:\n{summay_media(media)}", force_update=True, **kwargs)
# append comments to texts
- # For len(texts) < 1024 , ensure the combined texts and comments remains below 1024 characters to avoid sending a subsequent message containing only the comments.
+ # For texts length < 1024 , ensure the combined texts and comments remains below 1024 characters to avoid sending a subsequent message containing only the comments.
# For long texts, keep all comments
- if len(texts) < CAPTION_LENGTH:
+ if await count_without_entities(texts) < CAPTION_LENGTH:
for comment in comments:
- if len(f"{texts}{comment}") < CAPTION_LENGTH:
+ if await count_without_entities(f"{texts}{comment}") < CAPTION_LENGTH:
texts += comment
else:
texts = texts + "".join(comments)
@@ -95,10 +95,10 @@ async def send2tg(
if len(media) == 1:
return await send_single_media(client, target_chat, reply_parameters, media=media[0], texts=texts, cooldown=cooldown, **kwargs)
- caption = smart_split(texts, CAPTION_LENGTH)[0]
+ caption = (await smart_split(texts, CAPTION_LENGTH))[0]
remaining_texts = texts.removeprefix(caption)
if 1 < len(media) <= 10:
- group = warp_media_group(media, caption=caption)
+ group = await warp_media_group(media, caption=caption)
sent_messages.extend(await client.send_media_group(target_chat, media=group, reply_parameters=reply_parameters))
else: # media > 10
media_chunks = [media[i : i + 10] for i in range(0, len(media), 10)]
@@ -106,10 +106,10 @@ async def send2tg(
# send pure media first, and append captions at the last chunk
for idx, batch in enumerate(media_chunks):
if idx == 0: # first chunk
- group = warp_media_group(batch)
+ group = await warp_media_group(batch)
sent_messages.extend(await client.send_media_group(target_chat, media=group, reply_parameters=reply_parameters))
elif idx != num_chunk - 1: # disbale reply if not the last chunk
- group = warp_media_group(batch)
+ group = await warp_media_group(batch)
sent_messages.extend(await client.send_media_group(target_chat, media=group, reply_parameters=ReplyParameters()))
else: # last chunk: media <= 10, add caption here
sent_messages.extend(await send2tg(client, message, target_chat, reply_msg_id=-1, texts=caption, media=batch, cooldown=cooldown, **kwargs))
@@ -135,7 +135,7 @@ async def send_texts(
) -> list[Message | None]:
sent_messages: list[Message | None] = []
logger.trace(f"Sending {len(texts)} texts only")
- for idx, msg in enumerate(smart_split(texts.strip())):
+ for idx, msg in enumerate(await smart_split(texts.strip())):
if not msg:
continue
if idx == 0:
@@ -160,7 +160,7 @@ async def send_single_media(
) -> list[Message | None]:
sent_messages: list[Message | None] = []
logger.trace(f"Sending single media with {len(texts)} texts")
- caption = smart_split(texts, CAPTION_LENGTH)[0]
+ caption = (await smart_split(texts, CAPTION_LENGTH))[0]
remaining_texts = texts.removeprefix(caption)
if photo := media.get("photo"):
sent_messages.append(await client.send_photo(chat_id=target_chat, photo=photo, caption=caption, reply_parameters=reply_parameters))
src/messages/utils.py
@@ -3,6 +3,8 @@
import re
+from pyrogram.enums import ParseMode
+from pyrogram.parser.parser import Parser
from pyrogram.types import ReplyParameters
from config import TEXT_LENGTH, cache
@@ -79,26 +81,46 @@ def sender_markdown_to_html(sender: str) -> str:
return re.sub(r"^👤\[@(.*?)\]\(tg://user\?id=(\d+)\)", r'👤<a href="tg://user?id=\2">@\1</a>', sender)
-def smart_split(text: str, chars_per_string: int = TEXT_LENGTH) -> list[str]:
+async def count_without_entities(strings: str, mode: ParseMode = ParseMode.DEFAULT) -> int:
+ parser = Parser(client=None)
+ parsed = await parser.parse(strings, mode=mode)
+ return len(parsed["message"])
+
+
+async def smart_split(text: str, chars_per_string: int = TEXT_LENGTH, mode: ParseMode = ParseMode.DEFAULT) -> list[str]:
"""Splits one string into multiple strings, with a maximum amount of `chars_per_string` characters per string."""
- def _text_before_last(substr: str) -> str:
- return substr.join(part.split(substr)[:-1]) + substr
+ def next_sentence(strings: str) -> str:
+ # ruff: noqa: RUF001
+ if "\n" in strings:
+ return strings.split("\n")[0] + "\n"
+ if " " in strings:
+ return strings.split(" ")[0] + " "
+ if ". " in strings:
+ return strings.split(". ")[0] + ". "
+ if "。" in strings:
+ return strings.split("。")[0] + "。"
+ if ";" in strings:
+ return strings.split(";")[0] + ";"
+ if "!" in strings:
+ return strings.split("!")[0] + "!"
+ if "?" in strings:
+ return strings.split("?")[0] + "?"
+ return strings
parts = []
while True:
- if len(text) < chars_per_string:
+ if await count_without_entities(text, mode) < chars_per_string:
parts.append(text)
- return parts
-
- part = text[:chars_per_string]
-
- if "\n" in part:
- part = _text_before_last("\n")
- elif ". " in part:
- part = _text_before_last(". ")
- elif " " in part:
- part = _text_before_last(" ")
-
+ break
+
+ part = next_sentence(text)
+ left = text[len(part) :]
+ while await count_without_entities(f"{part}{next_sentence(left)}", mode) < chars_per_string:
+ part += next_sentence(left)
+ left = text[len(part) :]
+ if not left.strip():
+ break
parts.append(part)
- text = text[len(part) :]
+ text = left
+ return parts
src/preview/ytdlp.py
@@ -22,7 +22,7 @@ from messages.database import copy_messages_from_db, save_messages
from messages.preprocess import preprocess_media
from messages.progress import modify_progress, telegram_uploading
from messages.sender import send2tg
-from messages.utils import get_reply_to
+from messages.utils import count_without_entities, get_reply_to, smart_split
from multimedia import convert_to_h264, generate_cover
from networking import hx_req
from others.emoji import emojify
@@ -163,7 +163,7 @@ async def preview_ytdlp(
comments = await get_youtube_comments(kwargs.get("vid"), youtube_comments_provider)
for comment in comments:
- if len(f"{texts}{comment}") < CAPTION_LENGTH:
+ if await count_without_entities(f"{texts}{comment}") < CAPTION_LENGTH:
texts += comment
texts = texts.strip()
sent_messages: list[Message | None] = [] # 把发送的消息都记录下来
@@ -186,7 +186,7 @@ async def preview_ytdlp(
sent_messages.append(
await client.send_video(
chat_id=to_int(target_chat),
- caption=caption[:CAPTION_LENGTH],
+ caption=(await smart_split(caption, CAPTION_LENGTH))[0],
reply_parameters=reply_parameters,
progress=telegram_uploading,
progress_args=(kwargs.get("progress", False), video["video"], true(kwargs.get("detail_progress"))), # message, path, detail_progress
@@ -201,7 +201,7 @@ async def preview_ytdlp(
await client.send_audio(
chat_id=target_chat,
audio=audio_path.as_posix(),
- caption=texts[:CAPTION_LENGTH],
+ caption=(await smart_split(texts, CAPTION_LENGTH))[0],
performer=info["author"],
title=info["title"],
duration=duration,