Commit 31b3b7a

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-05-17 03:16:51
chore(subtitle): count subtitles more accurately
1 parent a7e450b
Changed files (5)
src/preview/utils.py
@@ -69,7 +69,15 @@ async def get_bilibili_video_info(url_or_vid: int | str) -> dict:
 
 
 async def get_bilibili_subtitle(url_or_vid: int | str) -> dict:
-    """Get Bilibili subtitle."""
+    """Get Bilibili subtitle.
+
+    Returns:
+        dict: {
+            "subtitles": "[minute:second] texts",
+            "num_chars": len(texts),
+            "reading_minutes": 2,
+            }
+    """
     try:
         # url to vid
         info = await get_bilibili_video_info(url_or_vid)
src/preview/ytdlp.py
@@ -45,7 +45,7 @@ from multimedia import convert_to_h264, generate_cover
 from networking import hx_req
 from preview.utils import fetch_youtube_video_info, get_bilibili_comments, make_bvid_clickable
 from subtitles.base import fetch_subtitle
-from utils import nowdt, publish_telegraph, readable_size, readable_time, remove_none_values, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
+from utils import count_subtitles, nowdt, publish_telegraph, readable_size, readable_time, remove_none_values, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
 
 
 class ProxyError(Exception):
@@ -266,7 +266,7 @@ async def preview_ytdlp(
         if subtitles:
             if len(subtitles) > TEXT_LENGTH or transcription_force_file:
                 caption = f"{emoji}[{info['author']}]({info['author_url']})\n🕒{create_time}"
-                caption += f"\n📝[{info['title']}]({url})\n#️⃣字符数: {len(subtitles)}\n⏳阅读时长: {len(subtitles) / READING_SPEED:.1f}分钟"
+                caption += f"\n📝[{info['title']}]({url})\n#️⃣字符数: {count_subtitles(subtitles)}\n⏳阅读时长: {count_subtitles(subtitles) / READING_SPEED:.1f}分钟"
                 if to_telegraph:
                     html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")])
                     if telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url):
src/subtitles/base.py
@@ -48,7 +48,15 @@ async def match_url(client: Client, message: Message) -> str:
 
 @cache.memoize(ttl=120)
 async def fetch_subtitle(url: str, provider: str) -> dict:
-    """Fetch subtitles from Bilibili or YouTube."""
+    """Fetch subtitles from Bilibili or YouTube.
+
+    Returns:
+        dict: {
+            "subtitles": "[minute:second] texts",
+            "num_chars": len(texts),
+            "reading_minutes": 2,
+            }
+    """
     succ = False
     error = "❌下载内嵌字幕失败\n🔄尝试使用语音转文字获取字幕"
     subtitles = []
@@ -89,9 +97,9 @@ async def to_transcription(subtitles: list[dict]) -> dict:
 
     Returns:
         dict: {
-            "subtitles": "[minute:second] transcription",
-            "num_chars": 11,
-            "num_tokens": 2,
+            "subtitles": "[minute:second] texts",
+            "num_chars": len(texts),
+            "reading_minutes": 2,
             }
     """
     if not subtitles:
src/subtitles/subtitle.py
@@ -18,7 +18,7 @@ from networking import match_social_media_link
 from preview.utils import fetch_youtube_video_info, get_bilibili_video_info
 from preview.ytdlp import preview_ytdlp
 from subtitles.base import fetch_subtitle, match_url
-from utils import publish_telegraph, to_int
+from utils import count_subtitles, publish_telegraph, to_int
 
 HELP = f"""📃**提取字幕**
 使用说明:
@@ -70,7 +70,7 @@ async def get_subtitle(client: Client, message: Message, youtube_subtitle_provid
             if res.get("error"):
                 await modify_progress(text=res["error"], force_update=True, **kwargs)
                 return
-            res |= {"subtitles": res["texts"], "num_chars": len(res["texts"]), "reading_minutes": len(res["texts"]) / READING_SPEED}
+            res |= {"subtitles": res["texts"], "num_chars": count_subtitles(res["texts"]), "reading_minutes": count_subtitles(res["texts"]) / READING_SPEED}
         else:
             await modify_progress(text=error + "\n正在通过下载音频后ASR识别字幕", force_update=True, **kwargs)
             kwargs |= {
src/utils.py
@@ -146,12 +146,37 @@ def soup_to_text(soup: PageElement) -> str:
     return text
 
 
-def number_to_emoji(num: int | str, default: str = "") -> str:
+def number_to_emoji(num: int | str, default: str | None = None) -> str:
     """Convert a number to an emoji."""
     num = str(num)
+    if default is None:
+        default = num
     return {"0": "0️⃣", "1": "1️⃣", "2": "2️⃣", "3": "3️⃣", "4": "4️⃣", "5": "5️⃣", "6": "6️⃣", "7": "7️⃣", "8": "8️⃣", "9": "9️⃣", "10": "🔟"}.get(num, default)
 
 
+def seconds_to_hms(seconds: float | str) -> str:
+    """Convert seconds to hms format."""
+    seconds = int(float(seconds))
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    if h == 0:
+        return f"{m:02d}:{s:02d}"
+    return f"{h:02d}:{m:02d}:{s:02d}"
+
+
+def count_subtitles(texts: str) -> int:
+    """Count number of characters in texts after removing initial timestamp.
+
+    Args:
+        texts: Input string potentially containing [hh:ss] timestamps
+
+    Returns:
+        int: Character count after timestamp removal
+    """
+    cleaned_text = re.sub(r"^\[.*?\]\s?", "", texts, flags=re.MULTILINE)
+    return len(cleaned_text)
+
+
 def stringfy(d: dict) -> dict:
     """Convert dict values to string.