Commit 31b3b7a
Changed files (5)
src
src/preview/utils.py
@@ -69,7 +69,15 @@ async def get_bilibili_video_info(url_or_vid: int | str) -> dict:
async def get_bilibili_subtitle(url_or_vid: int | str) -> dict:
- """Get Bilibili subtitle."""
+ """Get Bilibili subtitle.
+
+ Returns:
+ dict: {
+ "subtitles": "[minute:second] texts",
+ "num_chars": len(texts),
+ "reading_minutes": 2,
+ }
+ """
try:
# url to vid
info = await get_bilibili_video_info(url_or_vid)
src/preview/ytdlp.py
@@ -45,7 +45,7 @@ from multimedia import convert_to_h264, generate_cover
from networking import hx_req
from preview.utils import fetch_youtube_video_info, get_bilibili_comments, make_bvid_clickable
from subtitles.base import fetch_subtitle
-from utils import nowdt, publish_telegraph, readable_size, readable_time, remove_none_values, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
+from utils import count_subtitles, nowdt, publish_telegraph, readable_size, readable_time, remove_none_values, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
class ProxyError(Exception):
@@ -266,7 +266,7 @@ async def preview_ytdlp(
if subtitles:
if len(subtitles) > TEXT_LENGTH or transcription_force_file:
caption = f"{emoji}[{info['author']}]({info['author_url']})\n🕒{create_time}"
- caption += f"\n📝[{info['title']}]({url})\n#️⃣字符数: {len(subtitles)}\n⏳阅读时长: {len(subtitles) / READING_SPEED:.1f}分钟"
+ caption += f"\n📝[{info['title']}]({url})\n#️⃣字符数: {count_subtitles(subtitles)}\n⏳阅读时长: {count_subtitles(subtitles) / READING_SPEED:.1f}分钟"
if to_telegraph:
html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")])
if telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url):
src/subtitles/base.py
@@ -48,7 +48,15 @@ async def match_url(client: Client, message: Message) -> str:
@cache.memoize(ttl=120)
async def fetch_subtitle(url: str, provider: str) -> dict:
- """Fetch subtitles from Bilibili or YouTube."""
+ """Fetch subtitles from Bilibili or YouTube.
+
+ Returns:
+ dict: {
+ "subtitles": "[minute:second] texts",
+ "num_chars": len(texts),
+ "reading_minutes": 2,
+ }
+ """
succ = False
error = "❌下载内嵌字幕失败\n🔄尝试使用语音转文字获取字幕"
subtitles = []
@@ -89,9 +97,9 @@ async def to_transcription(subtitles: list[dict]) -> dict:
Returns:
dict: {
- "subtitles": "[minute:second] transcription",
- "num_chars": 11,
- "num_tokens": 2,
+ "subtitles": "[minute:second] texts",
+ "num_chars": len(texts),
+ "reading_minutes": 2,
}
"""
if not subtitles:
src/subtitles/subtitle.py
@@ -18,7 +18,7 @@ from networking import match_social_media_link
from preview.utils import fetch_youtube_video_info, get_bilibili_video_info
from preview.ytdlp import preview_ytdlp
from subtitles.base import fetch_subtitle, match_url
-from utils import publish_telegraph, to_int
+from utils import count_subtitles, publish_telegraph, to_int
HELP = f"""📃**提取字幕**
使用说明:
@@ -70,7 +70,7 @@ async def get_subtitle(client: Client, message: Message, youtube_subtitle_provid
if res.get("error"):
await modify_progress(text=res["error"], force_update=True, **kwargs)
return
- res |= {"subtitles": res["texts"], "num_chars": len(res["texts"]), "reading_minutes": len(res["texts"]) / READING_SPEED}
+ res |= {"subtitles": res["texts"], "num_chars": count_subtitles(res["texts"]), "reading_minutes": count_subtitles(res["texts"]) / READING_SPEED}
else:
await modify_progress(text=error + "\n正在通过下载音频后ASR识别字幕", force_update=True, **kwargs)
kwargs |= {
src/utils.py
@@ -146,12 +146,37 @@ def soup_to_text(soup: PageElement) -> str:
return text
-def number_to_emoji(num: int | str, default: str = "") -> str:
+def number_to_emoji(num: int | str, default: str | None = None) -> str:
"""Convert a number to an emoji."""
num = str(num)
+ if default is None:
+ default = num
return {"0": "0️⃣", "1": "1️⃣", "2": "2️⃣", "3": "3️⃣", "4": "4️⃣", "5": "5️⃣", "6": "6️⃣", "7": "7️⃣", "8": "8️⃣", "9": "9️⃣", "10": "🔟"}.get(num, default)
+def seconds_to_hms(seconds: float | str) -> str:
+ """Convert seconds to hms format."""
+ seconds = int(float(seconds))
+ m, s = divmod(seconds, 60)
+ h, m = divmod(m, 60)
+ if h == 0:
+ return f"{m:02d}:{s:02d}"
+ return f"{h:02d}:{m:02d}:{s:02d}"
+
+
+def count_subtitles(texts: str) -> int:
+ """Count number of characters in texts after removing initial timestamp.
+
+ Args:
+ texts: Input string potentially containing [hh:ss] timestamps
+
+ Returns:
+ int: Character count after timestamp removal
+ """
+ cleaned_text = re.sub(r"^\[.*?\]\s?", "", texts, flags=re.MULTILINE)
+ return len(cleaned_text)
+
+
def stringfy(d: dict) -> dict:
"""Convert dict values to string.