Commit b821511
Changed files (4)
src
src/preview/ytdlp.py
@@ -15,24 +15,39 @@ from glom import glom
from loguru import logger
from pyrogram.client import Client
from pyrogram.parser.markdown import BLOCKQUOTE_EXPANDABLE_DELIM
-from pyrogram.types import Message
+from pyrogram.types import Message, ReplyParameters
from yt_dlp import YoutubeDL
from yt_dlp.utils import DownloadError, ExtractorError, YoutubeDLError
from asr.voice_recognition import asr_file
-from config import API, CAPTION_LENGTH, DB, DOWNLOAD_DIR, MAX_FILE_BYTES, PROVIDER, PROXY, READING_SPEED, TID, TOKEN, YTDLP_DOWNLOAD_MAX_FILE_BYTES, YTDLP_RE_ENCODING_MAX_FILE_BYTES, cache
+from config import (
+ API,
+ CAPTION_LENGTH,
+ DB,
+ DOWNLOAD_DIR,
+ MAX_FILE_BYTES,
+ PROVIDER,
+ PROXY,
+ READING_SPEED,
+ TEXT_LENGTH,
+ TID,
+ TOKEN,
+ YTDLP_DOWNLOAD_MAX_FILE_BYTES,
+ YTDLP_RE_ENCODING_MAX_FILE_BYTES,
+ cache,
+)
from cookies import cookie_cloud_bilibili
from database import get_db
from messages.database import copy_messages_from_db, save_messages
from messages.preprocess import preprocess_media
from messages.progress import modify_progress, telegram_uploading
from messages.sender import send2tg
-from messages.utils import count_without_entities, get_reply_to, smart_split, warp_comments
+from messages.utils import blockquote, count_without_entities, get_reply_to, smart_split, warp_comments
from multimedia import convert_to_h264, generate_cover
from networking import hx_req
from others.emoji import emojify
-from others.subtitle import fetch_subtitle
from preview.utils import bv2av, make_bvid_clickable
+from subtitles.base import fetch_subtitle
from utils import publish_telegraph, readable_size, readable_time, remove_none_values, soup_to_text, to_int, true, ts_to_dt, unicode_to_ascii
@@ -45,13 +60,13 @@ async def preview_ytdlp(
message: Message,
url: str = "",
*,
+ use_db: bool = True,
ytdlp_audio_only: bool = False,
ytdlp_send_video: bool = True,
ytdlp_send_audio: bool = False,
bilibili_comments_provider: str = PROVIDER.BILIBILI_COMMENTS,
youtube_comments_provider: str = PROVIDER.YOUTUBE_COMMENTS,
proxy: str | None = None,
- append_youtube_subtitle: bool = True,
append_transcription: bool = True,
ytdlp_transcription_engine: str = "gemini",
to_telegraph: bool = True,
@@ -63,23 +78,24 @@ async def preview_ytdlp(
client (Client): The Pyrogram client.
message (Message): The trigger message object.
url (str, optional): ytdlp link.
+ use_db (bool, optional): Whether to use database to cache the result. Defaults to True.
ytdlp_audio_only (bool, optional): Download audio only. Defaults to True.
ytdlp_send_video (bool, optional): Send video. Defaults to True.
ytdlp_send_audio (bool, optional): Send audio. Defaults to False.
bilibili_comments_provider (str, optional): The bilibili comments extractor: "free", "tikhub" or "false"
youtube_comments_provider (str, optional): The youtube comments extractor: "free" or "false".
proxy (str, optional): Proxy to use. Defaults to None.
- append_youtube_subtitle (bool, optional): Also send youtube subtitle.
append_transcription (bool, optional): Also append transcription.
- ytdlp_transcription_method (str, optional): Method to get transcription.
+ ytdlp_transcription_engine (str, optional): Method to get transcription.
to_telegraph (bool, optional): Whether to publish the subtitle or transcription to telegraph.
+ delete_files (bool, optional): Whether to delete video & audio after uploading.
"""
logger.trace(f"{url=} {kwargs=}")
if kwargs.get("show_progress") and "progress" not in kwargs:
res = await send2tg(client, message, texts=f"🔗正在解析链接\n{url}", **kwargs)
kwargs["progress"] = res[0]
db_key = url
- if kv := await get_db(db_key):
+ if use_db and (kv := await get_db(db_key)):
logger.debug(f"YT-DLP preview {DB.ENGINE} cache hit for key={db_key}")
if await copy_messages_from_db(client, message, key=db_key, kv=kv, **kwargs):
return
@@ -134,7 +150,7 @@ async def preview_ytdlp(
video_path = info.get("video_path", Path(""))
audio_path = info.get("audio_path", Path(""))
# only save messages when both video and audio are uploaded
- save_to_db = bool(video_path.is_file() and audio_path.is_file())
+ save_to_db = bool(use_db and video_path.is_file() and audio_path.is_file())
msg = f"✅下载成功:\n{info['summary']}"
logger.success(f"{msg!r}")
await modify_progress(text=msg.strip(), **kwargs)
@@ -182,6 +198,7 @@ async def preview_ytdlp(
texts = texts.strip()
sent_messages: list[Message | None] = [] # 把发送的消息都记录下来
target_chat = kwargs["target_chat"] if kwargs.get("target_chat") else message.chat.id
+ target_chat = to_int(target_chat)
reply_msg_id = kwargs.get("reply_msg_id", 0)
reply_parameters = get_reply_to(message.id, reply_msg_id)
thumb = generate_cover(video_path) if video_path.is_file() else generate_cover(audio_path)
@@ -200,7 +217,7 @@ async def preview_ytdlp(
await modify_progress(text=f"🎬视频上传中-P{idx + 1}: {readable_size(path=video['video'])}", force_update=True, **kwargs)
sent_messages.append(
await client.send_video(
- chat_id=to_int(target_chat),
+ chat_id=target_chat,
caption=warp_comments(caption),
reply_parameters=reply_parameters,
progress=telegram_uploading,
@@ -233,29 +250,25 @@ async def preview_ytdlp(
if v := locals().get(k):
metadata[k] = unicode_to_ascii(v)
await save_messages(messages=sent_messages, key=url, metadata=metadata)
- if "youtube" in info["extractor"] and append_youtube_subtitle and (video_path.is_file() or audio_path.is_file()):
- res = await fetch_subtitle(video_id=info["id"], provider="free")
- if subtitles := res.get("subtitle"):
- caption = f"{emoji}[{info['author']}]({info['author_url']})\n🕒{create_time}\n📝[{info['title']}]({url})\n字符数: {res['num_chars']}\n阅读时长: {res['reading_minutes']:.1f}分钟"
- if to_telegraph:
- html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")])
- if telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url):
- caption += f"\n**⚡️[Telegraph即时预览]({telegraph_url})**"
- with io.BytesIO(subtitles.encode("utf-8")) as f:
- await client.send_document(to_int(target_chat), f, file_name=f"{info['title']}.txt", caption=caption)
- append_transcription = False # disable asr transcription
-
- if any(x in info["extractor"] for x in ["youtube", "bilibili"]) and append_transcription and audio_path.is_file():
- asr_res = await asr_file(audio_path, ytdlp_transcription_engine, duration, client=client, message=message, slient=True)
- if texts := asr_res.get("texts"):
- caption = f"{emoji}[{info['author']}]({info['author_url']})\n🕒{create_time}\n📝[{info['title']}]({url})\n字符数: {len(texts)}\n阅读时长: {len(texts) / READING_SPEED:.1f}分钟"
- if to_telegraph:
- html = "\n".join([f"<p>{s}</p>" for s in texts.split("\n")])
- if telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url):
- caption += f"\n**⚡️[Telegraph即时预览]({telegraph_url})**"
- with io.BytesIO(texts.encode("utf-8")) as f:
- await client.send_document(to_int(target_chat), f, file_name=f"{info['title']}.txt", caption=caption)
- [await modify_progress(msg, del_status=True) for msg in asr_res.get("sent_messages", [])]
+ if any(x in info["extractor"] for x in ["youtube", "bilibili"]) and append_transcription and (video_path.is_file() or audio_path.is_file()):
+ res = await fetch_subtitle(video_id=info["id"], provider="free") if info["extractor"] == "youtube" else {}
+ subtitles = res.get("subtitle", "")
+ if not subtitles:
+ res = await asr_file(audio_path, ytdlp_transcription_engine, duration, client=client, message=message, slient=True)
+ subtitles = res.get("texts", "")
+ if subtitles:
+ if len(subtitles) > TEXT_LENGTH:
+ caption = f"{emoji}[{info['author']}]({info['author_url']})\n🕒{create_time}\n📝[{info['title']}]({url})\n字符数: {len(subtitles)}\n阅读时长: {len(subtitles) / READING_SPEED:.1f}分钟"
+ if to_telegraph:
+ html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")])
+ if telegraph_url := await publish_telegraph(title=info["title"], html=html, author=info["author"], url=url):
+ caption += f"\n**⚡️[Telegraph即时预览]({telegraph_url})**"
+ with io.BytesIO(subtitles.encode("utf-8")) as f:
+ await client.send_document(to_int(target_chat), f, file_name=f"{info['title']}.txt", caption=caption)
+ else:
+ first_msg: Message = sent_messages[0] if sent_messages else message # type: ignore
+ await client.send_message(first_msg.chat.id, blockquote(subtitles), reply_parameters=ReplyParameters(message_id=first_msg.id))
+ [await modify_progress(msg, del_status=True) for msg in res.get("sent_messages", [])]
Path(json_file).unlink(missing_ok=True)
cleanup_ytdlp(info["id"])
src/others/subtitle.py → src/subtitles/base.py
@@ -1,6 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-import io
from datetime import UTC, datetime, timedelta
from zoneinfo import ZoneInfo
@@ -10,96 +9,14 @@ from pyrogram.client import Client
from pyrogram.types import Message
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
-from asr.voice_recognition import asr_file
-from config import API, PREFIX, PROVIDER, PROXY, READING_SPEED, TOKEN, TZ
-from database import cache
+from config import API, PREFIX, PROXY, READING_SPEED, TOKEN, TZ
from messages.parser import parse_msg
-from messages.progress import modify_progress
-from messages.sender import send2tg
-from messages.utils import equal_prefix, startswith_prefix
+from messages.utils import startswith_prefix
from networking import hx_req, match_social_media_link
-from utils import publish_telegraph, to_int
-
-HELP = f"""📃**提取字幕**
-使用说明:
-1. `{PREFIX.SUBTITLE} URL` 下载该链接的字幕
-2. 以 `{PREFIX.SUBTITLE}` 回复消息可下载消息中链接的字幕
-
-当前只支持YouTube
-"""
-
-
-async def get_subtitle(client: Client, message: Message, youtube_subtitle_provider: str = PROVIDER.YOUTUBE_SUBTITLE, *, to_telegraph: bool = True, **kwargs):
- """Get YouTube Subtitle."""
- target_chat = kwargs["target_chat"] if kwargs.get("target_chat") else message.chat.id
- # send docs if message == "/subtitle", without reply
- if equal_prefix(message.text, prefix=[PREFIX.SUBTITLE]) and not message.reply_to_message:
- await send2tg(client, message, texts=HELP, **kwargs)
- return
- if not (vid := await find_yt_vid(client, message)):
- return
-
- yt_url = f"https://www.youtube.com/watch?v={vid}"
- msg = f"🔍**正在获取字幕**\n{yt_url}"
- if kwargs.get("show_progress"):
- res = await send2tg(client, message, texts=msg, **kwargs)
- kwargs["progress"] = res[0]
-
- # cache media_group message
- if media_group_id := message.media_group_id:
- if cache.get(f"subtitle-{message.chat.id}-{media_group_id}"):
- return
- cache.set(f"subtitle-{message.chat.id}-{media_group_id}", "1", ttl=120)
-
- this_info = parse_msg(message, silent=True)
- reply_info = parse_msg(message.reply_to_message, silent=True) if message.reply_to_message else {}
-
- res = await fetch_subtitle(vid, youtube_subtitle_provider)
- if error := res.get("error", ""):
- if "Subtitles are disabled for this video" in error:
- error = "❌该视频没有提供字幕选项"
- if this_info["mtype"] in ["audio", "video"] or reply_info.get("mtype", "") in ["audio", "video"]:
- error += "\n🔄尝试使用语音转文字获取字幕"
- await modify_progress(text=error, force_update=True, **kwargs)
- msg = message if this_info["mtype"] in ["audio", "video"] else message.reply_to_message
- fpath: str = await msg.download() # type: ignore
- asr_res = await asr_file(fpath, engine="gemini", client=client, message=message, **kwargs)
- if asr_res.get("error"):
- await modify_progress(text=asr_res["error"], force_update=True, **kwargs)
- return
- res = {"subtitles": asr_res["texts"], "num_chars": len(asr_res["texts"]), "reading_minutes": len(asr_res["texts"]) / READING_SPEED}
- if asr_res.get("telegraph"):
- res["telegraph"] = asr_res["telegraph"]
- else:
- await modify_progress(text=error, force_update=True, **kwargs)
- return
- subtitles = res.get("subtitles", "")
- if not subtitles:
- return
- logger.success(subtitles)
- if vinfo := await fetch_youtube_video_info(vid):
- caption = f"🔴[{vinfo['author']}]({vinfo['channel']})\n🕒{vinfo['date']:%Y-%m-%d %H:%M:%S}\n"
- caption += f"📝[{vinfo['title']}]({yt_url})\n字符数: {res['num_chars']}\n阅读时长: {res['reading_minutes']:.1f}分钟"
- if to_telegraph:
- html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")])
- if telegraph_url := await publish_telegraph(title=vinfo["title"], html=html, author=vinfo["author"], url=yt_url):
- caption += f"\n**⚡️[Telegraph即时预览]({telegraph_url})**"
- with io.BytesIO(subtitles.encode("utf-8")) as f:
- await client.send_document(to_int(target_chat), f, file_name=f"{vinfo['title']}.txt", caption=caption)
- else:
- caption = f"原视频: [{vid}]({yt_url})\n字符数: {res['num_chars']}\n阅读时长: {res['reading_minutes']:.1f}分钟"
- if to_telegraph:
- html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")])
- if telegraph_url := await publish_telegraph(title=f"{vid}字幕", html=html, url=yt_url):
- caption += f"\n**⚡️[Telegraph即时预览]({telegraph_url})**"
- with io.BytesIO(subtitles.encode("utf-8")) as f:
- await client.send_document(to_int(target_chat), f, file_name=f"{vid}字幕.txt", caption=caption)
-
- [await modify_progress(msg, del_status=True) for msg in res.get("sent_messages", [])]
- await modify_progress(del_status=True, **kwargs)
async def find_yt_vid(client: Client, message: Message) -> str:
+ """Find YouTube video ID from message."""
info = parse_msg(message)
if not startswith_prefix(info["text"], prefix=[PREFIX.SUBTITLE]):
return ""
@@ -131,6 +48,7 @@ async def find_yt_vid(client: Client, message: Message) -> str:
async def fetch_subtitle(video_id: str, provider: str) -> dict:
+ """Fetch subtitles from YouTube."""
succ = False
subtitles = []
try:
@@ -236,6 +154,7 @@ def to_webvtt(subtitles: list[dict]) -> dict:
async def fetch_youtube_video_info(video_id: str) -> dict:
+ """Fetch YouTube video info."""
try:
logger.info(f"Fetch Video info for {video_id=}, proxy={PROXY.SUBTITLE}")
api = "https://www.googleapis.com/youtube/v3/videos"
src/subtitles/subtitle.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import io
+
+from loguru import logger
+from pyrogram.client import Client
+from pyrogram.types import Message
+
+from asr.voice_recognition import asr_file
+from config import PREFIX, PROVIDER, READING_SPEED
+from database import cache
+from messages.parser import parse_msg
+from messages.progress import modify_progress
+from messages.sender import send2tg
+from messages.utils import equal_prefix
+from preview.ytdlp import preview_ytdlp
+from subtitles.base import fetch_subtitle, fetch_youtube_video_info, find_yt_vid
+from utils import publish_telegraph, to_int
+
+HELP = f"""📃**提取字幕**
+使用说明:
+1. `{PREFIX.SUBTITLE} URL` 下载该链接的字幕
+2. 以 `{PREFIX.SUBTITLE}` 回复消息可下载消息中链接的字幕
+
+当前只支持YouTube
+"""
+
+
+async def get_subtitle(client: Client, message: Message, youtube_subtitle_provider: str = PROVIDER.YOUTUBE_SUBTITLE, *, to_telegraph: bool = True, **kwargs):
+ """Get YouTube Subtitle."""
+ target_chat = kwargs["target_chat"] if kwargs.get("target_chat") else message.chat.id
+ # send docs if message == "/subtitle", without reply
+ if equal_prefix(message.text, prefix=[PREFIX.SUBTITLE]) and not message.reply_to_message:
+ await send2tg(client, message, texts=HELP, **kwargs)
+ return
+ if not (vid := await find_yt_vid(client, message)):
+ return
+
+ yt_url = f"https://www.youtube.com/watch?v={vid}"
+ msg = f"🔍**正在获取字幕**\n{yt_url}"
+ if kwargs.get("show_progress"):
+ res = await send2tg(client, message, texts=msg, **kwargs)
+ kwargs["progress"] = res[0]
+
+ # cache media_group message
+ if media_group_id := message.media_group_id:
+ if cache.get(f"subtitle-{message.chat.id}-{media_group_id}"):
+ return
+ cache.set(f"subtitle-{message.chat.id}-{media_group_id}", "1", ttl=120)
+
+ this_info = parse_msg(message, silent=True)
+ reply_info = parse_msg(message.reply_to_message, silent=True) if message.reply_to_message else {}
+
+ res = await fetch_subtitle(vid, youtube_subtitle_provider)
+ if error := res.get("error", ""):
+ if "Subtitles are disabled for this video" in error:
+ error = "❌该视频没有提供字幕选项\n🔄尝试使用语音转文字获取字幕"
+ await modify_progress(text=error, force_update=True, **kwargs)
+ if this_info["mtype"] in ["audio", "video"] or reply_info.get("mtype", "") in ["audio", "video"]:
+ msg = message if this_info["mtype"] in ["audio", "video"] else message.reply_to_message
+ fpath: str = await msg.download() # type: ignore
+ asr_res = await asr_file(fpath, engine="gemini", client=client, message=message, **kwargs)
+ if asr_res.get("error"):
+ await modify_progress(text=asr_res["error"], force_update=True, **kwargs)
+ return
+ res = {"subtitles": asr_res["texts"], "num_chars": len(asr_res["texts"]), "reading_minutes": len(asr_res["texts"]) / READING_SPEED}
+ if asr_res.get("telegraph"):
+ res["telegraph"] = asr_res["telegraph"]
+ else:
+ kwargs |= {
+ "show_progress": False,
+ "url": yt_url,
+ "append_transcription": True,
+ "ytdlp_audio_only": True,
+ "youtube_comments_provider": False,
+ "proxy": None,
+ "use_db": False,
+ }
+ await preview_ytdlp(client=client, message=message, **kwargs)
+ await modify_progress(del_status=True, **kwargs)
+ return
+ subtitles = res.get("subtitles", "")
+ if not subtitles:
+ return
+ logger.success(subtitles)
+ if vinfo := await fetch_youtube_video_info(vid):
+ caption = f"🔴[{vinfo['author']}]({vinfo['channel']})\n🕒{vinfo['date']:%Y-%m-%d %H:%M:%S}\n"
+ caption += f"📝[{vinfo['title']}]({yt_url})\n字符数: {res['num_chars']}\n阅读时长: {res['reading_minutes']:.1f}分钟"
+ if to_telegraph:
+ html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")])
+ if telegraph_url := await publish_telegraph(title=vinfo["title"], html=html, author=vinfo["author"], url=yt_url):
+ caption += f"\n**⚡️[Telegraph即时预览]({telegraph_url})**"
+ with io.BytesIO(subtitles.encode("utf-8")) as f:
+ await client.send_document(to_int(target_chat), f, file_name=f"{vinfo['title']}.txt", caption=caption)
+ else:
+ caption = f"原视频: [{vid}]({yt_url})\n字符数: {res['num_chars']}\n阅读时长: {res['reading_minutes']:.1f}分钟"
+ if to_telegraph:
+ html = "\n".join([f"<p>{s}</p>" for s in subtitles.split("\n")])
+ if telegraph_url := await publish_telegraph(title=f"{vid}字幕", html=html, url=yt_url):
+ caption += f"\n**⚡️[Telegraph即时预览]({telegraph_url})**"
+ with io.BytesIO(subtitles.encode("utf-8")) as f:
+ await client.send_document(to_int(target_chat), f, file_name=f"{vid}字幕.txt", caption=caption)
+
+ [await modify_progress(msg, del_status=True) for msg in res.get("sent_messages", [])]
+ await modify_progress(del_status=True, **kwargs)
src/handler.py
@@ -21,7 +21,6 @@ from others.extract_audio import extract_audio_file
from others.raw_img_file import convert_raw_img_file
from others.search_google import search_google
from others.search_ytb import search_youtube
-from others.subtitle import get_subtitle
from permission import check_service
from preview.bilibili import preview_bilibili
from preview.douyin import preview_douyin
@@ -33,6 +32,7 @@ from preview.weibo import preview_weibo
from preview.xiaohongshu import preview_xhs
from preview.ytdlp import ProxyError, preview_ytdlp
from price.entrypoint import get_asset_price
+from subtitles.subtitle import get_subtitle
from utils import to_int, true