main
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import asyncio
5import contextlib
6import json
7import re
8from pathlib import Path
9from typing import Any, Literal
10from urllib.parse import parse_qs, urlparse
11
12import anyio
13from curl_cffi.requests.impersonate import BrowserTypeLiteral
14from httpx import AsyncClient, AsyncHTTPTransport, HTTPStatusError, Request, RequestError, Response
15from httpx._types import RequestContent, RequestData, RequestFiles
16from httpx_curl_cffi import AsyncCurlTransport, CurlOpt
17from loguru import logger
18
19from config import DOWNLOAD_DIR, PROXY, REQUEST_TIMEOUT, cache, semaphore
20from messages.progress import modify_progress
21from messages.utils import summay_media
22from utils import av2bv, bare_url, check_data, https_url, is_supported_by_ytdlp, match_urls, readable_size
23
24
25# ruff: noqa: RUF001
26async def log_req(request: Request) -> None:
27 logger.debug(f"{request.method} {request.url} {request.headers}")
28
29
30async def log_resp(response: Response) -> None:
31 request = response.request
32 logger.debug(f"[{response.status_code}] {request.method} {request.url}")
33
34
35async def hx_req(
36 url,
37 method: str = "GET",
38 *,
39 transport: AsyncCurlTransport | AsyncHTTPTransport | None = None,
40 headers: dict | None = None,
41 cookies: dict | None = None,
42 params: dict | None = None,
43 data: RequestData | None = None,
44 json_data: dict | None = None,
45 content_data: RequestContent | None = None,
46 files: RequestFiles | None = None,
47 proxy: str | None = None,
48 follow_redirects: bool = True,
49 check_keys: list[str] | None = None,
50 check_kv: dict | None = None,
51 timeout: int = REQUEST_TIMEOUT, # noqa: ASYNC109
52 retry: int = 0,
53 max_retry: int = 2,
54 verify: bool = True,
55 silent: bool = False,
56 mobile: bool = False,
57 rformat: Literal["json", "text", "content"] = "json",
58 last_error: str = "",
59 hx_raw: dict | None = None,
60) -> dict[str, Any]:
61 """Request the given URL with the given method and return the response as a dictionary.
62
63 Args:
64 url (str): The URL to request.
65 method (str): The method to use for the request.
66 headers (dict, optional): The headers to use for the request.
67 cookies (dict, optional): The cookies to use for the request.
68 params (dict, optional): The parameters to use for the request.
69 data (dict, optional): The data to POST or PUT.
70 json_data (dict, optional): The JSON data to POST or PUT.
71 content_data (dict, optional): The form data to POST or PUT.
72 proxy (str, optional): The proxy to use for the request.
73 follow_redirects (bool, optional): Whether to follow redirects.
74 check_keys (list[str], optional): The keys to check in the response.
75 check_kv (dict, optional): The key-value pairs to check in the response.
76 timeout (int, optional): The timeout for the request.
77 retry (int, optional): The number of retries for the request.
78 verify (bool, optional): Whether to verify the SSL certificate.
79 silent (bool, optional): Whether to suppress the logs.
80 mobile (bool, optional): Whether to use mobile headers.
81 rformat (str, optional): The format of the response.
82 last_error (str, optional): Last error message.
83 hx_raw (dict, optional): Raw HTTPX response on failure.
84
85 Returns:
86 dict: {"success": bool, "data": response}
87 """
88 if retry > max_retry:
89 logger.error(f"[{method}] Failed after {retry} retries: {url}")
90 return {"hx_error": last_error, "hx_raw": hx_raw or {}}
91 if transport is None:
92 transport = AsyncCurlTransport(proxy=proxy, impersonate="safari_ios" if mobile else "chrome", default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True})
93
94 if silent:
95 client = AsyncClient(http2=True, proxy=proxy, transport=transport, follow_redirects=follow_redirects, timeout=timeout, verify=verify)
96 else:
97 client = AsyncClient(
98 http2=True,
99 proxy=proxy,
100 transport=transport,
101 follow_redirects=follow_redirects,
102 timeout=timeout,
103 verify=verify,
104 event_hooks={"request": [log_req], "response": [log_resp]},
105 )
106
107 if method not in ["GET", "POST", "PUT", "DELETE"]:
108 error = f"Invalid method: {method}"
109 logger.error(error)
110 return {"hx_error": error}
111 try:
112 async with client:
113 if method == "GET":
114 response = await client.get(url, cookies=cookies, headers=headers, params=params)
115 elif method == "POST":
116 response = await client.post(url, cookies=cookies, headers=headers, data=data, json=json_data, files=files, content=content_data, params=params)
117 elif method == "PUT":
118 response = await client.put(url, cookies=cookies, headers=headers, data=data, json=json_data, content=content_data, files=files, params=params)
119 else:
120 response = await client.delete(url, cookies=cookies, headers=headers, params=params)
121 response.raise_for_status()
122 meta = {"headers": response.headers, "status_code": response.status_code}
123 if rformat == "content":
124 return {"content": response.content} | meta
125 resp_data = response.text
126 check_data(resp_data, check_keys=check_keys, check_kv=check_kv)
127 res = json.loads(resp_data) if rformat == "json" else {rformat: resp_data} | meta
128 if not silent:
129 logger.trace(res)
130 return res | meta if isinstance(res, dict) else res
131 except Exception as e:
132 error = f"{type(e).__name__}[{retry + 1}/{max_retry + 1}]: Failed to request {url}, {e}"
133 with contextlib.suppress(Exception):
134 hx_raw = response.json()
135 if "res" in locals():
136 error += f"\n{res}"
137 if "data" in locals():
138 error += f"\n{data}"
139 if hx_raw:
140 error += f"\n{hx_raw}"
141 elif "response" in locals():
142 error += f"\n{response.text}"
143 logger.error(error)
144 return await hx_req(url, method, headers=headers, cookies=cookies, params=params, data=data, json_data=json_data, proxy=proxy, follow_redirects=follow_redirects, check_keys=check_keys, check_kv=check_kv, timeout=timeout, retry=retry + 1, max_retry=max_retry, silent=silent, rformat=rformat, last_error=error, hx_raw=hx_raw) # fmt: off
145
146
147async def download_file(
148 link: str,
149 path: str | Path | None = None,
150 *,
151 suffix: str = "",
152 skip_exist: bool = False,
153 proxy: str | None = None,
154 headers: dict | None = None,
155 impersonate: BrowserTypeLiteral | None = "safari_ios",
156 stream: bool = False,
157 **kwargs,
158) -> str:
159 """Download a file from the given link and save it to the specified path.
160
161 Args:
162 link (str): URL to download the file.
163 path (str | Path, optional): The path to save the downloaded file. Defaults to auto detect.
164 suffix (str, optional): The suffix to append to the file name. Defaults to auto detect.
165 skip_exist (bool, optional): Skip downloading if the file already exists. Defaults to False.
166 proxy (str, optional): The proxy to use for the request.
167 headers (dict, optional): The headers to use for the request.
168 stream (bool, optional): Stream the download. Defaults to False.
169
170 Returns:
171 str: Download file path.
172 """
173 if not link:
174 return ""
175 if path is None:
176 path = Path(DOWNLOAD_DIR) / Path(urlparse(link).path).name
177 path = Path(path).expanduser().resolve()
178 if path.suffix != suffix:
179 path = path.with_suffix(f"{path.suffix}{suffix}") # append suffix, not replace
180
181 if path.is_file() and skip_exist:
182 logger.info(f"File already exists, skipping download: {path}")
183 return path.as_posix()
184 path.parent.mkdir(parents=True, exist_ok=True)
185 proxy = proxy or PROXY.DOWNLOAD
186 logger.trace(f"Downloading {link} to {path} with proxy={proxy}")
187 hx = AsyncClient(
188 headers=headers,
189 transport=AsyncCurlTransport(proxy=proxy, impersonate=impersonate, default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True}) if isinstance(impersonate, str) else None,
190 proxy=proxy,
191 timeout=REQUEST_TIMEOUT,
192 follow_redirects=True,
193 event_hooks={"request": [log_req], "response": [log_resp]},
194 )
195 try:
196 if stream: # can monitor progress, but the retry mechanism does not work
197 async with semaphore, hx.stream("GET", link) as response:
198 total = int(response.headers.get("Content-Length", 0))
199 async with await anyio.open_file(path, "wb") as f:
200 num_bytes_downloaded = response.num_bytes_downloaded
201 async for chunk in response.aiter_bytes():
202 await f.write(chunk)
203 msg = f"⏬下载中: {readable_size(num_bytes_downloaded)} / {readable_size(total)}\n💾{path.name}"
204 msg += f" ({num_bytes_downloaded / total:.2%})" if total and total > 0 else ""
205 await modify_progress(text=msg, **kwargs)
206 num_bytes_downloaded = response.num_bytes_downloaded
207 else:
208 async with semaphore, hx:
209 response = await hx.get(link)
210 response.raise_for_status()
211 path.write_bytes(response.content) # Save the file to disk
212 except (RequestError, HTTPStatusError) as e:
213 error = f"Failed to download: {e}"
214 logger.error(error)
215 await modify_progress(text=error, **kwargs)
216 return ""
217 if path.is_file():
218 logger.info(f"Downloaded file saved to {path}")
219 await modify_progress(text=f"🎉下载成功\n{path.name}", **kwargs)
220 return path.as_posix()
221 return ""
222
223
224async def download_first_success_urls(links: list[str], **kwargs) -> str:
225 """Download the first successfully file from a list of links.
226
227 Note: This will only download a single file from the list of links.
228 """
229 if not links:
230 return ""
231 for link in links:
232 res = await download_file(link, **kwargs)
233 if Path(res).is_file():
234 return res
235 return ""
236
237
238async def download_media(media: list[dict], **kwargs) -> list[dict]:
239 if not media:
240 return []
241 tasks = []
242 for item in media:
243 if task := item.get("photo"): # async function
244 tasks.append(task)
245 if task := item.get("video"):
246 tasks.append(task)
247 if task := item.get("livephoto"):
248 tasks.append(task)
249 if task := item.get("audio"):
250 tasks.append(task)
251 # run all tasks
252 results = await asyncio.gather(*tasks, return_exceptions=True)
253 final_media = []
254 for item, result in zip(media, results, strict=True):
255 if isinstance(result, Exception):
256 logger.error(f"Failed to download: {result}")
257 elif isinstance(result, str) and not Path(result).is_file():
258 logger.error(f"Downloaded file is not exists: {result}")
259 else:
260 if item.get("photo"): # async function
261 item["photo"] = result
262 final_media.append(item)
263 if task := item.get("video"):
264 item["video"] = result
265 final_media.append(item)
266 if task := item.get("livephoto"):
267 item["video"] = result
268 final_media.append(item)
269 if task := item.get("audio"):
270 item["audio"] = result
271 final_media.append(item)
272 logger.success(f"Downloaded: {result}")
273 await modify_progress(text=f"✅下载成功:\n{summay_media(final_media)}", **kwargs)
274 return final_media
275
276
277@cache.memoize(ttl=60)
278async def match_social_media_link(text: str, *, flatten_first: bool = True) -> dict:
279 """Matches social media links in the given text and returns a dictionary with the matched information.
280
281 Args:
282 text (str): The text to search for social media links.
283
284 Returns:
285 dict: A dictionary containing the matched information. At least "platform", "url", and "db_key" keys are present.
286 platform: The social media platform name.
287 url: The matched URL.
288 db_key: The key to store in the cache.
289 #! TODO: Handle multiple links in one message.
290 """
291 text = str(text)
292 if flatten_first:
293 text = await flatten_rediercts(text)
294 # https://www.douyin.com/video/7398813386827468041
295 # https://www.douyin.com/note/7458195074434846004
296 # https://www.iesdouyin.com/share/video/7454527270925946138/
297 # https://www.iesdouyin.com/share/note/7454527270925946138/
298 if matched := re.search(r"(https?://)?(www\.)?(ies)?douyin\.com/(share/)?(:?|video|note)/(\d+)", text):
299 return {"url": f"https://www.douyin.com/{matched.group(5)}/{matched.group(6)}", "db_key": f"www.douyin.com/{matched.group(5)}/{matched.group(6)}", "platform": "douyin"}
300 # https://www.douyin.com/user/MS4wLjABAAAAXgBuOEcyavDhrRBqnD8x7d4pj7RIL5QFRlLehCnem8couoAg8yXR-MGhUK0i4riF?modal_id=7451543857952492810
301 # https://www.douyin.com/discover?modal_id=7472757663609179430
302 if matched := re.search(r"(https?://)?(www\.)?douyin\.com/(.*?)\?(.*?)modal_id=(\d+)", text):
303 return {"url": f"https://www.douyin.com/video/{matched.group(5)}", "db_key": f"www.douyin.com/video/{matched.group(5)}", "platform": "douyin"}
304 # https://www.douyin.com/?previous_page=oversea_share_link&vid=7483851761246031115
305 if matched := re.search(r"(https?://)?(www\.)?douyin\.com/(.*?)\?(.*?)vid=(\d+)", text):
306 return {"url": f"https://www.douyin.com/video/{matched.group(5)}", "db_key": f"www.douyin.com/video/{matched.group(5)}", "platform": "douyin"}
307 # https://www.tiktok.com/@baymermel/video/7460653893941267755\?_t\=ZS-8t8YbVWqv5k\&_r\=1
308 if matched := re.search(r"(https?://)?(www\.)?tiktok\.com/(.*?)/(\d+)", text):
309 return {"url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "tiktok"}
310
311 # https://www.instagram.com/p/C7P3jN8vmEN
312 # https://www.instagram.com/reel/DBBEGXpvwNF
313 if matched := re.search(r"(https?://)?(www\.)?instagram\.com/(:?|p|reel)/([^.。,,/\s]+)", text):
314 return {"post_type": matched.group(3), "post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
315 # https://www.instagram.com/yifaer_chen/p/DEzv9x-vzOn/
316 if matched := re.search(r"(https?://)?(www\.)?instagram\.com/[a-zA-Z0-9_.]+/(:?|p|reel)/([^.。,,/\s]+)", text):
317 return {"post_type": matched.group(3), "post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
318
319 # https://x.com/taylorswift13/status/1794805688696275131
320 # https://twitter.com/taylorswift13/status/1794805688696275131
321 # https://fixupx.com/taylorswift13/status/1794805688696275131
322 # https://fxtwitter.com/taylorswift13/status/1794805688696275131
323 if matched := re.search(r"(https?://)?(:?twitter|x|fxtwitter|fixupx|vxtwitter)\.com\/(\w+)\/status/(\d+)", text):
324 handle = matched.group(3)
325 post_id = matched.group(4)
326 url = f"https://x.com/{handle}/status/{post_id}"
327 return {"platform": "x", "handle": handle, "post_id": post_id, "url": url, "db_key": bare_url(url)}
328
329 # weibo video first, then weibo post
330 # https://video.weibo.com/show?fid=1034:5123779299311660
331 # https://h5.video.weibo.com/show/1034:5169532881535051
332 if matched := re.search(r"(https?://)?(h5\.)?video\.weibo\.(:?com|cn)/show(\?fid=|\/)(\d+):(\d+)", text):
333 return {"post_id": f"weibovideo{matched.group(5)}:{matched.group(6)}", "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "weibo"}
334 # https://weibo.com/tv/show/1034:5123779299311660?from=old_pc_videoshow
335 if matched := re.search(r"(https?://)?(www\.)?weibo\.(:?com|cn)/tv/show/(\d+):(\d+)", text):
336 return {"post_id": f"weibovideo{matched.group(4)}:{matched.group(5)}", "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "weibo"}
337 # https://m.weibo.cn/detail/5113333048938691
338 # https://m.weibo.cn/status/5113333048938691
339 if matched := re.search(r"(https?://)?m\.weibo\.cn/(:?detail|status)/(\w+)", text):
340 return {"post_id": matched.group(3), "url": https_url(matched.group(0)), "db_key": f"m.weibo.cn/detail/{matched.group(3)}", "platform": "weibo"}
341 # https://weibo.com/1736562685/P6lhSjRnI
342 if matched := re.search(r"(https?://)?(www\.)?weibo\.com/(.*?)/(\w+)", text):
343 return {"post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": f"m.weibo.cn/detail/{matched.group(4)}", "platform": "weibo"}
344
345 # http://xhslink.com/a/Z3VPXAReU1Y1
346 xhs_pattern = r"(https?://)?xhslink\.com/(\w?/?)([^,,.。?\s]+)"
347 if matched := re.search(xhs_pattern, text):
348 transport = AsyncCurlTransport(proxy=PROXY.XHS, impersonate="safari_ios", default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True})
349 flatten = await flatten_rediercts(https_url(matched.group(0)), transport=transport, pattern=xhs_pattern, proxy=PROXY.XHS, method="GET")
350 base_url = flatten.split("?")[0]
351 post_id = Path(base_url).stem
352 queries = parse_qs(urlparse(flatten).query)
353 xsec = queries.get("xsec_token", [""])[0]
354 return {"url": https_url(matched.group(0)), "db_key": f"www.xiaohongshu.com/explore/{post_id}", "xsec": xsec, "is_xhs_link": True, "platform": "xiaohongshu"}
355 # https://www.xiaohongshu.com/explore/671a3dfe00000000240161db?xsec_token=ABY-b1JKuAlIm2dX1OSdIFHD7cQFHEdThv5aMyccvmbJo=
356 if matched := re.search(r"(https?://)?(www\.)?xiaohongshu\.com/([^。,,\s]+)", text):
357 base_url = matched.group(0).split("?")[0]
358 post_id = Path(base_url).stem
359 queries = parse_qs(urlparse(matched.group(0)).query)
360 xsec = queries.get("xsec_token", [""])[0]
361 return {
362 "url": f"https://www.xiaohongshu.com/explore/{post_id}?xsec_token={xsec}",
363 "db_key": f"www.xiaohongshu.com/explore/{post_id}",
364 "is_xhs_link": False,
365 "xsec": xsec,
366 "platform": "xiaohongshu",
367 }
368
369 # https://www.bilibili.com/video/BV1RSsNzDEQb
370 # https://www.bilibili.com/video/av115402113881975
371 # https://www.bilibili.com/BV1RSsNzDEQb
372 # https://www.bilibili.com/av115402113881975
373 if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/(video/)?(?P<prefix>[aAbB][vV])(?P<id>[a-zA-Z0-9]+)", text):
374 base_url = matched.group(0).split("?")[0]
375 bvid = Path(base_url).stem
376 queries = parse_qs(urlparse(matched.group(0)).query)
377 pid = queries.get("p", ["1"])[0]
378 url = f"https://www.bilibili.com/video/{av2bv(bvid)}?p={pid}".removesuffix("?p=1")
379 return {"url": url, "db_key": bare_url(url), "bvid": av2bv(bvid), "pid": pid, "platform": "bilibili"}
380
381 # https://m.bilibili.com/opus/1048442220384878593
382 if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/opus/(\d+)", text):
383 post_id = matched.group(3)
384 url = f"https://www.bilibili.com/opus/{post_id}"
385 return {"url": url, "db_key": url, "post_id": post_id, "platform": "bilibili-opus"}
386
387 # https://github.com/user-name/repo
388 # https://github.com/user-name/repo/issues/123
389 # https://github.com/user-name/repo/issues/123#issuecomment-45678
390 # https://github.com/user-name/repo/pull/123
391 # https://github.com/user-name/repo/pull/123#issuecomment-45678
392 if matched := re.search(r"(https?://)?github\.com/([a-zA-Z0-9]+(-[a-zA-Z0-9]+)*)/([.a-zA-Z0-9_-]+)/?([#/a-zA-Z0-9_-]+)?", text):
393 gh_user = matched.group(2)
394 gh_repo = matched.group(4)
395 query = matched.group(5) or ""
396 url = matched.group(0)
397 return {"url": https_url(url), "db_key": bare_url(url), "gh_user": gh_user, "gh_repo": gh_repo, "query": query, "platform": "github"}
398
399 # https://www.v2ex.com/t/1153086
400 if matched := re.search(r"(https?://)?(www\.)?v2ex\.com/t/(\d+)", text):
401 topic_id = matched.group(3)
402 url = f"https://www.v2ex.com/t/{topic_id}"
403 return {"url": url, "db_key": bare_url(url), "topic_id": topic_id, "platform": "v2ex"}
404
405 # https://open.spotify.com/track/0cOMncRq4cmDLO4tPQnkBF
406 if matched := re.search(r"(https?://)?open\.spotify\.com/(:?track|album|artist|playlist)/([a-zA-Z0-9]+)", text):
407 resource = matched.group(2)
408 spotify_id = matched.group(3)
409 url = matched.group(0)
410 return {"url": url, "db_key": bare_url(url), "resource": resource, "spotify_id": spotify_id, "platform": "spotify"}
411
412 # https://music.163.com/song?id=2021343740
413 # https://163cn.tv/HYHqZ6R
414 # https://163cn.link/HYHqZ6R
415 if matched := re.search(r"(https?://)?(:?music\.163\.com|163cn\.tv|163cn\.link)/([0-9a-zA-Z#./?=_\-%&]+)", text):
416 url = matched.group(0)
417 return {"url": url, "db_key": bare_url(url), "platform": "music163"}
418
419 # https://www.youtube.com/watch?v=D6aE2E0RHTc
420 if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/watch.*?v=([a-zA-Z0-9_-]{11})", text):
421 vid = matched.group(3)
422 return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
423 # https://youtube.com/shorts/lFKHbluAlJw
424 if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/(:?shorts|live)/([a-zA-Z0-9_-]{11})", text):
425 vid = matched.group(4)
426 return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
427 # https://youtu.be/vOiP3kfFlrE
428 if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtu\.be/([a-zA-Z0-9_-]{11})", text):
429 vid = matched.group(3)
430 return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
431
432 # https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw
433 # https://mp.weixin.qq.com/s?__biz=MzI5Njc4NTYyOQ==&mid=2247494800&idx=1&sn=43a5732bd3a205d4dbdcd523afc0ca4a&sharer_shareinfo=1923203fd24bfa47c5b36b690026f5c8&sharer_shareinfo_first=8814eca80b4a37d10aa9b725e61f9486
434 if matched := re.search(r"(https?://)?mp.weixin.qq.com/s[\/|\?]{1}([_A-Za-z\=\&0-9\#\-]+)", text):
435 return {"url": matched.group(0), "db_key": bare_url(matched.group(0)), "platform": "wechat"}
436
437 # !Put this before all reddit rules
438 # https://www.reddit.com/r/China_irl/s/bA50WleCBM
439 reddit_pattern = r"(https?://)?(:?m\.|www\.)reddit\.com/r/\w+/s/([^.。,,?&/\s]+)"
440 if matched := re.search(reddit_pattern, text):
441 text = await flatten_rediercts(https_url(matched.group(0)), pattern=reddit_pattern, proxy=PROXY.REDDIT)
442 # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/%E8%B5%B5%E8%96%87%E4%BB%80%E4%B9%88%E6%97%B6%E5%80%99%E5%9B%9E%E6%9D%A5/
443 # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/赵薇什么时候回来
444 # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/comment/mk43l4t/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button
445 if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/r/([_A-Za-z0-9]+)/comments/(.*?)/([^,,.。\?\s]+)", text):
446 return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
447 # https://reddit.com/comments/1kaazzn
448 if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/comments/([_A-Za-z0-9]+)", text):
449 return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
450
451 # https://arxiv.org/abs/2301.12345
452 # https://arxiv.org/pdf/2301.12345v3
453 if matched := re.search(r"(https?://)?arxiv\.org/(abs|pdf)/(\d{4}\.\d{4,5}(?:v\d+)?)", text):
454 url = matched.group(0)
455 arxiv_id = matched.group(3)
456 if "v" not in arxiv_id:
457 arxiv_id += "v1"
458 url += "v1"
459 return {"url": url, "arxiv_id": arxiv_id, "db_key": f"arxiv.org/abs/{arxiv_id}", "platform": "arxiv"}
460
461 # if all above pre-defined patterns failed, try to match ytdlp link
462 if urls := match_urls(text):
463 for url in urls:
464 if any(x in url.lower() for x in ["bilibili", "douyin", "instagram", "tiktok", "twitter", "weibo", "xiaohongshu", "reddit", "youtube"]):
465 # handled above
466 continue
467 if is_supported_by_ytdlp(url):
468 return {"url": url, "db_key": bare_url(url), "platform": "ytdlp"}
469 return {"platform": ""}
470
471
472@cache.memoize(ttl=60)
473async def flatten_rediercts(
474 texts: str | None = None,
475 pattern: str | None = None,
476 headers: dict | None = None,
477 proxy: str | None = None,
478 method: str = "HEAD",
479 transport: AsyncCurlTransport | AsyncHTTPTransport | None = None,
480) -> str:
481 if not texts:
482 return ""
483 url = ""
484 # v.douyin.com
485 if matched := re.search(r"(https?://)?v\.douyin\.com/([^.。,,?&/\s]+)", texts):
486 method = "GET" # use GET for v.douyin.com
487 url = matched.group(0)
488 # vt.tiktok.com
489 if matched := re.search(r"(https?://)?vt\.tiktok\.com/([^.。,,?&/\s]+)", texts):
490 url = matched.group(0)
491 # tiktok.com/t/
492 if matched := re.search(r"(https?://)?(www\.)?tiktok\.com/t/([^.。,,?&/\s]+)", texts):
493 url = matched.group(0)
494 # facebook.com/
495 if matched := re.search(r"(https?://)?(:?m\.|www\.)facebook\.com/share/(v|r)/([^.。,,?&/\s]+)", texts):
496 url = matched.group(0)
497 # b23.tv
498 if matched := re.search(r"(https?://)?b23\.tv/([^.。,,?&/\s]+)", texts):
499 url = matched.group(0)
500 # bili2233.cn
501 if matched := re.search(r"(https?://)?bili2233\.cn/([^.。,,?&/\s]+)", texts):
502 url = matched.group(0)
503 # t.co
504 if matched := re.search(r"(https?://)?t\.co/([^.。,,?&/\s]+)", texts):
505 url = matched.group(0)
506 # mapp.api.weibo.cn
507 if matched := re.search(r"(https?://)?mapp\.api\.weibo\.cn/fx/([0-9a-zA-Z]+)\.html", texts):
508 url = matched.group(0)
509 # t.cn
510 if matched := re.search(r"(https?://)?t\.cn/([^.。,,?&/\s]+)", texts):
511 url = matched.group(0)
512 method = "GET"
513 # bit.ly
514 if matched := re.search(r"(https?://)?bit\.ly/([^.。,,?&/\s]+)", texts):
515 url = matched.group(0)
516 # shorturl.at
517 if matched := re.search(r"(https?://)?shorturl\.at/([^.。,,?&/\s]+)", texts):
518 url = matched.group(0)
519 # vertexaisearch.cloud.google.com
520 if matched := re.search(r"(https?://)?vertexaisearch\.cloud\.google\.com/([0-9a-zA-Z\-_=+/]+)", texts):
521 url = matched.group(0)
522 proxy = PROXY.GOOGLE
523
524 # custom pattern
525 if pattern and (matched := re.search(pattern, texts)):
526 url = matched.group(0)
527 if not url:
528 return texts
529 # parse redirect
530 rediercted_url = https_url(url)
531 with contextlib.suppress(Exception):
532 if method == "HEAD":
533 async with AsyncClient(http2=True, proxy=proxy, follow_redirects=True, transport=transport, event_hooks={"request": [log_req], "response": [log_resp]}) as hx:
534 resp = await hx.head(https_url(url), headers=headers, timeout=3)
535 rediercted_url = str(resp.url)
536 elif method == "GET":
537 status_code = 302
538 while str(status_code).startswith("3"):
539 async with AsyncClient(http2=True, proxy=proxy, follow_redirects=False, transport=transport, event_hooks={"request": [log_req], "response": [log_resp]}) as hx:
540 resp = await hx.get(rediercted_url, headers=headers, timeout=3)
541 status_code = resp.status_code
542 rediercted_url = resp.headers.get("Location", rediercted_url)
543 if url != rediercted_url:
544 logger.info(f"Flatten redirect: {url} -> {rediercted_url}")
545 return texts.replace(url, rediercted_url)
546
547
548if __name__ == "__main__":
549 import asyncio
550
551 check_data(json.dumps({"foo": "bar", "baz": {"qux": "quux"}, "lst": ["1", "2", "3"]}), check_keys=["baz.qux"], check_kv={"foo": "bar", "baz.qux": "quux", "lst": ["1", "2", "3"]})
552 # asyncio.run(match_social_media_link("https://b23.tv/3MSgT4q/", flatten_first=True))
553 # print(asyncio.run(match_social_media_link("https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw", flatten_first=True)))
554 # print(asyncio.run(match_social_media_link("https://reddit.com/comments/1kaazzn", flatten_first=True)))
555 # print(asyncio.run(match_social_media_link("https://www.reddit.com/r/China_irl/s/bA50WleCBM")))
556 # asyncio.run(match_social_media_link("https://www.facebook.com/share/r/19QGGp39T3/", flatten_first=True))
557 # asyncio.run(match_social_media_link("https://www.douyin.com/video/7398813386827468041"))
558 # asyncio.run(match_social_media_link("https://www.iesdouyin.com/share/note/7454527270925946138/"))
559 # asyncio.run(match_social_media_link("https://www.instagram.com/yifaer_chen/p/DEzv9x-vzOn/"))
560 # asyncio.run(flatten_rediercts("http://t.cn/A6ukIuVn"))
561 # asyncio.run(flatten_rediercts("shorturl.at/fuyrt"))
562 # asyncio.run(flatten_rediercts("https://b23.tv/3MSgT4q"))
563 # asyncio.run(flatten_rediercts("https://v.douyin.com/CeiJfJMQG/"))
564 # asyncio.run(flatten_rediercts("https://www.tiktok.com/t/ZT2mcMA7f/"))
565 # asyncio.run(flatten_rediercts("https://t.co/Wwo3x69CQz"))
566 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/issues/14463")))
567 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/")))
568 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp")))
569 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/issues/14404#issuecomment-3323873708")))
570 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/pull/14467")))
571 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/pull/14417#issuecomment-3327344721")))
572 # print(asyncio.run(match_social_media_link("https://mapp.api.weibo.cn/fx/09ab955a1e0d406c9d6a74f5a2242b4a.html")))
573 # print(asyncio.run(match_social_media_link("https://www.bilibili.com/video/BV1TC411J7PK")))
574 # print(asyncio.run(match_social_media_link("https://www.bilibili.com/BV1TC411J7PK")))
575 # print(asyncio.run(match_social_media_link("https://www.instagram.com/miyoshi.aa/p/DN5hFcUE8rS/")))
576 # print(asyncio.run(match_social_media_link("https://www.youtube.com/watch?v=D6aE2E0RHTc")))
577 # print(asyncio.run(match_social_media_link("https://youtube.com/shorts/lFKHbluAlJw")))
578 # print(asyncio.run(match_social_media_link("https://youtu.be/vOiP3kfFlrE?si=zPd-Bt1GO03jxpI_")))
579 # res = asyncio.run(hx_req("https://httpbin.org/delay/10"))
580 # asyncio.run(hx_req("https://httpbin.org/get", check_kv={"url": "https://httpbin.org/get", "headers.Pragma": "no-cache1"}, max_retry=1))
581 # resp = asyncio.run(hx_req("https://httpbin.org/get", check_kv={"headers": {"Accept-Language": "en-US,en;q=0.8"}}))
582 # resp = asyncio.run(hx_req("https://httpbin.org/headers", headers={"referer": "https://www.xiaohongshu.com/"}))
583 # print(resp)
584
585 # asyncio.run(download_file("https://httpbin.org/image/jpeg", suffix=".jpg"))
586 # asyncio.run(match_social_media_link("https://www.instagram.com/p/C7P3jN8vmEN"))