main
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import asyncio
5import contextlib
6import json
7import re
8from datetime import timedelta
9from pathlib import Path
10from typing import Any, Literal
11from urllib.parse import parse_qs, urlparse
12
13import anyio
14from curl_cffi.requests.impersonate import BrowserTypeLiteral
15from glom import glom
16from httpx import AsyncClient, AsyncHTTPTransport, HTTPStatusError, Request, RequestError, Response
17from httpx._types import RequestContent, RequestData, RequestFiles
18from httpx_curl_cffi import AsyncCurlTransport, CurlOpt
19from loguru import logger
20
21from config import DOWNLOAD_DIR, PROXY, REQUEST_TIMEOUT, TOKEN, cache, semaphore
22from messages.progress import modify_progress
23from messages.utils import summay_media
24from utils import av2bv, bare_url, check_data, https_url, is_supported_by_ytdlp, match_urls, nowdt, readable_size
25
26
27# ruff: noqa: RUF001
28async def log_req(request: Request) -> None:
29 logger.debug(f"{request.method} {request.url} {request.headers}")
30
31
32async def log_resp(response: Response) -> None:
33 request = response.request
34 logger.debug(f"[{response.status_code}] {request.method} {request.url}")
35
36
37async def hx_req(
38 url,
39 method: Literal["GET", "POST", "PUT", "DELETE", "PATCH"] = "GET",
40 *,
41 transport: AsyncCurlTransport | AsyncHTTPTransport | None = None,
42 headers: dict | None = None,
43 cookies: dict | None = None,
44 params: dict | None = None,
45 data: RequestData | None = None,
46 json_data: dict | None = None,
47 content_data: RequestContent | None = None,
48 files: RequestFiles | None = None,
49 proxy: str | None = None,
50 follow_redirects: bool = True,
51 check_keys: list[str] | None = None,
52 check_kv: dict | None = None,
53 timeout: int = REQUEST_TIMEOUT, # noqa: ASYNC109
54 retry: int = 0,
55 max_retry: int = 2,
56 verify: bool = True,
57 silent: bool = False,
58 mobile: bool = False,
59 rformat: Literal["json", "text", "content"] = "json",
60 last_error: str = "",
61 hx_raw: dict | None = None,
62) -> dict[str, Any]:
63 """Request the given URL with the given method and return the response as a dictionary.
64
65 Args:
66 url (str): The URL to request.
67 method (str): The method to use for the request.
68 headers (dict, optional): The headers to use for the request.
69 cookies (dict, optional): The cookies to use for the request.
70 params (dict, optional): The parameters to use for the request.
71 data (dict, optional): The data to POST or PUT.
72 json_data (dict, optional): The JSON data to POST or PUT.
73 content_data (dict, optional): The form data to POST or PUT.
74 proxy (str, optional): The proxy to use for the request.
75 follow_redirects (bool, optional): Whether to follow redirects.
76 check_keys (list[str], optional): The keys to check in the response.
77 check_kv (dict, optional): The key-value pairs to check in the response.
78 timeout (int, optional): The timeout for the request.
79 retry (int, optional): The number of retries for the request.
80 verify (bool, optional): Whether to verify the SSL certificate.
81 silent (bool, optional): Whether to suppress the logs.
82 mobile (bool, optional): Whether to use mobile headers.
83 rformat (str, optional): The format of the response.
84 last_error (str, optional): Last error message.
85 hx_raw (dict, optional): Raw HTTPX response on failure.
86
87 Returns:
88 dict: {"success": bool, "data": response}
89 """
90 if retry > max_retry:
91 logger.error(f"[{method}] Failed after {retry} retries: {url}")
92 return {"hx_error": last_error, "hx_raw": hx_raw or {}}
93 if transport is None:
94 transport = AsyncCurlTransport(proxy=proxy, impersonate="safari_ios" if mobile else "chrome", default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True})
95
96 if silent:
97 client = AsyncClient(http2=True, proxy=proxy, transport=transport, follow_redirects=follow_redirects, timeout=timeout, verify=verify)
98 else:
99 client = AsyncClient(
100 http2=True,
101 proxy=proxy,
102 transport=transport,
103 follow_redirects=follow_redirects,
104 timeout=timeout,
105 verify=verify,
106 event_hooks={"request": [log_req], "response": [log_resp]},
107 )
108
109 if method not in ["GET", "POST", "PUT", "DELETE", "PATCH"]:
110 error = f"Invalid method: {method}"
111 logger.error(error)
112 return {"hx_error": error}
113 try:
114 async with client:
115 if method == "GET":
116 response = await client.get(url, cookies=cookies, headers=headers, params=params)
117 elif method == "POST":
118 response = await client.post(url, cookies=cookies, headers=headers, data=data, json=json_data, files=files, content=content_data, params=params)
119 elif method == "PUT":
120 response = await client.put(url, cookies=cookies, headers=headers, data=data, json=json_data, content=content_data, files=files, params=params)
121 elif method == "PATCH":
122 response = await client.patch(url, cookies=cookies, headers=headers, data=data, json=json_data, content=content_data, files=files, params=params)
123 else:
124 response = await client.delete(url, cookies=cookies, headers=headers, params=params)
125 response.raise_for_status()
126 meta = {"headers": response.headers, "status_code": response.status_code}
127 if rformat == "content":
128 return {"content": response.content} | meta
129 resp_data = response.text
130 check_data(resp_data, check_keys=check_keys, check_kv=check_kv)
131 res = json.loads(resp_data) if rformat == "json" else {rformat: resp_data} | meta
132 if not silent:
133 logger.trace(res)
134 return res | meta if isinstance(res, dict) else res
135 except Exception as e:
136 error = f"{type(e).__name__}[{retry + 1}/{max_retry + 1}]: Failed to request {url}, {e}"
137 with contextlib.suppress(Exception):
138 hx_raw = response.json()
139 if "res" in locals():
140 error += f"\n{res}"
141 if "data" in locals():
142 error += f"\n{data}"
143 if hx_raw:
144 error += f"\n{hx_raw}"
145 elif "response" in locals():
146 error += f"\n{response.text}"
147 logger.error(error)
148 return await hx_req(url, method, headers=headers, cookies=cookies, params=params, data=data, json_data=json_data, proxy=proxy, follow_redirects=follow_redirects, check_keys=check_keys, check_kv=check_kv, timeout=timeout, retry=retry + 1, max_retry=max_retry, silent=silent, rformat=rformat, last_error=error, hx_raw=hx_raw) # fmt: off
149
150
151async def download_file(
152 link: str,
153 path: str | Path | None = None,
154 *,
155 suffix: str = "",
156 skip_exist: bool = False,
157 proxy: str | None = None,
158 headers: dict | None = None,
159 impersonate: BrowserTypeLiteral | None = "safari_ios",
160 stream: bool = False,
161 **kwargs,
162) -> str:
163 """Download a file from the given link and save it to the specified path.
164
165 Args:
166 link (str): URL to download the file.
167 path (str | Path, optional): The path to save the downloaded file. Defaults to auto detect.
168 suffix (str, optional): The suffix to append to the file name. Defaults to auto detect.
169 skip_exist (bool, optional): Skip downloading if the file already exists. Defaults to False.
170 proxy (str, optional): The proxy to use for the request.
171 headers (dict, optional): The headers to use for the request.
172 stream (bool, optional): Stream the download. Defaults to False.
173
174 Returns:
175 str: Download file path.
176 """
177 if not link:
178 return ""
179 if path is None:
180 path = Path(DOWNLOAD_DIR) / Path(urlparse(link).path).name
181 path = Path(path).expanduser().resolve()
182 if path.suffix != suffix:
183 path = path.with_suffix(f"{path.suffix}{suffix}") # append suffix, not replace
184
185 if path.is_file() and skip_exist:
186 logger.info(f"File already exists, skipping download: {path}")
187 return path.as_posix()
188 path.parent.mkdir(parents=True, exist_ok=True)
189 proxy = proxy or PROXY.DOWNLOAD
190 logger.trace(f"Downloading {link} to {path} with proxy={proxy}")
191 hx = AsyncClient(
192 headers=headers,
193 transport=AsyncCurlTransport(proxy=proxy, impersonate=impersonate, default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True}) if isinstance(impersonate, str) else None,
194 proxy=proxy,
195 timeout=REQUEST_TIMEOUT,
196 follow_redirects=True,
197 event_hooks={"request": [log_req], "response": [log_resp]},
198 )
199 try:
200 if stream: # can monitor progress, but the retry mechanism does not work
201 async with semaphore, hx.stream("GET", link) as response:
202 total = int(response.headers.get("Content-Length", 0))
203 async with await anyio.open_file(path, "wb") as f:
204 num_bytes_downloaded = response.num_bytes_downloaded
205 async for chunk in response.aiter_bytes():
206 await f.write(chunk)
207 msg = f"⏬下载中: {readable_size(num_bytes_downloaded)} / {readable_size(total)}\n💾{path.name}"
208 msg += f" ({num_bytes_downloaded / total:.2%})" if total and total > 0 else ""
209 await modify_progress(text=msg, **kwargs)
210 num_bytes_downloaded = response.num_bytes_downloaded
211 else:
212 async with semaphore, hx:
213 response = await hx.get(link)
214 response.raise_for_status()
215 path.write_bytes(response.content) # Save the file to disk
216 except (RequestError, HTTPStatusError) as e:
217 error = f"Failed to download: {e}"
218 logger.error(error)
219 await modify_progress(text=error, **kwargs)
220 return ""
221 if path.is_file():
222 logger.info(f"Downloaded file saved to {path}")
223 await modify_progress(text=f"🎉下载成功\n{path.name}", **kwargs)
224 return path.as_posix()
225 return ""
226
227
228async def download_first_success_urls(links: list[str], **kwargs) -> str:
229 """Download the first successfully file from a list of links.
230
231 Note: This will only download a single file from the list of links.
232 """
233 if not links:
234 return ""
235 for link in links:
236 res = await download_file(link, **kwargs)
237 if Path(res).is_file():
238 return res
239 return ""
240
241
242async def download_media(media: list[dict], **kwargs) -> list[dict]:
243 if not media:
244 return []
245 tasks = []
246 for item in media:
247 if task := item.get("photo"): # async function
248 tasks.append(task)
249 if task := item.get("video"):
250 tasks.append(task)
251 if task := item.get("livephoto"):
252 tasks.append(task)
253 if task := item.get("audio"):
254 tasks.append(task)
255 # run all tasks
256 results = await asyncio.gather(*tasks, return_exceptions=True)
257 final_media = []
258 for item, result in zip(media, results, strict=True):
259 if isinstance(result, Exception):
260 logger.error(f"Failed to download: {result}")
261 elif isinstance(result, str) and not Path(result).is_file():
262 logger.error(f"Downloaded file is not exists: {result}")
263 else:
264 if item.get("photo"): # async function
265 item["photo"] = result
266 final_media.append(item)
267 if task := item.get("video"):
268 item["video"] = result
269 final_media.append(item)
270 if task := item.get("livephoto"):
271 item["video"] = result
272 final_media.append(item)
273 if task := item.get("audio"):
274 item["audio"] = result
275 final_media.append(item)
276 logger.success(f"Downloaded: {result}")
277 await modify_progress(text=f"✅下载成功:\n{summay_media(final_media)}", **kwargs)
278 return final_media
279
280
281@cache.memoize(ttl=60)
282async def match_social_media_link(text: str, *, flatten_first: bool = True) -> dict:
283 """Matches social media links in the given text and returns a dictionary with the matched information.
284
285 Args:
286 text (str): The text to search for social media links.
287
288 Returns:
289 dict: A dictionary containing the matched information. At least "platform", "url", and "db_key" keys are present.
290 platform: The social media platform name.
291 url: The matched URL.
292 db_key: The key to store in the cache.
293 #! TODO: Handle multiple links in one message.
294 """
295 text = str(text)
296 if flatten_first:
297 text = await flatten_rediercts(text)
298 # https://www.douyin.com/video/7398813386827468041
299 # https://www.douyin.com/note/7458195074434846004
300 # https://www.iesdouyin.com/share/video/7454527270925946138/
301 # https://www.iesdouyin.com/share/note/7454527270925946138/
302 if matched := re.search(r"(https?://)?(www\.)?(ies)?douyin\.com/(share/)?(:?|video|note)/(\d+)", text):
303 return {"url": f"https://www.douyin.com/{matched.group(5)}/{matched.group(6)}", "db_key": f"www.douyin.com/{matched.group(5)}/{matched.group(6)}", "platform": "douyin"}
304 # https://www.douyin.com/user/MS4wLjABAAAAXgBuOEcyavDhrRBqnD8x7d4pj7RIL5QFRlLehCnem8couoAg8yXR-MGhUK0i4riF?modal_id=7451543857952492810
305 # https://www.douyin.com/discover?modal_id=7472757663609179430
306 if matched := re.search(r"(https?://)?(www\.)?douyin\.com/(.*?)\?(.*?)modal_id=(\d+)", text):
307 return {"url": f"https://www.douyin.com/video/{matched.group(5)}", "db_key": f"www.douyin.com/video/{matched.group(5)}", "platform": "douyin"}
308 # https://www.douyin.com/?previous_page=oversea_share_link&vid=7483851761246031115
309 if matched := re.search(r"(https?://)?(www\.)?douyin\.com/(.*?)\?(.*?)vid=(\d+)", text):
310 return {"url": f"https://www.douyin.com/video/{matched.group(5)}", "db_key": f"www.douyin.com/video/{matched.group(5)}", "platform": "douyin"}
311 # https://www.tiktok.com/@baymermel/video/7460653893941267755\?_t\=ZS-8t8YbVWqv5k\&_r\=1
312 if matched := re.search(r"(https?://)?(www\.)?tiktok\.com/(.*?)/(\d+)", text):
313 return {"url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "tiktok"}
314
315 # https://www.instagram.com/p/C7P3jN8vmEN
316 # https://www.instagram.com/reel/DBBEGXpvwNF
317 if matched := re.search(r"(https?://)?(www\.)?instagram\.com/(:?|p|reel)/([^.。,,/\s]+)", text):
318 return {"post_type": matched.group(3), "post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
319 # https://www.instagram.com/yifaer_chen/p/DEzv9x-vzOn/
320 if matched := re.search(r"(https?://)?(www\.)?instagram\.com/[a-zA-Z0-9_.]+/(:?|p|reel)/([^.。,,/\s]+)", text):
321 return {"post_type": matched.group(3), "post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
322 # https://www.instagram.com/stories/laufey/3891120377355460527
323 if matched := re.search(r"(https?://)?(www\.)?instagram\.com/stories/([a-zA-Z0-9_.]+)/(\d+)", text):
324 return {"post_type": "story", "post_id": matched.group(4), "username": matched.group(3), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
325
326 # https://x.com/taylorswift13/status/1794805688696275131
327 # https://twitter.com/taylorswift13/status/1794805688696275131
328 # https://fixupx.com/taylorswift13/status/1794805688696275131
329 # https://fxtwitter.com/taylorswift13/status/1794805688696275131
330 if matched := re.search(r"(https?://)?(:?twitter|x|fxtwitter|fixupx|vxtwitter)\.com\/(\w+)\/status/(\d+)", text):
331 handle = matched.group(3)
332 post_id = matched.group(4)
333 url = f"https://x.com/{handle}/status/{post_id}"
334 return {"platform": "x", "handle": handle, "post_id": post_id, "url": url, "db_key": bare_url(url)}
335
336 # weibo video first, then weibo post
337 # https://video.weibo.com/show?fid=1034:5123779299311660
338 # https://h5.video.weibo.com/show/1034:5169532881535051
339 if matched := re.search(r"(https?://)?(h5\.)?video\.weibo\.(:?com|cn)/show(\?fid=|\/)(\d+):(\d+)", text):
340 return {"post_id": f"weibovideo{matched.group(5)}:{matched.group(6)}", "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "weibo"}
341 # https://weibo.com/tv/show/1034:5123779299311660?from=old_pc_videoshow
342 if matched := re.search(r"(https?://)?(www\.)?weibo\.(:?com|cn)/tv/show/(\d+):(\d+)", text):
343 return {"post_id": f"weibovideo{matched.group(4)}:{matched.group(5)}", "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "weibo"}
344 # https://m.weibo.cn/detail/5113333048938691
345 # https://m.weibo.cn/status/5113333048938691
346 if matched := re.search(r"(https?://)?m\.weibo\.cn/(:?detail|status)/(\w+)", text):
347 return {"post_id": matched.group(3), "url": https_url(matched.group(0)), "db_key": f"m.weibo.cn/detail/{matched.group(3)}", "platform": "weibo"}
348 # https://weibo.com/1736562685/P6lhSjRnI
349 if matched := re.search(r"(https?://)?(www\.)?weibo\.com/(.*?)/(\w+)", text):
350 return {"post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": f"m.weibo.cn/detail/{matched.group(4)}", "platform": "weibo"}
351
352 # http://xhslink.com/a/Z3VPXAReU1Y1
353 xhs_pattern = r"(https?://)?xhslink\.com/(\w?/?)([^,,.。?\s]+)"
354 if matched := re.search(xhs_pattern, text):
355 transport = AsyncCurlTransport(proxy=PROXY.XHS, impersonate="safari_ios", default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True})
356 flatten = await flatten_rediercts(https_url(matched.group(0)), transport=transport, pattern=xhs_pattern, proxy=PROXY.XHS, method="GET")
357 base_url = flatten.split("?")[0]
358 post_id = Path(base_url).stem
359 queries = parse_qs(urlparse(flatten).query)
360 xsec = queries.get("xsec_token", [""])[0]
361 return {"url": https_url(matched.group(0)), "db_key": f"www.xiaohongshu.com/explore/{post_id}", "xsec": xsec, "is_xhs_link": True, "platform": "xiaohongshu"}
362 # https://www.xiaohongshu.com/explore/671a3dfe00000000240161db?xsec_token=ABY-b1JKuAlIm2dX1OSdIFHD7cQFHEdThv5aMyccvmbJo=
363 if matched := re.search(r"(https?://)?(www\.)?xiaohongshu\.com/([^。,,\s]+)", text):
364 base_url = matched.group(0).split("?")[0]
365 post_id = Path(base_url).stem
366 queries = parse_qs(urlparse(matched.group(0)).query)
367 xsec = queries.get("xsec_token", [""])[0]
368 return {
369 "url": f"https://www.xiaohongshu.com/explore/{post_id}?xsec_token={xsec}",
370 "db_key": f"www.xiaohongshu.com/explore/{post_id}",
371 "is_xhs_link": False,
372 "xsec": xsec,
373 "platform": "xiaohongshu",
374 }
375
376 # https://www.bilibili.com/video/BV1RSsNzDEQb
377 # https://www.bilibili.com/video/av115402113881975
378 # https://www.bilibili.com/BV1RSsNzDEQb
379 # https://www.bilibili.com/av115402113881975
380 if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/(video/)?(?P<prefix>[aAbB][vV])(?P<id>[a-zA-Z0-9]+)", text):
381 base_url = matched.group(0).split("?")[0]
382 bvid = Path(base_url).stem
383 queries = parse_qs(urlparse(matched.group(0)).query)
384 pid = queries.get("p", ["1"])[0]
385 url = f"https://www.bilibili.com/video/{av2bv(bvid)}?p={pid}".removesuffix("?p=1")
386 return {"url": url, "db_key": bare_url(url), "bvid": av2bv(bvid), "pid": pid, "platform": "bilibili"}
387
388 # https://www.bilibili.com/list/watchlater/?bvid=BV1wi421f71U&oid=1451459580
389 if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/(.*?)bvid=(?P<prefix>[aAbB][vV])(?P<id>[a-zA-Z0-9]+)", text):
390 bvid = matched.group(4) + matched.group(5)
391 url = f"https://www.bilibili.com/video/{av2bv(bvid)}"
392 return {"url": url, "db_key": bare_url(url), "bvid": av2bv(bvid), "pid": 1, "platform": "bilibili"}
393
394 # https://m.bilibili.com/opus/1048442220384878593
395 if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/opus/(\d+)", text):
396 post_id = matched.group(3)
397 url = f"https://www.bilibili.com/opus/{post_id}"
398 return {"url": url, "db_key": url, "post_id": post_id, "platform": "bilibili-opus"}
399
400 # https://github.com/user-name/repo
401 # https://github.com/user-name/repo/issues/123
402 # https://github.com/user-name/repo/issues/123#issuecomment-45678
403 # https://github.com/user-name/repo/pull/123
404 # https://github.com/user-name/repo/pull/123#issuecomment-45678
405 if matched := re.search(r"(https?://)?github\.com/([a-zA-Z0-9]+(-[a-zA-Z0-9]+)*)/([.a-zA-Z0-9_-]+)/?([#/a-zA-Z0-9_-]+)?", text):
406 gh_user = matched.group(2)
407 gh_repo = matched.group(4)
408 query = matched.group(5) or ""
409 url = matched.group(0)
410 return {"url": https_url(url), "db_key": bare_url(url), "gh_user": gh_user, "gh_repo": gh_repo, "query": query, "platform": "github"}
411
412 # https://www.v2ex.com/t/1153086
413 if matched := re.search(r"(https?://)?(www\.)?v2ex\.com/t/(\d+)", text):
414 topic_id = matched.group(3)
415 url = f"https://www.v2ex.com/t/{topic_id}"
416 return {"url": url, "db_key": bare_url(url), "topic_id": topic_id, "platform": "v2ex"}
417
418 # https://open.spotify.com/track/0cOMncRq4cmDLO4tPQnkBF
419 if matched := re.search(r"(https?://)?open\.spotify\.com/(:?track|album|artist|playlist)/([a-zA-Z0-9]+)", text):
420 resource = matched.group(2)
421 spotify_id = matched.group(3)
422 url = matched.group(0)
423 return {"url": url, "db_key": bare_url(url), "resource": resource, "spotify_id": spotify_id, "platform": "spotify"}
424
425 # https://music.163.com/song?id=2021343740
426 # https://163cn.tv/HYHqZ6R
427 # https://163cn.link/HYHqZ6R
428 if matched := re.search(r"(https?://)?(:?music\.163\.com|163cn\.tv|163cn\.link)/([0-9a-zA-Z#./?=_\-%&]+)", text):
429 url = matched.group(0)
430 return {"url": url, "db_key": bare_url(url), "platform": "music163"}
431
432 # https://www.youtube.com/watch?v=D6aE2E0RHTc
433 if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/watch.*?v=([a-zA-Z0-9_-]{11})", text):
434 vid = matched.group(3)
435 return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
436 # https://youtube.com/shorts/lFKHbluAlJw
437 if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/(:?shorts|live)/([a-zA-Z0-9_-]{11})", text):
438 vid = matched.group(4)
439 return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
440 # https://youtu.be/vOiP3kfFlrE
441 if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtu\.be/([a-zA-Z0-9_-]{11})", text):
442 vid = matched.group(3)
443 return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
444
445 # https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw
446 # https://mp.weixin.qq.com/s?__biz=MzI5Njc4NTYyOQ==&mid=2247494800&idx=1&sn=43a5732bd3a205d4dbdcd523afc0ca4a&sharer_shareinfo=1923203fd24bfa47c5b36b690026f5c8&sharer_shareinfo_first=8814eca80b4a37d10aa9b725e61f9486
447 if matched := re.search(r"(https?://)?mp.weixin.qq.com/s[\/|\?]{1}([_A-Za-z\=\&0-9\#\-]+)", text):
448 return {"url": matched.group(0), "db_key": bare_url(matched.group(0)), "platform": "wechat"}
449
450 # !Put this before all reddit rules
451 # https://www.reddit.com/r/China_irl/s/bA50WleCBM
452 reddit_pattern = r"(https?://)?(:?m\.|www\.)reddit\.com/r/\w+/s/([^.。,,?&/\s]+)"
453 if matched := re.search(reddit_pattern, text):
454 text = await flatten_rediercts(https_url(matched.group(0)), pattern=reddit_pattern, proxy=PROXY.REDDIT)
455 # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/%E8%B5%B5%E8%96%87%E4%BB%80%E4%B9%88%E6%97%B6%E5%80%99%E5%9B%9E%E6%9D%A5/
456 # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/赵薇什么时候回来
457 # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/comment/mk43l4t/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button
458 if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/r/([_A-Za-z0-9]+)/comments/(.*?)/([^,,.。\?\s]+)", text):
459 return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
460 # https://reddit.com/comments/1kaazzn
461 if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/comments/([_A-Za-z0-9]+)", text):
462 return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
463
464 # https://arxiv.org/abs/2301.12345
465 # https://arxiv.org/pdf/2301.12345v3
466 if matched := re.search(r"(https?://)?arxiv\.org/(abs|pdf)/(\d{4}\.\d{4,5}(?:v\d+)?)", text):
467 arxiv_id = matched.group(3)
468 if "v" not in arxiv_id:
469 arxiv_id += "v1"
470 return {"url": f"https://arxiv.org/abs/{arxiv_id}", "arxiv_id": arxiv_id, "db_key": f"arxiv.org/abs/{arxiv_id}", "platform": "arxiv"}
471
472 # if all above pre-defined patterns failed, try to match ytdlp link
473 if urls := match_urls(text):
474 for url in urls:
475 if any(x in url.lower() for x in ["bilibili", "douyin", "instagram", "tiktok", "twitter", "weibo", "xiaohongshu", "reddit", "youtube"]):
476 # handled above
477 continue
478 if is_supported_by_ytdlp(url):
479 return {"url": url, "db_key": bare_url(url), "platform": "ytdlp"}
480 return {"platform": ""}
481
482
483@cache.memoize(ttl=60)
484async def flatten_rediercts(
485 texts: str | None = None,
486 pattern: str | None = None,
487 headers: dict | None = None,
488 proxy: str | None = None,
489 method: str = "HEAD",
490 transport: AsyncCurlTransport | AsyncHTTPTransport | None = None,
491) -> str:
492 if not texts:
493 return ""
494 url = ""
495 # v.douyin.com
496 if matched := re.search(r"(https?://)?v\.douyin\.com/([^.。,,?&/\s]+)", texts):
497 method = "GET" # use GET for v.douyin.com
498 url = matched.group(0)
499 # vt.tiktok.com
500 if matched := re.search(r"(https?://)?vt\.tiktok\.com/([^.。,,?&/\s]+)", texts):
501 url = matched.group(0)
502 # tiktok.com/t/
503 if matched := re.search(r"(https?://)?(www\.)?tiktok\.com/t/([^.。,,?&/\s]+)", texts):
504 url = matched.group(0)
505 # facebook.com/
506 if matched := re.search(r"(https?://)?(:?m\.|www\.)facebook\.com/share/(v|r)/([^.。,,?&/\s]+)", texts):
507 url = matched.group(0)
508 # b23.tv
509 if matched := re.search(r"(https?://)?b23\.tv/([^.。,,?&/\s]+)", texts):
510 url = matched.group(0)
511 # bili2233.cn
512 if matched := re.search(r"(https?://)?bili2233\.cn/([^.。,,?&/\s]+)", texts):
513 url = matched.group(0)
514 # t.co
515 if matched := re.search(r"(https?://)?t\.co/([^.。,,?&/\s]+)", texts):
516 url = matched.group(0)
517 # mapp.api.weibo.cn
518 if matched := re.search(r"(https?://)?mapp\.api\.weibo\.cn/fx/([0-9a-zA-Z]+)\.html", texts):
519 url = matched.group(0)
520 # t.cn
521 if matched := re.search(r"(https?://)?t\.cn/([^.。,,?&/\s]+)", texts):
522 url = matched.group(0)
523 method = "GET"
524 # bit.ly
525 if matched := re.search(r"(https?://)?bit\.ly/([^.。,,?&/\s]+)", texts):
526 url = matched.group(0)
527 # shorturl.at
528 if matched := re.search(r"(https?://)?shorturl\.at/([^.。,,?&/\s]+)", texts):
529 url = matched.group(0)
530 # vertexaisearch.cloud.google.com
531 if matched := re.search(r"(https?://)?vertexaisearch\.cloud\.google\.com/([0-9a-zA-Z\-_=+/]+)", texts):
532 url = matched.group(0)
533 proxy = PROXY.GOOGLE
534
535 # custom pattern
536 if pattern and (matched := re.search(pattern, texts)):
537 url = matched.group(0)
538 if not url:
539 return texts
540 # parse redirect
541 rediercted_url = https_url(url)
542 with contextlib.suppress(Exception):
543 if method == "HEAD":
544 async with AsyncClient(http2=True, proxy=proxy, follow_redirects=True, transport=transport, event_hooks={"request": [log_req], "response": [log_resp]}) as hx:
545 resp = await hx.head(https_url(url), headers=headers, timeout=3)
546 rediercted_url = str(resp.url)
547 elif method == "GET":
548 status_code = 302
549 while str(status_code).startswith("3"):
550 async with AsyncClient(http2=True, proxy=proxy, follow_redirects=False, transport=transport, event_hooks={"request": [log_req], "response": [log_resp]}) as hx:
551 resp = await hx.get(rediercted_url, headers=headers, timeout=3)
552 status_code = resp.status_code
553 rediercted_url = resp.headers.get("Location", rediercted_url)
554 if url != rediercted_url:
555 logger.info(f"Flatten redirect: {url} -> {rediercted_url}")
556 return texts.replace(url, rediercted_url)
557
558
559async def shorten_url(url: str, alias: str | None = None, services: list[str] | None = None) -> str:
560 """Shorten URL."""
561 if not url:
562 return url
563 supported = ["spoo.me", "cleanuri.com"]
564 if services is None:
565 services = supported
566 services = [x for x in services if x.lower() in supported]
567 for service in services:
568 if service == "spoo.me":
569 expire_after = nowdt() + timedelta(days=365 * 2)
570 headers = {"Content-Type": "application/json"}
571 payload = {"long_url": url, "expire_after": round(expire_after.timestamp()), "block_bots": False}
572 if TOKEN.SPOOME:
573 headers |= {"Authorization": f"Bearer {TOKEN.SPOOME}"}
574 payload |= {"private_stats": False}
575 if alias:
576 payload |= {"alias": alias}
577 resp = await hx_req("https://spoo.me/api/v1/shorten", "POST", headers=headers, json_data=payload, check_kv={"status": "ACTIVE"}, max_retry=0)
578 if glom(resp, "hx_raw.code", default="") == "conflict":
579 return f"https://spoo.me/{alias}"
580 if short_url := glom(resp, "short_url", default=""):
581 return short_url
582 if service == "cleanuri.com":
583 resp = await hx_req("https://cleanuri.com/api/v1/shorten", "POST", json_data={"url": url}, check_keys=["result_url"])
584 if short_url := glom(resp, "result_url", default=""):
585 return short_url
586 return url
587
588
589if __name__ == "__main__":
590 import asyncio
591
592 asyncio.run(shorten_url("https://www.google.com", alias="test"))
593 check_data(json.dumps({"foo": "bar", "baz": {"qux": "quux"}, "lst": ["1", "2", "3"]}), check_keys=["baz.qux"], check_kv={"foo": "bar", "baz.qux": "quux", "lst": ["1", "2", "3"]})
594 # asyncio.run(match_social_media_link("https://b23.tv/3MSgT4q/", flatten_first=True))
595 # print(asyncio.run(match_social_media_link("https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw", flatten_first=True)))
596 # print(asyncio.run(match_social_media_link("https://reddit.com/comments/1kaazzn", flatten_first=True)))
597 # print(asyncio.run(match_social_media_link("https://www.reddit.com/r/China_irl/s/bA50WleCBM")))
598 # asyncio.run(match_social_media_link("https://www.facebook.com/share/r/19QGGp39T3/", flatten_first=True))
599 # asyncio.run(match_social_media_link("https://www.douyin.com/video/7398813386827468041"))
600 # asyncio.run(match_social_media_link("https://www.iesdouyin.com/share/note/7454527270925946138/"))
601 # asyncio.run(match_social_media_link("https://www.instagram.com/yifaer_chen/p/DEzv9x-vzOn/"))
602 # asyncio.run(flatten_rediercts("http://t.cn/A6ukIuVn"))
603 # asyncio.run(flatten_rediercts("shorturl.at/fuyrt"))
604 # asyncio.run(flatten_rediercts("https://b23.tv/3MSgT4q"))
605 # asyncio.run(flatten_rediercts("https://v.douyin.com/CeiJfJMQG/"))
606 # asyncio.run(flatten_rediercts("https://www.tiktok.com/t/ZT2mcMA7f/"))
607 # asyncio.run(flatten_rediercts("https://t.co/Wwo3x69CQz"))
608 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/issues/14463")))
609 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/")))
610 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp")))
611 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/issues/14404#issuecomment-3323873708")))
612 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/pull/14467")))
613 # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/pull/14417#issuecomment-3327344721")))
614 # print(asyncio.run(match_social_media_link("https://mapp.api.weibo.cn/fx/09ab955a1e0d406c9d6a74f5a2242b4a.html")))
615 # print(asyncio.run(match_social_media_link("https://www.bilibili.com/video/BV1TC411J7PK")))
616 # print(asyncio.run(match_social_media_link("https://www.bilibili.com/BV1TC411J7PK")))
617 # print(asyncio.run(match_social_media_link("https://www.instagram.com/miyoshi.aa/p/DN5hFcUE8rS/")))
618 # print(asyncio.run(match_social_media_link("https://www.youtube.com/watch?v=D6aE2E0RHTc")))
619 # print(asyncio.run(match_social_media_link("https://youtube.com/shorts/lFKHbluAlJw")))
620 # print(asyncio.run(match_social_media_link("https://youtu.be/vOiP3kfFlrE?si=zPd-Bt1GO03jxpI_")))
621 # res = asyncio.run(hx_req("https://httpbin.org/delay/10"))
622 # asyncio.run(hx_req("https://httpbin.org/get", check_kv={"url": "https://httpbin.org/get", "headers.Pragma": "no-cache1"}, max_retry=1))
623 # resp = asyncio.run(hx_req("https://httpbin.org/get", check_kv={"headers": {"Accept-Language": "en-US,en;q=0.8"}}))
624 # resp = asyncio.run(hx_req("https://httpbin.org/headers", headers={"referer": "https://www.xiaohongshu.com/"}))
625 # print(resp)
626
627 # asyncio.run(download_file("https://httpbin.org/image/jpeg", suffix=".jpg"))
628 # asyncio.run(match_social_media_link("https://www.instagram.com/p/C7P3jN8vmEN"))