bennybot/src/networking.py at main

  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3
  4import asyncio
  5import contextlib
  6import json
  7import re
  8from pathlib import Path
  9from typing import Any, Literal
 10from urllib.parse import parse_qs, urlparse
 11
 12import anyio
 13from curl_cffi.requests.impersonate import BrowserTypeLiteral
 14from httpx import AsyncClient, AsyncHTTPTransport, HTTPStatusError, Request, RequestError, Response
 15from httpx._types import RequestContent, RequestData, RequestFiles
 16from httpx_curl_cffi import AsyncCurlTransport, CurlOpt
 17from loguru import logger
 18
 19from config import DOWNLOAD_DIR, PROXY, REQUEST_TIMEOUT, cache, semaphore
 20from messages.progress import modify_progress
 21from messages.utils import summay_media
 22from utils import av2bv, bare_url, check_data, https_url, is_supported_by_ytdlp, match_urls, readable_size
 23
 24
 25# ruff: noqa: RUF001
 26async def log_req(request: Request) -> None:
 27    logger.debug(f"{request.method} {request.url} {request.headers}")
 28
 29
 30async def log_resp(response: Response) -> None:
 31    request = response.request
 32    logger.debug(f"[{response.status_code}] {request.method} {request.url}")
 33
 34
 35async def hx_req(
 36    url,
 37    method: str = "GET",
 38    *,
 39    transport: AsyncCurlTransport | AsyncHTTPTransport | None = None,
 40    headers: dict | None = None,
 41    cookies: dict | None = None,
 42    params: dict | None = None,
 43    data: RequestData | None = None,
 44    json_data: dict | None = None,
 45    content_data: RequestContent | None = None,
 46    files: RequestFiles | None = None,
 47    proxy: str | None = None,
 48    follow_redirects: bool = True,
 49    check_keys: list[str] | None = None,
 50    check_kv: dict | None = None,
 51    timeout: int = REQUEST_TIMEOUT,  # noqa: ASYNC109
 52    retry: int = 0,
 53    max_retry: int = 2,
 54    verify: bool = True,
 55    silent: bool = False,
 56    mobile: bool = False,
 57    rformat: Literal["json", "text", "content"] = "json",
 58    last_error: str = "",
 59    hx_raw: dict | None = None,
 60) -> dict[str, Any]:
 61    """Request the given URL with the given method and return the response as a dictionary.
 62
 63    Args:
 64        url (str): The URL to request.
 65        method (str): The method to use for the request.
 66        headers (dict, optional): The headers to use for the request.
 67        cookies (dict, optional): The cookies to use for the request.
 68        params (dict, optional): The parameters to use for the request.
 69        data (dict, optional): The data to POST or PUT.
 70        json_data (dict, optional): The JSON data to POST or PUT.
 71        content_data (dict, optional): The form data to POST or PUT.
 72        proxy (str, optional): The proxy to use for the request.
 73        follow_redirects (bool, optional): Whether to follow redirects.
 74        check_keys (list[str], optional): The keys to check in the response.
 75        check_kv (dict, optional): The key-value pairs to check in the response.
 76        timeout (int, optional): The timeout for the request.
 77        retry (int, optional): The number of retries for the request.
 78        verify (bool, optional): Whether to verify the SSL certificate.
 79        silent (bool, optional): Whether to suppress the logs.
 80        mobile (bool, optional): Whether to use mobile headers.
 81        rformat (str, optional): The format of the response.
 82        last_error (str, optional): Last error message.
 83        hx_raw (dict, optional): Raw HTTPX response on failure.
 84
 85    Returns:
 86        dict: {"success": bool, "data": response}
 87    """
 88    if retry > max_retry:
 89        logger.error(f"[{method}] Failed after {retry} retries: {url}")
 90        return {"hx_error": last_error, "hx_raw": hx_raw or {}}
 91    if transport is None:
 92        transport = AsyncCurlTransport(proxy=proxy, impersonate="safari_ios" if mobile else "chrome", default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True})
 93
 94    if silent:
 95        client = AsyncClient(http2=True, proxy=proxy, transport=transport, follow_redirects=follow_redirects, timeout=timeout, verify=verify)
 96    else:
 97        client = AsyncClient(
 98            http2=True,
 99            proxy=proxy,
100            transport=transport,
101            follow_redirects=follow_redirects,
102            timeout=timeout,
103            verify=verify,
104            event_hooks={"request": [log_req], "response": [log_resp]},
105        )
106
107    if method not in ["GET", "POST", "PUT", "DELETE"]:
108        error = f"Invalid method: {method}"
109        logger.error(error)
110        return {"hx_error": error}
111    try:
112        async with client:
113            if method == "GET":
114                response = await client.get(url, cookies=cookies, headers=headers, params=params)
115            elif method == "POST":
116                response = await client.post(url, cookies=cookies, headers=headers, data=data, json=json_data, files=files, content=content_data, params=params)
117            elif method == "PUT":
118                response = await client.put(url, cookies=cookies, headers=headers, data=data, json=json_data, content=content_data, files=files, params=params)
119            else:
120                response = await client.delete(url, cookies=cookies, headers=headers, params=params)
121            response.raise_for_status()
122            meta = {"headers": response.headers, "status_code": response.status_code}
123            if rformat == "content":
124                return {"content": response.content} | meta
125            resp_data = response.text
126            check_data(resp_data, check_keys=check_keys, check_kv=check_kv)
127            res = json.loads(resp_data) if rformat == "json" else {rformat: resp_data} | meta
128            if not silent:
129                logger.trace(res)
130            return res | meta if isinstance(res, dict) else res
131    except Exception as e:
132        error = f"{type(e).__name__}[{retry + 1}/{max_retry + 1}]: Failed to request {url}, {e}"
133        with contextlib.suppress(Exception):
134            hx_raw = response.json()
135        if "res" in locals():
136            error += f"\n{res}"
137        if "data" in locals():
138            error += f"\n{data}"
139        if hx_raw:
140            error += f"\n{hx_raw}"
141        elif "response" in locals():
142            error += f"\n{response.text}"
143        logger.error(error)
144        return await hx_req(url, method, headers=headers, cookies=cookies, params=params, data=data, json_data=json_data, proxy=proxy, follow_redirects=follow_redirects, check_keys=check_keys, check_kv=check_kv, timeout=timeout, retry=retry + 1, max_retry=max_retry, silent=silent, rformat=rformat, last_error=error, hx_raw=hx_raw)  # fmt: off
145
146
147async def download_file(
148    link: str,
149    path: str | Path | None = None,
150    *,
151    suffix: str = "",
152    skip_exist: bool = False,
153    proxy: str | None = None,
154    headers: dict | None = None,
155    impersonate: BrowserTypeLiteral | None = "safari_ios",
156    stream: bool = False,
157    **kwargs,
158) -> str:
159    """Download a file from the given link and save it to the specified path.
160
161    Args:
162        link (str): URL to download the file.
163        path (str | Path, optional): The path to save the downloaded file. Defaults to auto detect.
164        suffix (str, optional): The suffix to append to the file name. Defaults to auto detect.
165        skip_exist (bool, optional): Skip downloading if the file already exists. Defaults to False.
166        proxy (str, optional): The proxy to use for the request.
167        headers (dict, optional): The headers to use for the request.
168        stream (bool, optional): Stream the download. Defaults to False.
169
170    Returns:
171        str: Download file path.
172    """
173    if not link:
174        return ""
175    if path is None:
176        path = Path(DOWNLOAD_DIR) / Path(urlparse(link).path).name
177    path = Path(path).expanduser().resolve()
178    if path.suffix != suffix:
179        path = path.with_suffix(f"{path.suffix}{suffix}")  # append suffix, not replace
180
181    if path.is_file() and skip_exist:
182        logger.info(f"File already exists, skipping download: {path}")
183        return path.as_posix()
184    path.parent.mkdir(parents=True, exist_ok=True)
185    proxy = proxy or PROXY.DOWNLOAD
186    logger.trace(f"Downloading {link} to {path} with proxy={proxy}")
187    hx = AsyncClient(
188        headers=headers,
189        transport=AsyncCurlTransport(proxy=proxy, impersonate=impersonate, default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True}) if isinstance(impersonate, str) else None,
190        proxy=proxy,
191        timeout=REQUEST_TIMEOUT,
192        follow_redirects=True,
193        event_hooks={"request": [log_req], "response": [log_resp]},
194    )
195    try:
196        if stream:  # can monitor progress, but the retry mechanism does not work
197            async with semaphore, hx.stream("GET", link) as response:
198                total = int(response.headers.get("Content-Length", 0))
199                async with await anyio.open_file(path, "wb") as f:
200                    num_bytes_downloaded = response.num_bytes_downloaded
201                    async for chunk in response.aiter_bytes():
202                        await f.write(chunk)
203                        msg = f"⏬下载中: {readable_size(num_bytes_downloaded)} / {readable_size(total)}\n💾{path.name}"
204                        msg += f" ({num_bytes_downloaded / total:.2%})" if total and total > 0 else ""
205                        await modify_progress(text=msg, **kwargs)
206                        num_bytes_downloaded = response.num_bytes_downloaded
207        else:
208            async with semaphore, hx:
209                response = await hx.get(link)
210                response.raise_for_status()
211                path.write_bytes(response.content)  # Save the file to disk
212    except (RequestError, HTTPStatusError) as e:
213        error = f"Failed to download: {e}"
214        logger.error(error)
215        await modify_progress(text=error, **kwargs)
216        return ""
217    if path.is_file():
218        logger.info(f"Downloaded file saved to {path}")
219        await modify_progress(text=f"🎉下载成功\n{path.name}", **kwargs)
220        return path.as_posix()
221    return ""
222
223
224async def download_first_success_urls(links: list[str], **kwargs) -> str:
225    """Download the first successfully file from a list of links.
226
227    Note: This will only download a single file from the list of links.
228    """
229    if not links:
230        return ""
231    for link in links:
232        res = await download_file(link, **kwargs)
233        if Path(res).is_file():
234            return res
235    return ""
236
237
238async def download_media(media: list[dict], **kwargs) -> list[dict]:
239    if not media:
240        return []
241    tasks = []
242    for item in media:
243        if task := item.get("photo"):  # async function
244            tasks.append(task)
245        if task := item.get("video"):
246            tasks.append(task)
247        if task := item.get("livephoto"):
248            tasks.append(task)
249        if task := item.get("audio"):
250            tasks.append(task)
251    # run all tasks
252    results = await asyncio.gather(*tasks, return_exceptions=True)
253    final_media = []
254    for item, result in zip(media, results, strict=True):
255        if isinstance(result, Exception):
256            logger.error(f"Failed to download: {result}")
257        elif isinstance(result, str) and not Path(result).is_file():
258            logger.error(f"Downloaded file is not exists: {result}")
259        else:
260            if item.get("photo"):  # async function
261                item["photo"] = result
262                final_media.append(item)
263            if task := item.get("video"):
264                item["video"] = result
265                final_media.append(item)
266            if task := item.get("livephoto"):
267                item["video"] = result
268                final_media.append(item)
269            if task := item.get("audio"):
270                item["audio"] = result
271                final_media.append(item)
272            logger.success(f"Downloaded: {result}")
273    await modify_progress(text=f"✅下载成功:\n{summay_media(final_media)}", **kwargs)
274    return final_media
275
276
277@cache.memoize(ttl=60)
278async def match_social_media_link(text: str, *, flatten_first: bool = True) -> dict:
279    """Matches social media links in the given text and returns a dictionary with the matched information.
280
281    Args:
282        text (str): The text to search for social media links.
283
284    Returns:
285        dict: A dictionary containing the matched information. At least "platform", "url", and "db_key" keys are present.
286                platform: The social media platform name.
287                url: The matched URL.
288                db_key: The key to store in the cache.
289    #! TODO: Handle multiple links in one message.
290    """
291    text = str(text)
292    if flatten_first:
293        text = await flatten_rediercts(text)
294    # https://www.douyin.com/video/7398813386827468041
295    # https://www.douyin.com/note/7458195074434846004
296    # https://www.iesdouyin.com/share/video/7454527270925946138/
297    # https://www.iesdouyin.com/share/note/7454527270925946138/
298    if matched := re.search(r"(https?://)?(www\.)?(ies)?douyin\.com/(share/)?(:?|video|note)/(\d+)", text):
299        return {"url": f"https://www.douyin.com/{matched.group(5)}/{matched.group(6)}", "db_key": f"www.douyin.com/{matched.group(5)}/{matched.group(6)}", "platform": "douyin"}
300    # https://www.douyin.com/user/MS4wLjABAAAAXgBuOEcyavDhrRBqnD8x7d4pj7RIL5QFRlLehCnem8couoAg8yXR-MGhUK0i4riF?modal_id=7451543857952492810
301    # https://www.douyin.com/discover?modal_id=7472757663609179430
302    if matched := re.search(r"(https?://)?(www\.)?douyin\.com/(.*?)\?(.*?)modal_id=(\d+)", text):
303        return {"url": f"https://www.douyin.com/video/{matched.group(5)}", "db_key": f"www.douyin.com/video/{matched.group(5)}", "platform": "douyin"}
304    # https://www.douyin.com/?previous_page=oversea_share_link&vid=7483851761246031115
305    if matched := re.search(r"(https?://)?(www\.)?douyin\.com/(.*?)\?(.*?)vid=(\d+)", text):
306        return {"url": f"https://www.douyin.com/video/{matched.group(5)}", "db_key": f"www.douyin.com/video/{matched.group(5)}", "platform": "douyin"}
307    # https://www.tiktok.com/@baymermel/video/7460653893941267755\?_t\=ZS-8t8YbVWqv5k\&_r\=1
308    if matched := re.search(r"(https?://)?(www\.)?tiktok\.com/(.*?)/(\d+)", text):
309        return {"url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "tiktok"}
310
311    # https://www.instagram.com/p/C7P3jN8vmEN
312    # https://www.instagram.com/reel/DBBEGXpvwNF
313    if matched := re.search(r"(https?://)?(www\.)?instagram\.com/(:?|p|reel)/([^.。,，/\s]+)", text):
314        return {"post_type": matched.group(3), "post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
315    # https://www.instagram.com/yifaer_chen/p/DEzv9x-vzOn/
316    if matched := re.search(r"(https?://)?(www\.)?instagram\.com/[a-zA-Z0-9_.]+/(:?|p|reel)/([^.。,，/\s]+)", text):
317        return {"post_type": matched.group(3), "post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
318
319    # https://x.com/taylorswift13/status/1794805688696275131
320    # https://twitter.com/taylorswift13/status/1794805688696275131
321    # https://fixupx.com/taylorswift13/status/1794805688696275131
322    # https://fxtwitter.com/taylorswift13/status/1794805688696275131
323    if matched := re.search(r"(https?://)?(:?twitter|x|fxtwitter|fixupx|vxtwitter)\.com\/(\w+)\/status/(\d+)", text):
324        handle = matched.group(3)
325        post_id = matched.group(4)
326        url = f"https://x.com/{handle}/status/{post_id}"
327        return {"platform": "x", "handle": handle, "post_id": post_id, "url": url, "db_key": bare_url(url)}
328
329    # weibo video first, then weibo post
330    # https://video.weibo.com/show?fid=1034:5123779299311660
331    # https://h5.video.weibo.com/show/1034:5169532881535051
332    if matched := re.search(r"(https?://)?(h5\.)?video\.weibo\.(:?com|cn)/show(\?fid=|\/)(\d+):(\d+)", text):
333        return {"post_id": f"weibovideo{matched.group(5)}:{matched.group(6)}", "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "weibo"}
334    # https://weibo.com/tv/show/1034:5123779299311660?from=old_pc_videoshow
335    if matched := re.search(r"(https?://)?(www\.)?weibo\.(:?com|cn)/tv/show/(\d+):(\d+)", text):
336        return {"post_id": f"weibovideo{matched.group(4)}:{matched.group(5)}", "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "weibo"}
337    # https://m.weibo.cn/detail/5113333048938691
338    # https://m.weibo.cn/status/5113333048938691
339    if matched := re.search(r"(https?://)?m\.weibo\.cn/(:?detail|status)/(\w+)", text):
340        return {"post_id": matched.group(3), "url": https_url(matched.group(0)), "db_key": f"m.weibo.cn/detail/{matched.group(3)}", "platform": "weibo"}
341    # https://weibo.com/1736562685/P6lhSjRnI
342    if matched := re.search(r"(https?://)?(www\.)?weibo\.com/(.*?)/(\w+)", text):
343        return {"post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": f"m.weibo.cn/detail/{matched.group(4)}", "platform": "weibo"}
344
345    # http://xhslink.com/a/Z3VPXAReU1Y1
346    xhs_pattern = r"(https?://)?xhslink\.com/(\w?/?)([^,，.。?\s]+)"
347    if matched := re.search(xhs_pattern, text):
348        transport = AsyncCurlTransport(proxy=PROXY.XHS, impersonate="safari_ios", default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True})
349        flatten = await flatten_rediercts(https_url(matched.group(0)), transport=transport, pattern=xhs_pattern, proxy=PROXY.XHS, method="GET")
350        base_url = flatten.split("?")[0]
351        post_id = Path(base_url).stem
352        queries = parse_qs(urlparse(flatten).query)
353        xsec = queries.get("xsec_token", [""])[0]
354        return {"url": https_url(matched.group(0)), "db_key": f"www.xiaohongshu.com/explore/{post_id}", "xsec": xsec, "is_xhs_link": True, "platform": "xiaohongshu"}
355    # https://www.xiaohongshu.com/explore/671a3dfe00000000240161db?xsec_token=ABY-b1JKuAlIm2dX1OSdIFHD7cQFHEdThv5aMyccvmbJo=
356    if matched := re.search(r"(https?://)?(www\.)?xiaohongshu\.com/([^。,，\s]+)", text):
357        base_url = matched.group(0).split("?")[0]
358        post_id = Path(base_url).stem
359        queries = parse_qs(urlparse(matched.group(0)).query)
360        xsec = queries.get("xsec_token", [""])[0]
361        return {
362            "url": f"https://www.xiaohongshu.com/explore/{post_id}?xsec_token={xsec}",
363            "db_key": f"www.xiaohongshu.com/explore/{post_id}",
364            "is_xhs_link": False,
365            "xsec": xsec,
366            "platform": "xiaohongshu",
367        }
368
369    # https://www.bilibili.com/video/BV1RSsNzDEQb
370    # https://www.bilibili.com/video/av115402113881975
371    # https://www.bilibili.com/BV1RSsNzDEQb
372    # https://www.bilibili.com/av115402113881975
373    if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/(video/)?(?P<prefix>[aAbB][vV])(?P<id>[a-zA-Z0-9]+)", text):
374        base_url = matched.group(0).split("?")[0]
375        bvid = Path(base_url).stem
376        queries = parse_qs(urlparse(matched.group(0)).query)
377        pid = queries.get("p", ["1"])[0]
378        url = f"https://www.bilibili.com/video/{av2bv(bvid)}?p={pid}".removesuffix("?p=1")
379        return {"url": url, "db_key": bare_url(url), "bvid": av2bv(bvid), "pid": pid, "platform": "bilibili"}
380
381    # https://m.bilibili.com/opus/1048442220384878593
382    if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/opus/(\d+)", text):
383        post_id = matched.group(3)
384        url = f"https://www.bilibili.com/opus/{post_id}"
385        return {"url": url, "db_key": url, "post_id": post_id, "platform": "bilibili-opus"}
386
387    # https://github.com/user-name/repo
388    # https://github.com/user-name/repo/issues/123
389    # https://github.com/user-name/repo/issues/123#issuecomment-45678
390    # https://github.com/user-name/repo/pull/123
391    # https://github.com/user-name/repo/pull/123#issuecomment-45678
392    if matched := re.search(r"(https?://)?github\.com/([a-zA-Z0-9]+(-[a-zA-Z0-9]+)*)/([.a-zA-Z0-9_-]+)/?([#/a-zA-Z0-9_-]+)?", text):
393        gh_user = matched.group(2)
394        gh_repo = matched.group(4)
395        query = matched.group(5) or ""
396        url = matched.group(0)
397        return {"url": https_url(url), "db_key": bare_url(url), "gh_user": gh_user, "gh_repo": gh_repo, "query": query, "platform": "github"}
398
399    # https://www.v2ex.com/t/1153086
400    if matched := re.search(r"(https?://)?(www\.)?v2ex\.com/t/(\d+)", text):
401        topic_id = matched.group(3)
402        url = f"https://www.v2ex.com/t/{topic_id}"
403        return {"url": url, "db_key": bare_url(url), "topic_id": topic_id, "platform": "v2ex"}
404
405    # https://open.spotify.com/track/0cOMncRq4cmDLO4tPQnkBF
406    if matched := re.search(r"(https?://)?open\.spotify\.com/(:?track|album|artist|playlist)/([a-zA-Z0-9]+)", text):
407        resource = matched.group(2)
408        spotify_id = matched.group(3)
409        url = matched.group(0)
410        return {"url": url, "db_key": bare_url(url), "resource": resource, "spotify_id": spotify_id, "platform": "spotify"}
411
412    # https://music.163.com/song?id=2021343740
413    # https://163cn.tv/HYHqZ6R
414    # https://163cn.link/HYHqZ6R
415    if matched := re.search(r"(https?://)?(:?music\.163\.com|163cn\.tv|163cn\.link)/([0-9a-zA-Z#./?=_\-%&]+)", text):
416        url = matched.group(0)
417        return {"url": url, "db_key": bare_url(url), "platform": "music163"}
418
419    # https://www.youtube.com/watch?v=D6aE2E0RHTc
420    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/watch.*?v=([a-zA-Z0-9_-]{11})", text):
421        vid = matched.group(3)
422        return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
423    # https://youtube.com/shorts/lFKHbluAlJw
424    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/(:?shorts|live)/([a-zA-Z0-9_-]{11})", text):
425        vid = matched.group(4)
426        return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
427    # https://youtu.be/vOiP3kfFlrE
428    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtu\.be/([a-zA-Z0-9_-]{11})", text):
429        vid = matched.group(3)
430        return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
431
432    # https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw
433    # https://mp.weixin.qq.com/s?__biz=MzI5Njc4NTYyOQ==&mid=2247494800&idx=1&sn=43a5732bd3a205d4dbdcd523afc0ca4a&sharer_shareinfo=1923203fd24bfa47c5b36b690026f5c8&sharer_shareinfo_first=8814eca80b4a37d10aa9b725e61f9486
434    if matched := re.search(r"(https?://)?mp.weixin.qq.com/s[\/|\?]{1}([_A-Za-z\=\&0-9\#\-]+)", text):
435        return {"url": matched.group(0), "db_key": bare_url(matched.group(0)), "platform": "wechat"}
436
437    # !Put this before all reddit rules
438    # https://www.reddit.com/r/China_irl/s/bA50WleCBM
439    reddit_pattern = r"(https?://)?(:?m\.|www\.)reddit\.com/r/\w+/s/([^.。,，?&/\s]+)"
440    if matched := re.search(reddit_pattern, text):
441        text = await flatten_rediercts(https_url(matched.group(0)), pattern=reddit_pattern, proxy=PROXY.REDDIT)
442    # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/%E8%B5%B5%E8%96%87%E4%BB%80%E4%B9%88%E6%97%B6%E5%80%99%E5%9B%9E%E6%9D%A5/
443    # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/赵薇什么时候回来
444    # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/comment/mk43l4t/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button
445    if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/r/([_A-Za-z0-9]+)/comments/(.*?)/([^,，.。\?\s]+)", text):
446        return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
447    # https://reddit.com/comments/1kaazzn
448    if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/comments/([_A-Za-z0-9]+)", text):
449        return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
450
451    # https://arxiv.org/abs/2301.12345
452    # https://arxiv.org/pdf/2301.12345v3
453    if matched := re.search(r"(https?://)?arxiv\.org/(abs|pdf)/(\d{4}\.\d{4,5}(?:v\d+)?)", text):
454        url = matched.group(0)
455        arxiv_id = matched.group(3)
456        if "v" not in arxiv_id:
457            arxiv_id += "v1"
458            url += "v1"
459        return {"url": url, "arxiv_id": arxiv_id, "db_key": f"arxiv.org/abs/{arxiv_id}", "platform": "arxiv"}
460
461    # if all above pre-defined patterns failed, try to match ytdlp link
462    if urls := match_urls(text):
463        for url in urls:
464            if any(x in url.lower() for x in ["bilibili", "douyin", "instagram", "tiktok", "twitter", "weibo", "xiaohongshu", "reddit", "youtube"]):
465                # handled above
466                continue
467            if is_supported_by_ytdlp(url):
468                return {"url": url, "db_key": bare_url(url), "platform": "ytdlp"}
469    return {"platform": ""}
470
471
472@cache.memoize(ttl=60)
473async def flatten_rediercts(
474    texts: str | None = None,
475    pattern: str | None = None,
476    headers: dict | None = None,
477    proxy: str | None = None,
478    method: str = "HEAD",
479    transport: AsyncCurlTransport | AsyncHTTPTransport | None = None,
480) -> str:
481    if not texts:
482        return ""
483    url = ""
484    # v.douyin.com
485    if matched := re.search(r"(https?://)?v\.douyin\.com/([^.。,，?&/\s]+)", texts):
486        method = "GET"  # use GET for v.douyin.com
487        url = matched.group(0)
488    # vt.tiktok.com
489    if matched := re.search(r"(https?://)?vt\.tiktok\.com/([^.。,，?&/\s]+)", texts):
490        url = matched.group(0)
491    # tiktok.com/t/
492    if matched := re.search(r"(https?://)?(www\.)?tiktok\.com/t/([^.。,，?&/\s]+)", texts):
493        url = matched.group(0)
494    # facebook.com/
495    if matched := re.search(r"(https?://)?(:?m\.|www\.)facebook\.com/share/(v|r)/([^.。,，?&/\s]+)", texts):
496        url = matched.group(0)
497    # b23.tv
498    if matched := re.search(r"(https?://)?b23\.tv/([^.。,，?&/\s]+)", texts):
499        url = matched.group(0)
500    # bili2233.cn
501    if matched := re.search(r"(https?://)?bili2233\.cn/([^.。,，?&/\s]+)", texts):
502        url = matched.group(0)
503    # t.co
504    if matched := re.search(r"(https?://)?t\.co/([^.。,，?&/\s]+)", texts):
505        url = matched.group(0)
506    # mapp.api.weibo.cn
507    if matched := re.search(r"(https?://)?mapp\.api\.weibo\.cn/fx/([0-9a-zA-Z]+)\.html", texts):
508        url = matched.group(0)
509    # t.cn
510    if matched := re.search(r"(https?://)?t\.cn/([^.。,，?&/\s]+)", texts):
511        url = matched.group(0)
512        method = "GET"
513    # bit.ly
514    if matched := re.search(r"(https?://)?bit\.ly/([^.。,，?&/\s]+)", texts):
515        url = matched.group(0)
516    # shorturl.at
517    if matched := re.search(r"(https?://)?shorturl\.at/([^.。,，?&/\s]+)", texts):
518        url = matched.group(0)
519    # vertexaisearch.cloud.google.com
520    if matched := re.search(r"(https?://)?vertexaisearch\.cloud\.google\.com/([0-9a-zA-Z\-_=+/]+)", texts):
521        url = matched.group(0)
522        proxy = PROXY.GOOGLE
523
524    # custom pattern
525    if pattern and (matched := re.search(pattern, texts)):
526        url = matched.group(0)
527    if not url:
528        return texts
529    # parse redirect
530    rediercted_url = https_url(url)
531    with contextlib.suppress(Exception):
532        if method == "HEAD":
533            async with AsyncClient(http2=True, proxy=proxy, follow_redirects=True, transport=transport, event_hooks={"request": [log_req], "response": [log_resp]}) as hx:
534                resp = await hx.head(https_url(url), headers=headers, timeout=3)
535                rediercted_url = str(resp.url)
536        elif method == "GET":
537            status_code = 302
538            while str(status_code).startswith("3"):
539                async with AsyncClient(http2=True, proxy=proxy, follow_redirects=False, transport=transport, event_hooks={"request": [log_req], "response": [log_resp]}) as hx:
540                    resp = await hx.get(rediercted_url, headers=headers, timeout=3)
541                    status_code = resp.status_code
542                    rediercted_url = resp.headers.get("Location", rediercted_url)
543    if url != rediercted_url:
544        logger.info(f"Flatten redirect: {url} -> {rediercted_url}")
545    return texts.replace(url, rediercted_url)
546
547
548if __name__ == "__main__":
549    import asyncio
550
551    check_data(json.dumps({"foo": "bar", "baz": {"qux": "quux"}, "lst": ["1", "2", "3"]}), check_keys=["baz.qux"], check_kv={"foo": "bar", "baz.qux": "quux", "lst": ["1", "2", "3"]})
552    # asyncio.run(match_social_media_link("https://b23.tv/3MSgT4q/", flatten_first=True))
553    # print(asyncio.run(match_social_media_link("https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw", flatten_first=True)))
554    # print(asyncio.run(match_social_media_link("https://reddit.com/comments/1kaazzn", flatten_first=True)))
555    # print(asyncio.run(match_social_media_link("https://www.reddit.com/r/China_irl/s/bA50WleCBM")))
556    # asyncio.run(match_social_media_link("https://www.facebook.com/share/r/19QGGp39T3/", flatten_first=True))
557    # asyncio.run(match_social_media_link("https://www.douyin.com/video/7398813386827468041"))
558    # asyncio.run(match_social_media_link("https://www.iesdouyin.com/share/note/7454527270925946138/"))
559    # asyncio.run(match_social_media_link("https://www.instagram.com/yifaer_chen/p/DEzv9x-vzOn/"))
560    # asyncio.run(flatten_rediercts("http://t.cn/A6ukIuVn"))
561    # asyncio.run(flatten_rediercts("shorturl.at/fuyrt"))
562    # asyncio.run(flatten_rediercts("https://b23.tv/3MSgT4q"))
563    # asyncio.run(flatten_rediercts("https://v.douyin.com/CeiJfJMQG/"))
564    # asyncio.run(flatten_rediercts("https://www.tiktok.com/t/ZT2mcMA7f/"))
565    # asyncio.run(flatten_rediercts("https://t.co/Wwo3x69CQz"))
566    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/issues/14463")))
567    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/")))
568    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp")))
569    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/issues/14404#issuecomment-3323873708")))
570    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/pull/14467")))
571    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/pull/14417#issuecomment-3327344721")))
572    # print(asyncio.run(match_social_media_link("https://mapp.api.weibo.cn/fx/09ab955a1e0d406c9d6a74f5a2242b4a.html")))
573    # print(asyncio.run(match_social_media_link("https://www.bilibili.com/video/BV1TC411J7PK")))
574    # print(asyncio.run(match_social_media_link("https://www.bilibili.com/BV1TC411J7PK")))
575    # print(asyncio.run(match_social_media_link("https://www.instagram.com/miyoshi.aa/p/DN5hFcUE8rS/")))
576    # print(asyncio.run(match_social_media_link("https://www.youtube.com/watch?v=D6aE2E0RHTc")))
577    # print(asyncio.run(match_social_media_link("https://youtube.com/shorts/lFKHbluAlJw")))
578    # print(asyncio.run(match_social_media_link("https://youtu.be/vOiP3kfFlrE?si=zPd-Bt1GO03jxpI_")))
579    # res = asyncio.run(hx_req("https://httpbin.org/delay/10"))
580    # asyncio.run(hx_req("https://httpbin.org/get", check_kv={"url": "https://httpbin.org/get", "headers.Pragma": "no-cache1"}, max_retry=1))
581    # resp = asyncio.run(hx_req("https://httpbin.org/get", check_kv={"headers": {"Accept-Language": "en-US,en;q=0.8"}}))
582    # resp = asyncio.run(hx_req("https://httpbin.org/headers", headers={"referer": "https://www.xiaohongshu.com/"}))
583    # print(resp)
584
585    # asyncio.run(download_file("https://httpbin.org/image/jpeg", suffix=".jpg"))
586    # asyncio.run(match_social_media_link("https://www.instagram.com/p/C7P3jN8vmEN"))