main
  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3
  4import asyncio
  5import contextlib
  6import json
  7import re
  8from datetime import timedelta
  9from pathlib import Path
 10from typing import Any, Literal
 11from urllib.parse import parse_qs, urlparse
 12
 13import anyio
 14from curl_cffi.requests.impersonate import BrowserTypeLiteral
 15from glom import glom
 16from httpx import AsyncClient, AsyncHTTPTransport, HTTPStatusError, Request, RequestError, Response
 17from httpx._types import RequestContent, RequestData, RequestFiles
 18from httpx_curl_cffi import AsyncCurlTransport, CurlOpt
 19from loguru import logger
 20
 21from config import DOWNLOAD_DIR, PROXY, REQUEST_TIMEOUT, TOKEN, cache, semaphore
 22from messages.progress import modify_progress
 23from messages.utils import summay_media
 24from utils import av2bv, bare_url, check_data, https_url, is_supported_by_ytdlp, match_urls, nowdt, readable_size
 25
 26
 27# ruff: noqa: RUF001
 28async def log_req(request: Request) -> None:
 29    logger.debug(f"{request.method} {request.url} {request.headers}")
 30
 31
 32async def log_resp(response: Response) -> None:
 33    request = response.request
 34    logger.debug(f"[{response.status_code}] {request.method} {request.url}")
 35
 36
 37async def hx_req(
 38    url,
 39    method: Literal["GET", "POST", "PUT", "DELETE", "PATCH"] = "GET",
 40    *,
 41    transport: AsyncCurlTransport | AsyncHTTPTransport | None = None,
 42    headers: dict | None = None,
 43    cookies: dict | None = None,
 44    params: dict | None = None,
 45    data: RequestData | None = None,
 46    json_data: dict | None = None,
 47    content_data: RequestContent | None = None,
 48    files: RequestFiles | None = None,
 49    proxy: str | None = None,
 50    follow_redirects: bool = True,
 51    check_keys: list[str] | None = None,
 52    check_kv: dict | None = None,
 53    timeout: int = REQUEST_TIMEOUT,  # noqa: ASYNC109
 54    retry: int = 0,
 55    max_retry: int = 2,
 56    verify: bool = True,
 57    silent: bool = False,
 58    mobile: bool = False,
 59    rformat: Literal["json", "text", "content"] = "json",
 60    last_error: str = "",
 61    hx_raw: dict | None = None,
 62) -> dict[str, Any]:
 63    """Request the given URL with the given method and return the response as a dictionary.
 64
 65    Args:
 66        url (str): The URL to request.
 67        method (str): The method to use for the request.
 68        headers (dict, optional): The headers to use for the request.
 69        cookies (dict, optional): The cookies to use for the request.
 70        params (dict, optional): The parameters to use for the request.
 71        data (dict, optional): The data to POST or PUT.
 72        json_data (dict, optional): The JSON data to POST or PUT.
 73        content_data (dict, optional): The form data to POST or PUT.
 74        proxy (str, optional): The proxy to use for the request.
 75        follow_redirects (bool, optional): Whether to follow redirects.
 76        check_keys (list[str], optional): The keys to check in the response.
 77        check_kv (dict, optional): The key-value pairs to check in the response.
 78        timeout (int, optional): The timeout for the request.
 79        retry (int, optional): The number of retries for the request.
 80        verify (bool, optional): Whether to verify the SSL certificate.
 81        silent (bool, optional): Whether to suppress the logs.
 82        mobile (bool, optional): Whether to use mobile headers.
 83        rformat (str, optional): The format of the response.
 84        last_error (str, optional): Last error message.
 85        hx_raw (dict, optional): Raw HTTPX response on failure.
 86
 87    Returns:
 88        dict: {"success": bool, "data": response}
 89    """
 90    if retry > max_retry:
 91        logger.error(f"[{method}] Failed after {retry} retries: {url}")
 92        return {"hx_error": last_error, "hx_raw": hx_raw or {}}
 93    if transport is None:
 94        transport = AsyncCurlTransport(proxy=proxy, impersonate="safari_ios" if mobile else "chrome", default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True})
 95
 96    if silent:
 97        client = AsyncClient(http2=True, proxy=proxy, transport=transport, follow_redirects=follow_redirects, timeout=timeout, verify=verify)
 98    else:
 99        client = AsyncClient(
100            http2=True,
101            proxy=proxy,
102            transport=transport,
103            follow_redirects=follow_redirects,
104            timeout=timeout,
105            verify=verify,
106            event_hooks={"request": [log_req], "response": [log_resp]},
107        )
108
109    if method not in ["GET", "POST", "PUT", "DELETE", "PATCH"]:
110        error = f"Invalid method: {method}"
111        logger.error(error)
112        return {"hx_error": error}
113    try:
114        async with client:
115            if method == "GET":
116                response = await client.get(url, cookies=cookies, headers=headers, params=params)
117            elif method == "POST":
118                response = await client.post(url, cookies=cookies, headers=headers, data=data, json=json_data, files=files, content=content_data, params=params)
119            elif method == "PUT":
120                response = await client.put(url, cookies=cookies, headers=headers, data=data, json=json_data, content=content_data, files=files, params=params)
121            elif method == "PATCH":
122                response = await client.patch(url, cookies=cookies, headers=headers, data=data, json=json_data, content=content_data, files=files, params=params)
123            else:
124                response = await client.delete(url, cookies=cookies, headers=headers, params=params)
125            response.raise_for_status()
126            meta = {"headers": response.headers, "status_code": response.status_code}
127            if rformat == "content":
128                return {"content": response.content} | meta
129            resp_data = response.text
130            check_data(resp_data, check_keys=check_keys, check_kv=check_kv)
131            res = json.loads(resp_data) if rformat == "json" else {rformat: resp_data} | meta
132            if not silent:
133                logger.trace(res)
134            return res | meta if isinstance(res, dict) else res
135    except Exception as e:
136        error = f"{type(e).__name__}[{retry + 1}/{max_retry + 1}]: Failed to request {url}, {e}"
137        with contextlib.suppress(Exception):
138            hx_raw = response.json()
139        if "res" in locals():
140            error += f"\n{res}"
141        if "data" in locals():
142            error += f"\n{data}"
143        if hx_raw:
144            error += f"\n{hx_raw}"
145        elif "response" in locals():
146            error += f"\n{response.text}"
147        logger.error(error)
148        return await hx_req(url, method, headers=headers, cookies=cookies, params=params, data=data, json_data=json_data, proxy=proxy, follow_redirects=follow_redirects, check_keys=check_keys, check_kv=check_kv, timeout=timeout, retry=retry + 1, max_retry=max_retry, silent=silent, rformat=rformat, last_error=error, hx_raw=hx_raw)  # fmt: off
149
150
151async def download_file(
152    link: str,
153    path: str | Path | None = None,
154    *,
155    suffix: str = "",
156    skip_exist: bool = False,
157    proxy: str | None = None,
158    headers: dict | None = None,
159    impersonate: BrowserTypeLiteral | None = "safari_ios",
160    stream: bool = False,
161    **kwargs,
162) -> str:
163    """Download a file from the given link and save it to the specified path.
164
165    Args:
166        link (str): URL to download the file.
167        path (str | Path, optional): The path to save the downloaded file. Defaults to auto detect.
168        suffix (str, optional): The suffix to append to the file name. Defaults to auto detect.
169        skip_exist (bool, optional): Skip downloading if the file already exists. Defaults to False.
170        proxy (str, optional): The proxy to use for the request.
171        headers (dict, optional): The headers to use for the request.
172        stream (bool, optional): Stream the download. Defaults to False.
173
174    Returns:
175        str: Download file path.
176    """
177    if not link:
178        return ""
179    if path is None:
180        path = Path(DOWNLOAD_DIR) / Path(urlparse(link).path).name
181    path = Path(path).expanduser().resolve()
182    if path.suffix != suffix:
183        path = path.with_suffix(f"{path.suffix}{suffix}")  # append suffix, not replace
184
185    if path.is_file() and skip_exist:
186        logger.info(f"File already exists, skipping download: {path}")
187        return path.as_posix()
188    path.parent.mkdir(parents=True, exist_ok=True)
189    proxy = proxy or PROXY.DOWNLOAD
190    logger.trace(f"Downloading {link} to {path} with proxy={proxy}")
191    hx = AsyncClient(
192        headers=headers,
193        transport=AsyncCurlTransport(proxy=proxy, impersonate=impersonate, default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True}) if isinstance(impersonate, str) else None,
194        proxy=proxy,
195        timeout=REQUEST_TIMEOUT,
196        follow_redirects=True,
197        event_hooks={"request": [log_req], "response": [log_resp]},
198    )
199    try:
200        if stream:  # can monitor progress, but the retry mechanism does not work
201            async with semaphore, hx.stream("GET", link) as response:
202                total = int(response.headers.get("Content-Length", 0))
203                async with await anyio.open_file(path, "wb") as f:
204                    num_bytes_downloaded = response.num_bytes_downloaded
205                    async for chunk in response.aiter_bytes():
206                        await f.write(chunk)
207                        msg = f"⏬下载中: {readable_size(num_bytes_downloaded)} / {readable_size(total)}\n💾{path.name}"
208                        msg += f" ({num_bytes_downloaded / total:.2%})" if total and total > 0 else ""
209                        await modify_progress(text=msg, **kwargs)
210                        num_bytes_downloaded = response.num_bytes_downloaded
211        else:
212            async with semaphore, hx:
213                response = await hx.get(link)
214                response.raise_for_status()
215                path.write_bytes(response.content)  # Save the file to disk
216    except (RequestError, HTTPStatusError) as e:
217        error = f"Failed to download: {e}"
218        logger.error(error)
219        await modify_progress(text=error, **kwargs)
220        return ""
221    if path.is_file():
222        logger.info(f"Downloaded file saved to {path}")
223        await modify_progress(text=f"🎉下载成功\n{path.name}", **kwargs)
224        return path.as_posix()
225    return ""
226
227
228async def download_first_success_urls(links: list[str], **kwargs) -> str:
229    """Download the first successfully file from a list of links.
230
231    Note: This will only download a single file from the list of links.
232    """
233    if not links:
234        return ""
235    for link in links:
236        res = await download_file(link, **kwargs)
237        if Path(res).is_file():
238            return res
239    return ""
240
241
242async def download_media(media: list[dict], **kwargs) -> list[dict]:
243    if not media:
244        return []
245    tasks = []
246    for item in media:
247        if task := item.get("photo"):  # async function
248            tasks.append(task)
249        if task := item.get("video"):
250            tasks.append(task)
251        if task := item.get("livephoto"):
252            tasks.append(task)
253        if task := item.get("audio"):
254            tasks.append(task)
255    # run all tasks
256    results = await asyncio.gather(*tasks, return_exceptions=True)
257    final_media = []
258    for item, result in zip(media, results, strict=True):
259        if isinstance(result, Exception):
260            logger.error(f"Failed to download: {result}")
261        elif isinstance(result, str) and not Path(result).is_file():
262            logger.error(f"Downloaded file is not exists: {result}")
263        else:
264            if item.get("photo"):  # async function
265                item["photo"] = result
266                final_media.append(item)
267            if task := item.get("video"):
268                item["video"] = result
269                final_media.append(item)
270            if task := item.get("livephoto"):
271                item["video"] = result
272                final_media.append(item)
273            if task := item.get("audio"):
274                item["audio"] = result
275                final_media.append(item)
276            logger.success(f"Downloaded: {result}")
277    await modify_progress(text=f"✅下载成功:\n{summay_media(final_media)}", **kwargs)
278    return final_media
279
280
281@cache.memoize(ttl=60)
282async def match_social_media_link(text: str, *, flatten_first: bool = True) -> dict:
283    """Matches social media links in the given text and returns a dictionary with the matched information.
284
285    Args:
286        text (str): The text to search for social media links.
287
288    Returns:
289        dict: A dictionary containing the matched information. At least "platform", "url", and "db_key" keys are present.
290                platform: The social media platform name.
291                url: The matched URL.
292                db_key: The key to store in the cache.
293    #! TODO: Handle multiple links in one message.
294    """
295    text = str(text)
296    if flatten_first:
297        text = await flatten_rediercts(text)
298    # https://www.douyin.com/video/7398813386827468041
299    # https://www.douyin.com/note/7458195074434846004
300    # https://www.iesdouyin.com/share/video/7454527270925946138/
301    # https://www.iesdouyin.com/share/note/7454527270925946138/
302    if matched := re.search(r"(https?://)?(www\.)?(ies)?douyin\.com/(share/)?(:?|video|note)/(\d+)", text):
303        return {"url": f"https://www.douyin.com/{matched.group(5)}/{matched.group(6)}", "db_key": f"www.douyin.com/{matched.group(5)}/{matched.group(6)}", "platform": "douyin"}
304    # https://www.douyin.com/user/MS4wLjABAAAAXgBuOEcyavDhrRBqnD8x7d4pj7RIL5QFRlLehCnem8couoAg8yXR-MGhUK0i4riF?modal_id=7451543857952492810
305    # https://www.douyin.com/discover?modal_id=7472757663609179430
306    if matched := re.search(r"(https?://)?(www\.)?douyin\.com/(.*?)\?(.*?)modal_id=(\d+)", text):
307        return {"url": f"https://www.douyin.com/video/{matched.group(5)}", "db_key": f"www.douyin.com/video/{matched.group(5)}", "platform": "douyin"}
308    # https://www.douyin.com/?previous_page=oversea_share_link&vid=7483851761246031115
309    if matched := re.search(r"(https?://)?(www\.)?douyin\.com/(.*?)\?(.*?)vid=(\d+)", text):
310        return {"url": f"https://www.douyin.com/video/{matched.group(5)}", "db_key": f"www.douyin.com/video/{matched.group(5)}", "platform": "douyin"}
311    # https://www.tiktok.com/@baymermel/video/7460653893941267755\?_t\=ZS-8t8YbVWqv5k\&_r\=1
312    if matched := re.search(r"(https?://)?(www\.)?tiktok\.com/(.*?)/(\d+)", text):
313        return {"url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "tiktok"}
314
315    # https://www.instagram.com/p/C7P3jN8vmEN
316    # https://www.instagram.com/reel/DBBEGXpvwNF
317    if matched := re.search(r"(https?://)?(www\.)?instagram\.com/(:?|p|reel)/([^.。,,/\s]+)", text):
318        return {"post_type": matched.group(3), "post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
319    # https://www.instagram.com/yifaer_chen/p/DEzv9x-vzOn/
320    if matched := re.search(r"(https?://)?(www\.)?instagram\.com/[a-zA-Z0-9_.]+/(:?|p|reel)/([^.。,,/\s]+)", text):
321        return {"post_type": matched.group(3), "post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
322    # https://www.instagram.com/stories/laufey/3891120377355460527
323    if matched := re.search(r"(https?://)?(www\.)?instagram\.com/stories/([a-zA-Z0-9_.]+)/(\d+)", text):
324        return {"post_type": "story", "post_id": matched.group(4), "username": matched.group(3), "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "instagram"}
325
326    # https://x.com/taylorswift13/status/1794805688696275131
327    # https://twitter.com/taylorswift13/status/1794805688696275131
328    # https://fixupx.com/taylorswift13/status/1794805688696275131
329    # https://fxtwitter.com/taylorswift13/status/1794805688696275131
330    if matched := re.search(r"(https?://)?(:?twitter|x|fxtwitter|fixupx|vxtwitter)\.com\/(\w+)\/status/(\d+)", text):
331        handle = matched.group(3)
332        post_id = matched.group(4)
333        url = f"https://x.com/{handle}/status/{post_id}"
334        return {"platform": "x", "handle": handle, "post_id": post_id, "url": url, "db_key": bare_url(url)}
335
336    # weibo video first, then weibo post
337    # https://video.weibo.com/show?fid=1034:5123779299311660
338    # https://h5.video.weibo.com/show/1034:5169532881535051
339    if matched := re.search(r"(https?://)?(h5\.)?video\.weibo\.(:?com|cn)/show(\?fid=|\/)(\d+):(\d+)", text):
340        return {"post_id": f"weibovideo{matched.group(5)}:{matched.group(6)}", "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "weibo"}
341    # https://weibo.com/tv/show/1034:5123779299311660?from=old_pc_videoshow
342    if matched := re.search(r"(https?://)?(www\.)?weibo\.(:?com|cn)/tv/show/(\d+):(\d+)", text):
343        return {"post_id": f"weibovideo{matched.group(4)}:{matched.group(5)}", "url": https_url(matched.group(0)), "db_key": bare_url(matched.group(0)), "platform": "weibo"}
344    # https://m.weibo.cn/detail/5113333048938691
345    # https://m.weibo.cn/status/5113333048938691
346    if matched := re.search(r"(https?://)?m\.weibo\.cn/(:?detail|status)/(\w+)", text):
347        return {"post_id": matched.group(3), "url": https_url(matched.group(0)), "db_key": f"m.weibo.cn/detail/{matched.group(3)}", "platform": "weibo"}
348    # https://weibo.com/1736562685/P6lhSjRnI
349    if matched := re.search(r"(https?://)?(www\.)?weibo\.com/(.*?)/(\w+)", text):
350        return {"post_id": matched.group(4), "url": https_url(matched.group(0)), "db_key": f"m.weibo.cn/detail/{matched.group(4)}", "platform": "weibo"}
351
352    # http://xhslink.com/a/Z3VPXAReU1Y1
353    xhs_pattern = r"(https?://)?xhslink\.com/(\w?/?)([^,,.。?\s]+)"
354    if matched := re.search(xhs_pattern, text):
355        transport = AsyncCurlTransport(proxy=PROXY.XHS, impersonate="safari_ios", default_headers=True, curl_options={CurlOpt.FRESH_CONNECT: True})
356        flatten = await flatten_rediercts(https_url(matched.group(0)), transport=transport, pattern=xhs_pattern, proxy=PROXY.XHS, method="GET")
357        base_url = flatten.split("?")[0]
358        post_id = Path(base_url).stem
359        queries = parse_qs(urlparse(flatten).query)
360        xsec = queries.get("xsec_token", [""])[0]
361        return {"url": https_url(matched.group(0)), "db_key": f"www.xiaohongshu.com/explore/{post_id}", "xsec": xsec, "is_xhs_link": True, "platform": "xiaohongshu"}
362    # https://www.xiaohongshu.com/explore/671a3dfe00000000240161db?xsec_token=ABY-b1JKuAlIm2dX1OSdIFHD7cQFHEdThv5aMyccvmbJo=
363    if matched := re.search(r"(https?://)?(www\.)?xiaohongshu\.com/([^。,,\s]+)", text):
364        base_url = matched.group(0).split("?")[0]
365        post_id = Path(base_url).stem
366        queries = parse_qs(urlparse(matched.group(0)).query)
367        xsec = queries.get("xsec_token", [""])[0]
368        return {
369            "url": f"https://www.xiaohongshu.com/explore/{post_id}?xsec_token={xsec}",
370            "db_key": f"www.xiaohongshu.com/explore/{post_id}",
371            "is_xhs_link": False,
372            "xsec": xsec,
373            "platform": "xiaohongshu",
374        }
375
376    # https://www.bilibili.com/video/BV1RSsNzDEQb
377    # https://www.bilibili.com/video/av115402113881975
378    # https://www.bilibili.com/BV1RSsNzDEQb
379    # https://www.bilibili.com/av115402113881975
380    if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/(video/)?(?P<prefix>[aAbB][vV])(?P<id>[a-zA-Z0-9]+)", text):
381        base_url = matched.group(0).split("?")[0]
382        bvid = Path(base_url).stem
383        queries = parse_qs(urlparse(matched.group(0)).query)
384        pid = queries.get("p", ["1"])[0]
385        url = f"https://www.bilibili.com/video/{av2bv(bvid)}?p={pid}".removesuffix("?p=1")
386        return {"url": url, "db_key": bare_url(url), "bvid": av2bv(bvid), "pid": pid, "platform": "bilibili"}
387
388    # https://www.bilibili.com/list/watchlater/?bvid=BV1wi421f71U&oid=1451459580
389    if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/(.*?)bvid=(?P<prefix>[aAbB][vV])(?P<id>[a-zA-Z0-9]+)", text):
390        bvid = matched.group(4) + matched.group(5)
391        url = f"https://www.bilibili.com/video/{av2bv(bvid)}"
392        return {"url": url, "db_key": bare_url(url), "bvid": av2bv(bvid), "pid": 1, "platform": "bilibili"}
393
394    # https://m.bilibili.com/opus/1048442220384878593
395    if matched := re.search(r"(https?://)?(:?m\.|www\.)?bilibili\.com/opus/(\d+)", text):
396        post_id = matched.group(3)
397        url = f"https://www.bilibili.com/opus/{post_id}"
398        return {"url": url, "db_key": url, "post_id": post_id, "platform": "bilibili-opus"}
399
400    # https://github.com/user-name/repo
401    # https://github.com/user-name/repo/issues/123
402    # https://github.com/user-name/repo/issues/123#issuecomment-45678
403    # https://github.com/user-name/repo/pull/123
404    # https://github.com/user-name/repo/pull/123#issuecomment-45678
405    if matched := re.search(r"(https?://)?github\.com/([a-zA-Z0-9]+(-[a-zA-Z0-9]+)*)/([.a-zA-Z0-9_-]+)/?([#/a-zA-Z0-9_-]+)?", text):
406        gh_user = matched.group(2)
407        gh_repo = matched.group(4)
408        query = matched.group(5) or ""
409        url = matched.group(0)
410        return {"url": https_url(url), "db_key": bare_url(url), "gh_user": gh_user, "gh_repo": gh_repo, "query": query, "platform": "github"}
411
412    # https://www.v2ex.com/t/1153086
413    if matched := re.search(r"(https?://)?(www\.)?v2ex\.com/t/(\d+)", text):
414        topic_id = matched.group(3)
415        url = f"https://www.v2ex.com/t/{topic_id}"
416        return {"url": url, "db_key": bare_url(url), "topic_id": topic_id, "platform": "v2ex"}
417
418    # https://open.spotify.com/track/0cOMncRq4cmDLO4tPQnkBF
419    if matched := re.search(r"(https?://)?open\.spotify\.com/(:?track|album|artist|playlist)/([a-zA-Z0-9]+)", text):
420        resource = matched.group(2)
421        spotify_id = matched.group(3)
422        url = matched.group(0)
423        return {"url": url, "db_key": bare_url(url), "resource": resource, "spotify_id": spotify_id, "platform": "spotify"}
424
425    # https://music.163.com/song?id=2021343740
426    # https://163cn.tv/HYHqZ6R
427    # https://163cn.link/HYHqZ6R
428    if matched := re.search(r"(https?://)?(:?music\.163\.com|163cn\.tv|163cn\.link)/([0-9a-zA-Z#./?=_\-%&]+)", text):
429        url = matched.group(0)
430        return {"url": url, "db_key": bare_url(url), "platform": "music163"}
431
432    # https://www.youtube.com/watch?v=D6aE2E0RHTc
433    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/watch.*?v=([a-zA-Z0-9_-]{11})", text):
434        vid = matched.group(3)
435        return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
436    # https://youtube.com/shorts/lFKHbluAlJw
437    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/(:?shorts|live)/([a-zA-Z0-9_-]{11})", text):
438        vid = matched.group(4)
439        return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
440    # https://youtu.be/vOiP3kfFlrE
441    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtu\.be/([a-zA-Z0-9_-]{11})", text):
442        vid = matched.group(3)
443        return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
444
445    # https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw
446    # https://mp.weixin.qq.com/s?__biz=MzI5Njc4NTYyOQ==&mid=2247494800&idx=1&sn=43a5732bd3a205d4dbdcd523afc0ca4a&sharer_shareinfo=1923203fd24bfa47c5b36b690026f5c8&sharer_shareinfo_first=8814eca80b4a37d10aa9b725e61f9486
447    if matched := re.search(r"(https?://)?mp.weixin.qq.com/s[\/|\?]{1}([_A-Za-z\=\&0-9\#\-]+)", text):
448        return {"url": matched.group(0), "db_key": bare_url(matched.group(0)), "platform": "wechat"}
449
450    # !Put this before all reddit rules
451    # https://www.reddit.com/r/China_irl/s/bA50WleCBM
452    reddit_pattern = r"(https?://)?(:?m\.|www\.)reddit\.com/r/\w+/s/([^.。,,?&/\s]+)"
453    if matched := re.search(reddit_pattern, text):
454        text = await flatten_rediercts(https_url(matched.group(0)), pattern=reddit_pattern, proxy=PROXY.REDDIT)
455    # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/%E8%B5%B5%E8%96%87%E4%BB%80%E4%B9%88%E6%97%B6%E5%80%99%E5%9B%9E%E6%9D%A5/
456    # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/赵薇什么时候回来
457    # https://www.reddit.com/r/DoubanGoosegroup/comments/1jkpgvp/comment/mk43l4t/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button
458    if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/r/([_A-Za-z0-9]+)/comments/(.*?)/([^,,.。\?\s]+)", text):
459        return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
460    # https://reddit.com/comments/1kaazzn
461    if matched := re.search(r"(https?://)?(:?m\.|www\.)?reddit\.com/comments/([_A-Za-z0-9]+)", text):
462        return {"url": matched.group(0).rstrip("/"), "db_key": bare_url(matched.group(0).rstrip("/")), "platform": "reddit"}
463
464    # https://arxiv.org/abs/2301.12345
465    # https://arxiv.org/pdf/2301.12345v3
466    if matched := re.search(r"(https?://)?arxiv\.org/(abs|pdf)/(\d{4}\.\d{4,5}(?:v\d+)?)", text):
467        arxiv_id = matched.group(3)
468        if "v" not in arxiv_id:
469            arxiv_id += "v1"
470        return {"url": f"https://arxiv.org/abs/{arxiv_id}", "arxiv_id": arxiv_id, "db_key": f"arxiv.org/abs/{arxiv_id}", "platform": "arxiv"}
471
472    # if all above pre-defined patterns failed, try to match ytdlp link
473    if urls := match_urls(text):
474        for url in urls:
475            if any(x in url.lower() for x in ["bilibili", "douyin", "instagram", "tiktok", "twitter", "weibo", "xiaohongshu", "reddit", "youtube"]):
476                # handled above
477                continue
478            if is_supported_by_ytdlp(url):
479                return {"url": url, "db_key": bare_url(url), "platform": "ytdlp"}
480    return {"platform": ""}
481
482
483@cache.memoize(ttl=60)
484async def flatten_rediercts(
485    texts: str | None = None,
486    pattern: str | None = None,
487    headers: dict | None = None,
488    proxy: str | None = None,
489    method: str = "HEAD",
490    transport: AsyncCurlTransport | AsyncHTTPTransport | None = None,
491) -> str:
492    if not texts:
493        return ""
494    url = ""
495    # v.douyin.com
496    if matched := re.search(r"(https?://)?v\.douyin\.com/([^.。,,?&/\s]+)", texts):
497        method = "GET"  # use GET for v.douyin.com
498        url = matched.group(0)
499    # vt.tiktok.com
500    if matched := re.search(r"(https?://)?vt\.tiktok\.com/([^.。,,?&/\s]+)", texts):
501        url = matched.group(0)
502    # tiktok.com/t/
503    if matched := re.search(r"(https?://)?(www\.)?tiktok\.com/t/([^.。,,?&/\s]+)", texts):
504        url = matched.group(0)
505    # facebook.com/
506    if matched := re.search(r"(https?://)?(:?m\.|www\.)facebook\.com/share/(v|r)/([^.。,,?&/\s]+)", texts):
507        url = matched.group(0)
508    # b23.tv
509    if matched := re.search(r"(https?://)?b23\.tv/([^.。,,?&/\s]+)", texts):
510        url = matched.group(0)
511    # bili2233.cn
512    if matched := re.search(r"(https?://)?bili2233\.cn/([^.。,,?&/\s]+)", texts):
513        url = matched.group(0)
514    # t.co
515    if matched := re.search(r"(https?://)?t\.co/([^.。,,?&/\s]+)", texts):
516        url = matched.group(0)
517    # mapp.api.weibo.cn
518    if matched := re.search(r"(https?://)?mapp\.api\.weibo\.cn/fx/([0-9a-zA-Z]+)\.html", texts):
519        url = matched.group(0)
520    # t.cn
521    if matched := re.search(r"(https?://)?t\.cn/([^.。,,?&/\s]+)", texts):
522        url = matched.group(0)
523        method = "GET"
524    # bit.ly
525    if matched := re.search(r"(https?://)?bit\.ly/([^.。,,?&/\s]+)", texts):
526        url = matched.group(0)
527    # shorturl.at
528    if matched := re.search(r"(https?://)?shorturl\.at/([^.。,,?&/\s]+)", texts):
529        url = matched.group(0)
530    # vertexaisearch.cloud.google.com
531    if matched := re.search(r"(https?://)?vertexaisearch\.cloud\.google\.com/([0-9a-zA-Z\-_=+/]+)", texts):
532        url = matched.group(0)
533        proxy = PROXY.GOOGLE
534
535    # custom pattern
536    if pattern and (matched := re.search(pattern, texts)):
537        url = matched.group(0)
538    if not url:
539        return texts
540    # parse redirect
541    rediercted_url = https_url(url)
542    with contextlib.suppress(Exception):
543        if method == "HEAD":
544            async with AsyncClient(http2=True, proxy=proxy, follow_redirects=True, transport=transport, event_hooks={"request": [log_req], "response": [log_resp]}) as hx:
545                resp = await hx.head(https_url(url), headers=headers, timeout=3)
546                rediercted_url = str(resp.url)
547        elif method == "GET":
548            status_code = 302
549            while str(status_code).startswith("3"):
550                async with AsyncClient(http2=True, proxy=proxy, follow_redirects=False, transport=transport, event_hooks={"request": [log_req], "response": [log_resp]}) as hx:
551                    resp = await hx.get(rediercted_url, headers=headers, timeout=3)
552                    status_code = resp.status_code
553                    rediercted_url = resp.headers.get("Location", rediercted_url)
554    if url != rediercted_url:
555        logger.info(f"Flatten redirect: {url} -> {rediercted_url}")
556    return texts.replace(url, rediercted_url)
557
558
559async def shorten_url(url: str, alias: str | None = None, services: list[str] | None = None) -> str:
560    """Shorten URL."""
561    if not url:
562        return url
563    supported = ["spoo.me", "cleanuri.com"]
564    if services is None:
565        services = supported
566    services = [x for x in services if x.lower() in supported]
567    for service in services:
568        if service == "spoo.me":
569            expire_after = nowdt() + timedelta(days=365 * 2)
570            headers = {"Content-Type": "application/json"}
571            payload = {"long_url": url, "expire_after": round(expire_after.timestamp()), "block_bots": False}
572            if TOKEN.SPOOME:
573                headers |= {"Authorization": f"Bearer {TOKEN.SPOOME}"}
574                payload |= {"private_stats": False}
575            if alias:
576                payload |= {"alias": alias}
577            resp = await hx_req("https://spoo.me/api/v1/shorten", "POST", headers=headers, json_data=payload, check_kv={"status": "ACTIVE"}, max_retry=0)
578            if glom(resp, "hx_raw.code", default="") == "conflict":
579                return f"https://spoo.me/{alias}"
580            if short_url := glom(resp, "short_url", default=""):
581                return short_url
582        if service == "cleanuri.com":
583            resp = await hx_req("https://cleanuri.com/api/v1/shorten", "POST", json_data={"url": url}, check_keys=["result_url"])
584            if short_url := glom(resp, "result_url", default=""):
585                return short_url
586    return url
587
588
589if __name__ == "__main__":
590    import asyncio
591
592    asyncio.run(shorten_url("https://www.google.com", alias="test"))
593    check_data(json.dumps({"foo": "bar", "baz": {"qux": "quux"}, "lst": ["1", "2", "3"]}), check_keys=["baz.qux"], check_kv={"foo": "bar", "baz.qux": "quux", "lst": ["1", "2", "3"]})
594    # asyncio.run(match_social_media_link("https://b23.tv/3MSgT4q/", flatten_first=True))
595    # print(asyncio.run(match_social_media_link("https://mp.weixin.qq.com/s/bd_giuPEyPBu9LTOtC2VHw", flatten_first=True)))
596    # print(asyncio.run(match_social_media_link("https://reddit.com/comments/1kaazzn", flatten_first=True)))
597    # print(asyncio.run(match_social_media_link("https://www.reddit.com/r/China_irl/s/bA50WleCBM")))
598    # asyncio.run(match_social_media_link("https://www.facebook.com/share/r/19QGGp39T3/", flatten_first=True))
599    # asyncio.run(match_social_media_link("https://www.douyin.com/video/7398813386827468041"))
600    # asyncio.run(match_social_media_link("https://www.iesdouyin.com/share/note/7454527270925946138/"))
601    # asyncio.run(match_social_media_link("https://www.instagram.com/yifaer_chen/p/DEzv9x-vzOn/"))
602    # asyncio.run(flatten_rediercts("http://t.cn/A6ukIuVn"))
603    # asyncio.run(flatten_rediercts("shorturl.at/fuyrt"))
604    # asyncio.run(flatten_rediercts("https://b23.tv/3MSgT4q"))
605    # asyncio.run(flatten_rediercts("https://v.douyin.com/CeiJfJMQG/"))
606    # asyncio.run(flatten_rediercts("https://www.tiktok.com/t/ZT2mcMA7f/"))
607    # asyncio.run(flatten_rediercts("https://t.co/Wwo3x69CQz"))
608    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/issues/14463")))
609    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/")))
610    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp")))
611    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/issues/14404#issuecomment-3323873708")))
612    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/pull/14467")))
613    # print(asyncio.run(match_social_media_link("https://github.com/yt-dlp/yt-dlp/pull/14417#issuecomment-3327344721")))
614    # print(asyncio.run(match_social_media_link("https://mapp.api.weibo.cn/fx/09ab955a1e0d406c9d6a74f5a2242b4a.html")))
615    # print(asyncio.run(match_social_media_link("https://www.bilibili.com/video/BV1TC411J7PK")))
616    # print(asyncio.run(match_social_media_link("https://www.bilibili.com/BV1TC411J7PK")))
617    # print(asyncio.run(match_social_media_link("https://www.instagram.com/miyoshi.aa/p/DN5hFcUE8rS/")))
618    # print(asyncio.run(match_social_media_link("https://www.youtube.com/watch?v=D6aE2E0RHTc")))
619    # print(asyncio.run(match_social_media_link("https://youtube.com/shorts/lFKHbluAlJw")))
620    # print(asyncio.run(match_social_media_link("https://youtu.be/vOiP3kfFlrE?si=zPd-Bt1GO03jxpI_")))
621    # res = asyncio.run(hx_req("https://httpbin.org/delay/10"))
622    # asyncio.run(hx_req("https://httpbin.org/get", check_kv={"url": "https://httpbin.org/get", "headers.Pragma": "no-cache1"}, max_retry=1))
623    # resp = asyncio.run(hx_req("https://httpbin.org/get", check_kv={"headers": {"Accept-Language": "en-US,en;q=0.8"}}))
624    # resp = asyncio.run(hx_req("https://httpbin.org/headers", headers={"referer": "https://www.xiaohongshu.com/"}))
625    # print(resp)
626
627    # asyncio.run(download_file("https://httpbin.org/image/jpeg", suffix=".jpg"))
628    # asyncio.run(match_social_media_link("https://www.instagram.com/p/C7P3jN8vmEN"))