Commit 6a17b43

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-08-28 06:32:42
fix(youtube): correctly extract video IDs from various YouTube URL formats
1 parent d3a986a
Changed files (1)
src/networking.py
@@ -379,12 +379,15 @@ async def match_social_media_link(text: str, *, flatten_first: bool = True) -> d
         return {"url": url, "db_key": bare_url(url), "platform": "music163"}
 
     # https://www.youtube.com/watch?v=D6aE2E0RHTc
-    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/watch([^,,.。\s]+)", text):
-        queries = parse_qs(urlparse(matched.group(0)).query)
-        if vid := queries.get("v", [""])[0]:
-            return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
+    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/watch.*?v=([a-zA-Z0-9_-]{11})", text):
+        vid = matched.group(3)
+        return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
     # https://youtube.com/shorts/lFKHbluAlJw
-    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/shorts/([^,,.。?\s]+)", text):
+    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtube\.com/(:?shorts|live)/([a-zA-Z0-9_-]{11})", text):
+        vid = matched.group(4)
+        return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
+    # https://youtu.be/vOiP3kfFlrE
+    if matched := re.search(r"(https?://)?(:?m\.|www\.)?youtu\.be/([a-zA-Z0-9_-]{11})", text):
         vid = matched.group(3)
         return {"url": f"https://www.youtube.com/watch?v={vid}", "db_key": f"www.youtube.com/watch?v={vid}", "vid": vid, "platform": "youtube"}
 
@@ -424,9 +427,6 @@ async def flatten_rediercts(texts: str | None = None, pattern: str | None = None
         return ""
 
     url = ""
-    # youtu.be
-    if matched := re.search(r"(https?://)?youtu\.be/([^.。,,?&/\s]+)", texts):
-        url = matched.group(0)
     # v.douyin.com
     if matched := re.search(r"(https?://)?v\.douyin\.com/([^.。,,?&/\s]+)", texts):
         method = "GET"  # use GET for v.douyin.com
@@ -507,11 +507,14 @@ if __name__ == "__main__":
     # asyncio.run(flatten_rediercts("https://v.douyin.com/CeiJfJMQG/"))
     # asyncio.run(flatten_rediercts("https://www.tiktok.com/t/ZT2mcMA7f/"))
     # asyncio.run(flatten_rediercts("https://t.co/Wwo3x69CQz"))
+    print(asyncio.run(match_social_media_link("https://www.youtube.com/watch?v=D6aE2E0RHTc")))
+    print(asyncio.run(match_social_media_link("https://youtube.com/shorts/lFKHbluAlJw")))
+    print(asyncio.run(match_social_media_link("https://youtu.be/vOiP3kfFlrE?si=zPd-Bt1GO03jxpI_")))
     # res = asyncio.run(hx_req("https://httpbin.org/delay/10"))
     # asyncio.run(hx_req("https://httpbin.org/get", check_kv={"url": "https://httpbin.org/get", "headers.Pragma": "no-cache1"}, max_retry=1))
     # resp = asyncio.run(hx_req("https://httpbin.org/get", check_kv={"headers": {"Accept-Language": "en-US,en;q=0.8"}}))
-    resp = asyncio.run(hx_req("https://httpbin.org/headers", headers={"referer": "https://www.xiaohongshu.com/"}))
-    print(resp)
+    # resp = asyncio.run(hx_req("https://httpbin.org/headers", headers={"referer": "https://www.xiaohongshu.com/"}))
+    # print(resp)
 
     # asyncio.run(download_file("https://httpbin.org/image/jpeg", suffix=".jpg"))
     # asyncio.run(match_social_media_link("https://www.instagram.com/p/C7P3jN8vmEN"))