Commit c94cc26

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-01-23 13:31:43
fix(xhs): prefer non-watermark images
1 parent a906056
Changed files (1)
src
src/preview/xiaohongshu.py
@@ -110,13 +110,21 @@ async def preview_xhs(client: Client, message: Message, url: str = "", db_key: s
     await save_messages(messages=sent_messages, key=db_key)
 
 
-async def get_xhs_info(url: str, retry: int = 0) -> dict:
+async def get_xhs_info(url: str, ua: str = UA.CHROME, retry: int = 0) -> dict:
     """Get xiaohongshu post info.
 
     XHS banned VPS IP, so we need to use residential proxy.
+    XHS has two different return formats base on User-Agent.
+    Some posts can only be accessed with mobile User-Agent. (I don't know why)
+    But images got from mobile has XHS watermark.
+    So we prefer to use desktop User-Agent.
     """
-    headers = {"user-agent": UA.IPHONE, "referer": "https://www.xiaohongshu.com/"}
-    if retry > 3:
+
+    def switch_ua(ua: str) -> str:
+        return UA.IPHONE if ua == UA.CHROME else UA.CHROME
+
+    headers = {"user-agent": ua, "referer": "https://www.xiaohongshu.com/"}
+    if retry > 4:
         return {}
     data = {}
     try:
@@ -127,12 +135,12 @@ async def get_xhs_info(url: str, retry: int = 0) -> dict:
         info = yaml.safe_load(script_info)
         if not info:
             retry += 1
-            logger.warning(f"XHS empty response, maybe need to adjust the proxy. Retrying: {retry} / 3")
-            return await get_xhs_info(url, retry=retry)
+            logger.warning(f"XHS empty response, maybe need to adjust the proxy. Retrying: {retry}")
+            return await get_xhs_info(url, ua=switch_ua(ua), retry=retry)
     except Exception as e:
-        logger.error(f"XHS parsing response failed: {e}, Retrying: {retry} / 3")
+        logger.error(f"XHS parsing response failed: {e}, Retrying: {retry}")
         retry += 1
-        return await get_xhs_info(url, retry=retry)
+        return await get_xhs_info(url, ua=switch_ua(ua), retry=retry)
 
     # XHS has two different return formats
     if notes := list(info.get("note", {}).get("noteDetailMap", {}).values()):
@@ -143,8 +151,8 @@ async def get_xhs_info(url: str, retry: int = 0) -> dict:
         data["note"] = note
         return data
     retry += 1
-    logger.error(f"Parsed info has no post, Retrying: {retry} / 3")
-    return await get_xhs_info(url, retry=retry)
+    logger.error(f"Parsed info has no post, Retrying: {retry}")
+    return await get_xhs_info(url, ua=switch_ua(ua), retry=retry)
 
 
 def get_xhs_comments(soup: BeautifulSoup | None) -> list[str]: