main
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3import asyncio
4import hashlib
5import os
6from datetime import UTC, datetime
7
8from glom import Coalesce, glom
9from loguru import logger
10
11from config import cache
12from database.d1 import create_d1_table, insert_d1, query_d1
13from networking import hx_req
14
15TABLE = "readhub"
16
17
18async def readhub():
19 if os.getenv("READHUB_DISABLED", "0") == "1" or cache.get("readhub"):
20 return
21 cache.set("readhub", 1, ttl=1200) # every 20 minutes
22 await create_d1_table(
23 table_name=TABLE,
24 columns="uid TEXT PRIMARY KEY, title TEXT, timestamp INTEGER, url TEXT, summary TEXT",
25 idx_cols=["uid", "timestamp"],
26 db_name="dnkt",
27 silent=True,
28 )
29 d1 = await query_d1(sql=f"SELECT timestamp,uid FROM {TABLE} ORDER BY timestamp DESC LIMIT 100", db_name="dnkt", silent=True)
30 finished_uids = set(glom(d1, "result.**.uid", default=[]))
31 if not finished_uids:
32 logger.error("ReadHub: No finished uids")
33 return
34 for page in range(1, 6):
35 await sync_page(finished_uids, page=page)
36
37
38async def sync_page(finished_uids: set[str], page: int = 1):
39 resp = await hx_req(
40 f"https://api.readhub.cn/news/list?page={page}&size=20&type=8",
41 headers={
42 "accept": "*/*",
43 "accept-language": "zh-CN,zh;q=0.9",
44 "authorization": "bearer",
45 "cache-control": "no-cache",
46 "content-type": "application/json",
47 "dnt": "1",
48 "origin": "https://readhub.cn",
49 "pragma": "no-cache",
50 "priority": "u=1, i",
51 "referer": "https://readhub.cn/",
52 "sec-ch-ua": '"Google Chrome";v="143", "Chromium";v="143", "Not A(Brand";v="24"',
53 "sec-ch-ua-mobile": "?0",
54 "sec-ch-ua-platform": '"Windows"',
55 "sec-fetch-dest": "empty",
56 "sec-fetch-mode": "cors",
57 "sec-fetch-site": "same-site",
58 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
59 },
60 check_kv={"data.pageIndex": page},
61 timeout=5,
62 silent=True,
63 )
64 items = glom(resp, "data.items", default=[])
65 tasks = []
66 for item in items[::-1]:
67 title = item.get("title", "")
68 summary = item.get("summary", "")
69 url = item.get("url", "")
70 uid = item.get("uid", 0)
71 if not url or not title:
72 continue
73 if not uid:
74 uid = hashlib.sha256(f"{title}{url}{summary}".encode()).hexdigest()
75 if uid in finished_uids:
76 continue
77 logger.info(f"ReadHub: [{title}]({url})")
78 records = {
79 "uid": uid,
80 "title": title,
81 "timestamp": get_utc_timestamp(item),
82 "url": url,
83 "summary": summary,
84 }
85 tasks.append(query_d1(**insert_d1(TABLE, records, update_on_conflict="uid"), db_name="dnkt", silent=True))
86 await asyncio.gather(*tasks)
87
88
89def get_utc_timestamp(item: dict) -> int:
90 time_str = glom(item, Coalesce("publishDate", "createdAt"), default="") # 2026-01-20T04:41:35.415Z
91 if not time_str:
92 return round(datetime.now(UTC).timestamp())
93 time_dt = datetime.fromisoformat(time_str)
94 return round(time_dt.timestamp())