main
1#!/venv/bin/python
2# -*- coding: utf-8 -*-
3import base64
4import contextlib
5import uuid
6from pathlib import Path
7from urllib.parse import urlparse
8
9import anyio
10import feedparser
11import xmltodict
12from glom import Coalesce, glom
13
14from asr.utils import audio_duration
15from config import DB, DOWNLOAD_DIR, PODCAST, PROXY, cache
16from database.alist import upload_alist
17from database.github import gh_upload_asset
18from database.r2 import set_cf_r2
19from networking import hx_req
20from podcast.utils import HEADERS, clean_feed_url, get_pubdate
21from preview.youtube import get_youtube_channel_thumb
22from utils import bare_url, convert_html, https_url, nowdt
23
24
25@cache.memoize(ttl=600)
26async def parse_feed(feed_url: str, *, raw_xml: bool = False) -> dict:
27 """Get feed content by url.
28
29 DO NOT use feedparser.parse(feed_url) because it doesn't support timeout.
30 """
31 data = await hx_req(feed_url, rformat="text", headers=HEADERS, timeout=10, silent=True, proxy=PROXY.PODCAST)
32 with contextlib.suppress(Exception):
33 if raw_xml:
34 return xmltodict.parse(data["text"])
35 feed = feedparser.parse(data["text"])
36 return feed if isinstance(feed, dict) else {}
37 return {}
38
39
40async def get_feed_title(feed_url: str) -> str:
41 """Get feed title by url."""
42 feed = await parse_feed(feed_url)
43 if title := glom(feed, Coalesce("feed.title", "feed.title_detail.value", "feed.itunes_title"), default=""):
44 return title
45 return urlparse(feed_url).netloc
46
47
48async def gen_pod_header(feed_url: str) -> dict:
49 """Generate podcast header for RSS feed."""
50 now = nowdt()
51 feed = await parse_feed(feed_url)
52 pub_date = get_pubdate(feed)
53 return {
54 "rss": {
55 "@version": "2.0",
56 "@xmlns:itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
57 "@xmlns:atom": "http://www.w3.org/2005/Atom",
58 "@xmlns:rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
59 "@xmlns:podcast": "https://podcastindex.org/namespace/1.0",
60 "@xmlns:content": "http://purl.org/rss/1.0/modules/content/",
61 "channel": {
62 # Required tags
63 "atom:link": {
64 "@href": feed_url,
65 "@rel": "self",
66 "@type": "application/rss+xml",
67 },
68 "title": glom(feed, "feed.title", default=""),
69 "description": glom(feed, Coalesce("feed.summary", "feed.subtitle"), default=""),
70 "itunes:image": {"@href": glom(feed, "feed.image.href", default=await get_cover(feed_url))},
71 "language": "en-us",
72 "itunes:category": {"@text": "TV & Film"},
73 "itunes:explicit": "no",
74 # Recommended tags
75 "podcast:locked": "yes",
76 "podcast:guid": gen_uuid(feed_url),
77 "itunes:author": glom(feed, Coalesce("feed.author", "feed.title"), default=""),
78 "link": feed_url,
79 # Situational tags
80 "itunes:title": glom(feed, "feed.title", default=""),
81 "itunes:type": "Episodic",
82 "itunes:block": "yes",
83 # Common tags for rss
84 "category": "TV & Film",
85 "generator": "BennyBot",
86 "lastBuildDate": f"{now:%a, %d %b %Y %H:%M:%S %z}",
87 "pubDate": f"{pub_date:%a, %d %b %Y %H:%M:%S %z}",
88 "image": {
89 "url": glom(feed, "feed.image.href", default=await get_cover(feed_url)),
90 "title": glom(feed, "feed.title", default=""),
91 "link": feed_url,
92 },
93 "item": [],
94 },
95 }
96 }
97
98
99def gen_opml_header():
100 """Generate opml header for OPML feed."""
101 return {"opml": {"@version": "1.0", "head": {"title": "Podcast"}, "body": {"outline": []}}}
102
103
104async def update_xml_desc(feed_url: str, processed_xml: dict, entry: dict, summary: str, audio_path: str | Path) -> dict:
105 """Add AI summary to item description.
106
107 Args:
108 feed_url (str): original feed url
109 processed_xml (dict): processed feed xml
110 entry (dict): feed entry parsed by feedparser
111 summary (str): AI summary
112 """
113 original_desc = glom(entry, Coalesce("content.0.value", "summary"), default="")
114 description = convert_html(summary) + "<p>----------------------------------</p>" + original_desc
115 # try to find the item in feed_xml
116 feed_xml = await parse_feed(feed_url, raw_xml=True)
117 new_item = entry
118 for item in glom(feed_xml, "rss.channel.item", default=[]):
119 item_link = https_url(clean_feed_url(item.get("link", "")))
120 if item_link == entry["link"]:
121 new_item = item # Found!
122 break
123 new_item.pop("content:encoded", None) # redundant
124 new_item["description"] = description
125 if not glom(new_item, "enclosure.@url", default=""): # This is a video rss feed. upload the audio to github
126 tag_name = base64.urlsafe_b64encode(feed_url.encode()).decode().rstrip("=")[-64:]
127 feed_title = await get_feed_title(feed_url)
128 enclosure_url = await gh_upload_asset(audio_path, tag_name=tag_name, release_name=feed_title, gh_repo=PODCAST.GH_REPO, gh_token=PODCAST.GH_TOKEN)
129 mime_type = {".mp3": "audio/mpeg", ".m4a": "audio/x-m4a", ".flac": "audio/flac"}.get(Path(audio_path).suffix, "audio/mpeg")
130 new_item = {
131 # Required tags
132 "title": entry["title"],
133 "enclosure": {
134 "@url": enclosure_url,
135 "@length": Path(audio_path).stat().st_size,
136 "@type": mime_type,
137 },
138 "guid": bare_url(entry["link"]),
139 # Recommended tags
140 "pubDate": get_pubdate(entry).strftime("%a, %d %b %Y %H:%M:%S %z"),
141 "description": description,
142 "itunes:duration": int(audio_duration(audio_path)),
143 "link": entry["link"],
144 "itunes:explicit": "false",
145 }
146 if not processed_xml:
147 processed_xml = await gen_pod_header(feed_url)
148 items = processed_xml["rss"]["channel"]["item"]
149 if not isinstance(items, list): # only one item, will be converted to list
150 items = [items]
151 items.insert(0, new_item)
152 processed_xml["rss"]["channel"]["item"] = items
153 return processed_xml
154
155
156def gen_uuid(url: str):
157 """Generate podcast UUID from URL.
158
159 Docs: https://github.com/Podcastindex-org/podcast-namespace/blob/main/docs/1.0.md#guid
160 The value is a UUIDv5, and is generated from the RSS feed url,
161 with the protocol scheme and trailing slashes stripped off,
162 combined with a unique "podcast" namespace which has a UUID of ead4c236-bf58-58c6-a2c6-a6b28d128cb6
163
164 Args:
165 url (str): feed url
166 """
167 url = url.strip().strip("/").removeprefix("http://").removeprefix("https://")
168 pod_uuid = uuid.uuid5(uuid.UUID("ead4c236-bf58-58c6-a2c6-a6b28d128cb6"), url)
169 return str(pod_uuid)
170
171
172async def get_cover(feed_url: str) -> str:
173 """Get podcast cover from feed url."""
174 if feed_url.startswith("https://www.youtube.com/feeds/videos.xml?channel_id="):
175 channel_id = feed_url.removeprefix("https://www.youtube.com/feeds/videos.xml?channel_id=")
176 return await get_youtube_channel_thumb(channel_id)
177 feed = await parse_feed(feed_url)
178 return glom(feed, "feed.image.href", default="https://upload.wikimedia.org/wikipedia/commons/c/c8/Podcast_iOS.png")
179
180
181async def save_xml(feed_xml: dict, save_url: str):
182 if glom(feed_xml, "rss.channel.item", default=[]):
183 items = glom(feed_xml, "rss.channel.item", default=[])
184 feed_xml["rss"]["channel"]["item"] = items[: PODCAST.KEEP_LATEST_ENTRIES]
185 xml_str = xmltodict.unparse(feed_xml, pretty=True, full_document=False)
186 if PODCAST.FS_ENGINE == "CF-R2":
187 r2_key = save_url.removeprefix(DB.CF_R2_PUBLIC_URL).lstrip("/")
188 await set_cf_r2(r2_key, data=xml_str, mime_type="application/xml")
189 return
190 if PODCAST.FS_ENGINE == "alist":
191 save_path = Path(DOWNLOAD_DIR) / Path(save_url).name
192 save_path = Path(save_path)
193 save_path.parent.mkdir(parents=True, exist_ok=True)
194 async with await anyio.open_file(save_path, "w") as f:
195 await f.write(xml_str)
196 await upload_alist(save_path)
197 save_path.unlink(missing_ok=True)