bennybot/src/podcast/xml.py at main

  1#!/venv/bin/python
  2# -*- coding: utf-8 -*-
  3import base64
  4import contextlib
  5import uuid
  6from pathlib import Path
  7from urllib.parse import urlparse
  8
  9import anyio
 10import feedparser
 11import xmltodict
 12from glom import Coalesce, glom
 13
 14from asr.utils import audio_duration
 15from config import DB, DOWNLOAD_DIR, PODCAST, PROXY, cache
 16from database.alist import upload_alist
 17from database.github import gh_upload_asset
 18from database.r2 import set_cf_r2
 19from networking import hx_req
 20from podcast.utils import HEADERS, clean_feed_url, get_pubdate
 21from preview.youtube import get_youtube_channel_thumb
 22from utils import bare_url, convert_html, https_url, nowdt
 23
 24
 25@cache.memoize(ttl=600)
 26async def parse_feed(feed_url: str, *, raw_xml: bool = False) -> dict:
 27    """Get feed content by url.
 28
 29    DO NOT use feedparser.parse(feed_url) because it doesn't support timeout.
 30    """
 31    data = await hx_req(feed_url, rformat="text", headers=HEADERS, timeout=10, silent=True, proxy=PROXY.PODCAST)
 32    with contextlib.suppress(Exception):
 33        if raw_xml:
 34            return xmltodict.parse(data["text"])
 35        feed = feedparser.parse(data["text"])
 36        return feed if isinstance(feed, dict) else {}
 37    return {}
 38
 39
 40async def get_feed_title(feed_url: str) -> str:
 41    """Get feed title by url."""
 42    feed = await parse_feed(feed_url)
 43    if title := glom(feed, Coalesce("feed.title", "feed.title_detail.value", "feed.itunes_title"), default=""):
 44        return title
 45    return urlparse(feed_url).netloc
 46
 47
 48async def gen_pod_header(feed_url: str) -> dict:
 49    """Generate podcast header for RSS feed."""
 50    now = nowdt()
 51    feed = await parse_feed(feed_url)
 52    pub_date = get_pubdate(feed)
 53    return {
 54        "rss": {
 55            "@version": "2.0",
 56            "@xmlns:itunes": "http://www.itunes.com/dtds/podcast-1.0.dtd",
 57            "@xmlns:atom": "http://www.w3.org/2005/Atom",
 58            "@xmlns:rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
 59            "@xmlns:podcast": "https://podcastindex.org/namespace/1.0",
 60            "@xmlns:content": "http://purl.org/rss/1.0/modules/content/",
 61            "channel": {
 62                # Required tags
 63                "atom:link": {
 64                    "@href": feed_url,
 65                    "@rel": "self",
 66                    "@type": "application/rss+xml",
 67                },
 68                "title": glom(feed, "feed.title", default=""),
 69                "description": glom(feed, Coalesce("feed.summary", "feed.subtitle"), default=""),
 70                "itunes:image": {"@href": glom(feed, "feed.image.href", default=await get_cover(feed_url))},
 71                "language": "en-us",
 72                "itunes:category": {"@text": "TV & Film"},
 73                "itunes:explicit": "no",
 74                # Recommended tags
 75                "podcast:locked": "yes",
 76                "podcast:guid": gen_uuid(feed_url),
 77                "itunes:author": glom(feed, Coalesce("feed.author", "feed.title"), default=""),
 78                "link": feed_url,
 79                # Situational tags
 80                "itunes:title": glom(feed, "feed.title", default=""),
 81                "itunes:type": "Episodic",
 82                "itunes:block": "yes",
 83                # Common tags for rss
 84                "category": "TV & Film",
 85                "generator": "BennyBot",
 86                "lastBuildDate": f"{now:%a, %d %b %Y %H:%M:%S %z}",
 87                "pubDate": f"{pub_date:%a, %d %b %Y %H:%M:%S %z}",
 88                "image": {
 89                    "url": glom(feed, "feed.image.href", default=await get_cover(feed_url)),
 90                    "title": glom(feed, "feed.title", default=""),
 91                    "link": feed_url,
 92                },
 93                "item": [],
 94            },
 95        }
 96    }
 97
 98
 99def gen_opml_header():
100    """Generate opml header for OPML feed."""
101    return {"opml": {"@version": "1.0", "head": {"title": "Podcast"}, "body": {"outline": []}}}
102
103
104async def update_xml_desc(feed_url: str, processed_xml: dict, entry: dict, summary: str, audio_path: str | Path) -> dict:
105    """Add AI summary to item description.
106
107    Args:
108        feed_url (str): original feed url
109        processed_xml (dict): processed feed xml
110        entry (dict): feed entry parsed by feedparser
111        summary (str): AI summary
112    """
113    original_desc = glom(entry, Coalesce("content.0.value", "summary"), default="")
114    description = convert_html(summary) + "<p>----------------------------------</p>" + original_desc
115    # try to find the item in feed_xml
116    feed_xml = await parse_feed(feed_url, raw_xml=True)
117    new_item = entry
118    for item in glom(feed_xml, "rss.channel.item", default=[]):
119        item_link = https_url(clean_feed_url(item.get("link", "")))
120        if item_link == entry["link"]:
121            new_item = item  # Found!
122            break
123    new_item.pop("content:encoded", None)  # redundant
124    new_item["description"] = description
125    if not glom(new_item, "enclosure.@url", default=""):  # This is a video rss feed. upload the audio to github
126        tag_name = base64.urlsafe_b64encode(feed_url.encode()).decode().rstrip("=")[-64:]
127        feed_title = await get_feed_title(feed_url)
128        enclosure_url = await gh_upload_asset(audio_path, tag_name=tag_name, release_name=feed_title, gh_repo=PODCAST.GH_REPO, gh_token=PODCAST.GH_TOKEN)
129        mime_type = {".mp3": "audio/mpeg", ".m4a": "audio/x-m4a", ".flac": "audio/flac"}.get(Path(audio_path).suffix, "audio/mpeg")
130        new_item = {
131            # Required tags
132            "title": entry["title"],
133            "enclosure": {
134                "@url": enclosure_url,
135                "@length": Path(audio_path).stat().st_size,
136                "@type": mime_type,
137            },
138            "guid": bare_url(entry["link"]),
139            # Recommended tags
140            "pubDate": get_pubdate(entry).strftime("%a, %d %b %Y %H:%M:%S %z"),
141            "description": description,
142            "itunes:duration": int(audio_duration(audio_path)),
143            "link": entry["link"],
144            "itunes:explicit": "false",
145        }
146    if not processed_xml:
147        processed_xml = await gen_pod_header(feed_url)
148    items = processed_xml["rss"]["channel"]["item"]
149    if not isinstance(items, list):  # only one item, will be converted to list
150        items = [items]
151    items.insert(0, new_item)
152    processed_xml["rss"]["channel"]["item"] = items
153    return processed_xml
154
155
156def gen_uuid(url: str):
157    """Generate podcast UUID from URL.
158
159    Docs: https://github.com/Podcastindex-org/podcast-namespace/blob/main/docs/1.0.md#guid
160    The value is a UUIDv5, and is generated from the RSS feed url,
161    with the protocol scheme and trailing slashes stripped off,
162    combined with a unique "podcast" namespace which has a UUID of ead4c236-bf58-58c6-a2c6-a6b28d128cb6
163
164    Args:
165        url (str): feed url
166    """
167    url = url.strip().strip("/").removeprefix("http://").removeprefix("https://")
168    pod_uuid = uuid.uuid5(uuid.UUID("ead4c236-bf58-58c6-a2c6-a6b28d128cb6"), url)
169    return str(pod_uuid)
170
171
172async def get_cover(feed_url: str) -> str:
173    """Get podcast cover from feed url."""
174    if feed_url.startswith("https://www.youtube.com/feeds/videos.xml?channel_id="):
175        channel_id = feed_url.removeprefix("https://www.youtube.com/feeds/videos.xml?channel_id=")
176        return await get_youtube_channel_thumb(channel_id)
177    feed = await parse_feed(feed_url)
178    return glom(feed, "feed.image.href", default="https://upload.wikimedia.org/wikipedia/commons/c/c8/Podcast_iOS.png")
179
180
181async def save_xml(feed_xml: dict, save_url: str):
182    if glom(feed_xml, "rss.channel.item", default=[]):
183        items = glom(feed_xml, "rss.channel.item", default=[])
184        feed_xml["rss"]["channel"]["item"] = items[: PODCAST.KEEP_LATEST_ENTRIES]
185    xml_str = xmltodict.unparse(feed_xml, pretty=True, full_document=False)
186    if PODCAST.FS_ENGINE == "CF-R2":
187        r2_key = save_url.removeprefix(DB.CF_R2_PUBLIC_URL).lstrip("/")
188        await set_cf_r2(r2_key, data=xml_str, mime_type="application/xml")
189        return
190    if PODCAST.FS_ENGINE == "alist":
191        save_path = Path(DOWNLOAD_DIR) / Path(save_url).name
192        save_path = Path(save_path)
193        save_path.parent.mkdir(parents=True, exist_ok=True)
194        async with await anyio.open_file(save_path, "w") as f:
195            await f.write(xml_str)
196        await upload_alist(save_path)
197        save_path.unlink(missing_ok=True)