bennybot/src/custom/email2md.py at main

  1#!/usr/bin/env python
  2# -*- coding: utf-8 -*-
  3import email
  4import email.policy
  5import re
  6from email.header import decode_header
  7from email.message import EmailMessage
  8from io import BytesIO
  9
 10from glom import glom
 11from loguru import logger
 12from pyrogram.client import Client
 13from pyrogram.types import InputMediaDocument, Message, ReplyParameters
 14
 15from config import CAPTION_LENGTH, DB, TEXT_LENGTH, TZ
 16from database.r2 import set_cf_r2
 17from messages.utils import count_without_entities
 18from utils import convert2html, convert2md, myself, nowstr, readable_size
 19
 20
 21async def eml2md(client: Client, message: Message):
 22    """EML文件转Markdown."""
 23    if glom(message, "document.mime_type", default="") != "message/rfc822":
 24        return
 25    # 确保是SMTP机器人或自己发送的邮件
 26    if not (glom(message, "from_user.username", default="") == "smtpbot" or glom(message, "from_user.id", default=0) == (await myself(client)).id):
 27        return
 28    data: BytesIO = await client.download_media(message, in_memory=True)  # type: ignore
 29    logger.debug(f"Downloading Email file: {data.name}")
 30    msg = email.message_from_binary_file(data, policy=email.policy.default)
 31    # 提取邮件元信息
 32    subject = decode(msg.get("Subject"))
 33    from_ = decode(msg.get("From"))
 34    to = decode(msg.get("To"))
 35    reply_to = decode(msg.get("Reply-To"))
 36    date = decode(msg.get("Date")) or nowstr(TZ)
 37    cc = decode(msg.get("Cc"))
 38    bcc = decode(msg.get("Bcc"))
 39    html, md = extract_body(msg)
 40    attachments = extract_attachments(msg)
 41    r2_key = f"TTL/90d/{message.chat.id}-{message.id}.html"
 42    await set_cf_r2(key=r2_key, data=html, mime_type="text/html")
 43    url = f"{DB.CF_R2_PUBLIC_URL}/TTL/90d/{message.chat.id}-{message.id}.html"
 44    header = ""
 45    if subject:
 46        header += f"**[{subject}]({url})**\n"
 47    if from_:
 48        header += f"**From**: {refine_email(from_)}\n"
 49    if to:
 50        header += f"**To**: {refine_email(to)}\n"
 51    if reply_to:
 52        header += f"**Reply-To**: {refine_email(reply_to)}\n"
 53    if date:
 54        header += f"**Date**: {date}\n"
 55    if cc:
 56        header += f"**CC**: {refine_email(cc)}\n"
 57    if bcc:
 58        header += f"**BCC**: {refine_email(bcc)}\n"
 59    if attachments:
 60        header += f"{attachments}\n"
 61
 62    async def sent_file(caption: str = ""):
 63        with BytesIO(html.encode("utf-8")) as fhtml, BytesIO(md.encode("utf-8")) as fmd:
 64            await client.send_media_group(
 65                message.chat.id,
 66                message_thread_id=message.message_thread_id,
 67                media=[
 68                    InputMediaDocument(fhtml, file_name=f"{subject}.html"),
 69                    InputMediaDocument(fmd, caption=caption, file_name=f"{subject}.md"),
 70                ],
 71                reply_parameters=ReplyParameters(message_id=message.id),
 72            )
 73
 74    length = await count_without_entities(header + md)
 75    if length <= CAPTION_LENGTH:
 76        await sent_file(header + md)
 77    elif CAPTION_LENGTH < length <= TEXT_LENGTH:
 78        await message.reply_text(header + md, quote=True)
 79        await sent_file(header)
 80    else:
 81        await sent_file(header)
 82    await client.mark_chat_unread(message.chat.id)
 83
 84
 85def decode(texts: str | None) -> str:
 86    # ruff: noqa: PLW2901
 87    if not texts:
 88        return ""
 89    try:
 90        decoded_parts = decode_header(texts)
 91        result_parts = []
 92        for part, encoding in decoded_parts:
 93            if isinstance(part, bytes):
 94                if encoding:
 95                    try:
 96                        part = part.decode(encoding)
 97                    except (UnicodeDecodeError, LookupError):
 98                        part = part.decode("utf-8", errors="replace")
 99                else:
100                    part = part.decode("utf-8", errors="replace")
101            result_parts.append(str(part))
102        return "".join(result_parts)
103    except Exception:
104        return str(texts)
105
106
107def refine_email(email: str) -> str:
108    return email.replace("<", "(").replace(">", ")")
109
110
111def extract_body(msg: EmailMessage) -> tuple[str, str]:
112    """Extract the email body to HTML & Markdown."""
113    html_part: str | None = None
114    text_part: str | None = None
115
116    if msg.is_multipart():
117        for part in msg.walk():
118            content_type = part.get_content_type()
119            disposition = str(part.get("Content-Disposition", ""))
120
121            if "attachment" in disposition:
122                continue
123
124            if content_type == "text/html" and html_part is None:
125                html_part = part.get_content()
126            elif content_type == "text/plain" and text_part is None:
127                text_part = part.get_content()
128    else:
129        content_type = msg.get_content_type()
130        if content_type == "text/html":
131            html_part = msg.get_content()
132        elif content_type == "text/plain":
133            text_part = msg.get_content()
134
135    if html_part:
136        md = convert2md(html=html_part)
137        return html_part, md
138    if text_part:
139        text_part = re.sub(r"\n\s*\n", "\n\n", text_part).strip()
140        html = convert2html(text_part)
141        return html, text_part
142
143    return "", ""
144
145
146def extract_attachments(msg: EmailMessage) -> str:
147    """List attachments with their filenames and sizes."""
148    attachments = ""
149
150    if not msg.is_multipart():
151        return ""
152
153    for part in msg.walk():
154        disposition = str(part.get("Content-Disposition", ""))
155        if "attachment" not in disposition:
156            continue
157
158        filename = part.get_filename() or "unnamed"
159        payload = part.get_payload(decode=True)
160        size = len(payload) if payload else 0
161        attachments += f"- `{filename}` ({readable_size(size)})\n"
162
163    return f"**Attachments**\n{attachments}" if attachments else ""