Commit 0059f10

benny-dou <60535774+benny-dou@users.noreply.github.com>
2025-05-12 02:00:52
fix(markdown): fix convert_md to accept HTML input
1 parent 54f5f67
Changed files (1)
src
src/llm/utils.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import re
+import tempfile
 from pathlib import Path
-from typing import BinaryIO
 
 import tiktoken
 from loguru import logger
@@ -209,19 +209,22 @@ def clean_gemini_sourcemarks(contexts: list[dict]) -> None:
                 part.text = clean_source_marks(part.text)
 
 
-def convert_md(path: str | Path | BinaryIO) -> str:
-    """Convert file to markdown format."""
-    if isinstance(path, (str, Path)):
+def convert_md(path: str | Path | None = None, html: str | None = None) -> str:
+    """Convert to markdown format."""
+    md = MarkItDown()
+    if path is not None:
         path = Path(path).expanduser().resolve()
         if not path.is_file():
             return ""
-    md = MarkItDown()
-    try:
         result = md.convert(path)
-    except Exception as e:
-        logger.error(f"Failed to convert to markdown: {e}")
-        return ""
-    return result.text_content
+        return result.text_content
+    if html is not None:
+        with tempfile.NamedTemporaryFile("w", suffix=".html", delete=False) as f:
+            f.write(html)
+        result = md.convert(f.name)
+        Path(f.name).unlink(missing_ok=True)
+        return result.text_content
+    return ""
 
 
 def split_reasoning(text: str) -> tuple[str, str]: