feat: 任务进度实时展示、接口测试、暗色主题重构及多项 bug 修复

后端 - 新增 app/task_progress.py 线程安全进度注册表 - 任务改为后台线程异步执行（_run_task_background），手动触发立即返回 task_key - 6 个任务函数（summarizer/tagger/scorer/deduplicator/brief/taxonomy）循环内上报进度 - scheduler 定时任务同步上报进度（trigger=scheduled） - 新增 GET /api/tasks/progress 与 POST /api/tasks/progress/reset 接口 - 新增 POST /api/test-connection 接口连通性测试（独立短超时客户端） - 修复 ai_client/rss_client 配置在 import 时固化的 bug（改为 property 运行时读取 settings），导致实际任务用 .env 假 key 调 LLM 401 - 修复 ai_client 对 reasoning 模型（MiniMax-M3 等）输出 <think> 块的 JSON 解析失败 - 修复 taxonomy bootstrap：LLM 超时（改用 300s 专用 client）、MiniMax 输出审查（精简样本仅标题 + 约束生成中性类目名）、失败误报 success（改抛异常如实标记） - 修复 models.py 双外键关系映射启动崩溃（显式 foreign_keys） - 修复 main.py SPA 路由 404、ArticleOut.published_at 序列化 500 - 移除 lifespan 同步 bootstrap 阻塞启动，改由 scheduler 后台异步执行前端 - Deep Ink 高对比度暗色主题重构，修复 Element Plus 暗色模式对比度问题 - Tasks 页面任务进度实时展示（进度条/阶段/计数/状态/触发来源）+ 1.5s 轮询 - 接口测试面板（rssKeeper / LLM 连通性 + 延迟） - 修复 nextJobs jobId 映射 bug 部署与文档 - Dockerfile 优化（BuildKit 缓存挂载、预编译 wheel、去 gcc、阿里云镜像源） - 新增 API.md 接口文档 Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-14 15:14:40 +08:00
parent bae47a2411
commit 778ccefb22
24 changed files with 1853 additions and 312 deletions
@@ -1,6 +1,7 @@
 """LLM API 客户端，兼容 OpenAI API 格式"""
 import json
 import logging
+import re
 from typing import Optional

 from openai import OpenAI, APIError
@@ -9,9 +10,57 @@ from config import settings

 logger = logging.getLogger(__name__)

+# 匹配 reasoning 模型（MiniMax-M3 / DeepSeek-R1 / GLM-Z1 等）的 <think>...</think> 推理块
+_THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL)
+
+
+def _parse_llm_json(content: str) -> dict:
+    """从 LLM 输出中提取 JSON。
+
+    兼容 reasoning 模型在 json_object 模式下仍输出 <think>...</think>
+    推理块、以及 JSON 前后有多余文本的情况。
+    """
+    if not content or not content.strip():
+        raise ValueError("LLM 返回空内容，无法解析 JSON")
+
+    text = content.strip()
+    # 1) 去掉闭合的 <think>...</think> 块
+    text = _THINK_RE.sub("", text).strip()
+    # 2) 处理只有 <think> 开头但未闭合（content 被截断）的情况
+    if text.startswith("<think>"):
+        text = text.split("</think>", 1)[-1].strip()
+
+    # 3) 尝试直接解析
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+
+    # 4) 提取首个 { 到最后 } 之间的子串
+    start = text.find("{")
+    end = text.rfind("}")
+    if start != -1 and end > start:
+        try:
+            return json.loads(text[start : end + 1])
+        except json.JSONDecodeError:
+            pass
+
+    # 5) 兜底：尝试数组
+    start = text.find("[")
+    end = text.rfind("]")
+    if start != -1 and end > start:
+        return json.loads(text[start : end + 1])
+
+    logger.error("无法从 LLM 输出提取 JSON: %s", content[:500])
+    raise ValueError("LLM 输出无法解析为 JSON")
+

 class AIClient:
-    """封装 LLM 调用，支持重试和 JSON 输出"""
+    """封装 LLM 调用，支持重试和 JSON 输出。
+
+    配置以 property 形式运行时从 settings 读取，避免模块 import 时
+    固化旧值（settings 在 FastAPI lifespan 启动后才会被数据库配置覆盖）。
+    """

    def __init__(
        self,
@@ -21,24 +70,42 @@ class AIClient:
        timeout: Optional[int] = None,
        max_retries: Optional[int] = None,
    ):
-        self.api_key = api_key or settings.OPENAI_API_KEY
-        self.base_url = base_url or settings.OPENAI_BASE_URL
-        self.model = model or settings.OPENAI_MODEL
-        self.timeout = timeout or settings.OPENAI_TIMEOUT
-        self.max_retries = max_retries or settings.OPENAI_MAX_RETRIES
+        # 仅保存显式传入的覆盖值；为 None 时运行时回退到 settings
+        self._api_key = api_key
+        self._base_url = base_url
+        self._model = model
+        self._timeout = timeout
+        self._max_retries = max_retries

-        self._client: Optional[OpenAI] = None
+    @property
+    def api_key(self) -> str:
+        return self._api_key or settings.OPENAI_API_KEY
+
+    @property
+    def base_url(self) -> str:
+        return self._base_url or settings.OPENAI_BASE_URL
+
+    @property
+    def model(self) -> str:
+        return self._model or settings.OPENAI_MODEL
+
+    @property
+    def timeout(self) -> int:
+        return self._timeout or settings.OPENAI_TIMEOUT
+
+    @property
+    def max_retries(self) -> int:
+        return self._max_retries or settings.OPENAI_MAX_RETRIES

    @property
    def client(self) -> OpenAI:
-        if self._client is None:
-            self._client = OpenAI(
-                api_key=self.api_key,
-                base_url=self.base_url,
-                timeout=self.timeout,
-                max_retries=self.max_retries,
-            )
-        return self._client
+        # 每次按最新配置创建，确保用到启动后覆盖的真实配置
+        return OpenAI(
+            api_key=self.api_key,
+            base_url=self.base_url,
+            timeout=self.timeout,
+            max_retries=self.max_retries,
+        )

    def chat_completion(
        self,
@@ -75,18 +142,14 @@ class AIClient:
        user_prompt: str,
        temperature: float = 0.3,
    ) -> dict:
-        """调用 LLM 并解析返回的 JSON"""
+        """调用 LLM 并解析返回的 JSON（兼容 reasoning 模型的 <think> 块）"""
        content = self.chat_completion(
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            temperature=temperature,
            json_mode=True,
        )
-        try:
-            return json.loads(content)
-        except json.JSONDecodeError as exc:
-            logger.error("LLM 返回不是合法 JSON: %s - content=%s", exc, content[:500])
-            raise
+        return _parse_llm_json(content)


 ai_client = AIClient()
@@ -9,6 +9,7 @@ from sqlalchemy.orm import Session

 from config import settings
 from models import EnrichedArticle, DailyBrief
+from app.task_progress import update_progress

 logger = logging.getLogger(__name__)

@@ -76,6 +77,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
    existing = db.query(DailyBrief).filter(DailyBrief.brief_date == date_str).first()
    if existing and not force:
        logger.info("日期 %s 简报已存在，跳过生成", date_str)
+        update_progress("generate_daily_brief", status="running", stage="简报已存在", current=0, total=0, message="简报已存在，跳过生成")
        return {
            "date": date_str,
            "total_articles": existing.total_articles,
@@ -86,6 +88,8 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
    day_start = datetime.strptime(date_str, "%Y-%m-%d")
    day_end = day_start + timedelta(days=1)

+    update_progress("generate_daily_brief", status="running", stage="加载文章", current=0, total=0)
+
    # 取当天去重后的代表文章
    query = (
        db.query(EnrichedArticle)
@@ -106,6 +110,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
    )

    # 按分类分组并排序
+    update_progress("generate_daily_brief", status="running", stage="按分类整理", current=0, total=0)
    by_category: Dict[str, List[Dict[str, Any]]] = {}
    for art in representative_articles:
        cat = art.category or "未分类"
@@ -127,6 +132,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
    }

    # 生成 Markdown 文件
+    update_progress("generate_daily_brief", status="running", stage="生成 Markdown", current=0, total=0)
    output_dir = settings.brief_output_dir_path / date_str
    output_dir.mkdir(parents=True, exist_ok=True)
    markdown_path = output_dir / "daily-brief.md"
@@ -134,6 +140,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
    markdown_path.write_text(markdown_content, encoding="utf-8")

    # 更新文章 brief_date
+    update_progress("generate_daily_brief", status="running", stage="保存简报", current=0, total=0)
    for art in representative_articles:
        art.brief_date = date_str

@@ -12,6 +12,7 @@ import numpy as np

 from config import settings
 from models import EnrichedArticle, DuplicateGroup
+from app.task_progress import update_progress, report_loop_progress

 logger = logging.getLogger(__name__)

@@ -172,8 +173,11 @@ def deduplicate_articles(

    if not articles:
        logger.info("日期 %s 无文章可去重", date_str)
+        update_progress("tag_score_dedup", status="running", stage="去重", current=0, total=0, message="无文章可去重")
        return {"total": 0, "duplicate_groups": 0, "representatives": 0}

+    update_progress("tag_score_dedup", status="running", stage="计算相似度并去重", current=0, total=0)
+
    # 先 URL 去重：相同 link 只保留一篇
    unique_articles: List[EnrichedArticle] = []
    seen_links: set = set()
@@ -194,8 +198,9 @@ def deduplicate_articles(
    )

    stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0}
+    update_progress("tag_score_dedup", status="running", stage="写入重复组", current=0, total=len(clusters))

-    for cluster in clusters:
+    for ci, cluster in enumerate(clusters):
        representative = _pick_representative(unique_articles, cluster)
        member_ids = [unique_articles[i].id for i in cluster]

@@ -214,6 +219,7 @@ def deduplicate_articles(
            art.is_representative = (art.id == representative.id)

        stats["representatives"] += 1
+        report_loop_progress("tag_score_dedup", ci + 1, len(clusters), "写入重复组")

    db.commit()
    logger.info(
@@ -11,11 +11,23 @@ logger = logging.getLogger(__name__)


 class RSSKeeperClient:
-    """rssKeeper 外部 API 客户端"""
+    """rssKeeper 外部 API 客户端。

-    def __init__(self, base_url: Optional[str] = None, timeout: int = 30):
-        self.base_url = (base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
-        self.timeout = timeout
+    配置以 property 形式运行时从 settings 读取，避免模块 import 时
+    固化旧值（settings 在 FastAPI lifespan 启动后才会被数据库配置覆盖）。
+    """
+
+    def __init__(self, base_url: Optional[str] = None, timeout: Optional[int] = None):
+        self._base_url = base_url
+        self._timeout = timeout
+
+    @property
+    def base_url(self) -> str:
+        return (self._base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
+
+    @property
+    def timeout(self) -> int:
+        return self._timeout if self._timeout is not None else 30

    def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        url = f"{self.base_url}{path}"
@@ -8,6 +8,7 @@ from sqlalchemy.orm import Session

 from config import settings
 from models import EnrichedArticle, Taxonomy
+from app.task_progress import update_progress, report_loop_progress
 from app.tagger import _count_matches, _normalize

 logger = logging.getLogger(__name__)
@@ -119,6 +120,7 @@ def score_articles(
        query = query.filter(EnrichedArticle.id.in_(article_ids))

    articles = query.all()
+    update_progress("tag_score_dedup", status="running", stage="计算分数", current=0, total=len(articles))
    count = 0
    for article in articles:
        article.heat_score = compute_heat_score(article, heat_rules)
@@ -141,6 +143,7 @@ def score_articles(
        count += 1
        if count % 50 == 0:
            db.commit()
+        report_loop_progress("tag_score_dedup", count, len(articles), "计算分数")

    db.commit()
    logger.info("打分完成: %d 篇文章", count)
@@ -7,6 +7,7 @@ from sqlalchemy.orm import Session

 from app.ai_client import ai_client
 from app.rss_client import rss_client
+from app.task_progress import update_progress, report_loop_progress
 from config import settings
 from models import EnrichedArticle

@@ -109,11 +110,13 @@ def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[
    articles = rss_client.fetch_recent(hours=hours, limit=limit)
    if not articles:
        logger.info("未拉取到新文章")
+        update_progress("summarize", status="running", stage="无新文章", current=0, total=0, message="未拉取到新文章")
        return {"fetched": 0, "created": 0, "summarized": 0}

    stats = {"fetched": len(articles), "created": 0, "summarized": 0}
+    update_progress("summarize", status="running", stage="拉取文章并生成摘要", current=0, total=len(articles))

-    for raw in articles:
+    for i, raw in enumerate(articles):
        data = _article_from_rss(raw)
        article = db.query(EnrichedArticle).filter(
            EnrichedArticle.rk_article_id == data["rk_article_id"]
@@ -146,6 +149,8 @@ def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[
        if stats["summarized"] % 10 == 0:
            db.commit()

+        report_loop_progress("summarize", i + 1, len(articles), "生成摘要")
+
    db.commit()
    logger.info(
        "摘要任务完成: fetched=%d, created=%d, summarized=%d",
@@ -5,6 +5,7 @@ from typing import List, Dict, Any, Tuple

 from sqlalchemy.orm import Session

+from app.task_progress import update_progress, report_loop_progress
 from models import EnrichedArticle, Taxonomy

 logger = logging.getLogger(__name__)
@@ -103,6 +104,7 @@ def tag_articles(db: Session, article_ids: List[int] = None) -> int:
        )

    articles = query.all()
+    update_progress("tag_score_dedup", status="running", stage="分类打标", current=0, total=len(articles))
    count = 0
    for article in articles:
        article.category = classify_article(article, categories)
@@ -110,6 +112,7 @@ def tag_articles(db: Session, article_ids: List[int] = None) -> int:
        count += 1
        if count % 50 == 0:
            db.commit()
+        report_loop_progress("tag_score_dedup", count, len(articles), "分类打标")

    db.commit()
    logger.info("分类/打标签完成: %d 篇文章", count)
@@ -0,0 +1,117 @@
+"""任务进度注册表（进程内内存，线程安全）。
+
+供手动任务、定时任务在执行过程中上报进度，前端通过
+GET /api/tasks/progress 轮询读取展示。
+
+单 worker（uvicorn --workers 1）前提下，所有请求/任务线程共享同一份内存。
+"""
+import copy
+import threading
+from datetime import datetime, timezone
+from typing import Optional
+
+# 4 个稳定任务 key
+TASK_KEYS = ("summarize", "tag_score_dedup", "generate_daily_brief", "bootstrap_taxonomy")
+
+_progress: dict = {}
+_lock = threading.Lock()
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _init() -> None:
+    """初始化所有任务 key 为 idle"""
+    for key in TASK_KEYS:
+        _progress[key] = {
+            "status": "idle",
+            "stage": "",
+            "current": 0,
+            "total": 0,
+            "message": None,
+            "started_at": None,
+            "updated_at": None,
+            "finished_at": None,
+            "trigger": None,
+        }
+
+
+_init()
+
+
+def update_progress(
+    task_key: str,
+    *,
+    status: Optional[str] = None,
+    stage: Optional[str] = None,
+    current: Optional[int] = None,
+    total: Optional[int] = None,
+    message: Optional[str] = None,
+    trigger: Optional[str] = None,
+) -> None:
+    """合并非 None 字段并盖时间戳"""
+    with _lock:
+        entry = _progress.get(task_key)
+        if entry is None:
+            entry = {
+                "status": "idle", "stage": "", "current": 0, "total": 0,
+                "message": None, "started_at": None, "updated_at": None,
+                "finished_at": None, "trigger": None,
+            }
+            _progress[task_key] = entry
+
+        now = _now_iso()
+        if status == "running" and entry.get("started_at") is None:
+            entry["started_at"] = now
+        if status in ("success", "error"):
+            entry["finished_at"] = now
+        # 若重新进入 running，重置终态时间戳
+        if status == "running":
+            entry["finished_at"] = None
+
+        if status is not None:
+            entry["status"] = status
+        if stage is not None:
+            entry["stage"] = stage
+        if current is not None:
+            entry["current"] = current
+        if total is not None:
+            entry["total"] = total
+        if message is not None:
+            entry["message"] = message
+        if trigger is not None:
+            entry["trigger"] = trigger
+        entry["updated_at"] = now
+
+
+def report_loop_progress(
+    task_key: str,
+    index: int,
+    total: int,
+    stage: str,
+    message: Optional[str] = None,
+    every: int = 5,
+) -> None:
+    """紧凑循环进度上报：每 `every` 次或最后一次（index==total）才上报，减少加锁"""
+    if index % every == 0 or index >= total:
+        update_progress(task_key, status="running", stage=stage, current=index, total=total, message=message)
+
+
+def get_progress(task_key: Optional[str] = None) -> dict:
+    """返回深拷贝（单个或全部），防止序列化期间被并发修改"""
+    with _lock:
+        if task_key is not None:
+            return copy.deepcopy(_progress.get(task_key))
+        return copy.deepcopy(_progress)
+
+
+def reset_progress(task_key: str) -> None:
+    """重置单个任务为 idle（前端清除终态显示用）"""
+    with _lock:
+        if task_key in _progress:
+            _progress[task_key] = {
+                "status": "idle", "stage": "", "current": 0, "total": 0,
+                "message": None, "started_at": None, "updated_at": None,
+                "finished_at": None, "trigger": None,
+            }
@@ -5,8 +5,9 @@ from typing import List, Dict, Any

 from sqlalchemy.orm import Session

-from app.ai_client import ai_client
+from app.ai_client import AIClient
 from app.rss_client import rss_client
+from app.task_progress import update_progress
 from models import Taxonomy

 logger = logging.getLogger(__name__)
@@ -40,19 +41,19 @@ TAXONOMY_SYSTEM_PROMPT = """你是一位专业的信息分类与内容分析专
 3. heat_rules 和 importance_rules 各 10-20 条，weight 范围 0.5-2.0。
 4. 所有 keywords 用中文或中英双语，便于后续关键词匹配。
 5. 不要输出任何解释文字，只输出 JSON。
+6. **分类与标签名称必须使用中性的主题领域词**（如科技、财经、文化、体育、生活、健康、设计、商业等），
+   禁止使用具体事件、人名、地名、国家名、机构名或任何政治/军事/冲突相关的敏感词作为名称或关键词，
+   以保证内容中立、避免触发内容审查。
 """


 def _build_sample_prompt(articles: List[Dict[str, Any]]) -> str:
-    lines = [f"共有 {len(articles)} 篇文章样本："]
-    for idx, art in enumerate(articles[:50], 1):
+    # 只用标题和来源，不带正文摘要——降低输入中的敏感内容，避免触发内容审查
+    lines = [f"共有 {len(articles)} 篇文章样本（仅展示标题用于归纳主题）："]
+    for idx, art in enumerate(articles[:40], 1):
        title = art.get("title", "")
-        summary = art.get("summary", "") or art.get("content", "")[:300]
        feed = art.get("feed_title", "")
-        cat = art.get("category", "")
-        lines.append(f"\n[{idx}] 标题：{title}")
-        lines.append(f"    来源：{feed} | 源分类：{cat}")
-        lines.append(f"    摘要：{summary[:400]}")
+        lines.append(f"[{idx}] {title}  （来源：{feed}）")
    return "\n".join(lines)


@@ -72,22 +73,24 @@ def bootstrap_taxonomy(db: Session, force: bool = False) -> bool:
        logger.info("强制重新初始化 taxonomy")

    logger.info("开始从 rssKeeper 拉取样本文章并生成分类体系...")
+    update_progress("bootstrap_taxonomy", status="running", stage="拉取样本文章", current=0, total=0)
    articles = rss_client.fetch_recent(hours=24 * 7, limit=200)
    if not articles:
        logger.warning("未获取到样本文章，无法生成分类体系")
-        return False
+        raise RuntimeError("未获取到样本文章，无法生成分类体系")

    user_prompt = _build_sample_prompt(articles)
-    try:
-        result = ai_client.chat_completion_json(
-            system_prompt=TAXONOMY_SYSTEM_PROMPT,
-            user_prompt=user_prompt,
-            temperature=0.5,
-        )
-    except Exception as exc:
-        logger.error("生成分类体系失败: %s", exc)
-        return False
+    update_progress("bootstrap_taxonomy", status="running", stage="LLM 生成分类体系", current=0, total=0, message="正在调用 LLM 生成分类规则，可能需要 2-4 分钟")
+    # bootstrap 是一次性大任务（生成 categories+tags+rules），MiniMax-M3 reasoning 模式较慢，
+    # 用专用大 timeout client（默认 60s 不够），失败抛异常由调用方捕获并如实标记进度
+    bootstrap_ai = AIClient(timeout=300, max_retries=2)
+    result = bootstrap_ai.chat_completion_json(
+        system_prompt=TAXONOMY_SYSTEM_PROMPT,
+        user_prompt=user_prompt,
+        temperature=0.5,
+    )

+    update_progress("bootstrap_taxonomy", status="running", stage="保存规则", current=0, total=0)
    _save_taxonomy(db, result)
    logger.info("taxonomy 初始化完成，共写入 %d 条规则", db.query(Taxonomy).count())
    return True