feat: 任务进度实时展示、接口测试、暗色主题重构及多项 bug 修复

后端
- 新增 app/task_progress.py 线程安全进度注册表
- 任务改为后台线程异步执行(_run_task_background),手动触发立即返回 task_key
- 6 个任务函数(summarizer/tagger/scorer/deduplicator/brief/taxonomy)循环内上报进度
- scheduler 定时任务同步上报进度(trigger=scheduled)
- 新增 GET /api/tasks/progress 与 POST /api/tasks/progress/reset 接口
- 新增 POST /api/test-connection 接口连通性测试(独立短超时客户端)
- 修复 ai_client/rss_client 配置在 import 时固化的 bug(改为 property 运行时读取 settings),
  导致实际任务用 .env 假 key 调 LLM 401
- 修复 ai_client 对 reasoning 模型(MiniMax-M3 等)输出 <think> 块的 JSON 解析失败
- 修复 taxonomy bootstrap:LLM 超时(改用 300s 专用 client)、MiniMax 输出审查
  (精简样本仅标题 + 约束生成中性类目名)、失败误报 success(改抛异常如实标记)
- 修复 models.py 双外键关系映射启动崩溃(显式 foreign_keys)
- 修复 main.py SPA 路由 404、ArticleOut.published_at 序列化 500
- 移除 lifespan 同步 bootstrap 阻塞启动,改由 scheduler 后台异步执行

前端
- Deep Ink 高对比度暗色主题重构,修复 Element Plus 暗色模式对比度问题
- Tasks 页面任务进度实时展示(进度条/阶段/计数/状态/触发来源)+ 1.5s 轮询
- 接口测试面板(rssKeeper / LLM 连通性 + 延迟)
- 修复 nextJobs jobId 映射 bug

部署与文档
- Dockerfile 优化(BuildKit 缓存挂载、预编译 wheel、去 gcc、阿里云镜像源)
- 新增 API.md 接口文档

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
congsh
2026-06-14 15:14:40 +08:00
parent bae47a2411
commit 778ccefb22
24 changed files with 1853 additions and 312 deletions
+84 -21
View File
@@ -1,6 +1,7 @@
"""LLM API 客户端,兼容 OpenAI API 格式"""
import json
import logging
import re
from typing import Optional
from openai import OpenAI, APIError
@@ -9,9 +10,57 @@ from config import settings
logger = logging.getLogger(__name__)
# 匹配 reasoning 模型(MiniMax-M3 / DeepSeek-R1 / GLM-Z1 等)的 <think>...</think> 推理块
_THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL)
def _parse_llm_json(content: str) -> dict:
"""从 LLM 输出中提取 JSON。
兼容 reasoning 模型在 json_object 模式下仍输出 <think>...</think>
推理块、以及 JSON 前后有多余文本的情况。
"""
if not content or not content.strip():
raise ValueError("LLM 返回空内容,无法解析 JSON")
text = content.strip()
# 1) 去掉闭合的 <think>...</think> 块
text = _THINK_RE.sub("", text).strip()
# 2) 处理只有 <think> 开头但未闭合(content 被截断)的情况
if text.startswith("<think>"):
text = text.split("</think>", 1)[-1].strip()
# 3) 尝试直接解析
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# 4) 提取首个 { 到最后 } 之间的子串
start = text.find("{")
end = text.rfind("}")
if start != -1 and end > start:
try:
return json.loads(text[start : end + 1])
except json.JSONDecodeError:
pass
# 5) 兜底:尝试数组
start = text.find("[")
end = text.rfind("]")
if start != -1 and end > start:
return json.loads(text[start : end + 1])
logger.error("无法从 LLM 输出提取 JSON: %s", content[:500])
raise ValueError("LLM 输出无法解析为 JSON")
class AIClient:
"""封装 LLM 调用,支持重试和 JSON 输出"""
"""封装 LLM 调用,支持重试和 JSON 输出
配置以 property 形式运行时从 settings 读取,避免模块 import 时
固化旧值(settings 在 FastAPI lifespan 启动后才会被数据库配置覆盖)。
"""
def __init__(
self,
@@ -21,24 +70,42 @@ class AIClient:
timeout: Optional[int] = None,
max_retries: Optional[int] = None,
):
self.api_key = api_key or settings.OPENAI_API_KEY
self.base_url = base_url or settings.OPENAI_BASE_URL
self.model = model or settings.OPENAI_MODEL
self.timeout = timeout or settings.OPENAI_TIMEOUT
self.max_retries = max_retries or settings.OPENAI_MAX_RETRIES
# 仅保存显式传入的覆盖值;为 None 时运行时回退到 settings
self._api_key = api_key
self._base_url = base_url
self._model = model
self._timeout = timeout
self._max_retries = max_retries
self._client: Optional[OpenAI] = None
@property
def api_key(self) -> str:
return self._api_key or settings.OPENAI_API_KEY
@property
def base_url(self) -> str:
return self._base_url or settings.OPENAI_BASE_URL
@property
def model(self) -> str:
return self._model or settings.OPENAI_MODEL
@property
def timeout(self) -> int:
return self._timeout or settings.OPENAI_TIMEOUT
@property
def max_retries(self) -> int:
return self._max_retries or settings.OPENAI_MAX_RETRIES
@property
def client(self) -> OpenAI:
if self._client is None:
self._client = OpenAI(
api_key=self.api_key,
base_url=self.base_url,
timeout=self.timeout,
max_retries=self.max_retries,
)
return self._client
# 每次按最新配置创建,确保用到启动后覆盖的真实配置
return OpenAI(
api_key=self.api_key,
base_url=self.base_url,
timeout=self.timeout,
max_retries=self.max_retries,
)
def chat_completion(
self,
@@ -75,18 +142,14 @@ class AIClient:
user_prompt: str,
temperature: float = 0.3,
) -> dict:
"""调用 LLM 并解析返回的 JSON"""
"""调用 LLM 并解析返回的 JSON(兼容 reasoning 模型的 <think> 块)"""
content = self.chat_completion(
system_prompt=system_prompt,
user_prompt=user_prompt,
temperature=temperature,
json_mode=True,
)
try:
return json.loads(content)
except json.JSONDecodeError as exc:
logger.error("LLM 返回不是合法 JSON: %s - content=%s", exc, content[:500])
raise
return _parse_llm_json(content)
ai_client = AIClient()
+7
View File
@@ -9,6 +9,7 @@ from sqlalchemy.orm import Session
from config import settings
from models import EnrichedArticle, DailyBrief
from app.task_progress import update_progress
logger = logging.getLogger(__name__)
@@ -76,6 +77,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
existing = db.query(DailyBrief).filter(DailyBrief.brief_date == date_str).first()
if existing and not force:
logger.info("日期 %s 简报已存在,跳过生成", date_str)
update_progress("generate_daily_brief", status="running", stage="简报已存在", current=0, total=0, message="简报已存在,跳过生成")
return {
"date": date_str,
"total_articles": existing.total_articles,
@@ -86,6 +88,8 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
day_start = datetime.strptime(date_str, "%Y-%m-%d")
day_end = day_start + timedelta(days=1)
update_progress("generate_daily_brief", status="running", stage="加载文章", current=0, total=0)
# 取当天去重后的代表文章
query = (
db.query(EnrichedArticle)
@@ -106,6 +110,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
)
# 按分类分组并排序
update_progress("generate_daily_brief", status="running", stage="按分类整理", current=0, total=0)
by_category: Dict[str, List[Dict[str, Any]]] = {}
for art in representative_articles:
cat = art.category or "未分类"
@@ -127,6 +132,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
}
# 生成 Markdown 文件
update_progress("generate_daily_brief", status="running", stage="生成 Markdown", current=0, total=0)
output_dir = settings.brief_output_dir_path / date_str
output_dir.mkdir(parents=True, exist_ok=True)
markdown_path = output_dir / "daily-brief.md"
@@ -134,6 +140,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
markdown_path.write_text(markdown_content, encoding="utf-8")
# 更新文章 brief_date
update_progress("generate_daily_brief", status="running", stage="保存简报", current=0, total=0)
for art in representative_articles:
art.brief_date = date_str
+7 -1
View File
@@ -12,6 +12,7 @@ import numpy as np
from config import settings
from models import EnrichedArticle, DuplicateGroup
from app.task_progress import update_progress, report_loop_progress
logger = logging.getLogger(__name__)
@@ -172,8 +173,11 @@ def deduplicate_articles(
if not articles:
logger.info("日期 %s 无文章可去重", date_str)
update_progress("tag_score_dedup", status="running", stage="去重", current=0, total=0, message="无文章可去重")
return {"total": 0, "duplicate_groups": 0, "representatives": 0}
update_progress("tag_score_dedup", status="running", stage="计算相似度并去重", current=0, total=0)
# 先 URL 去重:相同 link 只保留一篇
unique_articles: List[EnrichedArticle] = []
seen_links: set = set()
@@ -194,8 +198,9 @@ def deduplicate_articles(
)
stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0}
update_progress("tag_score_dedup", status="running", stage="写入重复组", current=0, total=len(clusters))
for cluster in clusters:
for ci, cluster in enumerate(clusters):
representative = _pick_representative(unique_articles, cluster)
member_ids = [unique_articles[i].id for i in cluster]
@@ -214,6 +219,7 @@ def deduplicate_articles(
art.is_representative = (art.id == representative.id)
stats["representatives"] += 1
report_loop_progress("tag_score_dedup", ci + 1, len(clusters), "写入重复组")
db.commit()
logger.info(
+16 -4
View File
@@ -11,11 +11,23 @@ logger = logging.getLogger(__name__)
class RSSKeeperClient:
"""rssKeeper 外部 API 客户端"""
"""rssKeeper 外部 API 客户端
def __init__(self, base_url: Optional[str] = None, timeout: int = 30):
self.base_url = (base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
self.timeout = timeout
配置以 property 形式运行时从 settings 读取,避免模块 import 时
固化旧值(settings 在 FastAPI lifespan 启动后才会被数据库配置覆盖)。
"""
def __init__(self, base_url: Optional[str] = None, timeout: Optional[int] = None):
self._base_url = base_url
self._timeout = timeout
@property
def base_url(self) -> str:
return (self._base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
@property
def timeout(self) -> int:
return self._timeout if self._timeout is not None else 30
def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
url = f"{self.base_url}{path}"
+3
View File
@@ -8,6 +8,7 @@ from sqlalchemy.orm import Session
from config import settings
from models import EnrichedArticle, Taxonomy
from app.task_progress import update_progress, report_loop_progress
from app.tagger import _count_matches, _normalize
logger = logging.getLogger(__name__)
@@ -119,6 +120,7 @@ def score_articles(
query = query.filter(EnrichedArticle.id.in_(article_ids))
articles = query.all()
update_progress("tag_score_dedup", status="running", stage="计算分数", current=0, total=len(articles))
count = 0
for article in articles:
article.heat_score = compute_heat_score(article, heat_rules)
@@ -141,6 +143,7 @@ def score_articles(
count += 1
if count % 50 == 0:
db.commit()
report_loop_progress("tag_score_dedup", count, len(articles), "计算分数")
db.commit()
logger.info("打分完成: %d 篇文章", count)
+6 -1
View File
@@ -7,6 +7,7 @@ from sqlalchemy.orm import Session
from app.ai_client import ai_client
from app.rss_client import rss_client
from app.task_progress import update_progress, report_loop_progress
from config import settings
from models import EnrichedArticle
@@ -109,11 +110,13 @@ def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[
articles = rss_client.fetch_recent(hours=hours, limit=limit)
if not articles:
logger.info("未拉取到新文章")
update_progress("summarize", status="running", stage="无新文章", current=0, total=0, message="未拉取到新文章")
return {"fetched": 0, "created": 0, "summarized": 0}
stats = {"fetched": len(articles), "created": 0, "summarized": 0}
update_progress("summarize", status="running", stage="拉取文章并生成摘要", current=0, total=len(articles))
for raw in articles:
for i, raw in enumerate(articles):
data = _article_from_rss(raw)
article = db.query(EnrichedArticle).filter(
EnrichedArticle.rk_article_id == data["rk_article_id"]
@@ -146,6 +149,8 @@ def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[
if stats["summarized"] % 10 == 0:
db.commit()
report_loop_progress("summarize", i + 1, len(articles), "生成摘要")
db.commit()
logger.info(
"摘要任务完成: fetched=%d, created=%d, summarized=%d",
+3
View File
@@ -5,6 +5,7 @@ from typing import List, Dict, Any, Tuple
from sqlalchemy.orm import Session
from app.task_progress import update_progress, report_loop_progress
from models import EnrichedArticle, Taxonomy
logger = logging.getLogger(__name__)
@@ -103,6 +104,7 @@ def tag_articles(db: Session, article_ids: List[int] = None) -> int:
)
articles = query.all()
update_progress("tag_score_dedup", status="running", stage="分类打标", current=0, total=len(articles))
count = 0
for article in articles:
article.category = classify_article(article, categories)
@@ -110,6 +112,7 @@ def tag_articles(db: Session, article_ids: List[int] = None) -> int:
count += 1
if count % 50 == 0:
db.commit()
report_loop_progress("tag_score_dedup", count, len(articles), "分类打标")
db.commit()
logger.info("分类/打标签完成: %d 篇文章", count)
+117
View File
@@ -0,0 +1,117 @@
"""任务进度注册表(进程内内存,线程安全)。
供手动任务、定时任务在执行过程中上报进度,前端通过
GET /api/tasks/progress 轮询读取展示。
单 workeruvicorn --workers 1)前提下,所有请求/任务线程共享同一份内存。
"""
import copy
import threading
from datetime import datetime, timezone
from typing import Optional
# 4 个稳定任务 key
TASK_KEYS = ("summarize", "tag_score_dedup", "generate_daily_brief", "bootstrap_taxonomy")
_progress: dict = {}
_lock = threading.Lock()
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _init() -> None:
"""初始化所有任务 key 为 idle"""
for key in TASK_KEYS:
_progress[key] = {
"status": "idle",
"stage": "",
"current": 0,
"total": 0,
"message": None,
"started_at": None,
"updated_at": None,
"finished_at": None,
"trigger": None,
}
_init()
def update_progress(
task_key: str,
*,
status: Optional[str] = None,
stage: Optional[str] = None,
current: Optional[int] = None,
total: Optional[int] = None,
message: Optional[str] = None,
trigger: Optional[str] = None,
) -> None:
"""合并非 None 字段并盖时间戳"""
with _lock:
entry = _progress.get(task_key)
if entry is None:
entry = {
"status": "idle", "stage": "", "current": 0, "total": 0,
"message": None, "started_at": None, "updated_at": None,
"finished_at": None, "trigger": None,
}
_progress[task_key] = entry
now = _now_iso()
if status == "running" and entry.get("started_at") is None:
entry["started_at"] = now
if status in ("success", "error"):
entry["finished_at"] = now
# 若重新进入 running,重置终态时间戳
if status == "running":
entry["finished_at"] = None
if status is not None:
entry["status"] = status
if stage is not None:
entry["stage"] = stage
if current is not None:
entry["current"] = current
if total is not None:
entry["total"] = total
if message is not None:
entry["message"] = message
if trigger is not None:
entry["trigger"] = trigger
entry["updated_at"] = now
def report_loop_progress(
task_key: str,
index: int,
total: int,
stage: str,
message: Optional[str] = None,
every: int = 5,
) -> None:
"""紧凑循环进度上报:每 `every` 次或最后一次(index==total)才上报,减少加锁"""
if index % every == 0 or index >= total:
update_progress(task_key, status="running", stage=stage, current=index, total=total, message=message)
def get_progress(task_key: Optional[str] = None) -> dict:
"""返回深拷贝(单个或全部),防止序列化期间被并发修改"""
with _lock:
if task_key is not None:
return copy.deepcopy(_progress.get(task_key))
return copy.deepcopy(_progress)
def reset_progress(task_key: str) -> None:
"""重置单个任务为 idle(前端清除终态显示用)"""
with _lock:
if task_key in _progress:
_progress[task_key] = {
"status": "idle", "stage": "", "current": 0, "total": 0,
"message": None, "started_at": None, "updated_at": None,
"finished_at": None, "trigger": None,
}
+21 -18
View File
@@ -5,8 +5,9 @@ from typing import List, Dict, Any
from sqlalchemy.orm import Session
from app.ai_client import ai_client
from app.ai_client import AIClient
from app.rss_client import rss_client
from app.task_progress import update_progress
from models import Taxonomy
logger = logging.getLogger(__name__)
@@ -40,19 +41,19 @@ TAXONOMY_SYSTEM_PROMPT = """你是一位专业的信息分类与内容分析专
3. heat_rules 和 importance_rules 各 10-20 条,weight 范围 0.5-2.0。
4. 所有 keywords 用中文或中英双语,便于后续关键词匹配。
5. 不要输出任何解释文字,只输出 JSON。
6. **分类与标签名称必须使用中性的主题领域词**(如科技、财经、文化、体育、生活、健康、设计、商业等),
禁止使用具体事件、人名、地名、国家名、机构名或任何政治/军事/冲突相关的敏感词作为名称或关键词,
以保证内容中立、避免触发内容审查。
"""
def _build_sample_prompt(articles: List[Dict[str, Any]]) -> str:
lines = [f"共有 {len(articles)} 篇文章样本:"]
for idx, art in enumerate(articles[:50], 1):
# 只用标题和来源,不带正文摘要——降低输入中的敏感内容,避免触发内容审查
lines = [f"共有 {len(articles)} 篇文章样本(仅展示标题用于归纳主题):"]
for idx, art in enumerate(articles[:40], 1):
title = art.get("title", "")
summary = art.get("summary", "") or art.get("content", "")[:300]
feed = art.get("feed_title", "")
cat = art.get("category", "")
lines.append(f"\n[{idx}] 标题:{title}")
lines.append(f" 来源:{feed} | 源分类:{cat}")
lines.append(f" 摘要:{summary[:400]}")
lines.append(f"[{idx}] {title} (来源:{feed}")
return "\n".join(lines)
@@ -72,22 +73,24 @@ def bootstrap_taxonomy(db: Session, force: bool = False) -> bool:
logger.info("强制重新初始化 taxonomy")
logger.info("开始从 rssKeeper 拉取样本文章并生成分类体系...")
update_progress("bootstrap_taxonomy", status="running", stage="拉取样本文章", current=0, total=0)
articles = rss_client.fetch_recent(hours=24 * 7, limit=200)
if not articles:
logger.warning("未获取到样本文章,无法生成分类体系")
return False
raise RuntimeError("未获取到样本文章,无法生成分类体系")
user_prompt = _build_sample_prompt(articles)
try:
result = ai_client.chat_completion_json(
system_prompt=TAXONOMY_SYSTEM_PROMPT,
user_prompt=user_prompt,
temperature=0.5,
)
except Exception as exc:
logger.error("生成分类体系失败: %s", exc)
return False
update_progress("bootstrap_taxonomy", status="running", stage="LLM 生成分类体系", current=0, total=0, message="正在调用 LLM 生成分类规则,可能需要 2-4 分钟")
# bootstrap 是一次性大任务(生成 categories+tags+rules),MiniMax-M3 reasoning 模式较慢,
# 用专用大 timeout client(默认 60s 不够),失败抛异常由调用方捕获并如实标记进度
bootstrap_ai = AIClient(timeout=300, max_retries=2)
result = bootstrap_ai.chat_completion_json(
system_prompt=TAXONOMY_SYSTEM_PROMPT,
user_prompt=user_prompt,
temperature=0.5,
)
update_progress("bootstrap_taxonomy", status="running", stage="保存规则", current=0, total=0)
_save_taxonomy(db, result)
logger.info("taxonomy 初始化完成,共写入 %d 条规则", db.query(Taxonomy).count())
return True