Files
congsh 778ccefb22 feat: 任务进度实时展示、接口测试、暗色主题重构及多项 bug 修复
后端
- 新增 app/task_progress.py 线程安全进度注册表
- 任务改为后台线程异步执行(_run_task_background),手动触发立即返回 task_key
- 6 个任务函数(summarizer/tagger/scorer/deduplicator/brief/taxonomy)循环内上报进度
- scheduler 定时任务同步上报进度(trigger=scheduled)
- 新增 GET /api/tasks/progress 与 POST /api/tasks/progress/reset 接口
- 新增 POST /api/test-connection 接口连通性测试(独立短超时客户端)
- 修复 ai_client/rss_client 配置在 import 时固化的 bug(改为 property 运行时读取 settings),
  导致实际任务用 .env 假 key 调 LLM 401
- 修复 ai_client 对 reasoning 模型(MiniMax-M3 等)输出 <think> 块的 JSON 解析失败
- 修复 taxonomy bootstrap:LLM 超时(改用 300s 专用 client)、MiniMax 输出审查
  (精简样本仅标题 + 约束生成中性类目名)、失败误报 success(改抛异常如实标记)
- 修复 models.py 双外键关系映射启动崩溃(显式 foreign_keys)
- 修复 main.py SPA 路由 404、ArticleOut.published_at 序列化 500
- 移除 lifespan 同步 bootstrap 阻塞启动,改由 scheduler 后台异步执行

前端
- Deep Ink 高对比度暗色主题重构,修复 Element Plus 暗色模式对比度问题
- Tasks 页面任务进度实时展示(进度条/阶段/计数/状态/触发来源)+ 1.5s 轮询
- 接口测试面板(rssKeeper / LLM 连通性 + 延迟)
- 修复 nextJobs jobId 映射 bug

部署与文档
- Dockerfile 优化(BuildKit 缓存挂载、预编译 wheel、去 gcc、阿里云镜像源)
- 新增 API.md 接口文档

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-14 15:14:40 +08:00

160 lines
5.7 KiB
Python

"""文章摘要生成器:对无摘要或短摘要文章调用 LLM 生成 AI 摘要"""
import logging
from datetime import datetime, timezone
from typing import List, Dict, Any
from sqlalchemy.orm import Session
from app.ai_client import ai_client
from app.rss_client import rss_client
from app.task_progress import update_progress, report_loop_progress
from config import settings
from models import EnrichedArticle
logger = logging.getLogger(__name__)
SUMMARY_SYSTEM_PROMPT = """你是一位擅长阅读 RSS 新闻并提炼摘要的助手。
请用简洁流畅的中文总结文章核心内容,要求:
1. 长度控制在 {max_length} 个汉字以内。
2. 包含文章最重要的 1-3 个要点。
3. 不要添加个人评价,不要复述原文标题。
4. 若原文是英文,请用中文输出摘要。
"""
SUMMARY_USER_PROMPT_TEMPLATE = """请为以下文章生成摘要。
标题:{title}
作者:{author}
来源:{feed_title}
正文:
{content}
"""
def _needs_summary(article: EnrichedArticle) -> bool:
"""判断是否需要生成 AI 摘要"""
if not article.ai_summary:
return True
original = article.original_summary or ""
if len(original.strip()) < settings.MIN_ORIGINAL_SUMMARY_LENGTH:
return True
return False
def _prepare_content(raw_content: str, max_chars: int = 8000) -> str:
"""清洗并截断正文,避免超过 LLM 上下文"""
text = raw_content or ""
# 简单去除多余空白
text = " ".join(text.split())
return text[:max_chars]
def _generate_summary(article: EnrichedArticle) -> str:
"""调用 LLM 生成单篇文章摘要"""
content = _prepare_content(article.content or article.original_summary or "")
if not content.strip():
# 如果连原始摘要都没有,只能基于标题生成
content = article.title or ""
system_prompt = SUMMARY_SYSTEM_PROMPT.format(max_length=settings.MAX_AI_SUMMARY_LENGTH)
user_prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
title=article.title or "",
author=article.author or "",
feed_title=article.feed_title or "",
content=content,
)
try:
summary = ai_client.chat_completion(
system_prompt=system_prompt,
user_prompt=user_prompt,
temperature=0.3,
)
return summary[: settings.MAX_AI_SUMMARY_LENGTH]
except Exception as exc:
logger.error("生成 article_id=%d 摘要失败: %s", article.rk_article_id, exc)
return ""
def _article_from_rss(raw: Dict[str, Any]) -> Dict[str, Any]:
"""把 rssKeeper 返回的文章转换为可写入 enriched 表的字典"""
published_at = raw.get("published_at")
if isinstance(published_at, str):
try:
published_at = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
except Exception:
published_at = None
return {
"rk_article_id": raw["id"],
"title": raw.get("title", "") or "",
"link": raw.get("link", "") or "",
"feed_id": raw.get("feed_id", 0),
"feed_title": raw.get("feed_title", "") or "",
"feed_category": raw.get("category", "") or "",
"author": raw.get("author", "") or "",
"published_at": published_at,
"original_summary": raw.get("summary", "") or "",
"content": raw.get("content", "") or "",
}
def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[str, int]:
"""
拉取最近文章,补充 AI 摘要。
返回统计信息:{"fetched": x, "created": y, "summarized": z}
"""
articles = rss_client.fetch_recent(hours=hours, limit=limit)
if not articles:
logger.info("未拉取到新文章")
update_progress("summarize", status="running", stage="无新文章", current=0, total=0, message="未拉取到新文章")
return {"fetched": 0, "created": 0, "summarized": 0}
stats = {"fetched": len(articles), "created": 0, "summarized": 0}
update_progress("summarize", status="running", stage="拉取文章并生成摘要", current=0, total=len(articles))
for i, raw in enumerate(articles):
data = _article_from_rss(raw)
article = db.query(EnrichedArticle).filter(
EnrichedArticle.rk_article_id == data["rk_article_id"]
).first()
if article is None:
article = EnrichedArticle(**data)
db.add(article)
db.flush()
stats["created"] += 1
else:
# 更新已有记录的基础字段
article.title = data["title"] or article.title
article.link = data["link"] or article.link
article.feed_title = data["feed_title"] or article.feed_title
article.feed_category = data["feed_category"] or article.feed_category
article.author = data["author"] or article.author
article.published_at = data["published_at"] or article.published_at
article.original_summary = data["original_summary"] or article.original_summary
article.content = data["content"] or article.content
article.fetched_at = datetime.now(timezone.utc)
if _needs_summary(article):
ai_summary = _generate_summary(article)
if ai_summary:
article.ai_summary = ai_summary
stats["summarized"] += 1
# 每 10 篇提交一次,避免长时间事务
if stats["summarized"] % 10 == 0:
db.commit()
report_loop_progress("summarize", i + 1, len(articles), "生成摘要")
db.commit()
logger.info(
"摘要任务完成: fetched=%d, created=%d, summarized=%d",
stats["fetched"], stats["created"], stats["summarized"]
)
return stats