feat: 任务进度实时展示、接口测试、暗色主题重构及多项 bug 修复
后端 - 新增 app/task_progress.py 线程安全进度注册表 - 任务改为后台线程异步执行(_run_task_background),手动触发立即返回 task_key - 6 个任务函数(summarizer/tagger/scorer/deduplicator/brief/taxonomy)循环内上报进度 - scheduler 定时任务同步上报进度(trigger=scheduled) - 新增 GET /api/tasks/progress 与 POST /api/tasks/progress/reset 接口 - 新增 POST /api/test-connection 接口连通性测试(独立短超时客户端) - 修复 ai_client/rss_client 配置在 import 时固化的 bug(改为 property 运行时读取 settings), 导致实际任务用 .env 假 key 调 LLM 401 - 修复 ai_client 对 reasoning 模型(MiniMax-M3 等)输出 <think> 块的 JSON 解析失败 - 修复 taxonomy bootstrap:LLM 超时(改用 300s 专用 client)、MiniMax 输出审查 (精简样本仅标题 + 约束生成中性类目名)、失败误报 success(改抛异常如实标记) - 修复 models.py 双外键关系映射启动崩溃(显式 foreign_keys) - 修复 main.py SPA 路由 404、ArticleOut.published_at 序列化 500 - 移除 lifespan 同步 bootstrap 阻塞启动,改由 scheduler 后台异步执行 前端 - Deep Ink 高对比度暗色主题重构,修复 Element Plus 暗色模式对比度问题 - Tasks 页面任务进度实时展示(进度条/阶段/计数/状态/触发来源)+ 1.5s 轮询 - 接口测试面板(rssKeeper / LLM 连通性 + 延迟) - 修复 nextJobs jobId 映射 bug 部署与文档 - Dockerfile 优化(BuildKit 缓存挂载、预编译 wheel、去 gcc、阿里云镜像源) - 新增 API.md 接口文档 Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
+84
-21
@@ -1,6 +1,7 @@
|
||||
"""LLM API 客户端,兼容 OpenAI API 格式"""
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from openai import OpenAI, APIError
|
||||
@@ -9,9 +10,57 @@ from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 匹配 reasoning 模型(MiniMax-M3 / DeepSeek-R1 / GLM-Z1 等)的 <think>...</think> 推理块
|
||||
_THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL)
|
||||
|
||||
|
||||
def _parse_llm_json(content: str) -> dict:
|
||||
"""从 LLM 输出中提取 JSON。
|
||||
|
||||
兼容 reasoning 模型在 json_object 模式下仍输出 <think>...</think>
|
||||
推理块、以及 JSON 前后有多余文本的情况。
|
||||
"""
|
||||
if not content or not content.strip():
|
||||
raise ValueError("LLM 返回空内容,无法解析 JSON")
|
||||
|
||||
text = content.strip()
|
||||
# 1) 去掉闭合的 <think>...</think> 块
|
||||
text = _THINK_RE.sub("", text).strip()
|
||||
# 2) 处理只有 <think> 开头但未闭合(content 被截断)的情况
|
||||
if text.startswith("<think>"):
|
||||
text = text.split("</think>", 1)[-1].strip()
|
||||
|
||||
# 3) 尝试直接解析
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 4) 提取首个 { 到最后 } 之间的子串
|
||||
start = text.find("{")
|
||||
end = text.rfind("}")
|
||||
if start != -1 and end > start:
|
||||
try:
|
||||
return json.loads(text[start : end + 1])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 5) 兜底:尝试数组
|
||||
start = text.find("[")
|
||||
end = text.rfind("]")
|
||||
if start != -1 and end > start:
|
||||
return json.loads(text[start : end + 1])
|
||||
|
||||
logger.error("无法从 LLM 输出提取 JSON: %s", content[:500])
|
||||
raise ValueError("LLM 输出无法解析为 JSON")
|
||||
|
||||
|
||||
class AIClient:
|
||||
"""封装 LLM 调用,支持重试和 JSON 输出"""
|
||||
"""封装 LLM 调用,支持重试和 JSON 输出。
|
||||
|
||||
配置以 property 形式运行时从 settings 读取,避免模块 import 时
|
||||
固化旧值(settings 在 FastAPI lifespan 启动后才会被数据库配置覆盖)。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -21,24 +70,42 @@ class AIClient:
|
||||
timeout: Optional[int] = None,
|
||||
max_retries: Optional[int] = None,
|
||||
):
|
||||
self.api_key = api_key or settings.OPENAI_API_KEY
|
||||
self.base_url = base_url or settings.OPENAI_BASE_URL
|
||||
self.model = model or settings.OPENAI_MODEL
|
||||
self.timeout = timeout or settings.OPENAI_TIMEOUT
|
||||
self.max_retries = max_retries or settings.OPENAI_MAX_RETRIES
|
||||
# 仅保存显式传入的覆盖值;为 None 时运行时回退到 settings
|
||||
self._api_key = api_key
|
||||
self._base_url = base_url
|
||||
self._model = model
|
||||
self._timeout = timeout
|
||||
self._max_retries = max_retries
|
||||
|
||||
self._client: Optional[OpenAI] = None
|
||||
@property
|
||||
def api_key(self) -> str:
|
||||
return self._api_key or settings.OPENAI_API_KEY
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
return self._base_url or settings.OPENAI_BASE_URL
|
||||
|
||||
@property
|
||||
def model(self) -> str:
|
||||
return self._model or settings.OPENAI_MODEL
|
||||
|
||||
@property
|
||||
def timeout(self) -> int:
|
||||
return self._timeout or settings.OPENAI_TIMEOUT
|
||||
|
||||
@property
|
||||
def max_retries(self) -> int:
|
||||
return self._max_retries or settings.OPENAI_MAX_RETRIES
|
||||
|
||||
@property
|
||||
def client(self) -> OpenAI:
|
||||
if self._client is None:
|
||||
self._client = OpenAI(
|
||||
api_key=self.api_key,
|
||||
base_url=self.base_url,
|
||||
timeout=self.timeout,
|
||||
max_retries=self.max_retries,
|
||||
)
|
||||
return self._client
|
||||
# 每次按最新配置创建,确保用到启动后覆盖的真实配置
|
||||
return OpenAI(
|
||||
api_key=self.api_key,
|
||||
base_url=self.base_url,
|
||||
timeout=self.timeout,
|
||||
max_retries=self.max_retries,
|
||||
)
|
||||
|
||||
def chat_completion(
|
||||
self,
|
||||
@@ -75,18 +142,14 @@ class AIClient:
|
||||
user_prompt: str,
|
||||
temperature: float = 0.3,
|
||||
) -> dict:
|
||||
"""调用 LLM 并解析返回的 JSON"""
|
||||
"""调用 LLM 并解析返回的 JSON(兼容 reasoning 模型的 <think> 块)"""
|
||||
content = self.chat_completion(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
temperature=temperature,
|
||||
json_mode=True,
|
||||
)
|
||||
try:
|
||||
return json.loads(content)
|
||||
except json.JSONDecodeError as exc:
|
||||
logger.error("LLM 返回不是合法 JSON: %s - content=%s", exc, content[:500])
|
||||
raise
|
||||
return _parse_llm_json(content)
|
||||
|
||||
|
||||
ai_client = AIClient()
|
||||
|
||||
@@ -9,6 +9,7 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, DailyBrief
|
||||
from app.task_progress import update_progress
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -76,6 +77,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
|
||||
existing = db.query(DailyBrief).filter(DailyBrief.brief_date == date_str).first()
|
||||
if existing and not force:
|
||||
logger.info("日期 %s 简报已存在,跳过生成", date_str)
|
||||
update_progress("generate_daily_brief", status="running", stage="简报已存在", current=0, total=0, message="简报已存在,跳过生成")
|
||||
return {
|
||||
"date": date_str,
|
||||
"total_articles": existing.total_articles,
|
||||
@@ -86,6 +88,8 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
|
||||
day_start = datetime.strptime(date_str, "%Y-%m-%d")
|
||||
day_end = day_start + timedelta(days=1)
|
||||
|
||||
update_progress("generate_daily_brief", status="running", stage="加载文章", current=0, total=0)
|
||||
|
||||
# 取当天去重后的代表文章
|
||||
query = (
|
||||
db.query(EnrichedArticle)
|
||||
@@ -106,6 +110,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
|
||||
)
|
||||
|
||||
# 按分类分组并排序
|
||||
update_progress("generate_daily_brief", status="running", stage="按分类整理", current=0, total=0)
|
||||
by_category: Dict[str, List[Dict[str, Any]]] = {}
|
||||
for art in representative_articles:
|
||||
cat = art.category or "未分类"
|
||||
@@ -127,6 +132,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
|
||||
}
|
||||
|
||||
# 生成 Markdown 文件
|
||||
update_progress("generate_daily_brief", status="running", stage="生成 Markdown", current=0, total=0)
|
||||
output_dir = settings.brief_output_dir_path / date_str
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
markdown_path = output_dir / "daily-brief.md"
|
||||
@@ -134,6 +140,7 @@ def generate_daily_brief(db: Session, date_str: str = None, force: bool = False)
|
||||
markdown_path.write_text(markdown_content, encoding="utf-8")
|
||||
|
||||
# 更新文章 brief_date
|
||||
update_progress("generate_daily_brief", status="running", stage="保存简报", current=0, total=0)
|
||||
for art in representative_articles:
|
||||
art.brief_date = date_str
|
||||
|
||||
|
||||
+7
-1
@@ -12,6 +12,7 @@ import numpy as np
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, DuplicateGroup
|
||||
from app.task_progress import update_progress, report_loop_progress
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -172,8 +173,11 @@ def deduplicate_articles(
|
||||
|
||||
if not articles:
|
||||
logger.info("日期 %s 无文章可去重", date_str)
|
||||
update_progress("tag_score_dedup", status="running", stage="去重", current=0, total=0, message="无文章可去重")
|
||||
return {"total": 0, "duplicate_groups": 0, "representatives": 0}
|
||||
|
||||
update_progress("tag_score_dedup", status="running", stage="计算相似度并去重", current=0, total=0)
|
||||
|
||||
# 先 URL 去重:相同 link 只保留一篇
|
||||
unique_articles: List[EnrichedArticle] = []
|
||||
seen_links: set = set()
|
||||
@@ -194,8 +198,9 @@ def deduplicate_articles(
|
||||
)
|
||||
|
||||
stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0}
|
||||
update_progress("tag_score_dedup", status="running", stage="写入重复组", current=0, total=len(clusters))
|
||||
|
||||
for cluster in clusters:
|
||||
for ci, cluster in enumerate(clusters):
|
||||
representative = _pick_representative(unique_articles, cluster)
|
||||
member_ids = [unique_articles[i].id for i in cluster]
|
||||
|
||||
@@ -214,6 +219,7 @@ def deduplicate_articles(
|
||||
art.is_representative = (art.id == representative.id)
|
||||
|
||||
stats["representatives"] += 1
|
||||
report_loop_progress("tag_score_dedup", ci + 1, len(clusters), "写入重复组")
|
||||
|
||||
db.commit()
|
||||
logger.info(
|
||||
|
||||
+16
-4
@@ -11,11 +11,23 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RSSKeeperClient:
|
||||
"""rssKeeper 外部 API 客户端"""
|
||||
"""rssKeeper 外部 API 客户端。
|
||||
|
||||
def __init__(self, base_url: Optional[str] = None, timeout: int = 30):
|
||||
self.base_url = (base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
|
||||
self.timeout = timeout
|
||||
配置以 property 形式运行时从 settings 读取,避免模块 import 时
|
||||
固化旧值(settings 在 FastAPI lifespan 启动后才会被数据库配置覆盖)。
|
||||
"""
|
||||
|
||||
def __init__(self, base_url: Optional[str] = None, timeout: Optional[int] = None):
|
||||
self._base_url = base_url
|
||||
self._timeout = timeout
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
return (self._base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
|
||||
|
||||
@property
|
||||
def timeout(self) -> int:
|
||||
return self._timeout if self._timeout is not None else 30
|
||||
|
||||
def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
url = f"{self.base_url}{path}"
|
||||
|
||||
@@ -8,6 +8,7 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, Taxonomy
|
||||
from app.task_progress import update_progress, report_loop_progress
|
||||
from app.tagger import _count_matches, _normalize
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -119,6 +120,7 @@ def score_articles(
|
||||
query = query.filter(EnrichedArticle.id.in_(article_ids))
|
||||
|
||||
articles = query.all()
|
||||
update_progress("tag_score_dedup", status="running", stage="计算分数", current=0, total=len(articles))
|
||||
count = 0
|
||||
for article in articles:
|
||||
article.heat_score = compute_heat_score(article, heat_rules)
|
||||
@@ -141,6 +143,7 @@ def score_articles(
|
||||
count += 1
|
||||
if count % 50 == 0:
|
||||
db.commit()
|
||||
report_loop_progress("tag_score_dedup", count, len(articles), "计算分数")
|
||||
|
||||
db.commit()
|
||||
logger.info("打分完成: %d 篇文章", count)
|
||||
|
||||
+6
-1
@@ -7,6 +7,7 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from app.ai_client import ai_client
|
||||
from app.rss_client import rss_client
|
||||
from app.task_progress import update_progress, report_loop_progress
|
||||
from config import settings
|
||||
from models import EnrichedArticle
|
||||
|
||||
@@ -109,11 +110,13 @@ def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[
|
||||
articles = rss_client.fetch_recent(hours=hours, limit=limit)
|
||||
if not articles:
|
||||
logger.info("未拉取到新文章")
|
||||
update_progress("summarize", status="running", stage="无新文章", current=0, total=0, message="未拉取到新文章")
|
||||
return {"fetched": 0, "created": 0, "summarized": 0}
|
||||
|
||||
stats = {"fetched": len(articles), "created": 0, "summarized": 0}
|
||||
update_progress("summarize", status="running", stage="拉取文章并生成摘要", current=0, total=len(articles))
|
||||
|
||||
for raw in articles:
|
||||
for i, raw in enumerate(articles):
|
||||
data = _article_from_rss(raw)
|
||||
article = db.query(EnrichedArticle).filter(
|
||||
EnrichedArticle.rk_article_id == data["rk_article_id"]
|
||||
@@ -146,6 +149,8 @@ def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[
|
||||
if stats["summarized"] % 10 == 0:
|
||||
db.commit()
|
||||
|
||||
report_loop_progress("summarize", i + 1, len(articles), "生成摘要")
|
||||
|
||||
db.commit()
|
||||
logger.info(
|
||||
"摘要任务完成: fetched=%d, created=%d, summarized=%d",
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import List, Dict, Any, Tuple
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.task_progress import update_progress, report_loop_progress
|
||||
from models import EnrichedArticle, Taxonomy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -103,6 +104,7 @@ def tag_articles(db: Session, article_ids: List[int] = None) -> int:
|
||||
)
|
||||
|
||||
articles = query.all()
|
||||
update_progress("tag_score_dedup", status="running", stage="分类打标", current=0, total=len(articles))
|
||||
count = 0
|
||||
for article in articles:
|
||||
article.category = classify_article(article, categories)
|
||||
@@ -110,6 +112,7 @@ def tag_articles(db: Session, article_ids: List[int] = None) -> int:
|
||||
count += 1
|
||||
if count % 50 == 0:
|
||||
db.commit()
|
||||
report_loop_progress("tag_score_dedup", count, len(articles), "分类打标")
|
||||
|
||||
db.commit()
|
||||
logger.info("分类/打标签完成: %d 篇文章", count)
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
"""任务进度注册表(进程内内存,线程安全)。
|
||||
|
||||
供手动任务、定时任务在执行过程中上报进度,前端通过
|
||||
GET /api/tasks/progress 轮询读取展示。
|
||||
|
||||
单 worker(uvicorn --workers 1)前提下,所有请求/任务线程共享同一份内存。
|
||||
"""
|
||||
import copy
|
||||
import threading
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
# 4 个稳定任务 key
|
||||
TASK_KEYS = ("summarize", "tag_score_dedup", "generate_daily_brief", "bootstrap_taxonomy")
|
||||
|
||||
_progress: dict = {}
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _init() -> None:
|
||||
"""初始化所有任务 key 为 idle"""
|
||||
for key in TASK_KEYS:
|
||||
_progress[key] = {
|
||||
"status": "idle",
|
||||
"stage": "",
|
||||
"current": 0,
|
||||
"total": 0,
|
||||
"message": None,
|
||||
"started_at": None,
|
||||
"updated_at": None,
|
||||
"finished_at": None,
|
||||
"trigger": None,
|
||||
}
|
||||
|
||||
|
||||
_init()
|
||||
|
||||
|
||||
def update_progress(
|
||||
task_key: str,
|
||||
*,
|
||||
status: Optional[str] = None,
|
||||
stage: Optional[str] = None,
|
||||
current: Optional[int] = None,
|
||||
total: Optional[int] = None,
|
||||
message: Optional[str] = None,
|
||||
trigger: Optional[str] = None,
|
||||
) -> None:
|
||||
"""合并非 None 字段并盖时间戳"""
|
||||
with _lock:
|
||||
entry = _progress.get(task_key)
|
||||
if entry is None:
|
||||
entry = {
|
||||
"status": "idle", "stage": "", "current": 0, "total": 0,
|
||||
"message": None, "started_at": None, "updated_at": None,
|
||||
"finished_at": None, "trigger": None,
|
||||
}
|
||||
_progress[task_key] = entry
|
||||
|
||||
now = _now_iso()
|
||||
if status == "running" and entry.get("started_at") is None:
|
||||
entry["started_at"] = now
|
||||
if status in ("success", "error"):
|
||||
entry["finished_at"] = now
|
||||
# 若重新进入 running,重置终态时间戳
|
||||
if status == "running":
|
||||
entry["finished_at"] = None
|
||||
|
||||
if status is not None:
|
||||
entry["status"] = status
|
||||
if stage is not None:
|
||||
entry["stage"] = stage
|
||||
if current is not None:
|
||||
entry["current"] = current
|
||||
if total is not None:
|
||||
entry["total"] = total
|
||||
if message is not None:
|
||||
entry["message"] = message
|
||||
if trigger is not None:
|
||||
entry["trigger"] = trigger
|
||||
entry["updated_at"] = now
|
||||
|
||||
|
||||
def report_loop_progress(
|
||||
task_key: str,
|
||||
index: int,
|
||||
total: int,
|
||||
stage: str,
|
||||
message: Optional[str] = None,
|
||||
every: int = 5,
|
||||
) -> None:
|
||||
"""紧凑循环进度上报:每 `every` 次或最后一次(index==total)才上报,减少加锁"""
|
||||
if index % every == 0 or index >= total:
|
||||
update_progress(task_key, status="running", stage=stage, current=index, total=total, message=message)
|
||||
|
||||
|
||||
def get_progress(task_key: Optional[str] = None) -> dict:
|
||||
"""返回深拷贝(单个或全部),防止序列化期间被并发修改"""
|
||||
with _lock:
|
||||
if task_key is not None:
|
||||
return copy.deepcopy(_progress.get(task_key))
|
||||
return copy.deepcopy(_progress)
|
||||
|
||||
|
||||
def reset_progress(task_key: str) -> None:
|
||||
"""重置单个任务为 idle(前端清除终态显示用)"""
|
||||
with _lock:
|
||||
if task_key in _progress:
|
||||
_progress[task_key] = {
|
||||
"status": "idle", "stage": "", "current": 0, "total": 0,
|
||||
"message": None, "started_at": None, "updated_at": None,
|
||||
"finished_at": None, "trigger": None,
|
||||
}
|
||||
+21
-18
@@ -5,8 +5,9 @@ from typing import List, Dict, Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.ai_client import ai_client
|
||||
from app.ai_client import AIClient
|
||||
from app.rss_client import rss_client
|
||||
from app.task_progress import update_progress
|
||||
from models import Taxonomy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -40,19 +41,19 @@ TAXONOMY_SYSTEM_PROMPT = """你是一位专业的信息分类与内容分析专
|
||||
3. heat_rules 和 importance_rules 各 10-20 条,weight 范围 0.5-2.0。
|
||||
4. 所有 keywords 用中文或中英双语,便于后续关键词匹配。
|
||||
5. 不要输出任何解释文字,只输出 JSON。
|
||||
6. **分类与标签名称必须使用中性的主题领域词**(如科技、财经、文化、体育、生活、健康、设计、商业等),
|
||||
禁止使用具体事件、人名、地名、国家名、机构名或任何政治/军事/冲突相关的敏感词作为名称或关键词,
|
||||
以保证内容中立、避免触发内容审查。
|
||||
"""
|
||||
|
||||
|
||||
def _build_sample_prompt(articles: List[Dict[str, Any]]) -> str:
|
||||
lines = [f"共有 {len(articles)} 篇文章样本:"]
|
||||
for idx, art in enumerate(articles[:50], 1):
|
||||
# 只用标题和来源,不带正文摘要——降低输入中的敏感内容,避免触发内容审查
|
||||
lines = [f"共有 {len(articles)} 篇文章样本(仅展示标题用于归纳主题):"]
|
||||
for idx, art in enumerate(articles[:40], 1):
|
||||
title = art.get("title", "")
|
||||
summary = art.get("summary", "") or art.get("content", "")[:300]
|
||||
feed = art.get("feed_title", "")
|
||||
cat = art.get("category", "")
|
||||
lines.append(f"\n[{idx}] 标题:{title}")
|
||||
lines.append(f" 来源:{feed} | 源分类:{cat}")
|
||||
lines.append(f" 摘要:{summary[:400]}")
|
||||
lines.append(f"[{idx}] {title} (来源:{feed})")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@@ -72,22 +73,24 @@ def bootstrap_taxonomy(db: Session, force: bool = False) -> bool:
|
||||
logger.info("强制重新初始化 taxonomy")
|
||||
|
||||
logger.info("开始从 rssKeeper 拉取样本文章并生成分类体系...")
|
||||
update_progress("bootstrap_taxonomy", status="running", stage="拉取样本文章", current=0, total=0)
|
||||
articles = rss_client.fetch_recent(hours=24 * 7, limit=200)
|
||||
if not articles:
|
||||
logger.warning("未获取到样本文章,无法生成分类体系")
|
||||
return False
|
||||
raise RuntimeError("未获取到样本文章,无法生成分类体系")
|
||||
|
||||
user_prompt = _build_sample_prompt(articles)
|
||||
try:
|
||||
result = ai_client.chat_completion_json(
|
||||
system_prompt=TAXONOMY_SYSTEM_PROMPT,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.5,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error("生成分类体系失败: %s", exc)
|
||||
return False
|
||||
update_progress("bootstrap_taxonomy", status="running", stage="LLM 生成分类体系", current=0, total=0, message="正在调用 LLM 生成分类规则,可能需要 2-4 分钟")
|
||||
# bootstrap 是一次性大任务(生成 categories+tags+rules),MiniMax-M3 reasoning 模式较慢,
|
||||
# 用专用大 timeout client(默认 60s 不够),失败抛异常由调用方捕获并如实标记进度
|
||||
bootstrap_ai = AIClient(timeout=300, max_retries=2)
|
||||
result = bootstrap_ai.chat_completion_json(
|
||||
system_prompt=TAXONOMY_SYSTEM_PROMPT,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.5,
|
||||
)
|
||||
|
||||
update_progress("bootstrap_taxonomy", status="running", stage="保存规则", current=0, total=0)
|
||||
_save_taxonomy(db, result)
|
||||
logger.info("taxonomy 初始化完成,共写入 %d 条规则", db.query(Taxonomy).count())
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user