Files
congsh 778ccefb22 feat: 任务进度实时展示、接口测试、暗色主题重构及多项 bug 修复
后端
- 新增 app/task_progress.py 线程安全进度注册表
- 任务改为后台线程异步执行(_run_task_background),手动触发立即返回 task_key
- 6 个任务函数(summarizer/tagger/scorer/deduplicator/brief/taxonomy)循环内上报进度
- scheduler 定时任务同步上报进度(trigger=scheduled)
- 新增 GET /api/tasks/progress 与 POST /api/tasks/progress/reset 接口
- 新增 POST /api/test-connection 接口连通性测试(独立短超时客户端)
- 修复 ai_client/rss_client 配置在 import 时固化的 bug(改为 property 运行时读取 settings),
  导致实际任务用 .env 假 key 调 LLM 401
- 修复 ai_client 对 reasoning 模型(MiniMax-M3 等)输出 <think> 块的 JSON 解析失败
- 修复 taxonomy bootstrap:LLM 超时(改用 300s 专用 client)、MiniMax 输出审查
  (精简样本仅标题 + 约束生成中性类目名)、失败误报 success(改抛异常如实标记)
- 修复 models.py 双外键关系映射启动崩溃(显式 foreign_keys)
- 修复 main.py SPA 路由 404、ArticleOut.published_at 序列化 500
- 移除 lifespan 同步 bootstrap 阻塞启动,改由 scheduler 后台异步执行

前端
- Deep Ink 高对比度暗色主题重构,修复 Element Plus 暗色模式对比度问题
- Tasks 页面任务进度实时展示(进度条/阶段/计数/状态/触发来源)+ 1.5s 轮询
- 接口测试面板(rssKeeper / LLM 连通性 + 延迟)
- 修复 nextJobs jobId 映射 bug

部署与文档
- Dockerfile 优化(BuildKit 缓存挂载、预编译 wheel、去 gcc、阿里云镜像源)
- 新增 API.md 接口文档

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-14 15:14:40 +08:00

151 lines
4.8 KiB
Python

"""基于规则计算文章热度、重要性、重复性分数"""
import logging
import math
from datetime import datetime, timedelta, timezone
from typing import List
from sqlalchemy.orm import Session
from config import settings
from models import EnrichedArticle, Taxonomy
from app.task_progress import update_progress, report_loop_progress
from app.tagger import _count_matches, _normalize
logger = logging.getLogger(__name__)
# 综合分权重:热度 30%,重要性 50%,重复性 20%
COMPOSITE_WEIGHT_HEAT = 0.3
COMPOSITE_WEIGHT_IMPORTANCE = 0.5
COMPOSITE_WEIGHT_DUPLICATION = 0.2
def _build_text(article: EnrichedArticle) -> str:
"""构建用于打分的文本"""
return " ".join([
article.title or "",
article.ai_summary or article.original_summary or "",
article.content or "",
])
def _score_by_rules(article: EnrichedArticle, rules: List[Taxonomy]) -> float:
"""基于规则关键词匹配计算分数,规则权重越大得分越高"""
text = _build_text(article)
if not text.strip() or not rules:
return 0.0
score = 0.0
for rule in rules:
keywords = rule.keywords or []
hits = _count_matches(text, keywords)
if hits > 0:
score += min(hits, 5) * rule.weight * 10
return min(score, 100.0)
def _freshness_score(article: EnrichedArticle) -> float:
"""根据发布时间计算新鲜度加成"""
now = datetime.now(timezone.utc)
published = article.published_at
if not published:
return 0.0
# 数据库中读出的 published_at 可能为 naive,默认按 UTC 处理
if published.tzinfo is None:
published = published.replace(tzinfo=timezone.utc)
hours_old = (now - published).total_seconds() / 3600
if hours_old < 0:
hours_old = 0
# 24 小时内满分 20 分,超过 72 小时降至 0
if hours_old <= 24:
return 20.0
elif hours_old >= 72:
return 0.0
else:
return 20.0 * (1 - (hours_old - 24) / 48)
def compute_heat_score(article: EnrichedArticle, heat_rules: List[Taxonomy]) -> float:
"""热度分:关键词命中 + 新鲜度"""
base = _score_by_rules(article, heat_rules)
fresh = _freshness_score(article)
return min(base + fresh, 100.0)
def compute_importance_score(article: EnrichedArticle, importance_rules: List[Taxonomy]) -> float:
"""重要性分:关键词命中"""
return _score_by_rules(article, importance_rules)
def compute_duplication_score(duplicate_count: int, max_count: int = 5) -> float:
"""
重复性分:同一主题在多个源出现次数越多,重复性分越高。
出现 1 次为 0 分,>= max_count 为 100 分。
"""
if duplicate_count <= 1:
return 0.0
score = (duplicate_count - 1) / (max_count - 1) * 100.0
return min(score, 100.0)
def compute_composite_score(heat: float, importance: float, duplication: float) -> float:
"""计算综合分"""
return round(
heat * COMPOSITE_WEIGHT_HEAT
+ importance * COMPOSITE_WEIGHT_IMPORTANCE
+ duplication * COMPOSITE_WEIGHT_DUPLICATION,
2,
)
def score_articles(
db: Session,
article_ids: List[int] = None,
update_duplication: bool = False,
) -> int:
"""
对文章计算热度/重要性/综合分。
若 update_duplication=True,则同时根据重复组更新重复性分数。
返回处理数量。
"""
heat_rules = db.query(Taxonomy).filter(Taxonomy.kind == "heat_rule").all()
importance_rules = db.query(Taxonomy).filter(Taxonomy.kind == "importance_rule").all()
query = db.query(EnrichedArticle)
if article_ids:
query = query.filter(EnrichedArticle.id.in_(article_ids))
articles = query.all()
update_progress("tag_score_dedup", status="running", stage="计算分数", current=0, total=len(articles))
count = 0
for article in articles:
article.heat_score = compute_heat_score(article, heat_rules)
article.importance_score = compute_importance_score(article, importance_rules)
if update_duplication:
dup_count = 0
if article.duplicate_group_id:
group = article.duplicate_group
if group and group.member_article_ids:
# 非代表成员数量才是真正的重复次数
dup_count = max(len(group.member_article_ids) - 1, 0)
article.duplication_score = compute_duplication_score(dup_count)
article.composite_score = compute_composite_score(
article.heat_score,
article.importance_score,
article.duplication_score,
)
count += 1
if count % 50 == 0:
db.commit()
report_loop_progress("tag_score_dedup", count, len(articles), "计算分数")
db.commit()
logger.info("打分完成: %d 篇文章", count)
return count