778ccefb22
后端 - 新增 app/task_progress.py 线程安全进度注册表 - 任务改为后台线程异步执行(_run_task_background),手动触发立即返回 task_key - 6 个任务函数(summarizer/tagger/scorer/deduplicator/brief/taxonomy)循环内上报进度 - scheduler 定时任务同步上报进度(trigger=scheduled) - 新增 GET /api/tasks/progress 与 POST /api/tasks/progress/reset 接口 - 新增 POST /api/test-connection 接口连通性测试(独立短超时客户端) - 修复 ai_client/rss_client 配置在 import 时固化的 bug(改为 property 运行时读取 settings), 导致实际任务用 .env 假 key 调 LLM 401 - 修复 ai_client 对 reasoning 模型(MiniMax-M3 等)输出 <think> 块的 JSON 解析失败 - 修复 taxonomy bootstrap:LLM 超时(改用 300s 专用 client)、MiniMax 输出审查 (精简样本仅标题 + 约束生成中性类目名)、失败误报 success(改抛异常如实标记) - 修复 models.py 双外键关系映射启动崩溃(显式 foreign_keys) - 修复 main.py SPA 路由 404、ArticleOut.published_at 序列化 500 - 移除 lifespan 同步 bootstrap 阻塞启动,改由 scheduler 后台异步执行 前端 - Deep Ink 高对比度暗色主题重构,修复 Element Plus 暗色模式对比度问题 - Tasks 页面任务进度实时展示(进度条/阶段/计数/状态/触发来源)+ 1.5s 轮询 - 接口测试面板(rssKeeper / LLM 连通性 + 延迟) - 修复 nextJobs jobId 映射 bug 部署与文档 - Dockerfile 优化(BuildKit 缓存挂载、预编译 wheel、去 gcc、阿里云镜像源) - 新增 API.md 接口文档 Co-Authored-By: Claude <noreply@anthropic.com>
151 lines
4.8 KiB
Python
151 lines
4.8 KiB
Python
"""基于规则计算文章热度、重要性、重复性分数"""
|
|
import logging
|
|
import math
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import List
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from config import settings
|
|
from models import EnrichedArticle, Taxonomy
|
|
from app.task_progress import update_progress, report_loop_progress
|
|
from app.tagger import _count_matches, _normalize
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# 综合分权重:热度 30%,重要性 50%,重复性 20%
|
|
COMPOSITE_WEIGHT_HEAT = 0.3
|
|
COMPOSITE_WEIGHT_IMPORTANCE = 0.5
|
|
COMPOSITE_WEIGHT_DUPLICATION = 0.2
|
|
|
|
|
|
def _build_text(article: EnrichedArticle) -> str:
|
|
"""构建用于打分的文本"""
|
|
return " ".join([
|
|
article.title or "",
|
|
article.ai_summary or article.original_summary or "",
|
|
article.content or "",
|
|
])
|
|
|
|
|
|
def _score_by_rules(article: EnrichedArticle, rules: List[Taxonomy]) -> float:
|
|
"""基于规则关键词匹配计算分数,规则权重越大得分越高"""
|
|
text = _build_text(article)
|
|
if not text.strip() or not rules:
|
|
return 0.0
|
|
|
|
score = 0.0
|
|
for rule in rules:
|
|
keywords = rule.keywords or []
|
|
hits = _count_matches(text, keywords)
|
|
if hits > 0:
|
|
score += min(hits, 5) * rule.weight * 10
|
|
|
|
return min(score, 100.0)
|
|
|
|
|
|
def _freshness_score(article: EnrichedArticle) -> float:
|
|
"""根据发布时间计算新鲜度加成"""
|
|
now = datetime.now(timezone.utc)
|
|
published = article.published_at
|
|
if not published:
|
|
return 0.0
|
|
|
|
# 数据库中读出的 published_at 可能为 naive,默认按 UTC 处理
|
|
if published.tzinfo is None:
|
|
published = published.replace(tzinfo=timezone.utc)
|
|
|
|
hours_old = (now - published).total_seconds() / 3600
|
|
if hours_old < 0:
|
|
hours_old = 0
|
|
|
|
# 24 小时内满分 20 分,超过 72 小时降至 0
|
|
if hours_old <= 24:
|
|
return 20.0
|
|
elif hours_old >= 72:
|
|
return 0.0
|
|
else:
|
|
return 20.0 * (1 - (hours_old - 24) / 48)
|
|
|
|
|
|
def compute_heat_score(article: EnrichedArticle, heat_rules: List[Taxonomy]) -> float:
|
|
"""热度分:关键词命中 + 新鲜度"""
|
|
base = _score_by_rules(article, heat_rules)
|
|
fresh = _freshness_score(article)
|
|
return min(base + fresh, 100.0)
|
|
|
|
|
|
def compute_importance_score(article: EnrichedArticle, importance_rules: List[Taxonomy]) -> float:
|
|
"""重要性分:关键词命中"""
|
|
return _score_by_rules(article, importance_rules)
|
|
|
|
|
|
def compute_duplication_score(duplicate_count: int, max_count: int = 5) -> float:
|
|
"""
|
|
重复性分:同一主题在多个源出现次数越多,重复性分越高。
|
|
出现 1 次为 0 分,>= max_count 为 100 分。
|
|
"""
|
|
if duplicate_count <= 1:
|
|
return 0.0
|
|
score = (duplicate_count - 1) / (max_count - 1) * 100.0
|
|
return min(score, 100.0)
|
|
|
|
|
|
def compute_composite_score(heat: float, importance: float, duplication: float) -> float:
|
|
"""计算综合分"""
|
|
return round(
|
|
heat * COMPOSITE_WEIGHT_HEAT
|
|
+ importance * COMPOSITE_WEIGHT_IMPORTANCE
|
|
+ duplication * COMPOSITE_WEIGHT_DUPLICATION,
|
|
2,
|
|
)
|
|
|
|
|
|
def score_articles(
|
|
db: Session,
|
|
article_ids: List[int] = None,
|
|
update_duplication: bool = False,
|
|
) -> int:
|
|
"""
|
|
对文章计算热度/重要性/综合分。
|
|
若 update_duplication=True,则同时根据重复组更新重复性分数。
|
|
返回处理数量。
|
|
"""
|
|
heat_rules = db.query(Taxonomy).filter(Taxonomy.kind == "heat_rule").all()
|
|
importance_rules = db.query(Taxonomy).filter(Taxonomy.kind == "importance_rule").all()
|
|
|
|
query = db.query(EnrichedArticle)
|
|
if article_ids:
|
|
query = query.filter(EnrichedArticle.id.in_(article_ids))
|
|
|
|
articles = query.all()
|
|
update_progress("tag_score_dedup", status="running", stage="计算分数", current=0, total=len(articles))
|
|
count = 0
|
|
for article in articles:
|
|
article.heat_score = compute_heat_score(article, heat_rules)
|
|
article.importance_score = compute_importance_score(article, importance_rules)
|
|
|
|
if update_duplication:
|
|
dup_count = 0
|
|
if article.duplicate_group_id:
|
|
group = article.duplicate_group
|
|
if group and group.member_article_ids:
|
|
# 非代表成员数量才是真正的重复次数
|
|
dup_count = max(len(group.member_article_ids) - 1, 0)
|
|
article.duplication_score = compute_duplication_score(dup_count)
|
|
|
|
article.composite_score = compute_composite_score(
|
|
article.heat_score,
|
|
article.importance_score,
|
|
article.duplication_score,
|
|
)
|
|
count += 1
|
|
if count % 50 == 0:
|
|
db.commit()
|
|
report_loop_progress("tag_score_dedup", count, len(articles), "计算分数")
|
|
|
|
db.commit()
|
|
logger.info("打分完成: %d 篇文章", count)
|
|
return count
|