Files
dataClean/app/scorer.py
T

151 lines
4.8 KiB
Python
Raw Normal View History

2026-06-12 16:04:03 +08:00
"""基于规则计算文章热度、重要性、重复性分数"""
import logging
import math
from datetime import datetime, timedelta, timezone
from typing import List
from sqlalchemy.orm import Session
from config import settings
from models import EnrichedArticle, Taxonomy
from app.task_progress import update_progress, report_loop_progress
2026-06-12 16:04:03 +08:00
from app.tagger import _count_matches, _normalize
logger = logging.getLogger(__name__)
# 综合分权重:热度 30%,重要性 50%,重复性 20%
COMPOSITE_WEIGHT_HEAT = 0.3
COMPOSITE_WEIGHT_IMPORTANCE = 0.5
COMPOSITE_WEIGHT_DUPLICATION = 0.2
def _build_text(article: EnrichedArticle) -> str:
"""构建用于打分的文本"""
return " ".join([
article.title or "",
article.ai_summary or article.original_summary or "",
article.content or "",
])
def _score_by_rules(article: EnrichedArticle, rules: List[Taxonomy]) -> float:
"""基于规则关键词匹配计算分数,规则权重越大得分越高"""
text = _build_text(article)
if not text.strip() or not rules:
return 0.0
score = 0.0
for rule in rules:
keywords = rule.keywords or []
hits = _count_matches(text, keywords)
if hits > 0:
score += min(hits, 5) * rule.weight * 10
return min(score, 100.0)
def _freshness_score(article: EnrichedArticle) -> float:
"""根据发布时间计算新鲜度加成"""
now = datetime.now(timezone.utc)
published = article.published_at
if not published:
return 0.0
# 数据库中读出的 published_at 可能为 naive,默认按 UTC 处理
if published.tzinfo is None:
published = published.replace(tzinfo=timezone.utc)
hours_old = (now - published).total_seconds() / 3600
if hours_old < 0:
hours_old = 0
# 24 小时内满分 20 分,超过 72 小时降至 0
if hours_old <= 24:
return 20.0
elif hours_old >= 72:
return 0.0
else:
return 20.0 * (1 - (hours_old - 24) / 48)
def compute_heat_score(article: EnrichedArticle, heat_rules: List[Taxonomy]) -> float:
"""热度分:关键词命中 + 新鲜度"""
base = _score_by_rules(article, heat_rules)
fresh = _freshness_score(article)
return min(base + fresh, 100.0)
def compute_importance_score(article: EnrichedArticle, importance_rules: List[Taxonomy]) -> float:
"""重要性分:关键词命中"""
return _score_by_rules(article, importance_rules)
def compute_duplication_score(duplicate_count: int, max_count: int = 5) -> float:
"""
重复性分:同一主题在多个源出现次数越多,重复性分越高。
出现 1 次为 0 分,>= max_count 为 100 分。
"""
if duplicate_count <= 1:
return 0.0
score = (duplicate_count - 1) / (max_count - 1) * 100.0
return min(score, 100.0)
def compute_composite_score(heat: float, importance: float, duplication: float) -> float:
"""计算综合分"""
return round(
heat * COMPOSITE_WEIGHT_HEAT
+ importance * COMPOSITE_WEIGHT_IMPORTANCE
+ duplication * COMPOSITE_WEIGHT_DUPLICATION,
2,
)
def score_articles(
db: Session,
article_ids: List[int] = None,
update_duplication: bool = False,
) -> int:
"""
对文章计算热度/重要性/综合分。
若 update_duplication=True,则同时根据重复组更新重复性分数。
返回处理数量。
"""
heat_rules = db.query(Taxonomy).filter(Taxonomy.kind == "heat_rule").all()
importance_rules = db.query(Taxonomy).filter(Taxonomy.kind == "importance_rule").all()
query = db.query(EnrichedArticle)
if article_ids:
query = query.filter(EnrichedArticle.id.in_(article_ids))
articles = query.all()
update_progress("tag_score_dedup", status="running", stage="计算分数", current=0, total=len(articles))
2026-06-12 16:04:03 +08:00
count = 0
for article in articles:
article.heat_score = compute_heat_score(article, heat_rules)
article.importance_score = compute_importance_score(article, importance_rules)
if update_duplication:
dup_count = 0
if article.duplicate_group_id:
group = article.duplicate_group
if group and group.member_article_ids:
# 非代表成员数量才是真正的重复次数
dup_count = max(len(group.member_article_ids) - 1, 0)
article.duplication_score = compute_duplication_score(dup_count)
article.composite_score = compute_composite_score(
article.heat_score,
article.importance_score,
article.duplication_score,
)
count += 1
if count % 50 == 0:
db.commit()
report_loop_progress("tag_score_dedup", count, len(articles), "计算分数")
2026-06-12 16:04:03 +08:00
db.commit()
logger.info("打分完成: %d 篇文章", count)
return count