"""基于规则计算文章热度、重要性、重复性分数""" import logging import math from datetime import datetime, timedelta, timezone from typing import List from sqlalchemy.orm import Session from config import settings from models import EnrichedArticle, Taxonomy from app.task_progress import update_progress, report_loop_progress from app.tagger import _count_matches, _normalize logger = logging.getLogger(__name__) # 综合分权重:热度 30%,重要性 50%,重复性 20% COMPOSITE_WEIGHT_HEAT = 0.3 COMPOSITE_WEIGHT_IMPORTANCE = 0.5 COMPOSITE_WEIGHT_DUPLICATION = 0.2 def _build_text(article: EnrichedArticle) -> str: """构建用于打分的文本""" return " ".join([ article.title or "", article.ai_summary or article.original_summary or "", article.content or "", ]) def _score_by_rules(article: EnrichedArticle, rules: List[Taxonomy]) -> float: """基于规则关键词匹配计算分数,规则权重越大得分越高""" text = _build_text(article) if not text.strip() or not rules: return 0.0 score = 0.0 for rule in rules: keywords = rule.keywords or [] hits = _count_matches(text, keywords) if hits > 0: score += min(hits, 5) * rule.weight * 10 return min(score, 100.0) def _freshness_score(article: EnrichedArticle) -> float: """根据发布时间计算新鲜度加成""" now = datetime.now(timezone.utc) published = article.published_at if not published: return 0.0 # 数据库中读出的 published_at 可能为 naive,默认按 UTC 处理 if published.tzinfo is None: published = published.replace(tzinfo=timezone.utc) hours_old = (now - published).total_seconds() / 3600 if hours_old < 0: hours_old = 0 # 24 小时内满分 20 分,超过 72 小时降至 0 if hours_old <= 24: return 20.0 elif hours_old >= 72: return 0.0 else: return 20.0 * (1 - (hours_old - 24) / 48) def compute_heat_score(article: EnrichedArticle, heat_rules: List[Taxonomy]) -> float: """热度分:关键词命中 + 新鲜度""" base = _score_by_rules(article, heat_rules) fresh = _freshness_score(article) return min(base + fresh, 100.0) def compute_importance_score(article: EnrichedArticle, importance_rules: List[Taxonomy]) -> float: """重要性分:关键词命中""" return _score_by_rules(article, importance_rules) def compute_duplication_score(duplicate_count: int, max_count: int = 5) -> float: """ 重复性分:同一主题在多个源出现次数越多,重复性分越高。 出现 1 次为 0 分,>= max_count 为 100 分。 """ if duplicate_count <= 1: return 0.0 score = (duplicate_count - 1) / (max_count - 1) * 100.0 return min(score, 100.0) def compute_composite_score(heat: float, importance: float, duplication: float) -> float: """计算综合分""" return round( heat * COMPOSITE_WEIGHT_HEAT + importance * COMPOSITE_WEIGHT_IMPORTANCE + duplication * COMPOSITE_WEIGHT_DUPLICATION, 2, ) def score_articles( db: Session, article_ids: List[int] = None, update_duplication: bool = False, ) -> int: """ 对文章计算热度/重要性/综合分。 若 update_duplication=True,则同时根据重复组更新重复性分数。 返回处理数量。 """ heat_rules = db.query(Taxonomy).filter(Taxonomy.kind == "heat_rule").all() importance_rules = db.query(Taxonomy).filter(Taxonomy.kind == "importance_rule").all() query = db.query(EnrichedArticle) if article_ids: query = query.filter(EnrichedArticle.id.in_(article_ids)) articles = query.all() update_progress("tag_score_dedup", status="running", stage="计算分数", current=0, total=len(articles)) count = 0 for article in articles: article.heat_score = compute_heat_score(article, heat_rules) article.importance_score = compute_importance_score(article, importance_rules) if update_duplication: dup_count = 0 if article.duplicate_group_id: group = article.duplicate_group if group and group.member_article_ids: # 非代表成员数量才是真正的重复次数 dup_count = max(len(group.member_article_ids) - 1, 0) article.duplication_score = compute_duplication_score(dup_count) article.composite_score = compute_composite_score( article.heat_score, article.importance_score, article.duplication_score, ) count += 1 if count % 50 == 0: db.commit() report_loop_progress("tag_score_dedup", count, len(articles), "计算分数") db.commit() logger.info("打分完成: %d 篇文章", count) return count