148 lines
4.6 KiB
Python
148 lines
4.6 KiB
Python
"""基于规则计算文章热度、重要性、重复性分数"""
|
|
import logging
|
|
import math
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import List
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from config import settings
|
|
from models import EnrichedArticle, Taxonomy
|
|
from app.tagger import _count_matches, _normalize
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# 综合分权重:热度 30%,重要性 50%,重复性 20%
|
|
COMPOSITE_WEIGHT_HEAT = 0.3
|
|
COMPOSITE_WEIGHT_IMPORTANCE = 0.5
|
|
COMPOSITE_WEIGHT_DUPLICATION = 0.2
|
|
|
|
|
|
def _build_text(article: EnrichedArticle) -> str:
|
|
"""构建用于打分的文本"""
|
|
return " ".join([
|
|
article.title or "",
|
|
article.ai_summary or article.original_summary or "",
|
|
article.content or "",
|
|
])
|
|
|
|
|
|
def _score_by_rules(article: EnrichedArticle, rules: List[Taxonomy]) -> float:
|
|
"""基于规则关键词匹配计算分数,规则权重越大得分越高"""
|
|
text = _build_text(article)
|
|
if not text.strip() or not rules:
|
|
return 0.0
|
|
|
|
score = 0.0
|
|
for rule in rules:
|
|
keywords = rule.keywords or []
|
|
hits = _count_matches(text, keywords)
|
|
if hits > 0:
|
|
score += min(hits, 5) * rule.weight * 10
|
|
|
|
return min(score, 100.0)
|
|
|
|
|
|
def _freshness_score(article: EnrichedArticle) -> float:
|
|
"""根据发布时间计算新鲜度加成"""
|
|
now = datetime.now(timezone.utc)
|
|
published = article.published_at
|
|
if not published:
|
|
return 0.0
|
|
|
|
# 数据库中读出的 published_at 可能为 naive,默认按 UTC 处理
|
|
if published.tzinfo is None:
|
|
published = published.replace(tzinfo=timezone.utc)
|
|
|
|
hours_old = (now - published).total_seconds() / 3600
|
|
if hours_old < 0:
|
|
hours_old = 0
|
|
|
|
# 24 小时内满分 20 分,超过 72 小时降至 0
|
|
if hours_old <= 24:
|
|
return 20.0
|
|
elif hours_old >= 72:
|
|
return 0.0
|
|
else:
|
|
return 20.0 * (1 - (hours_old - 24) / 48)
|
|
|
|
|
|
def compute_heat_score(article: EnrichedArticle, heat_rules: List[Taxonomy]) -> float:
|
|
"""热度分:关键词命中 + 新鲜度"""
|
|
base = _score_by_rules(article, heat_rules)
|
|
fresh = _freshness_score(article)
|
|
return min(base + fresh, 100.0)
|
|
|
|
|
|
def compute_importance_score(article: EnrichedArticle, importance_rules: List[Taxonomy]) -> float:
|
|
"""重要性分:关键词命中"""
|
|
return _score_by_rules(article, importance_rules)
|
|
|
|
|
|
def compute_duplication_score(duplicate_count: int, max_count: int = 5) -> float:
|
|
"""
|
|
重复性分:同一主题在多个源出现次数越多,重复性分越高。
|
|
出现 1 次为 0 分,>= max_count 为 100 分。
|
|
"""
|
|
if duplicate_count <= 1:
|
|
return 0.0
|
|
score = (duplicate_count - 1) / (max_count - 1) * 100.0
|
|
return min(score, 100.0)
|
|
|
|
|
|
def compute_composite_score(heat: float, importance: float, duplication: float) -> float:
|
|
"""计算综合分"""
|
|
return round(
|
|
heat * COMPOSITE_WEIGHT_HEAT
|
|
+ importance * COMPOSITE_WEIGHT_IMPORTANCE
|
|
+ duplication * COMPOSITE_WEIGHT_DUPLICATION,
|
|
2,
|
|
)
|
|
|
|
|
|
def score_articles(
|
|
db: Session,
|
|
article_ids: List[int] = None,
|
|
update_duplication: bool = False,
|
|
) -> int:
|
|
"""
|
|
对文章计算热度/重要性/综合分。
|
|
若 update_duplication=True,则同时根据重复组更新重复性分数。
|
|
返回处理数量。
|
|
"""
|
|
heat_rules = db.query(Taxonomy).filter(Taxonomy.kind == "heat_rule").all()
|
|
importance_rules = db.query(Taxonomy).filter(Taxonomy.kind == "importance_rule").all()
|
|
|
|
query = db.query(EnrichedArticle)
|
|
if article_ids:
|
|
query = query.filter(EnrichedArticle.id.in_(article_ids))
|
|
|
|
articles = query.all()
|
|
count = 0
|
|
for article in articles:
|
|
article.heat_score = compute_heat_score(article, heat_rules)
|
|
article.importance_score = compute_importance_score(article, importance_rules)
|
|
|
|
if update_duplication:
|
|
dup_count = 0
|
|
if article.duplicate_group_id:
|
|
group = article.duplicate_group
|
|
if group and group.member_article_ids:
|
|
# 非代表成员数量才是真正的重复次数
|
|
dup_count = max(len(group.member_article_ids) - 1, 0)
|
|
article.duplication_score = compute_duplication_score(dup_count)
|
|
|
|
article.composite_score = compute_composite_score(
|
|
article.heat_score,
|
|
article.importance_score,
|
|
article.duplication_score,
|
|
)
|
|
count += 1
|
|
if count % 50 == 0:
|
|
db.commit()
|
|
|
|
db.commit()
|
|
logger.info("打分完成: %d 篇文章", count)
|
|
return count
|