feat: 修复代码审核报告问题
This commit is contained in:
@@ -0,0 +1,223 @@
|
||||
"""文章去重:URL 精确去重 + 标题/内容相似度去重"""
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from difflib import SequenceMatcher
|
||||
from typing import List, Dict, Tuple, Set
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
import numpy as np
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, DuplicateGroup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _normalize_title(title: str) -> str:
|
||||
"""标题规范化:去除标点和多余空格,小写,保留中英文数字"""
|
||||
if not title:
|
||||
return ""
|
||||
# 保留:单词字符、CJK 统一表意符号(含扩展 A/B/C/D/E)
|
||||
title = re.sub(
|
||||
r"[^\w一-鿿㐀-䶿\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f]",
|
||||
" ",
|
||||
title,
|
||||
)
|
||||
title = " ".join(title.split())
|
||||
return title.lower()
|
||||
|
||||
|
||||
def _title_similarity(a: str, b: str) -> float:
|
||||
"""计算标题相似度"""
|
||||
na = _normalize_title(a)
|
||||
nb = _normalize_title(b)
|
||||
if not na or not nb:
|
||||
return 0.0
|
||||
return SequenceMatcher(None, na, nb).ratio()
|
||||
|
||||
|
||||
def _content_similarity_matrix(contents: List[str]) -> np.ndarray:
|
||||
"""使用 TF-IDF + 余弦相似度计算内容相似度矩阵"""
|
||||
if len(contents) < 2:
|
||||
return np.zeros((len(contents), len(contents)))
|
||||
|
||||
# 过滤空内容
|
||||
valid_contents = [c or "" for c in contents]
|
||||
try:
|
||||
vectorizer = TfidfVectorizer(
|
||||
max_features=5000,
|
||||
stop_words="english",
|
||||
ngram_range=(1, 2),
|
||||
min_df=1,
|
||||
)
|
||||
tfidf = vectorizer.fit_transform(valid_contents)
|
||||
return cosine_similarity(tfidf)
|
||||
except Exception as exc:
|
||||
logger.warning("TF-IDF 相似度计算失败: %s", exc)
|
||||
return np.zeros((len(contents), len(contents)))
|
||||
|
||||
|
||||
def _find_duplicate_clusters(
|
||||
articles: List[EnrichedArticle],
|
||||
title_threshold: float = None,
|
||||
content_threshold: float = None,
|
||||
) -> List[Set[int]]:
|
||||
"""
|
||||
基于标题相似度和内容相似度找出重复簇。
|
||||
返回索引簇列表,每个簇是一组 articles 的索引集合。
|
||||
"""
|
||||
title_threshold = title_threshold or settings.TITLE_SIMILARITY_THRESHOLD
|
||||
content_threshold = content_threshold or settings.CONTENT_SIMILARITY_THRESHOLD
|
||||
|
||||
n = len(articles)
|
||||
if n < 2:
|
||||
return []
|
||||
|
||||
contents = []
|
||||
for art in articles:
|
||||
text = " ".join([
|
||||
art.title or "",
|
||||
art.ai_summary or art.original_summary or "",
|
||||
art.content or "",
|
||||
])
|
||||
contents.append(text[:2000]) # 限制长度加速计算
|
||||
|
||||
content_sim = _content_similarity_matrix(contents)
|
||||
|
||||
visited = [False] * n
|
||||
clusters: List[Set[int]] = []
|
||||
|
||||
for i in range(n):
|
||||
if visited[i]:
|
||||
continue
|
||||
cluster = {i}
|
||||
queue = [i]
|
||||
visited[i] = True
|
||||
|
||||
while queue:
|
||||
cur = queue.pop(0)
|
||||
for j in range(n):
|
||||
if visited[j] or cur == j:
|
||||
continue
|
||||
|
||||
title_sim = _title_similarity(articles[cur].title or "", articles[j].title or "")
|
||||
c_sim = content_sim[cur][j] if cur < n and j < n else 0.0
|
||||
|
||||
# 标题高度相似 或 内容高度相似均视为重复
|
||||
if title_sim >= title_threshold or c_sim >= content_threshold:
|
||||
cluster.add(j)
|
||||
queue.append(j)
|
||||
visited[j] = True
|
||||
|
||||
if len(cluster) > 1:
|
||||
clusters.append(cluster)
|
||||
|
||||
return clusters
|
||||
|
||||
|
||||
def _pick_representative(articles: List[EnrichedArticle], indices: Set[int]) -> EnrichedArticle:
|
||||
"""从重复组中选择代表文章:优先选有 AI 摘要、来源 Feed 分类明确、发布时间最早的"""
|
||||
candidates = [articles[i] for i in indices]
|
||||
# 排序:有 AI 摘要优先,然后有 Feed 分类,然后发布时间早
|
||||
candidates.sort(
|
||||
key=lambda a: (
|
||||
bool(a.ai_summary),
|
||||
bool(a.feed_category),
|
||||
a.published_at or datetime.min,
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return candidates[0]
|
||||
|
||||
|
||||
def deduplicate_articles(
|
||||
db: Session,
|
||||
date_str: str = None,
|
||||
title_threshold: float = None,
|
||||
content_threshold: float = None,
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
对指定日期的文章进行去重。
|
||||
若 date_str 为空则处理今天(UTC)的文章。
|
||||
返回统计:{"total": x, "duplicate_groups": y, "representatives": z}
|
||||
"""
|
||||
if date_str is None:
|
||||
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
# 只清空该日期已有的去重组,避免破坏历史数据
|
||||
day_start = datetime.strptime(date_str, "%Y-%m-%d")
|
||||
day_end = day_start + timedelta(days=1)
|
||||
|
||||
old_groups = db.query(DuplicateGroup).filter(DuplicateGroup.brief_date == date_str).all()
|
||||
for og in old_groups:
|
||||
for art in og.articles:
|
||||
art.duplicate_group_id = None
|
||||
art.is_representative = False
|
||||
db.delete(og)
|
||||
db.commit()
|
||||
|
||||
# 重置该日期文章的去重标记
|
||||
articles = (
|
||||
db.query(EnrichedArticle)
|
||||
.filter(
|
||||
EnrichedArticle.fetched_at >= day_start,
|
||||
EnrichedArticle.fetched_at < day_end,
|
||||
)
|
||||
.order_by(EnrichedArticle.published_at)
|
||||
.all()
|
||||
)
|
||||
|
||||
if not articles:
|
||||
logger.info("日期 %s 无文章可去重", date_str)
|
||||
return {"total": 0, "duplicate_groups": 0, "representatives": 0}
|
||||
|
||||
# 先 URL 去重:相同 link 只保留一篇
|
||||
unique_articles: List[EnrichedArticle] = []
|
||||
seen_links: set = set()
|
||||
url_dup_count = 0
|
||||
for art in articles:
|
||||
link = (art.link or "").strip()
|
||||
if link and link in seen_links:
|
||||
url_dup_count += 1
|
||||
continue
|
||||
if link:
|
||||
seen_links.add(link)
|
||||
unique_articles.append(art)
|
||||
|
||||
clusters = _find_duplicate_clusters(
|
||||
unique_articles,
|
||||
title_threshold=title_threshold,
|
||||
content_threshold=content_threshold,
|
||||
)
|
||||
|
||||
stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0}
|
||||
|
||||
for cluster in clusters:
|
||||
representative = _pick_representative(unique_articles, cluster)
|
||||
member_ids = [unique_articles[i].id for i in cluster]
|
||||
|
||||
group = DuplicateGroup(
|
||||
representative_article_id=representative.id,
|
||||
member_article_ids=member_ids,
|
||||
similarity_matrix={}, # 可后续补充
|
||||
brief_date=date_str,
|
||||
)
|
||||
db.add(group)
|
||||
db.flush()
|
||||
|
||||
for idx in cluster:
|
||||
art = unique_articles[idx]
|
||||
art.duplicate_group_id = group.id
|
||||
art.is_representative = (art.id == representative.id)
|
||||
|
||||
stats["representatives"] += 1
|
||||
|
||||
db.commit()
|
||||
logger.info(
|
||||
"去重完成: 日期=%s, 总文章=%d, 重复组=%d, URL 重复=%d",
|
||||
date_str, stats["total"], stats["duplicate_groups"], url_dup_count
|
||||
)
|
||||
return stats
|
||||
Reference in New Issue
Block a user