2026-06-12 16:04:03 +08:00
|
|
|
"""文章去重:URL 精确去重 + 标题/内容相似度去重"""
|
|
|
|
|
import logging
|
|
|
|
|
import re
|
|
|
|
|
from datetime import datetime, timedelta, timezone
|
|
|
|
|
from difflib import SequenceMatcher
|
|
|
|
|
from typing import List, Dict, Tuple, Set
|
|
|
|
|
|
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
from config import settings
|
|
|
|
|
from models import EnrichedArticle, DuplicateGroup
|
2026-06-14 15:14:40 +08:00
|
|
|
from app.task_progress import update_progress, report_loop_progress
|
2026-06-12 16:04:03 +08:00
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _normalize_title(title: str) -> str:
|
|
|
|
|
"""标题规范化:去除标点和多余空格,小写,保留中英文数字"""
|
|
|
|
|
if not title:
|
|
|
|
|
return ""
|
|
|
|
|
# 保留:单词字符、CJK 统一表意符号(含扩展 A/B/C/D/E)
|
|
|
|
|
title = re.sub(
|
|
|
|
|
r"[^\w一-鿿㐀-䶿\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f]",
|
|
|
|
|
" ",
|
|
|
|
|
title,
|
|
|
|
|
)
|
|
|
|
|
title = " ".join(title.split())
|
|
|
|
|
return title.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _title_similarity(a: str, b: str) -> float:
|
|
|
|
|
"""计算标题相似度"""
|
|
|
|
|
na = _normalize_title(a)
|
|
|
|
|
nb = _normalize_title(b)
|
|
|
|
|
if not na or not nb:
|
|
|
|
|
return 0.0
|
|
|
|
|
return SequenceMatcher(None, na, nb).ratio()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _content_similarity_matrix(contents: List[str]) -> np.ndarray:
|
|
|
|
|
"""使用 TF-IDF + 余弦相似度计算内容相似度矩阵"""
|
|
|
|
|
if len(contents) < 2:
|
|
|
|
|
return np.zeros((len(contents), len(contents)))
|
|
|
|
|
|
|
|
|
|
# 过滤空内容
|
|
|
|
|
valid_contents = [c or "" for c in contents]
|
|
|
|
|
try:
|
|
|
|
|
vectorizer = TfidfVectorizer(
|
|
|
|
|
max_features=5000,
|
|
|
|
|
stop_words="english",
|
|
|
|
|
ngram_range=(1, 2),
|
|
|
|
|
min_df=1,
|
|
|
|
|
)
|
|
|
|
|
tfidf = vectorizer.fit_transform(valid_contents)
|
|
|
|
|
return cosine_similarity(tfidf)
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
logger.warning("TF-IDF 相似度计算失败: %s", exc)
|
|
|
|
|
return np.zeros((len(contents), len(contents)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _find_duplicate_clusters(
|
|
|
|
|
articles: List[EnrichedArticle],
|
|
|
|
|
title_threshold: float = None,
|
|
|
|
|
content_threshold: float = None,
|
|
|
|
|
) -> List[Set[int]]:
|
|
|
|
|
"""
|
|
|
|
|
基于标题相似度和内容相似度找出重复簇。
|
|
|
|
|
返回索引簇列表,每个簇是一组 articles 的索引集合。
|
|
|
|
|
"""
|
|
|
|
|
title_threshold = title_threshold or settings.TITLE_SIMILARITY_THRESHOLD
|
|
|
|
|
content_threshold = content_threshold or settings.CONTENT_SIMILARITY_THRESHOLD
|
|
|
|
|
|
|
|
|
|
n = len(articles)
|
|
|
|
|
if n < 2:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
contents = []
|
|
|
|
|
for art in articles:
|
|
|
|
|
text = " ".join([
|
|
|
|
|
art.title or "",
|
|
|
|
|
art.ai_summary or art.original_summary or "",
|
|
|
|
|
art.content or "",
|
|
|
|
|
])
|
|
|
|
|
contents.append(text[:2000]) # 限制长度加速计算
|
|
|
|
|
|
|
|
|
|
content_sim = _content_similarity_matrix(contents)
|
|
|
|
|
|
|
|
|
|
visited = [False] * n
|
|
|
|
|
clusters: List[Set[int]] = []
|
|
|
|
|
|
|
|
|
|
for i in range(n):
|
|
|
|
|
if visited[i]:
|
|
|
|
|
continue
|
|
|
|
|
cluster = {i}
|
|
|
|
|
queue = [i]
|
|
|
|
|
visited[i] = True
|
|
|
|
|
|
|
|
|
|
while queue:
|
|
|
|
|
cur = queue.pop(0)
|
|
|
|
|
for j in range(n):
|
|
|
|
|
if visited[j] or cur == j:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
title_sim = _title_similarity(articles[cur].title or "", articles[j].title or "")
|
|
|
|
|
c_sim = content_sim[cur][j] if cur < n and j < n else 0.0
|
|
|
|
|
|
|
|
|
|
# 标题高度相似 或 内容高度相似均视为重复
|
|
|
|
|
if title_sim >= title_threshold or c_sim >= content_threshold:
|
|
|
|
|
cluster.add(j)
|
|
|
|
|
queue.append(j)
|
|
|
|
|
visited[j] = True
|
|
|
|
|
|
|
|
|
|
if len(cluster) > 1:
|
|
|
|
|
clusters.append(cluster)
|
|
|
|
|
|
|
|
|
|
return clusters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _pick_representative(articles: List[EnrichedArticle], indices: Set[int]) -> EnrichedArticle:
|
|
|
|
|
"""从重复组中选择代表文章:优先选有 AI 摘要、来源 Feed 分类明确、发布时间最早的"""
|
|
|
|
|
candidates = [articles[i] for i in indices]
|
|
|
|
|
# 排序:有 AI 摘要优先,然后有 Feed 分类,然后发布时间早
|
|
|
|
|
candidates.sort(
|
|
|
|
|
key=lambda a: (
|
|
|
|
|
bool(a.ai_summary),
|
|
|
|
|
bool(a.feed_category),
|
|
|
|
|
a.published_at or datetime.min,
|
|
|
|
|
),
|
|
|
|
|
reverse=True,
|
|
|
|
|
)
|
|
|
|
|
return candidates[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def deduplicate_articles(
|
|
|
|
|
db: Session,
|
|
|
|
|
date_str: str = None,
|
|
|
|
|
title_threshold: float = None,
|
|
|
|
|
content_threshold: float = None,
|
|
|
|
|
) -> Dict[str, int]:
|
|
|
|
|
"""
|
|
|
|
|
对指定日期的文章进行去重。
|
|
|
|
|
若 date_str 为空则处理今天(UTC)的文章。
|
|
|
|
|
返回统计:{"total": x, "duplicate_groups": y, "representatives": z}
|
|
|
|
|
"""
|
|
|
|
|
if date_str is None:
|
|
|
|
|
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
|
|
|
|
|
|
|
|
# 只清空该日期已有的去重组,避免破坏历史数据
|
|
|
|
|
day_start = datetime.strptime(date_str, "%Y-%m-%d")
|
|
|
|
|
day_end = day_start + timedelta(days=1)
|
|
|
|
|
|
|
|
|
|
old_groups = db.query(DuplicateGroup).filter(DuplicateGroup.brief_date == date_str).all()
|
|
|
|
|
for og in old_groups:
|
|
|
|
|
for art in og.articles:
|
|
|
|
|
art.duplicate_group_id = None
|
|
|
|
|
art.is_representative = False
|
|
|
|
|
db.delete(og)
|
|
|
|
|
db.commit()
|
|
|
|
|
|
|
|
|
|
# 重置该日期文章的去重标记
|
|
|
|
|
articles = (
|
|
|
|
|
db.query(EnrichedArticle)
|
|
|
|
|
.filter(
|
|
|
|
|
EnrichedArticle.fetched_at >= day_start,
|
|
|
|
|
EnrichedArticle.fetched_at < day_end,
|
|
|
|
|
)
|
|
|
|
|
.order_by(EnrichedArticle.published_at)
|
|
|
|
|
.all()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not articles:
|
|
|
|
|
logger.info("日期 %s 无文章可去重", date_str)
|
2026-06-14 15:14:40 +08:00
|
|
|
update_progress("tag_score_dedup", status="running", stage="去重", current=0, total=0, message="无文章可去重")
|
2026-06-12 16:04:03 +08:00
|
|
|
return {"total": 0, "duplicate_groups": 0, "representatives": 0}
|
|
|
|
|
|
2026-06-14 15:14:40 +08:00
|
|
|
update_progress("tag_score_dedup", status="running", stage="计算相似度并去重", current=0, total=0)
|
|
|
|
|
|
2026-06-12 16:04:03 +08:00
|
|
|
# 先 URL 去重:相同 link 只保留一篇
|
|
|
|
|
unique_articles: List[EnrichedArticle] = []
|
|
|
|
|
seen_links: set = set()
|
|
|
|
|
url_dup_count = 0
|
|
|
|
|
for art in articles:
|
|
|
|
|
link = (art.link or "").strip()
|
|
|
|
|
if link and link in seen_links:
|
|
|
|
|
url_dup_count += 1
|
|
|
|
|
continue
|
|
|
|
|
if link:
|
|
|
|
|
seen_links.add(link)
|
|
|
|
|
unique_articles.append(art)
|
|
|
|
|
|
|
|
|
|
clusters = _find_duplicate_clusters(
|
|
|
|
|
unique_articles,
|
|
|
|
|
title_threshold=title_threshold,
|
|
|
|
|
content_threshold=content_threshold,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0}
|
2026-06-14 15:14:40 +08:00
|
|
|
update_progress("tag_score_dedup", status="running", stage="写入重复组", current=0, total=len(clusters))
|
2026-06-12 16:04:03 +08:00
|
|
|
|
2026-06-14 15:14:40 +08:00
|
|
|
for ci, cluster in enumerate(clusters):
|
2026-06-12 16:04:03 +08:00
|
|
|
representative = _pick_representative(unique_articles, cluster)
|
|
|
|
|
member_ids = [unique_articles[i].id for i in cluster]
|
|
|
|
|
|
|
|
|
|
group = DuplicateGroup(
|
|
|
|
|
representative_article_id=representative.id,
|
|
|
|
|
member_article_ids=member_ids,
|
|
|
|
|
similarity_matrix={}, # 可后续补充
|
|
|
|
|
brief_date=date_str,
|
|
|
|
|
)
|
|
|
|
|
db.add(group)
|
|
|
|
|
db.flush()
|
|
|
|
|
|
|
|
|
|
for idx in cluster:
|
|
|
|
|
art = unique_articles[idx]
|
|
|
|
|
art.duplicate_group_id = group.id
|
|
|
|
|
art.is_representative = (art.id == representative.id)
|
|
|
|
|
|
|
|
|
|
stats["representatives"] += 1
|
2026-06-14 15:14:40 +08:00
|
|
|
report_loop_progress("tag_score_dedup", ci + 1, len(clusters), "写入重复组")
|
2026-06-12 16:04:03 +08:00
|
|
|
|
|
|
|
|
db.commit()
|
|
|
|
|
logger.info(
|
|
|
|
|
"去重完成: 日期=%s, 总文章=%d, 重复组=%d, URL 重复=%d",
|
|
|
|
|
date_str, stats["total"], stats["duplicate_groups"], url_dup_count
|
|
|
|
|
)
|
|
|
|
|
return stats
|