"""文章去重:URL 精确去重 + 标题/内容相似度去重""" import logging import re from datetime import datetime, timedelta, timezone from difflib import SequenceMatcher from typing import List, Dict, Tuple, Set from sqlalchemy.orm import Session from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np from config import settings from models import EnrichedArticle, DuplicateGroup from app.task_progress import update_progress, report_loop_progress logger = logging.getLogger(__name__) def _normalize_title(title: str) -> str: """标题规范化:去除标点和多余空格,小写,保留中英文数字""" if not title: return "" # 保留:单词字符、CJK 统一表意符号(含扩展 A/B/C/D/E) title = re.sub( r"[^\w一-鿿㐀-䶿\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f]", " ", title, ) title = " ".join(title.split()) return title.lower() def _title_similarity(a: str, b: str) -> float: """计算标题相似度""" na = _normalize_title(a) nb = _normalize_title(b) if not na or not nb: return 0.0 return SequenceMatcher(None, na, nb).ratio() def _content_similarity_matrix(contents: List[str]) -> np.ndarray: """使用 TF-IDF + 余弦相似度计算内容相似度矩阵""" if len(contents) < 2: return np.zeros((len(contents), len(contents))) # 过滤空内容 valid_contents = [c or "" for c in contents] try: vectorizer = TfidfVectorizer( max_features=5000, stop_words="english", ngram_range=(1, 2), min_df=1, ) tfidf = vectorizer.fit_transform(valid_contents) return cosine_similarity(tfidf) except Exception as exc: logger.warning("TF-IDF 相似度计算失败: %s", exc) return np.zeros((len(contents), len(contents))) def _find_duplicate_clusters( articles: List[EnrichedArticle], title_threshold: float = None, content_threshold: float = None, ) -> List[Set[int]]: """ 基于标题相似度和内容相似度找出重复簇。 返回索引簇列表,每个簇是一组 articles 的索引集合。 """ title_threshold = title_threshold or settings.TITLE_SIMILARITY_THRESHOLD content_threshold = content_threshold or settings.CONTENT_SIMILARITY_THRESHOLD n = len(articles) if n < 2: return [] contents = [] for art in articles: text = " ".join([ art.title or "", art.ai_summary or art.original_summary or "", art.content or "", ]) contents.append(text[:2000]) # 限制长度加速计算 content_sim = _content_similarity_matrix(contents) visited = [False] * n clusters: List[Set[int]] = [] for i in range(n): if visited[i]: continue cluster = {i} queue = [i] visited[i] = True while queue: cur = queue.pop(0) for j in range(n): if visited[j] or cur == j: continue title_sim = _title_similarity(articles[cur].title or "", articles[j].title or "") c_sim = content_sim[cur][j] if cur < n and j < n else 0.0 # 标题高度相似 或 内容高度相似均视为重复 if title_sim >= title_threshold or c_sim >= content_threshold: cluster.add(j) queue.append(j) visited[j] = True if len(cluster) > 1: clusters.append(cluster) return clusters def _pick_representative(articles: List[EnrichedArticle], indices: Set[int]) -> EnrichedArticle: """从重复组中选择代表文章:优先选有 AI 摘要、来源 Feed 分类明确、发布时间最早的""" candidates = [articles[i] for i in indices] # 排序:有 AI 摘要优先,然后有 Feed 分类,然后发布时间早 candidates.sort( key=lambda a: ( bool(a.ai_summary), bool(a.feed_category), a.published_at or datetime.min, ), reverse=True, ) return candidates[0] def deduplicate_articles( db: Session, date_str: str = None, title_threshold: float = None, content_threshold: float = None, ) -> Dict[str, int]: """ 对指定日期的文章进行去重。 若 date_str 为空则处理今天(UTC)的文章。 返回统计:{"total": x, "duplicate_groups": y, "representatives": z} """ if date_str is None: date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") # 只清空该日期已有的去重组,避免破坏历史数据 day_start = datetime.strptime(date_str, "%Y-%m-%d") day_end = day_start + timedelta(days=1) old_groups = db.query(DuplicateGroup).filter(DuplicateGroup.brief_date == date_str).all() for og in old_groups: for art in og.articles: art.duplicate_group_id = None art.is_representative = False db.delete(og) db.commit() # 重置该日期文章的去重标记 articles = ( db.query(EnrichedArticle) .filter( EnrichedArticle.fetched_at >= day_start, EnrichedArticle.fetched_at < day_end, ) .order_by(EnrichedArticle.published_at) .all() ) if not articles: logger.info("日期 %s 无文章可去重", date_str) update_progress("tag_score_dedup", status="running", stage="去重", current=0, total=0, message="无文章可去重") return {"total": 0, "duplicate_groups": 0, "representatives": 0} update_progress("tag_score_dedup", status="running", stage="计算相似度并去重", current=0, total=0) # 先 URL 去重:相同 link 只保留一篇 unique_articles: List[EnrichedArticle] = [] seen_links: set = set() url_dup_count = 0 for art in articles: link = (art.link or "").strip() if link and link in seen_links: url_dup_count += 1 continue if link: seen_links.add(link) unique_articles.append(art) clusters = _find_duplicate_clusters( unique_articles, title_threshold=title_threshold, content_threshold=content_threshold, ) stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0} update_progress("tag_score_dedup", status="running", stage="写入重复组", current=0, total=len(clusters)) for ci, cluster in enumerate(clusters): representative = _pick_representative(unique_articles, cluster) member_ids = [unique_articles[i].id for i in cluster] group = DuplicateGroup( representative_article_id=representative.id, member_article_ids=member_ids, similarity_matrix={}, # 可后续补充 brief_date=date_str, ) db.add(group) db.flush() for idx in cluster: art = unique_articles[idx] art.duplicate_group_id = group.id art.is_representative = (art.id == representative.id) stats["representatives"] += 1 report_loop_progress("tag_score_dedup", ci + 1, len(clusters), "写入重复组") db.commit() logger.info( "去重完成: 日期=%s, 总文章=%d, 重复组=%d, URL 重复=%d", date_str, stats["total"], stats["duplicate_groups"], url_dup_count ) return stats