"""基于规则给文章分类、打标签""" import logging import re from typing import List, Dict, Any, Tuple from sqlalchemy.orm import Session from app.task_progress import update_progress, report_loop_progress from models import EnrichedArticle, Taxonomy logger = logging.getLogger(__name__) def _normalize(text: str) -> str: """规范化文本用于关键词匹配""" if not text: return "" # 去除多余空白,统一小写 text = " ".join(text.split()) return text.lower() def _count_matches(text: str, keywords: List[str]) -> int: """统计关键词在文本中的命中次数(不区分大小写)""" if not text or not keywords: return 0 text_norm = _normalize(text) count = 0 for kw in keywords: if not kw: continue kw_norm = _normalize(kw) # 简单子串匹配;中文关键词也适用 count += text_norm.count(kw_norm) return count def classify_article(article: EnrichedArticle, categories: List[Taxonomy]) -> str: """为文章选择最匹配的分类""" text = " ".join([ article.title or "", article.ai_summary or article.original_summary or "", article.content or "", ]) best_category = "" best_score = 0 for cat in categories: score = _count_matches(text, cat.keywords or []) # 如果文章来自某个 Feed 分类,给予少量加成 if article.feed_category and article.feed_category == cat.name: score += 2 if score > best_score: best_score = score best_category = cat.name # 若完全没有命中,回退到源分类 if not best_category and article.feed_category: best_category = article.feed_category if not best_category: best_category = "未分类" return best_category def tag_article(article: EnrichedArticle, tags: List[Taxonomy]) -> List[str]: """为文章打上命中的标签""" text = " ".join([ article.title or "", article.ai_summary or article.original_summary or "", article.content or "", ]) matched = [] for tag in tags: if _count_matches(text, tag.keywords or []) > 0: matched.append(tag.name) # 去重并保持顺序 return list(dict.fromkeys(matched)) def tag_articles(db: Session, article_ids: List[int] = None) -> int: """ 对文章进行分类和打标签。 若指定 article_ids 则只处理这些文章;否则处理所有未分类或没有标签的文章。 返回处理数量。 """ categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").all() tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").all() if not categories: logger.warning("taxonomy 中无 category 数据,跳过分类") return 0 query = db.query(EnrichedArticle) if article_ids: query = query.filter(EnrichedArticle.id.in_(article_ids)) else: query = query.filter( (EnrichedArticle.category == "") | (EnrichedArticle.category == None) ) articles = query.all() update_progress("tag_score_dedup", status="running", stage="分类打标", current=0, total=len(articles)) count = 0 for article in articles: article.category = classify_article(article, categories) article.tags = tag_article(article, tags) count += 1 if count % 50 == 0: db.commit() report_loop_progress("tag_score_dedup", count, len(articles), "分类打标") db.commit() logger.info("分类/打标签完成: %d 篇文章", count) return count