app/tagger.py

"""基于规则给文章分类、打标签"""
import logging
import re
from typing import List, Dict, Any, Tuple

from sqlalchemy.orm import Session

from app.task_progress import update_progress, report_loop_progress
from models import EnrichedArticle, Taxonomy

logger = logging.getLogger(__name__)


def _normalize(text: str) -> str:
    """规范化文本用于关键词匹配"""
    if not text:
        return ""
    # 去除多余空白，统一小写
    text = " ".join(text.split())
    return text.lower()


def _count_matches(text: str, keywords: List[str]) -> int:
    """统计关键词在文本中的命中次数（不区分大小写）"""
    if not text or not keywords:
        return 0
    text_norm = _normalize(text)
    count = 0
    for kw in keywords:
        if not kw:
            continue
        kw_norm = _normalize(kw)
        # 简单子串匹配；中文关键词也适用
        count += text_norm.count(kw_norm)
    return count


def classify_article(article: EnrichedArticle, categories: List[Taxonomy]) -> str:
    """为文章选择最匹配的分类"""
    text = " ".join([
        article.title or "",
        article.ai_summary or article.original_summary or "",
        article.content or "",
    ])

    best_category = ""
    best_score = 0

    for cat in categories:
        score = _count_matches(text, cat.keywords or [])
        # 如果文章来自某个 Feed 分类，给予少量加成
        if article.feed_category and article.feed_category == cat.name:
            score += 2
        if score > best_score:
            best_score = score
            best_category = cat.name

    # 若完全没有命中，回退到源分类
    if not best_category and article.feed_category:
        best_category = article.feed_category

    if not best_category:
        best_category = "未分类"

    return best_category


def tag_article(article: EnrichedArticle, tags: List[Taxonomy]) -> List[str]:
    """为文章打上命中的标签"""
    text = " ".join([
        article.title or "",
        article.ai_summary or article.original_summary or "",
        article.content or "",
    ])

    matched = []
    for tag in tags:
        if _count_matches(text, tag.keywords or []) > 0:
            matched.append(tag.name)

    # 去重并保持顺序
    return list(dict.fromkeys(matched))


def tag_articles(db: Session, article_ids: List[int] = None) -> int:
    """
    对文章进行分类和打标签。
    若指定 article_ids 则只处理这些文章；否则处理所有未分类或没有标签的文章。
    返回处理数量。
    """
    categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").all()
    tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").all()

    if not categories:
        logger.warning("taxonomy 中无 category 数据，跳过分类")
        return 0

    query = db.query(EnrichedArticle)
    if article_ids:
        query = query.filter(EnrichedArticle.id.in_(article_ids))
    else:
        query = query.filter(
            (EnrichedArticle.category == "") | (EnrichedArticle.category == None)
        )

    articles = query.all()
    update_progress("tag_score_dedup", status="running", stage="分类打标", current=0, total=len(articles))
    count = 0
    for article in articles:
        article.category = classify_article(article, categories)
        article.tags = tag_article(article, tags)
        count += 1
        if count % 50 == 0:
            db.commit()
        report_loop_progress("tag_score_dedup", count, len(articles), "分类打标")

    db.commit()
    logger.info("分类/打标签完成: %d 篇文章", count)
    return count
feat: 修复代码审核报告问题 2026-06-12 16:04:03 +08:00			`"""基于规则给文章分类、打标签"""`
			`import logging`
			`import re`
			`from typing import List, Dict, Any, Tuple`

			`from sqlalchemy.orm import Session`

feat: 任务进度实时展示、接口测试、暗色主题重构及多项 bug 修复 2026-06-14 15:14:40 +08:00			`from app.task_progress import update_progress, report_loop_progress`
feat: 修复代码审核报告问题 2026-06-12 16:04:03 +08:00			`from models import EnrichedArticle, Taxonomy`

			`logger = logging.getLogger(__name__)`


			`def _normalize(text: str) -> str:`
			`"""规范化文本用于关键词匹配"""`
			`if not text:`
			`return ""`
			`# 去除多余空白，统一小写`
			`text = " ".join(text.split())`
			`return text.lower()`


			`def _count_matches(text: str, keywords: List[str]) -> int:`
			`"""统计关键词在文本中的命中次数（不区分大小写）"""`
			`if not text or not keywords:`
			`return 0`
			`text_norm = _normalize(text)`
			`count = 0`
			`for kw in keywords:`
			`if not kw:`
			`continue`
			`kw_norm = _normalize(kw)`
			`# 简单子串匹配；中文关键词也适用`
			`count += text_norm.count(kw_norm)`
			`return count`


			`def classify_article(article: EnrichedArticle, categories: List[Taxonomy]) -> str:`
			`"""为文章选择最匹配的分类"""`
			`text = " ".join([`
			`article.title or "",`
			`article.ai_summary or article.original_summary or "",`
			`article.content or "",`
			`])`

			`best_category = ""`
			`best_score = 0`

			`for cat in categories:`
			`score = _count_matches(text, cat.keywords or [])`
			`# 如果文章来自某个 Feed 分类，给予少量加成`
			`if article.feed_category and article.feed_category == cat.name:`
			`score += 2`
			`if score > best_score:`
			`best_score = score`
			`best_category = cat.name`

			`# 若完全没有命中，回退到源分类`
			`if not best_category and article.feed_category:`
			`best_category = article.feed_category`

			`if not best_category:`
			`best_category = "未分类"`

			`return best_category`


			`def tag_article(article: EnrichedArticle, tags: List[Taxonomy]) -> List[str]:`
			`"""为文章打上命中的标签"""`
			`text = " ".join([`
			`article.title or "",`
			`article.ai_summary or article.original_summary or "",`
			`article.content or "",`
			`])`

			`matched = []`
			`for tag in tags:`
			`if _count_matches(text, tag.keywords or []) > 0:`
			`matched.append(tag.name)`

			`# 去重并保持顺序`
			`return list(dict.fromkeys(matched))`


			`def tag_articles(db: Session, article_ids: List[int] = None) -> int:`
			`"""`
			`对文章进行分类和打标签。`
			`若指定 article_ids 则只处理这些文章；否则处理所有未分类或没有标签的文章。`
			`返回处理数量。`
			`"""`
			`categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").all()`
			`tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").all()`

			`if not categories:`
			`logger.warning("taxonomy 中无 category 数据，跳过分类")`
			`return 0`

			`query = db.query(EnrichedArticle)`
			`if article_ids:`
			`query = query.filter(EnrichedArticle.id.in_(article_ids))`
			`else:`
			`query = query.filter(`
			`(EnrichedArticle.category == "") \| (EnrichedArticle.category == None)`
			`)`

			`articles = query.all()`
feat: 任务进度实时展示、接口测试、暗色主题重构及多项 bug 修复 2026-06-14 15:14:40 +08:00			`update_progress("tag_score_dedup", status="running", stage="分类打标", current=0, total=len(articles))`
feat: 修复代码审核报告问题 2026-06-12 16:04:03 +08:00			`count = 0`
			`for article in articles:`
			`article.category = classify_article(article, categories)`
			`article.tags = tag_article(article, tags)`
			`count += 1`
			`if count % 50 == 0:`
			`db.commit()`
feat: 任务进度实时展示、接口测试、暗色主题重构及多项 bug 修复 2026-06-14 15:14:40 +08:00			`report_loop_progress("tag_score_dedup", count, len(articles), "分类打标")`
feat: 修复代码审核报告问题 2026-06-12 16:04:03 +08:00
			`db.commit()`
			`logger.info("分类/打标签完成: %d 篇文章", count)`
			`return count`