feat: 修复代码审核报告问题
This commit is contained in:
+116
@@ -0,0 +1,116 @@
|
||||
"""基于规则给文章分类、打标签"""
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Dict, Any, Tuple
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models import EnrichedArticle, Taxonomy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _normalize(text: str) -> str:
|
||||
"""规范化文本用于关键词匹配"""
|
||||
if not text:
|
||||
return ""
|
||||
# 去除多余空白,统一小写
|
||||
text = " ".join(text.split())
|
||||
return text.lower()
|
||||
|
||||
|
||||
def _count_matches(text: str, keywords: List[str]) -> int:
|
||||
"""统计关键词在文本中的命中次数(不区分大小写)"""
|
||||
if not text or not keywords:
|
||||
return 0
|
||||
text_norm = _normalize(text)
|
||||
count = 0
|
||||
for kw in keywords:
|
||||
if not kw:
|
||||
continue
|
||||
kw_norm = _normalize(kw)
|
||||
# 简单子串匹配;中文关键词也适用
|
||||
count += text_norm.count(kw_norm)
|
||||
return count
|
||||
|
||||
|
||||
def classify_article(article: EnrichedArticle, categories: List[Taxonomy]) -> str:
|
||||
"""为文章选择最匹配的分类"""
|
||||
text = " ".join([
|
||||
article.title or "",
|
||||
article.ai_summary or article.original_summary or "",
|
||||
article.content or "",
|
||||
])
|
||||
|
||||
best_category = ""
|
||||
best_score = 0
|
||||
|
||||
for cat in categories:
|
||||
score = _count_matches(text, cat.keywords or [])
|
||||
# 如果文章来自某个 Feed 分类,给予少量加成
|
||||
if article.feed_category and article.feed_category == cat.name:
|
||||
score += 2
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_category = cat.name
|
||||
|
||||
# 若完全没有命中,回退到源分类
|
||||
if not best_category and article.feed_category:
|
||||
best_category = article.feed_category
|
||||
|
||||
if not best_category:
|
||||
best_category = "未分类"
|
||||
|
||||
return best_category
|
||||
|
||||
|
||||
def tag_article(article: EnrichedArticle, tags: List[Taxonomy]) -> List[str]:
|
||||
"""为文章打上命中的标签"""
|
||||
text = " ".join([
|
||||
article.title or "",
|
||||
article.ai_summary or article.original_summary or "",
|
||||
article.content or "",
|
||||
])
|
||||
|
||||
matched = []
|
||||
for tag in tags:
|
||||
if _count_matches(text, tag.keywords or []) > 0:
|
||||
matched.append(tag.name)
|
||||
|
||||
# 去重并保持顺序
|
||||
return list(dict.fromkeys(matched))
|
||||
|
||||
|
||||
def tag_articles(db: Session, article_ids: List[int] = None) -> int:
|
||||
"""
|
||||
对文章进行分类和打标签。
|
||||
若指定 article_ids 则只处理这些文章;否则处理所有未分类或没有标签的文章。
|
||||
返回处理数量。
|
||||
"""
|
||||
categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").all()
|
||||
tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").all()
|
||||
|
||||
if not categories:
|
||||
logger.warning("taxonomy 中无 category 数据,跳过分类")
|
||||
return 0
|
||||
|
||||
query = db.query(EnrichedArticle)
|
||||
if article_ids:
|
||||
query = query.filter(EnrichedArticle.id.in_(article_ids))
|
||||
else:
|
||||
query = query.filter(
|
||||
(EnrichedArticle.category == "") | (EnrichedArticle.category == None)
|
||||
)
|
||||
|
||||
articles = query.all()
|
||||
count = 0
|
||||
for article in articles:
|
||||
article.category = classify_article(article, categories)
|
||||
article.tags = tag_article(article, tags)
|
||||
count += 1
|
||||
if count % 50 == 0:
|
||||
db.commit()
|
||||
|
||||
db.commit()
|
||||
logger.info("分类/打标签完成: %d 篇文章", count)
|
||||
return count
|
||||
Reference in New Issue
Block a user