778ccefb22
后端 - 新增 app/task_progress.py 线程安全进度注册表 - 任务改为后台线程异步执行(_run_task_background),手动触发立即返回 task_key - 6 个任务函数(summarizer/tagger/scorer/deduplicator/brief/taxonomy)循环内上报进度 - scheduler 定时任务同步上报进度(trigger=scheduled) - 新增 GET /api/tasks/progress 与 POST /api/tasks/progress/reset 接口 - 新增 POST /api/test-connection 接口连通性测试(独立短超时客户端) - 修复 ai_client/rss_client 配置在 import 时固化的 bug(改为 property 运行时读取 settings), 导致实际任务用 .env 假 key 调 LLM 401 - 修复 ai_client 对 reasoning 模型(MiniMax-M3 等)输出 <think> 块的 JSON 解析失败 - 修复 taxonomy bootstrap:LLM 超时(改用 300s 专用 client)、MiniMax 输出审查 (精简样本仅标题 + 约束生成中性类目名)、失败误报 success(改抛异常如实标记) - 修复 models.py 双外键关系映射启动崩溃(显式 foreign_keys) - 修复 main.py SPA 路由 404、ArticleOut.published_at 序列化 500 - 移除 lifespan 同步 bootstrap 阻塞启动,改由 scheduler 后台异步执行 前端 - Deep Ink 高对比度暗色主题重构,修复 Element Plus 暗色模式对比度问题 - Tasks 页面任务进度实时展示(进度条/阶段/计数/状态/触发来源)+ 1.5s 轮询 - 接口测试面板(rssKeeper / LLM 连通性 + 延迟) - 修复 nextJobs jobId 映射 bug 部署与文档 - Dockerfile 优化(BuildKit 缓存挂载、预编译 wheel、去 gcc、阿里云镜像源) - 新增 API.md 接口文档 Co-Authored-By: Claude <noreply@anthropic.com>
120 lines
3.6 KiB
Python
120 lines
3.6 KiB
Python
"""基于规则给文章分类、打标签"""
|
|
import logging
|
|
import re
|
|
from typing import List, Dict, Any, Tuple
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.task_progress import update_progress, report_loop_progress
|
|
from models import EnrichedArticle, Taxonomy
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _normalize(text: str) -> str:
|
|
"""规范化文本用于关键词匹配"""
|
|
if not text:
|
|
return ""
|
|
# 去除多余空白,统一小写
|
|
text = " ".join(text.split())
|
|
return text.lower()
|
|
|
|
|
|
def _count_matches(text: str, keywords: List[str]) -> int:
|
|
"""统计关键词在文本中的命中次数(不区分大小写)"""
|
|
if not text or not keywords:
|
|
return 0
|
|
text_norm = _normalize(text)
|
|
count = 0
|
|
for kw in keywords:
|
|
if not kw:
|
|
continue
|
|
kw_norm = _normalize(kw)
|
|
# 简单子串匹配;中文关键词也适用
|
|
count += text_norm.count(kw_norm)
|
|
return count
|
|
|
|
|
|
def classify_article(article: EnrichedArticle, categories: List[Taxonomy]) -> str:
|
|
"""为文章选择最匹配的分类"""
|
|
text = " ".join([
|
|
article.title or "",
|
|
article.ai_summary or article.original_summary or "",
|
|
article.content or "",
|
|
])
|
|
|
|
best_category = ""
|
|
best_score = 0
|
|
|
|
for cat in categories:
|
|
score = _count_matches(text, cat.keywords or [])
|
|
# 如果文章来自某个 Feed 分类,给予少量加成
|
|
if article.feed_category and article.feed_category == cat.name:
|
|
score += 2
|
|
if score > best_score:
|
|
best_score = score
|
|
best_category = cat.name
|
|
|
|
# 若完全没有命中,回退到源分类
|
|
if not best_category and article.feed_category:
|
|
best_category = article.feed_category
|
|
|
|
if not best_category:
|
|
best_category = "未分类"
|
|
|
|
return best_category
|
|
|
|
|
|
def tag_article(article: EnrichedArticle, tags: List[Taxonomy]) -> List[str]:
|
|
"""为文章打上命中的标签"""
|
|
text = " ".join([
|
|
article.title or "",
|
|
article.ai_summary or article.original_summary or "",
|
|
article.content or "",
|
|
])
|
|
|
|
matched = []
|
|
for tag in tags:
|
|
if _count_matches(text, tag.keywords or []) > 0:
|
|
matched.append(tag.name)
|
|
|
|
# 去重并保持顺序
|
|
return list(dict.fromkeys(matched))
|
|
|
|
|
|
def tag_articles(db: Session, article_ids: List[int] = None) -> int:
|
|
"""
|
|
对文章进行分类和打标签。
|
|
若指定 article_ids 则只处理这些文章;否则处理所有未分类或没有标签的文章。
|
|
返回处理数量。
|
|
"""
|
|
categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").all()
|
|
tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").all()
|
|
|
|
if not categories:
|
|
logger.warning("taxonomy 中无 category 数据,跳过分类")
|
|
return 0
|
|
|
|
query = db.query(EnrichedArticle)
|
|
if article_ids:
|
|
query = query.filter(EnrichedArticle.id.in_(article_ids))
|
|
else:
|
|
query = query.filter(
|
|
(EnrichedArticle.category == "") | (EnrichedArticle.category == None)
|
|
)
|
|
|
|
articles = query.all()
|
|
update_progress("tag_score_dedup", status="running", stage="分类打标", current=0, total=len(articles))
|
|
count = 0
|
|
for article in articles:
|
|
article.category = classify_article(article, categories)
|
|
article.tags = tag_article(article, tags)
|
|
count += 1
|
|
if count % 50 == 0:
|
|
db.commit()
|
|
report_loop_progress("tag_score_dedup", count, len(articles), "分类打标")
|
|
|
|
db.commit()
|
|
logger.info("分类/打标签完成: %d 篇文章", count)
|
|
return count
|