Files
dataClean/scheduler.py
T
congsh 778ccefb22 feat: 任务进度实时展示、接口测试、暗色主题重构及多项 bug 修复
后端
- 新增 app/task_progress.py 线程安全进度注册表
- 任务改为后台线程异步执行(_run_task_background),手动触发立即返回 task_key
- 6 个任务函数(summarizer/tagger/scorer/deduplicator/brief/taxonomy)循环内上报进度
- scheduler 定时任务同步上报进度(trigger=scheduled)
- 新增 GET /api/tasks/progress 与 POST /api/tasks/progress/reset 接口
- 新增 POST /api/test-connection 接口连通性测试(独立短超时客户端)
- 修复 ai_client/rss_client 配置在 import 时固化的 bug(改为 property 运行时读取 settings),
  导致实际任务用 .env 假 key 调 LLM 401
- 修复 ai_client 对 reasoning 模型(MiniMax-M3 等)输出 <think> 块的 JSON 解析失败
- 修复 taxonomy bootstrap:LLM 超时(改用 300s 专用 client)、MiniMax 输出审查
  (精简样本仅标题 + 约束生成中性类目名)、失败误报 success(改抛异常如实标记)
- 修复 models.py 双外键关系映射启动崩溃(显式 foreign_keys)
- 修复 main.py SPA 路由 404、ArticleOut.published_at 序列化 500
- 移除 lifespan 同步 bootstrap 阻塞启动,改由 scheduler 后台异步执行

前端
- Deep Ink 高对比度暗色主题重构,修复 Element Plus 暗色模式对比度问题
- Tasks 页面任务进度实时展示(进度条/阶段/计数/状态/触发来源)+ 1.5s 轮询
- 接口测试面板(rssKeeper / LLM 连通性 + 延迟)
- 修复 nextJobs jobId 映射 bug

部署与文档
- Dockerfile 优化(BuildKit 缓存挂载、预编译 wheel、去 gcc、阿里云镜像源)
- 新增 API.md 接口文档

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-14 15:14:40 +08:00

190 lines
5.9 KiB
Python

"""APScheduler 定时任务"""
import functools
import logging
import threading
from datetime import datetime, timezone
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.date import DateTrigger
from apscheduler.triggers.interval import IntervalTrigger
from apscheduler.triggers.cron import CronTrigger
from sqlalchemy.orm import Session
from config import settings
from database import SessionLocal
from app.taxonomy import ensure_taxonomy, bootstrap_taxonomy
from app.summarizer import fetch_and_summarize
from app.tagger import tag_articles
from app.deduplicator import deduplicate_articles
from app.scorer import score_articles
from app.brief import generate_daily_brief
from app.settings_manager import get_setting_value
from app import task_progress
logger = logging.getLogger(__name__)
_scheduler: BackgroundScheduler | None = None
# 任务互斥锁:防止手动任务与定时任务并发执行
_task_lock = threading.Lock()
# 定时任务函数名 → 进度 key 映射
_JOB_TASK_KEYS = {
"job_fetch_and_summarize": "summarize",
"job_tag_score_deduplicate": "tag_score_dedup",
"job_generate_daily_brief": "generate_daily_brief",
"job_bootstrap_taxonomy": "bootstrap_taxonomy",
}
def get_scheduler() -> BackgroundScheduler:
global _scheduler
if _scheduler is None:
_scheduler = BackgroundScheduler(
job_defaults={
"coalesce": True,
"max_instances": 1,
"misfire_grace_time": 300,
},
timezone="Asia/Shanghai",
)
return _scheduler
def get_task_lock():
"""返回全局任务互斥锁,供手动任务接口使用"""
return _task_lock
def _with_db(func):
"""装饰器:为任务函数提供数据库会话,并记录运行日志,同时上报进度"""
@functools.wraps(func)
def wrapper():
acquired = _task_lock.acquire(blocking=False)
if not acquired:
logger.warning("定时任务 %s 跳过:已有其他任务正在执行", func.__name__)
return
task_key = _JOB_TASK_KEYS.get(func.__name__)
db = SessionLocal()
if task_key:
task_progress.update_progress(
task_key, status="running", trigger="scheduled",
stage="初始化", current=0, total=0, message=None,
)
try:
func(db)
if task_key:
task_progress.update_progress(
task_key, status="success", stage="完成", message="定时任务执行成功"
)
except Exception as exc:
logger.error("定时任务 %s 执行失败: %s", func.__name__, exc, exc_info=True)
if task_key:
task_progress.update_progress(
task_key, status="error", stage="失败", message=str(exc)[:500]
)
finally:
db.close()
_task_lock.release()
return wrapper
@_with_db
def job_bootstrap_taxonomy(db: Session):
"""初始化分类体系(仅在表为空时执行)"""
logger.info("执行 taxonomy 初始化检查")
ensure_taxonomy(db)
@_with_db
def job_fetch_and_summarize(db: Session):
"""拉取文章并生成摘要"""
logger.info("执行摘要生成任务")
fetch_and_summarize(db, hours=24, limit=200)
@_with_db
def job_tag_score_deduplicate(db: Session):
"""对当天文章分类、打分、去重"""
logger.info("执行分类/打分/去重任务")
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
# 1. 对当天未分类的文章打标签
tag_articles(db)
# 2. 对当天文章去重
deduplicate_articles(db, date_str=today)
# 3. 重新计算分数(含重复性分数)
score_articles(db, update_duplication=True)
@_with_db
def job_generate_daily_brief(db: Session):
"""生成每日简报"""
logger.info("执行每日简报生成任务")
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
generate_daily_brief(db, date_str=today, force=True)
def init_scheduler():
"""注册并启动所有定时任务"""
scheduler = get_scheduler()
# 从数据库/环境变量读取调度配置
summarize_interval = int(get_setting_value("SUMMARIZE_INTERVAL_MINUTES", settings.SUMMARIZE_INTERVAL_MINUTES))
tag_score_interval = int(get_setting_value("TAG_SCORE_INTERVAL_MINUTES", settings.TAG_SCORE_INTERVAL_MINUTES))
brief_hour = int(get_setting_value("DAILY_BRIEF_HOUR", settings.DAILY_BRIEF_HOUR))
brief_minute = int(get_setting_value("DAILY_BRIEF_MINUTE", settings.DAILY_BRIEF_MINUTE))
# taxonomy 初始化:服务启动后立即执行一次
scheduler.add_job(
job_bootstrap_taxonomy,
trigger=DateTrigger(run_date=datetime.now()),
id="bootstrap_taxonomy",
replace_existing=True,
max_instances=1,
)
# 摘要任务
scheduler.add_job(
job_fetch_and_summarize,
trigger=IntervalTrigger(minutes=summarize_interval),
id="fetch_and_summarize",
replace_existing=True,
)
# 分类/打分/去重任务
scheduler.add_job(
job_tag_score_deduplicate,
trigger=IntervalTrigger(minutes=tag_score_interval),
id="tag_score_deduplicate",
replace_existing=True,
)
# 每日简报
scheduler.add_job(
job_generate_daily_brief,
trigger=CronTrigger(hour=brief_hour, minute=brief_minute),
id="generate_daily_brief",
replace_existing=True,
)
scheduler.start()
logger.info(
"调度器已启动: summarize=%d分钟, tag_score=%d分钟, brief=%02d:%02d",
summarize_interval,
tag_score_interval,
brief_hour,
brief_minute,
)
def stop_scheduler():
"""停止调度器"""
global _scheduler
if _scheduler:
_scheduler.shutdown(wait=False)
_scheduler = None
logger.info("调度器已停止")