427 lines
13 KiB
Python
427 lines
13 KiB
Python
|
|
"""dataClean FastAPI 入口"""
|
||
|
|
import logging
|
||
|
|
import os
|
||
|
|
from contextlib import asynccontextmanager
|
||
|
|
from datetime import datetime, timedelta, timezone
|
||
|
|
from typing import Optional, List
|
||
|
|
|
||
|
|
from fastapi import FastAPI, Depends, HTTPException, Query, Body, Security, status
|
||
|
|
from fastapi.middleware.cors import CORSMiddleware
|
||
|
|
from fastapi.staticfiles import StaticFiles
|
||
|
|
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||
|
|
from pydantic import BaseModel, ConfigDict
|
||
|
|
from sqlalchemy.orm import Session
|
||
|
|
|
||
|
|
from config import settings
|
||
|
|
from database import init_db, get_db, SessionLocal
|
||
|
|
from scheduler import init_scheduler, stop_scheduler, get_scheduler, get_task_lock
|
||
|
|
from app.taxonomy import bootstrap_taxonomy, list_taxonomy, ensure_taxonomy
|
||
|
|
from app.summarizer import fetch_and_summarize
|
||
|
|
from app.tagger import tag_articles
|
||
|
|
from app.deduplicator import deduplicate_articles
|
||
|
|
from app.scorer import score_articles
|
||
|
|
from app.brief import generate_daily_brief
|
||
|
|
from app.settings_manager import (
|
||
|
|
init_default_settings,
|
||
|
|
list_settings,
|
||
|
|
get_setting,
|
||
|
|
set_setting,
|
||
|
|
reset_settings,
|
||
|
|
apply_db_settings_to_config,
|
||
|
|
)
|
||
|
|
from models import EnrichedArticle, DailyBrief, Taxonomy, DuplicateGroup, AppSetting
|
||
|
|
|
||
|
|
logging.basicConfig(
|
||
|
|
level=getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO),
|
||
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||
|
|
)
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# API Token 鉴权(当配置时启用)
|
||
|
|
security_scheme = HTTPBearer(auto_error=False)
|
||
|
|
|
||
|
|
|
||
|
|
def _get_allowed_origins() -> List[str]:
|
||
|
|
"""解析 CORS 允许来源配置"""
|
||
|
|
raw = settings.CORS_ALLOWED_ORIGINS
|
||
|
|
if raw:
|
||
|
|
return [o.strip() for o in raw.split(",") if o.strip()]
|
||
|
|
# 默认只允许同源(Docker/生产由反向代理或浏览器同源访问)
|
||
|
|
return []
|
||
|
|
|
||
|
|
|
||
|
|
def verify_token(credentials: Optional[HTTPAuthorizationCredentials] = Security(security_scheme)):
|
||
|
|
"""验证 API Token;未配置时跳过鉴权"""
|
||
|
|
token = settings.API_TOKEN
|
||
|
|
if not token:
|
||
|
|
return None
|
||
|
|
if not credentials:
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||
|
|
detail="缺少 Authorization 请求头",
|
||
|
|
headers={"WWW-Authenticate": "Bearer"},
|
||
|
|
)
|
||
|
|
if credentials.scheme != "Bearer" or credentials.credentials != token:
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_403_FORBIDDEN,
|
||
|
|
detail="无效的 API Token",
|
||
|
|
)
|
||
|
|
return credentials.credentials
|
||
|
|
|
||
|
|
|
||
|
|
def _run_task_locked(func, db: Session):
|
||
|
|
"""带互斥锁执行任务"""
|
||
|
|
acquired = get_task_lock().acquire(blocking=False)
|
||
|
|
if not acquired:
|
||
|
|
raise HTTPException(status_code=409, detail="已有任务正在执行,请稍后再试")
|
||
|
|
try:
|
||
|
|
return func(db)
|
||
|
|
finally:
|
||
|
|
get_task_lock().release()
|
||
|
|
|
||
|
|
|
||
|
|
@asynccontextmanager
|
||
|
|
async def lifespan(app: FastAPI):
|
||
|
|
"""应用生命周期管理"""
|
||
|
|
logger.info("启动 dataClean 服务")
|
||
|
|
init_db()
|
||
|
|
|
||
|
|
db = SessionLocal()
|
||
|
|
try:
|
||
|
|
# 初始化默认配置
|
||
|
|
init_default_settings(db)
|
||
|
|
# 用数据库配置覆盖全局 settings
|
||
|
|
apply_db_settings_to_config(db)
|
||
|
|
# 首次启动时确保 taxonomy 表存在
|
||
|
|
ensure_taxonomy(db)
|
||
|
|
except Exception as exc:
|
||
|
|
logger.error("启动初始化失败: %s", exc)
|
||
|
|
finally:
|
||
|
|
db.close()
|
||
|
|
|
||
|
|
init_scheduler()
|
||
|
|
yield
|
||
|
|
stop_scheduler()
|
||
|
|
|
||
|
|
|
||
|
|
app = FastAPI(
|
||
|
|
title="dataClean",
|
||
|
|
description="RSS 数据清洗、摘要、分类、打分与简报生成服务",
|
||
|
|
version="1.0.0",
|
||
|
|
lifespan=lifespan,
|
||
|
|
)
|
||
|
|
|
||
|
|
# CORS 配置:生产环境收敛到具体域名,且不与 credentials=true 同时用通配符
|
||
|
|
_allowed_origins = _get_allowed_origins()
|
||
|
|
app.add_middleware(
|
||
|
|
CORSMiddleware,
|
||
|
|
allow_origins=_allowed_origins or ["*"],
|
||
|
|
allow_credentials=bool(_allowed_origins),
|
||
|
|
allow_methods=["*"],
|
||
|
|
allow_headers=["*"],
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- Pydantic 模型 ----------
|
||
|
|
|
||
|
|
class ArticleOut(BaseModel):
|
||
|
|
id: int
|
||
|
|
rk_article_id: int
|
||
|
|
title: str
|
||
|
|
link: str
|
||
|
|
feed_title: str
|
||
|
|
category: str
|
||
|
|
tags: List[str]
|
||
|
|
heat_score: float
|
||
|
|
importance_score: float
|
||
|
|
duplication_score: float
|
||
|
|
composite_score: float
|
||
|
|
ai_summary: str
|
||
|
|
is_representative: bool
|
||
|
|
published_at: Optional[str]
|
||
|
|
|
||
|
|
model_config = ConfigDict(from_attributes=True)
|
||
|
|
|
||
|
|
|
||
|
|
class ArticleListOut(BaseModel):
|
||
|
|
items: List[ArticleOut]
|
||
|
|
total: int
|
||
|
|
|
||
|
|
|
||
|
|
class BriefOut(BaseModel):
|
||
|
|
id: int
|
||
|
|
brief_date: str
|
||
|
|
total_articles: int
|
||
|
|
unique_articles: int
|
||
|
|
by_category: dict
|
||
|
|
markdown_path: str
|
||
|
|
|
||
|
|
model_config = ConfigDict(from_attributes=True)
|
||
|
|
|
||
|
|
|
||
|
|
class TaxonomyOut(BaseModel):
|
||
|
|
id: int
|
||
|
|
name: str
|
||
|
|
kind: str
|
||
|
|
description: str
|
||
|
|
keywords: List[str]
|
||
|
|
weight: float
|
||
|
|
created_by_ai: bool
|
||
|
|
|
||
|
|
model_config = ConfigDict(from_attributes=True)
|
||
|
|
|
||
|
|
|
||
|
|
class SettingOut(BaseModel):
|
||
|
|
key: str
|
||
|
|
value: str
|
||
|
|
description: str
|
||
|
|
is_sensitive: bool
|
||
|
|
is_masked: bool
|
||
|
|
updated_at: Optional[str]
|
||
|
|
|
||
|
|
|
||
|
|
class SettingUpdate(BaseModel):
|
||
|
|
value: str
|
||
|
|
|
||
|
|
|
||
|
|
class BatchSettingsUpdate(BaseModel):
|
||
|
|
settings: dict
|
||
|
|
|
||
|
|
|
||
|
|
class StatsOut(BaseModel):
|
||
|
|
total_articles: int
|
||
|
|
today_articles: int
|
||
|
|
ai_summarized: int
|
||
|
|
categories: int
|
||
|
|
tags: int
|
||
|
|
duplicate_groups: int
|
||
|
|
briefs: int
|
||
|
|
next_jobs: dict
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- 健康检查 ----------
|
||
|
|
|
||
|
|
@app.get("/health")
|
||
|
|
def health():
|
||
|
|
return {"status": "ok", "service": "dataClean"}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- 文章接口 ----------
|
||
|
|
|
||
|
|
@app.get("/api/articles", response_model=ArticleListOut)
|
||
|
|
def list_articles(
|
||
|
|
date: Optional[str] = Query(None, description="日期 YYYY-MM-DD"),
|
||
|
|
category: Optional[str] = Query(None),
|
||
|
|
tag: Optional[str] = Query(None),
|
||
|
|
representative_only: bool = Query(False, description="仅返回重复组代表文章"),
|
||
|
|
limit: int = Query(50, ge=1, le=200),
|
||
|
|
offset: int = Query(0, ge=0),
|
||
|
|
db: Session = Depends(get_db),
|
||
|
|
):
|
||
|
|
query = db.query(EnrichedArticle)
|
||
|
|
|
||
|
|
if date:
|
||
|
|
day = datetime.strptime(date, "%Y-%m-%d")
|
||
|
|
next_day = day + timedelta(days=1)
|
||
|
|
query = query.filter(EnrichedArticle.fetched_at >= day, EnrichedArticle.fetched_at < next_day)
|
||
|
|
if category:
|
||
|
|
query = query.filter(EnrichedArticle.category == category)
|
||
|
|
if tag:
|
||
|
|
# SQLite JSON 列使用 json_each 做精确匹配,避免字符串子串误命中
|
||
|
|
query = query.filter(
|
||
|
|
EnrichedArticle.tags.contains([tag])
|
||
|
|
)
|
||
|
|
if representative_only:
|
||
|
|
query = query.filter(
|
||
|
|
(EnrichedArticle.is_representative == True) | (EnrichedArticle.duplicate_group_id == None)
|
||
|
|
)
|
||
|
|
|
||
|
|
total = query.count()
|
||
|
|
items = query.order_by(EnrichedArticle.composite_score.desc()).offset(offset).limit(limit).all()
|
||
|
|
return {"items": items, "total": total}
|
||
|
|
|
||
|
|
|
||
|
|
@app.get("/api/articles/{article_id}", response_model=ArticleOut)
|
||
|
|
def get_article(article_id: int, db: Session = Depends(get_db)):
|
||
|
|
article = db.query(EnrichedArticle).filter(EnrichedArticle.id == article_id).first()
|
||
|
|
if not article:
|
||
|
|
raise HTTPException(status_code=404, detail="文章不存在")
|
||
|
|
return article
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- 简报接口 ----------
|
||
|
|
|
||
|
|
@app.get("/api/briefs", response_model=List[BriefOut])
|
||
|
|
def list_briefs(
|
||
|
|
limit: int = Query(30, ge=1, le=100),
|
||
|
|
db: Session = Depends(get_db),
|
||
|
|
):
|
||
|
|
return (
|
||
|
|
db.query(DailyBrief)
|
||
|
|
.order_by(DailyBrief.brief_date.desc())
|
||
|
|
.limit(limit)
|
||
|
|
.all()
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@app.get("/api/briefs/{date}", response_model=BriefOut)
|
||
|
|
def get_brief(date: str, db: Session = Depends(get_db)):
|
||
|
|
brief = db.query(DailyBrief).filter(DailyBrief.brief_date == date).first()
|
||
|
|
if not brief:
|
||
|
|
raise HTTPException(status_code=404, detail="简报不存在")
|
||
|
|
return brief
|
||
|
|
|
||
|
|
|
||
|
|
@app.post("/api/briefs/{date}/regenerate")
|
||
|
|
def regenerate_brief(date: str, db: Session = Depends(get_db), _=Depends(verify_token)):
|
||
|
|
try:
|
||
|
|
data = generate_daily_brief(db, date_str=date, force=True)
|
||
|
|
return {"message": "简报已重新生成", "data": data}
|
||
|
|
except Exception as exc:
|
||
|
|
logger.error("重新生成简报失败: %s", exc)
|
||
|
|
raise HTTPException(status_code=500, detail=str(exc))
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- 分类体系接口 ----------
|
||
|
|
|
||
|
|
@app.get("/api/taxonomy", response_model=List[TaxonomyOut])
|
||
|
|
def get_taxonomy(kind: Optional[str] = Query(None), db: Session = Depends(get_db)):
|
||
|
|
return list_taxonomy(db, kind=kind)
|
||
|
|
|
||
|
|
|
||
|
|
@app.post("/api/taxonomy/bootstrap")
|
||
|
|
def trigger_taxonomy_bootstrap(
|
||
|
|
force: bool = False,
|
||
|
|
db: Session = Depends(get_db),
|
||
|
|
_=Depends(verify_token),
|
||
|
|
):
|
||
|
|
ok = bootstrap_taxonomy(db, force=force)
|
||
|
|
if not ok:
|
||
|
|
return {"message": "taxonomy 已存在或初始化失败,请检查日志"}
|
||
|
|
return {"message": "taxonomy 初始化成功"}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- 手动触发任务接口 ----------
|
||
|
|
|
||
|
|
@app.post("/api/tasks/summarize")
|
||
|
|
def task_summarize(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||
|
|
stats = _run_task_locked(lambda session: fetch_and_summarize(session, hours=24, limit=200), db)
|
||
|
|
return {"message": "摘要任务完成", "stats": stats}
|
||
|
|
|
||
|
|
|
||
|
|
@app.post("/api/tasks/tag-score-dedup")
|
||
|
|
def task_tag_score_dedup(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||
|
|
def _run(session):
|
||
|
|
tag_articles(session)
|
||
|
|
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||
|
|
deduplicate_articles(session, date_str=today)
|
||
|
|
score_articles(session, update_duplication=True)
|
||
|
|
return None
|
||
|
|
_run_task_locked(_run, db)
|
||
|
|
return {"message": "分类/去重/打分任务完成"}
|
||
|
|
|
||
|
|
|
||
|
|
@app.post("/api/tasks/brief")
|
||
|
|
def task_brief(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||
|
|
def _run(session):
|
||
|
|
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||
|
|
return generate_daily_brief(session, date_str=today, force=True)
|
||
|
|
data = _run_task_locked(_run, db)
|
||
|
|
return {"message": "简报生成任务完成", "data": data}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- 配置管理接口 ----------
|
||
|
|
|
||
|
|
@app.get("/api/settings", response_model=List[SettingOut])
|
||
|
|
def get_settings(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||
|
|
return list_settings(db, mask_sensitive=True)
|
||
|
|
|
||
|
|
|
||
|
|
@app.put("/api/settings/{key}")
|
||
|
|
def update_setting(
|
||
|
|
key: str,
|
||
|
|
body: SettingUpdate,
|
||
|
|
db: Session = Depends(get_db),
|
||
|
|
_=Depends(verify_token),
|
||
|
|
):
|
||
|
|
ok = set_setting(db, key, body.value)
|
||
|
|
if not ok:
|
||
|
|
raise HTTPException(status_code=400, detail="无效的配置项")
|
||
|
|
return {"message": "配置已保存,重启服务后生效"}
|
||
|
|
|
||
|
|
|
||
|
|
@app.put("/api/settings")
|
||
|
|
def update_settings_batch(
|
||
|
|
body: BatchSettingsUpdate,
|
||
|
|
db: Session = Depends(get_db),
|
||
|
|
_=Depends(verify_token),
|
||
|
|
):
|
||
|
|
errors = []
|
||
|
|
for key, value in body.settings.items():
|
||
|
|
if not set_setting(db, key, value):
|
||
|
|
errors.append(key)
|
||
|
|
if errors:
|
||
|
|
raise HTTPException(status_code=400, detail=f"以下配置项无效: {', '.join(errors)}")
|
||
|
|
return {"message": "配置已保存,重启服务后生效"}
|
||
|
|
|
||
|
|
|
||
|
|
@app.post("/api/settings/reset")
|
||
|
|
def reset_all_settings(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||
|
|
reset_settings(db)
|
||
|
|
return {"message": "配置已重置为环境变量默认值,重启服务后生效"}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- 仪表盘统计接口 ----------
|
||
|
|
|
||
|
|
@app.get("/api/stats", response_model=StatsOut)
|
||
|
|
def get_stats(db: Session = Depends(get_db)):
|
||
|
|
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||
|
|
day_start = datetime.strptime(today, "%Y-%m-%d")
|
||
|
|
day_end = day_start + timedelta(days=1)
|
||
|
|
|
||
|
|
total_articles = db.query(EnrichedArticle).count()
|
||
|
|
today_articles = (
|
||
|
|
db.query(EnrichedArticle)
|
||
|
|
.filter(EnrichedArticle.fetched_at >= day_start, EnrichedArticle.fetched_at < day_end)
|
||
|
|
.count()
|
||
|
|
)
|
||
|
|
ai_summarized = db.query(EnrichedArticle).filter(EnrichedArticle.ai_summary != "").count()
|
||
|
|
categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").count()
|
||
|
|
tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").count()
|
||
|
|
duplicate_groups = db.query(DuplicateGroup).count()
|
||
|
|
briefs = db.query(DailyBrief).count()
|
||
|
|
|
||
|
|
scheduler = get_scheduler()
|
||
|
|
next_jobs = {}
|
||
|
|
for job in scheduler.get_jobs():
|
||
|
|
next_jobs[job.id] = job.next_run_time.isoformat() if job.next_run_time else None
|
||
|
|
|
||
|
|
return {
|
||
|
|
"total_articles": total_articles,
|
||
|
|
"today_articles": today_articles,
|
||
|
|
"ai_summarized": ai_summarized,
|
||
|
|
"categories": categories,
|
||
|
|
"tags": tags,
|
||
|
|
"duplicate_groups": duplicate_groups,
|
||
|
|
"briefs": briefs,
|
||
|
|
"next_jobs": next_jobs,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- 静态文件托管(生产环境) ----------
|
||
|
|
|
||
|
|
static_dir = os.path.join(os.path.dirname(__file__), "static")
|
||
|
|
if not os.path.isdir(static_dir):
|
||
|
|
# 本地构建时 frontend/dist 也可作为静态文件源
|
||
|
|
frontend_dist = os.path.join(os.path.dirname(__file__), "frontend", "dist")
|
||
|
|
if os.path.isdir(frontend_dist):
|
||
|
|
static_dir = frontend_dist
|
||
|
|
|
||
|
|
if os.path.isdir(static_dir):
|
||
|
|
app.mount("/", StaticFiles(directory=static_dir, html=True), name="static")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
import uvicorn
|
||
|
|
uvicorn.run(app, host="0.0.0.0", port=7331)
|