feat: 修复代码审核报告问题
This commit is contained in:
@@ -0,0 +1,426 @@
|
||||
"""dataClean FastAPI 入口"""
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional, List
|
||||
|
||||
from fastapi import FastAPI, Depends, HTTPException, Query, Body, Security, status
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from database import init_db, get_db, SessionLocal
|
||||
from scheduler import init_scheduler, stop_scheduler, get_scheduler, get_task_lock
|
||||
from app.taxonomy import bootstrap_taxonomy, list_taxonomy, ensure_taxonomy
|
||||
from app.summarizer import fetch_and_summarize
|
||||
from app.tagger import tag_articles
|
||||
from app.deduplicator import deduplicate_articles
|
||||
from app.scorer import score_articles
|
||||
from app.brief import generate_daily_brief
|
||||
from app.settings_manager import (
|
||||
init_default_settings,
|
||||
list_settings,
|
||||
get_setting,
|
||||
set_setting,
|
||||
reset_settings,
|
||||
apply_db_settings_to_config,
|
||||
)
|
||||
from models import EnrichedArticle, DailyBrief, Taxonomy, DuplicateGroup, AppSetting
|
||||
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO),
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# API Token 鉴权(当配置时启用)
|
||||
security_scheme = HTTPBearer(auto_error=False)
|
||||
|
||||
|
||||
def _get_allowed_origins() -> List[str]:
|
||||
"""解析 CORS 允许来源配置"""
|
||||
raw = settings.CORS_ALLOWED_ORIGINS
|
||||
if raw:
|
||||
return [o.strip() for o in raw.split(",") if o.strip()]
|
||||
# 默认只允许同源(Docker/生产由反向代理或浏览器同源访问)
|
||||
return []
|
||||
|
||||
|
||||
def verify_token(credentials: Optional[HTTPAuthorizationCredentials] = Security(security_scheme)):
|
||||
"""验证 API Token;未配置时跳过鉴权"""
|
||||
token = settings.API_TOKEN
|
||||
if not token:
|
||||
return None
|
||||
if not credentials:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="缺少 Authorization 请求头",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
if credentials.scheme != "Bearer" or credentials.credentials != token:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail="无效的 API Token",
|
||||
)
|
||||
return credentials.credentials
|
||||
|
||||
|
||||
def _run_task_locked(func, db: Session):
|
||||
"""带互斥锁执行任务"""
|
||||
acquired = get_task_lock().acquire(blocking=False)
|
||||
if not acquired:
|
||||
raise HTTPException(status_code=409, detail="已有任务正在执行,请稍后再试")
|
||||
try:
|
||||
return func(db)
|
||||
finally:
|
||||
get_task_lock().release()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""应用生命周期管理"""
|
||||
logger.info("启动 dataClean 服务")
|
||||
init_db()
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# 初始化默认配置
|
||||
init_default_settings(db)
|
||||
# 用数据库配置覆盖全局 settings
|
||||
apply_db_settings_to_config(db)
|
||||
# 首次启动时确保 taxonomy 表存在
|
||||
ensure_taxonomy(db)
|
||||
except Exception as exc:
|
||||
logger.error("启动初始化失败: %s", exc)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
init_scheduler()
|
||||
yield
|
||||
stop_scheduler()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="dataClean",
|
||||
description="RSS 数据清洗、摘要、分类、打分与简报生成服务",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# CORS 配置:生产环境收敛到具体域名,且不与 credentials=true 同时用通配符
|
||||
_allowed_origins = _get_allowed_origins()
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=_allowed_origins or ["*"],
|
||||
allow_credentials=bool(_allowed_origins),
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
# ---------- Pydantic 模型 ----------
|
||||
|
||||
class ArticleOut(BaseModel):
|
||||
id: int
|
||||
rk_article_id: int
|
||||
title: str
|
||||
link: str
|
||||
feed_title: str
|
||||
category: str
|
||||
tags: List[str]
|
||||
heat_score: float
|
||||
importance_score: float
|
||||
duplication_score: float
|
||||
composite_score: float
|
||||
ai_summary: str
|
||||
is_representative: bool
|
||||
published_at: Optional[str]
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class ArticleListOut(BaseModel):
|
||||
items: List[ArticleOut]
|
||||
total: int
|
||||
|
||||
|
||||
class BriefOut(BaseModel):
|
||||
id: int
|
||||
brief_date: str
|
||||
total_articles: int
|
||||
unique_articles: int
|
||||
by_category: dict
|
||||
markdown_path: str
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class TaxonomyOut(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
kind: str
|
||||
description: str
|
||||
keywords: List[str]
|
||||
weight: float
|
||||
created_by_ai: bool
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class SettingOut(BaseModel):
|
||||
key: str
|
||||
value: str
|
||||
description: str
|
||||
is_sensitive: bool
|
||||
is_masked: bool
|
||||
updated_at: Optional[str]
|
||||
|
||||
|
||||
class SettingUpdate(BaseModel):
|
||||
value: str
|
||||
|
||||
|
||||
class BatchSettingsUpdate(BaseModel):
|
||||
settings: dict
|
||||
|
||||
|
||||
class StatsOut(BaseModel):
|
||||
total_articles: int
|
||||
today_articles: int
|
||||
ai_summarized: int
|
||||
categories: int
|
||||
tags: int
|
||||
duplicate_groups: int
|
||||
briefs: int
|
||||
next_jobs: dict
|
||||
|
||||
|
||||
# ---------- 健康检查 ----------
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok", "service": "dataClean"}
|
||||
|
||||
|
||||
# ---------- 文章接口 ----------
|
||||
|
||||
@app.get("/api/articles", response_model=ArticleListOut)
|
||||
def list_articles(
|
||||
date: Optional[str] = Query(None, description="日期 YYYY-MM-DD"),
|
||||
category: Optional[str] = Query(None),
|
||||
tag: Optional[str] = Query(None),
|
||||
representative_only: bool = Query(False, description="仅返回重复组代表文章"),
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
offset: int = Query(0, ge=0),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
query = db.query(EnrichedArticle)
|
||||
|
||||
if date:
|
||||
day = datetime.strptime(date, "%Y-%m-%d")
|
||||
next_day = day + timedelta(days=1)
|
||||
query = query.filter(EnrichedArticle.fetched_at >= day, EnrichedArticle.fetched_at < next_day)
|
||||
if category:
|
||||
query = query.filter(EnrichedArticle.category == category)
|
||||
if tag:
|
||||
# SQLite JSON 列使用 json_each 做精确匹配,避免字符串子串误命中
|
||||
query = query.filter(
|
||||
EnrichedArticle.tags.contains([tag])
|
||||
)
|
||||
if representative_only:
|
||||
query = query.filter(
|
||||
(EnrichedArticle.is_representative == True) | (EnrichedArticle.duplicate_group_id == None)
|
||||
)
|
||||
|
||||
total = query.count()
|
||||
items = query.order_by(EnrichedArticle.composite_score.desc()).offset(offset).limit(limit).all()
|
||||
return {"items": items, "total": total}
|
||||
|
||||
|
||||
@app.get("/api/articles/{article_id}", response_model=ArticleOut)
|
||||
def get_article(article_id: int, db: Session = Depends(get_db)):
|
||||
article = db.query(EnrichedArticle).filter(EnrichedArticle.id == article_id).first()
|
||||
if not article:
|
||||
raise HTTPException(status_code=404, detail="文章不存在")
|
||||
return article
|
||||
|
||||
|
||||
# ---------- 简报接口 ----------
|
||||
|
||||
@app.get("/api/briefs", response_model=List[BriefOut])
|
||||
def list_briefs(
|
||||
limit: int = Query(30, ge=1, le=100),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
return (
|
||||
db.query(DailyBrief)
|
||||
.order_by(DailyBrief.brief_date.desc())
|
||||
.limit(limit)
|
||||
.all()
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/briefs/{date}", response_model=BriefOut)
|
||||
def get_brief(date: str, db: Session = Depends(get_db)):
|
||||
brief = db.query(DailyBrief).filter(DailyBrief.brief_date == date).first()
|
||||
if not brief:
|
||||
raise HTTPException(status_code=404, detail="简报不存在")
|
||||
return brief
|
||||
|
||||
|
||||
@app.post("/api/briefs/{date}/regenerate")
|
||||
def regenerate_brief(date: str, db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
try:
|
||||
data = generate_daily_brief(db, date_str=date, force=True)
|
||||
return {"message": "简报已重新生成", "data": data}
|
||||
except Exception as exc:
|
||||
logger.error("重新生成简报失败: %s", exc)
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
|
||||
|
||||
# ---------- 分类体系接口 ----------
|
||||
|
||||
@app.get("/api/taxonomy", response_model=List[TaxonomyOut])
|
||||
def get_taxonomy(kind: Optional[str] = Query(None), db: Session = Depends(get_db)):
|
||||
return list_taxonomy(db, kind=kind)
|
||||
|
||||
|
||||
@app.post("/api/taxonomy/bootstrap")
|
||||
def trigger_taxonomy_bootstrap(
|
||||
force: bool = False,
|
||||
db: Session = Depends(get_db),
|
||||
_=Depends(verify_token),
|
||||
):
|
||||
ok = bootstrap_taxonomy(db, force=force)
|
||||
if not ok:
|
||||
return {"message": "taxonomy 已存在或初始化失败,请检查日志"}
|
||||
return {"message": "taxonomy 初始化成功"}
|
||||
|
||||
|
||||
# ---------- 手动触发任务接口 ----------
|
||||
|
||||
@app.post("/api/tasks/summarize")
|
||||
def task_summarize(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
stats = _run_task_locked(lambda session: fetch_and_summarize(session, hours=24, limit=200), db)
|
||||
return {"message": "摘要任务完成", "stats": stats}
|
||||
|
||||
|
||||
@app.post("/api/tasks/tag-score-dedup")
|
||||
def task_tag_score_dedup(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
def _run(session):
|
||||
tag_articles(session)
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
deduplicate_articles(session, date_str=today)
|
||||
score_articles(session, update_duplication=True)
|
||||
return None
|
||||
_run_task_locked(_run, db)
|
||||
return {"message": "分类/去重/打分任务完成"}
|
||||
|
||||
|
||||
@app.post("/api/tasks/brief")
|
||||
def task_brief(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
def _run(session):
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
return generate_daily_brief(session, date_str=today, force=True)
|
||||
data = _run_task_locked(_run, db)
|
||||
return {"message": "简报生成任务完成", "data": data}
|
||||
|
||||
|
||||
# ---------- 配置管理接口 ----------
|
||||
|
||||
@app.get("/api/settings", response_model=List[SettingOut])
|
||||
def get_settings(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
return list_settings(db, mask_sensitive=True)
|
||||
|
||||
|
||||
@app.put("/api/settings/{key}")
|
||||
def update_setting(
|
||||
key: str,
|
||||
body: SettingUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
_=Depends(verify_token),
|
||||
):
|
||||
ok = set_setting(db, key, body.value)
|
||||
if not ok:
|
||||
raise HTTPException(status_code=400, detail="无效的配置项")
|
||||
return {"message": "配置已保存,重启服务后生效"}
|
||||
|
||||
|
||||
@app.put("/api/settings")
|
||||
def update_settings_batch(
|
||||
body: BatchSettingsUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
_=Depends(verify_token),
|
||||
):
|
||||
errors = []
|
||||
for key, value in body.settings.items():
|
||||
if not set_setting(db, key, value):
|
||||
errors.append(key)
|
||||
if errors:
|
||||
raise HTTPException(status_code=400, detail=f"以下配置项无效: {', '.join(errors)}")
|
||||
return {"message": "配置已保存,重启服务后生效"}
|
||||
|
||||
|
||||
@app.post("/api/settings/reset")
|
||||
def reset_all_settings(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
reset_settings(db)
|
||||
return {"message": "配置已重置为环境变量默认值,重启服务后生效"}
|
||||
|
||||
|
||||
# ---------- 仪表盘统计接口 ----------
|
||||
|
||||
@app.get("/api/stats", response_model=StatsOut)
|
||||
def get_stats(db: Session = Depends(get_db)):
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
day_start = datetime.strptime(today, "%Y-%m-%d")
|
||||
day_end = day_start + timedelta(days=1)
|
||||
|
||||
total_articles = db.query(EnrichedArticle).count()
|
||||
today_articles = (
|
||||
db.query(EnrichedArticle)
|
||||
.filter(EnrichedArticle.fetched_at >= day_start, EnrichedArticle.fetched_at < day_end)
|
||||
.count()
|
||||
)
|
||||
ai_summarized = db.query(EnrichedArticle).filter(EnrichedArticle.ai_summary != "").count()
|
||||
categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").count()
|
||||
tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").count()
|
||||
duplicate_groups = db.query(DuplicateGroup).count()
|
||||
briefs = db.query(DailyBrief).count()
|
||||
|
||||
scheduler = get_scheduler()
|
||||
next_jobs = {}
|
||||
for job in scheduler.get_jobs():
|
||||
next_jobs[job.id] = job.next_run_time.isoformat() if job.next_run_time else None
|
||||
|
||||
return {
|
||||
"total_articles": total_articles,
|
||||
"today_articles": today_articles,
|
||||
"ai_summarized": ai_summarized,
|
||||
"categories": categories,
|
||||
"tags": tags,
|
||||
"duplicate_groups": duplicate_groups,
|
||||
"briefs": briefs,
|
||||
"next_jobs": next_jobs,
|
||||
}
|
||||
|
||||
|
||||
# ---------- 静态文件托管(生产环境) ----------
|
||||
|
||||
static_dir = os.path.join(os.path.dirname(__file__), "static")
|
||||
if not os.path.isdir(static_dir):
|
||||
# 本地构建时 frontend/dist 也可作为静态文件源
|
||||
frontend_dist = os.path.join(os.path.dirname(__file__), "frontend", "dist")
|
||||
if os.path.isdir(frontend_dist):
|
||||
static_dir = frontend_dist
|
||||
|
||||
if os.path.isdir(static_dir):
|
||||
app.mount("/", StaticFiles(directory=static_dir, html=True), name="static")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=7331)
|
||||
Reference in New Issue
Block a user