feat: init rssKeeper - RSS 抓取、管理与检索系统

完整功能包括: - FastAPI 后端 + SQLite + FTS5 全文搜索 - RSS 源管理、自动发现、OPML 导入导出 - 文章抓取、去重、分类、全文检索 - RSS 源健康度监控 - Vue 3 + Element Plus 暗色主题 Web UI - 对外 REST API 供 AI 分析调用 - Docker + docker-compose 部署
2026-06-11 14:03:36 +08:00
commit 54e7db0ef0
28 changed files with 2915 additions and 0 deletions
@@ -0,0 +1,26 @@
+"""配置管理 - 环境变量 + 默认值"""
+import os
+from pathlib import Path
+
+# 项目根目录
+BASE_DIR = Path(__file__).parent
+DATA_DIR = Path(os.getenv("DATA_DIR", "/app/data"))
+DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+# 数据库
+DATABASE_URL = os.getenv("DATABASE_URL", str(DATA_DIR / "rsskeeper.db"))
+
+# RSS 抓取配置
+FETCH_CONCURRENCY = int(os.getenv("FETCH_CONCURRENCY", "10"))
+FETCH_TIMEOUT = int(os.getenv("FETCH_TIMEOUT", "30"))
+DEFAULT_FETCH_INTERVAL = int(os.getenv("DEFAULT_FETCH_INTERVAL", "60"))  # 分钟
+MIN_FETCH_INTERVAL = int(os.getenv("MIN_FETCH_INTERVAL", "15"))  # 最小间隔15分钟
+
+# 内容处理
+MAX_ARTICLE_CONTENT_LENGTH = int(os.getenv("MAX_ARTICLE_CONTENT_LENGTH", "50000"))
+MAX_SUMMARY_LENGTH = int(os.getenv("MAX_SUMMARY_LENGTH", "500"))
+ARTICLE_RETENTION_DAYS = int(os.getenv("ARTICLE_RETENTION_DAYS", "0"))  # 0 = 永久保留
+
+# API 配置
+API_PREFIX = "/api"
+EXTERNAL_API_PREFIX = "/api/v1/external"
@@ -0,0 +1,89 @@
+"""数据库连接与初始化"""
+from sqlalchemy import create_engine, event
+from sqlalchemy.orm import sessionmaker, declarative_base
+from config import DATABASE_URL
+
+# SQLite 连接
+engine = create_engine(
+    f"sqlite:///{DATABASE_URL}",
+    connect_args={"check_same_thread": False},
+    echo=False,
+)
+
+# 启用 SQLite 外键约束
+@event.listens_for(engine, "connect")
+def set_sqlite_pragma(dbapi_conn, connection_record):
+    cursor = dbapi_conn.cursor()
+    cursor.execute("PRAGMA foreign_keys=ON")
+    cursor.close()
+
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+Base = declarative_base()
+
+
+def get_db():
+    """FastAPI 依赖注入用"""
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+def init_db():
+    """初始化数据库表"""
+    from models import Feed, Article, FetchLog  # noqa
+
+    Base.metadata.create_all(bind=engine)
+    init_fts5()
+
+
+def init_fts5():
+    """初始化 FTS5 全文搜索虚拟表"""
+    conn = engine.raw_connection()
+    cursor = conn.cursor()
+
+    # 检查 FTS5 扩展是否可用
+    try:
+        cursor.execute("SELECT sqlite_compileoption_used('ENABLE_FTS5')")
+        has_fts5 = cursor.fetchone()[0]
+        if not has_fts5:
+            print("警告: SQLite 未启用 FTS5 扩展，全文搜索将不可用")
+            return
+    except Exception:
+        pass
+
+    # 创建 FTS5 虚拟表
+    cursor.execute("""
+        CREATE VIRTUAL TABLE IF NOT EXISTS articles_fts USING fts5(
+            title, content,
+            content='articles',
+            content_rowid='id'
+        )
+    """)
+
+    # 创建触发器，自动同步 articles 表到 FTS5
+    cursor.execute("""
+        CREATE TRIGGER IF NOT EXISTS articles_fts_insert AFTER INSERT ON articles BEGIN
+            INSERT INTO articles_fts(rowid, title, content)
+            VALUES (new.id, new.title, new.content);
+        END
+    """)
+    cursor.execute("""
+        CREATE TRIGGER IF NOT EXISTS articles_fts_delete AFTER DELETE ON articles BEGIN
+            INSERT INTO articles_fts(articles_fts, rowid, title, content)
+            VALUES ('delete', old.id, old.title, old.content);
+        END
+    """)
+    cursor.execute("""
+        CREATE TRIGGER IF NOT EXISTS articles_fts_update AFTER UPDATE ON articles BEGIN
+            INSERT INTO articles_fts(articles_fts, rowid, title, content)
+            VALUES ('delete', old.id, old.title, old.content);
+            INSERT INTO articles_fts(rowid, title, content)
+            VALUES (new.id, new.title, new.content);
+        END
+    """)
+
+    conn.commit()
+    cursor.close()
+    conn.close()
@@ -0,0 +1,81 @@
+"""SQLite FTS5 全文搜索封装"""
+from sqlalchemy import text
+from database import engine
+
+
+def search_articles(query: str, limit: int = 50, offset: int = 0):
+    """全文搜索文章
+    返回 [(article_id, title, content_snippet, rank), ...]
+    """
+    if not query or not query.strip():
+        return [], 0
+
+    # 转义 FTS5 特殊字符
+    query = query.replace('"', '""').strip()
+
+    conn = engine.raw_connection()
+    cursor = conn.cursor()
+
+    try:
+        # 使用 FTS5 查询
+        sql = """
+            SELECT a.id, a.title, a.summary, a.link, a.published_at, a.created_at,
+                   f.id as feed_id, f.title as feed_title, f.category,
+                   rank
+            FROM articles_fts
+            JOIN articles a ON articles_fts.rowid = a.id
+            JOIN feeds f ON a.feed_id = f.id
+            WHERE articles_fts MATCH ?
+            ORDER BY rank
+            LIMIT ? OFFSET ?
+        """
+        cursor.execute(sql, (query, limit, offset))
+        rows = cursor.fetchall()
+
+        # 获取总数
+        count_sql = """
+            SELECT COUNT(*) FROM articles_fts WHERE articles_fts MATCH ?
+        """
+        cursor.execute(count_sql, (query,))
+        total = cursor.fetchone()[0]
+
+        results = []
+        for row in rows:
+            results.append({
+                "id": row[0],
+                "title": row[1],
+                "summary": row[2],
+                "link": row[3],
+                "published_at": row[4],
+                "created_at": row[5],
+                "feed_id": row[6],
+                "feed_title": row[7],
+                "category": row[8],
+            })
+
+        return results, total
+    except Exception as e:
+        # FTS5 查询失败时返回空结果
+        return [], 0
+    finally:
+        cursor.close()
+        conn.close()
+
+
+def rebuild_fts_index():
+    """重建 FTS5 索引（数据不一致时使用）"""
+    conn = engine.raw_connection()
+    cursor = conn.cursor()
+    try:
+        cursor.execute("DELETE FROM articles_fts")
+        cursor.execute("""
+            INSERT INTO articles_fts(rowid, title, content)
+            SELECT id, title, content FROM articles
+        """)
+        conn.commit()
+        return True
+    except Exception:
+        return False
+    finally:
+        cursor.close()
+        conn.close()
@@ -0,0 +1,112 @@
+"""RSS 源健康度检测"""
+from datetime import datetime, timedelta
+from typing import List, Dict
+from sqlalchemy.orm import Session
+from models import Feed, FetchLog
+
+
+def get_feed_health(db: Session, feed_id: int = None) -> List[Dict]:
+    """获取 RSS 源健康度信息
+    返回每个源的健康状态详情
+    """
+    query = db.query(Feed)
+    if feed_id:
+        query = query.filter(Feed.id == feed_id)
+
+    feeds = query.all()
+    results = []
+
+    for feed in feeds:
+        total = feed.success_count + feed.fail_count
+        success_rate = round(feed.success_count / total * 100, 1) if total > 0 else 0
+
+        days_since_fetch = None
+        if feed.last_fetch_at:
+            days_since_fetch = (datetime.utcnow() - feed.last_fetch_at).days
+
+        # 获取最近 7 天抓取记录
+        recent_logs = db.query(FetchLog).filter(
+            FetchLog.feed_id == feed.id,
+            FetchLog.created_at >= datetime.utcnow() - timedelta(days=7)
+        ).order_by(FetchLog.created_at.desc()).limit(10).all()
+
+        health = feed.health_status()
+
+        results.append({
+            "id": feed.id,
+            "title": feed.title or feed.url,
+            "url": feed.url,
+            "is_active": feed.is_active,
+            "health_status": health,
+            "health_label": _health_label(health),
+            "success_rate": success_rate,
+            "success_count": feed.success_count,
+            "fail_count": feed.fail_count,
+            "total_fetches": total,
+            "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
+            "days_since_fetch": days_since_fetch,
+            "article_count": feed.article_count,
+            "last_error": feed.last_error,
+            "recent_logs": [
+                {
+                    "status": log.status,
+                    "articles_fetched": log.articles_fetched,
+                    "response_time_ms": log.response_time_ms,
+                    "created_at": log.created_at.isoformat(),
+                    "error_message": log.error_message if log.status == "fail" else None,
+                }
+                for log in recent_logs
+            ],
+        })
+
+    return results
+
+
+def _health_label(status: str) -> str:
+    labels = {
+        "healthy": "健康",
+        "warning": "警告",
+        "unhealthy": "异常",
+        "unknown": "未知",
+    }
+    return labels.get(status, "未知")
+
+
+def get_overall_stats(db: Session) -> Dict:
+    """获取整体统计信息"""
+    total_feeds = db.query(Feed).count()
+    active_feeds = db.query(Feed).filter(Feed.is_active == True).count()
+    total_articles = db.query(Feed).with_entities(Feed.article_count).all()
+    total_articles_count = sum(a[0] for a in total_articles) if total_articles else 0
+
+    # 健康源统计
+    feeds = db.query(Feed).all()
+    healthy = warning = unhealthy = 0
+    for feed in feeds:
+        status = feed.health_status()
+        if status == "healthy":
+            healthy += 1
+        elif status == "warning":
+            warning += 1
+        elif status == "unhealthy":
+            unhealthy += 1
+
+    # 今日抓取
+    today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
+    from models import FetchLog
+    today_fetches = db.query(FetchLog).filter(FetchLog.created_at >= today).count()
+    today_success = db.query(FetchLog).filter(
+        FetchLog.created_at >= today, FetchLog.status == "success"
+    ).count()
+
+    return {
+        "total_feeds": total_feeds,
+        "active_feeds": active_feeds,
+        "total_articles": total_articles_count,
+        "healthy_feeds": healthy,
+        "warning_feeds": warning,
+        "unhealthy_feeds": unhealthy,
+        "today_fetches": today_fetches,
+        "today_success": today_success,
+        "today_success_rate": round(today_success / today_fetches * 100, 1) if today_fetches > 0 else 0,
+    }
@@ -0,0 +1,75 @@
+"""rssKeeper - FastAPI 入口"""
+import os
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+from starlette.middleware.cors import CORSMiddleware
+from database import init_db, SessionLocal
+from scheduler import init_feed_jobs, stop_scheduler
+from routers import feeds, articles, dashboard, external_api
+import config
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """应用生命周期管理"""
+    # 启动时：初始化数据库 + 注册定时任务
+    init_db()
+    db = SessionLocal()
+    try:
+        init_feed_jobs(db)
+    finally:
+        db.close()
+
+    yield
+
+    # 关闭时：停止调度器
+    stop_scheduler()
+
+
+app = FastAPI(
+    title="rssKeeper",
+    description="RSS 抓取、管理与检索系统",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+
+# CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# API 路由
+app.include_router(feeds.router, prefix=config.API_PREFIX)
+app.include_router(articles.router, prefix=config.API_PREFIX)
+app.include_router(dashboard.router, prefix=config.API_PREFIX)
+app.include_router(external_api.router, prefix=config.EXTERNAL_API_PREFIX)
+
+
+@app.get("/api/health")
+def health_check():
+    """健康检查"""
+    return {"status": "ok", "service": "rssKeeper"}
+
+
+# 静态文件服务（前端构建产物）
+static_dir = os.path.join(config.BASE_DIR, "static")
+if os.path.exists(static_dir):
+    app.mount("/static", StaticFiles(directory=static_dir), name="static")
+
+    @app.get("/{full_path:path}")
+    async def serve_spa(full_path: str):
+        """Vue SPA 路由回退"""
+        # API 路由不走这里
+        if full_path.startswith("api/") or full_path.startswith("docs") or full_path.startswith("openapi.json"):
+            return {"detail": "Not found"}
+
+        index_path = os.path.join(static_dir, "index.html")
+        if os.path.exists(index_path):
+            return FileResponse(index_path)
+        return {"detail": "Frontend not built"}
@@ -0,0 +1,90 @@
+"""SQLAlchemy 数据模型"""
+from datetime import datetime
+from sqlalchemy import Column, Integer, String, Text, Boolean, DateTime, ForeignKey
+from sqlalchemy.orm import relationship
+from database import Base
+
+
+class Feed(Base):
+    """RSS 源"""
+    __tablename__ = "feeds"
+
+    id = Column(Integer, primary_key=True, index=True)
+    url = Column(String(2048), unique=True, nullable=False, index=True)
+    title = Column(String(512), default="")
+    description = Column(Text, default="")
+    category = Column(String(128), default="")
+    is_active = Column(Boolean, default=True, index=True)
+    fetch_interval_minutes = Column(Integer, default=60)
+
+    # 抓取统计
+    last_fetch_at = Column(DateTime, nullable=True)
+    last_fetch_status = Column(String(20), default="")
+    last_error = Column(Text, default="")
+    success_count = Column(Integer, default=0)
+    fail_count = Column(Integer, default=0)
+    article_count = Column(Integer, default=0)
+
+    created_at = Column(DateTime, default=datetime.utcnow)
+
+    # 关联
+    articles = relationship("Article", back_populates="feed", cascade="all, delete-orphan")
+    fetch_logs = relationship("FetchLog", back_populates="feed", cascade="all, delete-orphan")
+
+    def health_status(self):
+        """计算健康度
+        🟢 健康: 成功率 >= 90%, 最近7天有更新
+        🟡 警告: 成功率 50%-90%, 或超过3天未更新
+        🔴 异常: 成功率 < 50%, 或超过7天未更新
+        """
+        total = self.success_count + self.fail_count
+        if total == 0:
+            return "unknown"
+
+        success_rate = self.success_count / total
+
+        days_since_last_fetch = None
+        if self.last_fetch_at:
+            days_since_last_fetch = (datetime.utcnow() - self.last_fetch_at).days
+
+        if success_rate >= 0.9 and (days_since_last_fetch is None or days_since_last_fetch <= 7):
+            return "healthy"
+        elif success_rate >= 0.5 and (days_since_last_fetch is None or days_since_last_fetch <= 7):
+            return "warning"
+        else:
+            return "unhealthy"
+
+
+class Article(Base):
+    """RSS 文章"""
+    __tablename__ = "articles"
+
+    id = Column(Integer, primary_key=True, index=True)
+    feed_id = Column(Integer, ForeignKey("feeds.id", ondelete="CASCADE"), nullable=False, index=True)
+    title = Column(String(1024), default="", index=True)
+    link = Column(String(2048), unique=True, nullable=False, index=True)
+    author = Column(String(256), default="")
+    published_at = Column(DateTime, nullable=True, index=True)
+    content = Column(Text, default="")
+    summary = Column(Text, default="")
+    is_read = Column(Boolean, default=False)
+    created_at = Column(DateTime, default=datetime.utcnow, index=True)
+
+    # 关联
+    feed = relationship("Feed", back_populates="articles")
+
+
+class FetchLog(Base):
+    """抓取日志"""
+    __tablename__ = "fetch_logs"
+
+    id = Column(Integer, primary_key=True, index=True)
+    feed_id = Column(Integer, ForeignKey("feeds.id", ondelete="CASCADE"), nullable=False, index=True)
+    status = Column(String(20), nullable=False)  # success / fail
+    articles_fetched = Column(Integer, default=0)
+    error_message = Column(Text, default="")
+    response_time_ms = Column(Integer, nullable=True)
+    created_at = Column(DateTime, default=datetime.utcnow, index=True)
+
+    # 关联
+    feed = relationship("Feed", back_populates="fetch_logs")
@@ -0,0 +1,9 @@
+fastapi>=0.110.0
+uvicorn[standard]>=0.29.0
+sqlalchemy>=2.0.0
+pydantic>=2.6.0
+feedparser>=6.0.11
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+apscheduler>=3.10.4
+lxml>=5.1.0
@@ -0,0 +1,133 @@
+"""文章管理 API"""
+from typing import Optional
+from fastapi import APIRouter, Depends
+from pydantic import BaseModel
+from sqlalchemy.orm import Session
+from sqlalchemy import desc
+from database import get_db
+from models import Article, Feed
+from fulltext_search import search_articles
+
+router = APIRouter(prefix="/articles", tags=["articles"])
+
+
+class ArticleOut(BaseModel):
+    id: int
+    feed_id: int
+    title: str
+    link: str
+    author: str
+    published_at: Optional[str]
+    summary: str
+    is_read: bool
+    created_at: str
+    feed_title: str
+    category: str
+
+    class Config:
+        from_attributes = True
+
+
+@router.get("")
+def list_articles(
+    skip: int = 0,
+    limit: int = 50,
+    feed_id: Optional[int] = None,
+    category: Optional[str] = None,
+    search: Optional[str] = None,
+    since: Optional[str] = None,
+    until: Optional[str] = None,
+    is_read: Optional[bool] = None,
+    db: Session = Depends(get_db),
+):
+    """获取文章列表，支持多种筛选条件"""
+
+    # 如果有搜索关键词，使用 FTS5 全文搜索
+    if search and search.strip():
+        results, total = search_articles(search.strip(), limit=limit, offset=skip)
+        return {"total": total, "items": results}
+
+    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
+
+    if feed_id:
+        query = query.filter(Article.feed_id == feed_id)
+    if category:
+        query = query.filter(Feed.category == category)
+    if is_read is not None:
+        query = query.filter(Article.is_read == is_read)
+    if since:
+        query = query.filter(Article.published_at >= since)
+    if until:
+        query = query.filter(Article.published_at <= until)
+
+    total = query.count()
+    rows = query.order_by(desc(Article.published_at)).offset(skip).limit(limit).all()
+
+    items = []
+    for article, feed_title, category in rows:
+        items.append({
+            "id": article.id,
+            "feed_id": article.feed_id,
+            "title": article.title or "",
+            "link": article.link,
+            "author": article.author or "",
+            "published_at": article.published_at.isoformat() if article.published_at else None,
+            "summary": article.summary or "",
+            "is_read": article.is_read,
+            "created_at": article.created_at.isoformat(),
+            "feed_title": feed_title or "",
+            "category": category or "",
+        })
+
+    return {"total": total, "items": items}
+
+
+@router.get("/{article_id}")
+def get_article(article_id: int, db: Session = Depends(get_db)):
+    """获取文章详情"""
+    article = db.query(Article).filter(Article.id == article_id).first()
+    if not article:
+        raise HTTPException(status_code=404, detail="文章不存在")
+
+    feed = db.query(Feed).filter(Feed.id == article.feed_id).first()
+
+    return {
+        "id": article.id,
+        "feed_id": article.feed_id,
+        "title": article.title or "",
+        "link": article.link,
+        "author": article.author or "",
+        "published_at": article.published_at.isoformat() if article.published_at else None,
+        "content": article.content or "",
+        "summary": article.summary or "",
+        "is_read": article.is_read,
+        "created_at": article.created_at.isoformat(),
+        "feed_title": feed.title if feed else "",
+        "category": feed.category if feed else "",
+    }
+
+
+@router.put("/{article_id}/read")
+def mark_read(article_id: int, db: Session = Depends(get_db)):
+    """标记文章为已读"""
+    article = db.query(Article).filter(Article.id == article_id).first()
+    if not article:
+        raise HTTPException(status_code=404, detail="文章不存在")
+
+    article.is_read = True
+    db.commit()
+    return {"message": "已标记为已读"}
+
+
+@router.get("/search/fulltext")
+def fulltext_search(
+    q: str,
+    skip: int = 0,
+    limit: int = 50,
+):
+    """全文搜索文章"""
+    results, total = search_articles(q, limit=limit, offset=skip)
+    return {"total": total, "items": results}
+
+
+from fastapi import HTTPException
@@ -0,0 +1,58 @@
+"""仪表盘统计 API"""
+from fastapi import APIRouter, Depends
+from sqlalchemy.orm import Session
+from database import get_db
+from health_checker import get_overall_stats, get_feed_health
+
+router = APIRouter(prefix="/dashboard", tags=["dashboard"])
+
+
+@router.get("/stats")
+def dashboard_stats(db: Session = Depends(get_db)):
+    """仪表盘统计数据"""
+    return get_overall_stats(db)
+
+
+@router.get("/health")
+def dashboard_health(
+    skip: int = 0,
+    limit: int = 100,
+    db: Session = Depends(get_db),
+):
+    """RSS 源健康度列表"""
+    all_health = get_feed_health(db)
+    total = len(all_health)
+
+    # 按健康状态排序：异常在前
+    status_order = {"unhealthy": 0, "warning": 1, "unknown": 2, "healthy": 3}
+    all_health.sort(key=lambda x: status_order.get(x["health_status"], 2))
+
+    items = all_health[skip:skip + limit]
+    return {"total": total, "items": items}
+
+
+@router.get("/recent-activity")
+def recent_activity(limit: int = 20, db: Session = Depends(get_db)):
+    """最近的抓取活动"""
+    from models import FetchLog, Feed
+    from sqlalchemy import desc
+
+    logs = db.query(FetchLog, Feed.title.label("feed_title")).join(Feed).order_by(
+        desc(FetchLog.created_at)
+    ).limit(limit).all()
+
+    return {
+        "items": [
+            {
+                "id": log.id,
+                "feed_id": log.feed_id,
+                "feed_title": feed_title or "",
+                "status": log.status,
+                "articles_fetched": log.articles_fetched,
+                "response_time_ms": log.response_time_ms,
+                "error_message": log.error_message,
+                "created_at": log.created_at.isoformat(),
+            }
+            for log, feed_title in logs
+        ]
+    }
@@ -0,0 +1,163 @@
+"""对外 API（供 AI/外部系统调用）"""
+from typing import Optional
+from datetime import datetime, timedelta
+from fastapi import APIRouter, Depends
+from sqlalchemy.orm import Session
+from sqlalchemy import desc
+from database import get_db
+from models import Article, Feed
+
+router = APIRouter(prefix="/external", tags=["external"])
+
+
+@router.get("/recent")
+def get_recent_articles(
+    hours: int = 24,
+    limit: int = 50,
+    feed_id: Optional[int] = None,
+    category: Optional[str] = None,
+    db: Session = Depends(get_db),
+):
+    """获取最近 N 小时的文章
+    这是对外提供给 AI 分析的主要接口
+    """
+    since = datetime.utcnow() - timedelta(hours=hours)
+
+    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
+
+    query = query.filter(Article.created_at >= since)
+
+    if feed_id:
+        query = query.filter(Article.feed_id == feed_id)
+    if category:
+        query = query.filter(Feed.category == category)
+
+    rows = query.order_by(desc(Article.published_at)).limit(limit).all()
+
+    return {
+        "query": {
+            "hours": hours,
+            "limit": limit,
+            "feed_id": feed_id,
+            "category": category,
+        },
+        "count": len(rows),
+        "articles": [
+            {
+                "id": article.id,
+                "title": article.title or "",
+                "link": article.link,
+                "author": article.author or "",
+                "summary": article.summary or "",
+                "content": article.content or "" if len(article.content or "") < 10000 else article.summary or "",
+                "published_at": article.published_at.isoformat() if article.published_at else None,
+                "created_at": article.created_at.isoformat(),
+                "feed_title": feed_title or "",
+                "category": category or "",
+            }
+            for article, feed_title, category in rows
+        ],
+    }
+
+
+@router.get("/feeds")
+def get_active_feeds(db: Session = Depends(get_db)):
+    """获取所有活跃的 RSS 源列表"""
+    feeds = db.query(Feed).filter(Feed.is_active == True).all()
+
+    return {
+        "count": len(feeds),
+        "feeds": [
+            {
+                "id": feed.id,
+                "title": feed.title or feed.url,
+                "url": feed.url,
+                "category": feed.category or "",
+                "article_count": feed.article_count,
+                "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
+            }
+            for feed in feeds
+        ],
+    }
+
+
+@router.get("/feeds/{feed_id}/articles")
+def get_feed_articles(
+    feed_id: int,
+    limit: int = 100,
+    since: Optional[str] = None,
+    db: Session = Depends(get_db),
+):
+    """获取指定 RSS 源的文章"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        return {"error": "Feed not found"}
+
+    query = db.query(Article).filter(Article.feed_id == feed_id)
+
+    if since:
+        query = query.filter(Article.published_at >= since)
+
+    articles = query.order_by(desc(Article.published_at)).limit(limit).all()
+
+    return {
+        "feed": {
+            "id": feed.id,
+            "title": feed.title or feed.url,
+            "url": feed.url,
+        },
+        "count": len(articles),
+        "articles": [
+            {
+                "id": article.id,
+                "title": article.title or "",
+                "link": article.link,
+                "author": article.author or "",
+                "summary": article.summary or "",
+                "published_at": article.published_at.isoformat() if article.published_at else None,
+            }
+            for article in articles
+        ],
+    }
+
+
+@router.get("/summary")
+def get_daily_summary(
+    date: Optional[str] = None,
+    db: Session = Depends(get_db),
+):
+    """获取指定日期的文章摘要统计
+    供 AI 快速了解某天的 RSS 内容概况
+    """
+    if date:
+        try:
+            day = datetime.strptime(date, "%Y-%m-%d")
+            next_day = day + timedelta(days=1)
+        except ValueError:
+            return {"error": "Invalid date format, use YYYY-MM-DD"}
+    else:
+        day = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
+        next_day = day + timedelta(days=1)
+
+    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
+    query = query.filter(Article.created_at >= day, Article.created_at < next_day)
+    rows = query.order_by(desc(Article.published_at)).all()
+
+    # 按分类统计
+    by_category = {}
+    for article, feed_title, category in rows:
+        cat = category or "未分类"
+        if cat not in by_category:
+            by_category[cat] = []
+        by_category[cat].append({
+            "title": article.title or "",
+            "link": article.link,
+            "feed": feed_title or "",
+            "summary": article.summary or "",
+        })
+
+    return {
+        "date": day.strftime("%Y-%m-%d"),
+        "total_articles": len(rows),
+        "by_category": by_category,
+    }
@@ -0,0 +1,273 @@
+"""RSS 源管理 API"""
+from typing import List, Optional
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel, HttpUrl
+from sqlalchemy.orm import Session
+from database import get_db
+from models import Feed
+from rss_fetcher import discover_feed_url, fetch_and_store_feed
+from scheduler import add_feed_job, remove_feed_job
+
+router = APIRouter(prefix="/feeds", tags=["feeds"])
+
+
+class FeedCreate(BaseModel):
+    url: str
+    title: Optional[str] = ""
+    description: Optional[str] = ""
+    category: Optional[str] = ""
+    is_active: Optional[bool] = True
+    fetch_interval_minutes: Optional[int] = 60
+
+
+class FeedUpdate(BaseModel):
+    title: Optional[str] = None
+    description: Optional[str] = None
+    category: Optional[str] = None
+    is_active: Optional[bool] = None
+    fetch_interval_minutes: Optional[int] = None
+
+
+class FeedOut(BaseModel):
+    id: int
+    url: str
+    title: str
+    description: str
+    category: str
+    is_active: bool
+    fetch_interval_minutes: int
+    last_fetch_at: Optional[str] = None
+    last_fetch_status: str
+    success_count: int
+    fail_count: int
+    article_count: int
+    health_status: str
+    created_at: str
+
+    class Config:
+        from_attributes = True
+
+
+@router.get("", response_model=dict)
+def list_feeds(
+    skip: int = 0,
+    limit: int = 100,
+    category: Optional[str] = None,
+    search: Optional[str] = None,
+    is_active: Optional[bool] = None,
+    db: Session = Depends(get_db),
+):
+    """获取 RSS 源列表，支持分页、分类筛选、搜索"""
+    query = db.query(Feed)
+
+    if category:
+        query = query.filter(Feed.category == category)
+    if is_active is not None:
+        query = query.filter(Feed.is_active == is_active)
+    if search:
+        query = query.filter(
+            Feed.title.contains(search) | Feed.url.contains(search) | Feed.description.contains(search)
+        )
+
+    total = query.count()
+    feeds = query.order_by(Feed.created_at.desc()).offset(skip).limit(limit).all()
+
+    results = []
+    for feed in feeds:
+        data = {
+            "id": feed.id,
+            "url": feed.url,
+            "title": feed.title or feed.url,
+            "description": feed.description or "",
+            "category": feed.category or "",
+            "is_active": feed.is_active,
+            "fetch_interval_minutes": feed.fetch_interval_minutes,
+            "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
+            "last_fetch_status": feed.last_fetch_status,
+            "success_count": feed.success_count,
+            "fail_count": feed.fail_count,
+            "article_count": feed.article_count,
+            "health_status": feed.health_status(),
+            "created_at": feed.created_at.isoformat(),
+        }
+        results.append(data)
+
+    return {"total": total, "items": results}
+
+
+@router.get("/categories")
+def list_categories(db: Session = Depends(get_db)):
+    """获取所有分类列表"""
+    categories = db.query(Feed.category).filter(Feed.category != "").distinct().all()
+    return [c[0] for c in categories if c[0]]
+
+
+@router.post("", response_model=dict)
+def create_feed(data: FeedCreate, db: Session = Depends(get_db)):
+    """添加 RSS 源"""
+    # 检查是否已存在
+    existing = db.query(Feed).filter(Feed.url == data.url).first()
+    if existing:
+        raise HTTPException(status_code=409, detail="该 RSS 源已存在")
+
+    feed = Feed(
+        url=data.url,
+        title=data.title or "",
+        description=data.description or "",
+        category=data.category or "",
+        is_active=data.is_active,
+        fetch_interval_minutes=data.fetch_interval_minutes or 60,
+    )
+    db.add(feed)
+    db.commit()
+    db.refresh(feed)
+
+    # 注册定时任务
+    if feed.is_active:
+        add_feed_job(feed.id, feed.fetch_interval_minutes)
+
+    # 立即抓取一次
+    fetch_and_store_feed(feed.id)
+
+    return {"id": feed.id, "message": "RSS 源添加成功", "url": feed.url}
+
+
+@router.post("/discover")
+def discover_feed(url: str, db: Session = Depends(get_db)):
+    """从网页自动发现 RSS feed URL"""
+    feed_urls = discover_feed_url(url)
+    return {"source_url": url, "found_feeds": feed_urls}
+
+
+@router.get("/{feed_id}", response_model=dict)
+def get_feed(feed_id: int, db: Session = Depends(get_db)):
+    """获取 RSS 源详情"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        raise HTTPException(status_code=404, detail="RSS 源不存在")
+
+    return {
+        "id": feed.id,
+        "url": feed.url,
+        "title": feed.title or feed.url,
+        "description": feed.description or "",
+        "category": feed.category or "",
+        "is_active": feed.is_active,
+        "fetch_interval_minutes": feed.fetch_interval_minutes,
+        "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
+        "last_fetch_status": feed.last_fetch_status,
+        "last_error": feed.last_error,
+        "success_count": feed.success_count,
+        "fail_count": feed.fail_count,
+        "article_count": feed.article_count,
+        "health_status": feed.health_status(),
+        "created_at": feed.created_at.isoformat(),
+    }
+
+
+@router.put("/{feed_id}", response_model=dict)
+def update_feed(feed_id: int, data: FeedUpdate, db: Session = Depends(get_db)):
+    """更新 RSS 源"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        raise HTTPException(status_code=404, detail="RSS 源不存在")
+
+    if data.title is not None:
+        feed.title = data.title
+    if data.description is not None:
+        feed.description = data.description
+    if data.category is not None:
+        feed.category = data.category
+    if data.is_active is not None:
+        feed.is_active = data.is_active
+        if feed.is_active:
+            add_feed_job(feed.id, feed.fetch_interval_minutes)
+        else:
+            remove_feed_job(feed.id)
+    if data.fetch_interval_minutes is not None:
+        feed.fetch_interval_minutes = data.fetch_interval_minutes
+        if feed.is_active:
+            add_feed_job(feed.id, feed.fetch_interval_minutes)
+
+    db.commit()
+    return {"message": "RSS 源更新成功"}
+
+
+@router.delete("/{feed_id}")
+def delete_feed(feed_id: int, db: Session = Depends(get_db)):
+    """删除 RSS 源（级联删除文章和日志）"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        raise HTTPException(status_code=404, detail="RSS 源不存在")
+
+    remove_feed_job(feed_id)
+    db.delete(feed)
+    db.commit()
+    return {"message": "RSS 源已删除"}
+
+
+@router.post("/{feed_id}/fetch")
+def trigger_fetch(feed_id: int, db: Session = Depends(get_db)):
+    """手动触发抓取"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        raise HTTPException(status_code=404, detail="RSS 源不存在")
+
+    result = fetch_and_store_feed(feed_id)
+    return result
+
+
+@router.post("/import-opml")
+def import_opml(opml_content: str, db: Session = Depends(get_db)):
+    """导入 OPML 文件内容"""
+    import xml.etree.ElementTree as ET
+
+    try:
+        root = ET.fromstring(opml_content)
+    except ET.ParseError:
+        raise HTTPException(status_code=400, detail="无效的 OPML 文件")
+
+    added = 0
+    skipped = 0
+
+    for outline in root.iter("outline"):
+        url = outline.get("xmlUrl") or outline.get("xmlurl")
+        if not url:
+            continue
+
+        existing = db.query(Feed).filter(Feed.url == url).first()
+        if existing:
+            skipped += 1
+            continue
+
+        feed = Feed(
+            url=url,
+            title=outline.get("title", "") or outline.get("text", ""),
+            description=outline.get("description", ""),
+            category=outline.get("category", ""),
+            is_active=True,
+            fetch_interval_minutes=60,
+        )
+        db.add(feed)
+        db.commit()
+        db.refresh(feed)
+
+        add_feed_job(feed.id, feed.fetch_interval_minutes)
+        added += 1
+
+    return {"added": added, "skipped": skipped, "message": f"成功导入 {added} 个 RSS 源"}
+
+
+@router.get("/export-opml")
+def export_opml(db: Session = Depends(get_db)):
+    """导出 OPML 文件内容"""
+    feeds = db.query(Feed).all()
+
+    lines = ['<?xml version="1.0" encoding="UTF-8"?>', '<opml version="2.0">', '<head><title>rssKeeper Feeds</title></head>', '<body>']
+    for feed in feeds:
+        title = (feed.title or feed.url).replace('"', '&quot;')
+        lines.append(f'  <outline type="rss" text="{title}" xmlUrl="{feed.url}" />')
+    lines.append('</body>')
+    lines.append('</opml>')
+
+    return {"opml": "\n".join(lines)}
@@ -0,0 +1,298 @@
+"""RSS 抓取核心逻辑"""
+import time
+import re
+import html
+import hashlib
+from datetime import datetime, timezone
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from urllib.parse import urljoin
+import requests
+import feedparser
+from bs4 import BeautifulSoup
+from sqlalchemy.orm import Session
+from models import Feed, Article, FetchLog
+from database import SessionLocal
+import config
+
+
+def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict:
+    """抓取单个 RSS 源
+    返回 {"success": bool, "feed_data": parsed, "error": str, "response_time_ms": int}
+    """
+    start_time = time.time()
+    try:
+        headers = {
+            "User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
+            "Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
+        }
+        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
+        response.raise_for_status()
+
+        # 解析 RSS
+        parsed = feedparser.parse(response.content)
+
+        response_time_ms = int((time.time() - start_time) * 1000)
+
+        if parsed.bozo and hasattr(parsed, 'bozo_exception'):
+            # 有解析警告但可能仍然可用
+            pass
+
+        return {
+            "success": True,
+            "feed_data": parsed,
+            "error": None,
+            "response_time_ms": response_time_ms,
+        }
+    except requests.exceptions.RequestException as e:
+        return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
+    except Exception as e:
+        return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
+
+
+def discover_feed_url(url: str, timeout: int = 15) -> list:
+    """从任意网页自动发现 RSS/Atom feed URL
+    返回找到的 feed URL 列表
+    """
+    try:
+        headers = {
+            "User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
+        }
+        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.content, "html.parser")
+        feed_urls = []
+
+        # 查找 <link rel="alternate"> 标签
+        for link in soup.find_all("link", rel="alternate"):
+            link_type = link.get("type", "").lower()
+            href = link.get("href", "")
+            if href and any(t in link_type for t in ["rss", "atom", "xml"]):
+                full_url = urljoin(response.url, href)
+                feed_urls.append(full_url)
+
+        # 也查找常见的 RSS 链接
+        common_patterns = [
+            "/rss", "/feed", "/feeds", "/atom.xml", "/rss.xml",
+            "/index.xml", "/feed.xml", "/?feed=rss2",
+        ]
+        for pattern in common_patterns:
+            candidate = urljoin(response.url, pattern)
+            if candidate not in feed_urls:
+                # 验证是否是有效的 feed
+                try:
+                    resp = requests.head(candidate, headers=headers, timeout=5, allow_redirects=True)
+                    content_type = resp.headers.get("Content-Type", "").lower()
+                    if any(t in content_type for t in ["rss", "atom", "xml"]):
+                        feed_urls.append(candidate)
+                except Exception:
+                    pass
+
+        return list(dict.fromkeys(feed_urls))  # 去重保持顺序
+    except Exception:
+        return []
+
+
+def parse_article(entry, feed_id: int) -> dict:
+    """从 feedparser entry 解析文章数据"""
+    title = entry.get("title", "")
+    link = entry.get("link", "")
+    author = entry.get("author", "")
+
+    # 发布时间
+    published_at = None
+    if hasattr(entry, "published_parsed") and entry.published_parsed:
+        try:
+            published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
+        except (ValueError, TypeError):
+            pass
+    if not published_at and hasattr(entry, "updated_parsed") and entry.updated_parsed:
+        try:
+            published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
+        except (ValueError, TypeError):
+            pass
+
+    # 内容：优先 summary，其次 content
+    content = ""
+    if hasattr(entry, "content") and entry.content:
+        content = entry.content[0].value
+    elif hasattr(entry, "summary"):
+        content = entry.summary
+
+    # 清洗 HTML
+    content = clean_html(content)
+
+    # 生成摘要
+    summary = generate_summary(content)
+
+    return {
+        "feed_id": feed_id,
+        "title": title[:1024],
+        "link": link[:2048],
+        "author": author[:256],
+        "published_at": published_at,
+        "content": content[:config.MAX_ARTICLE_CONTENT_LENGTH],
+        "summary": summary[:config.MAX_SUMMARY_LENGTH],
+    }
+
+
+def clean_html(html_text: str) -> str:
+    """清洗 HTML，去除 script/style 标签，转为安全文本"""
+    if not html_text:
+        return ""
+
+    # 先解码 HTML 实体
+    text = html.unescape(html_text)
+
+    # 用 BeautifulSoup 清理
+    soup = BeautifulSoup(text, "html.parser")
+
+    # 移除 script 和 style
+    for tag in soup(["script", "style", "iframe", "object", "embed"]):
+        tag.decompose()
+
+    # 获取纯文本
+    cleaned = soup.get_text(separator="\n")
+
+    # 压缩空白行
+    cleaned = re.sub(r"\n\s*\n+", "\n\n", cleaned)
+    cleaned = cleaned.strip()
+
+    return cleaned
+
+
+def generate_summary(content: str, max_length: int = 300) -> str:
+    """从内容生成摘要"""
+    if not content:
+        return ""
+
+    # 去掉多余空白
+    text = re.sub(r"\s+", " ", content).strip()
+
+    if len(text) <= max_length:
+        return text
+
+    # 在句子边界截断
+    truncated = text[:max_length]
+    last_period = max(truncated.rfind("。"), truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
+    if last_period > max_length * 0.5:
+        return truncated[:last_period + 1]
+
+    return truncated + "..."
+
+
+def fetch_and_store_feed(feed_id: int) -> dict:
+    """抓取指定 RSS 源并存储文章
+    返回抓取结果统计
+    """
+    db = SessionLocal()
+    try:
+        feed = db.query(Feed).filter(Feed.id == feed_id).first()
+        if not feed:
+            return {"success": False, "error": "Feed not found", "articles_count": 0}
+
+        result = fetch_feed(feed.url)
+
+        if not result["success"]:
+            # 记录失败
+            feed.last_fetch_at = datetime.utcnow()
+            feed.last_fetch_status = "fail"
+            feed.last_error = result["error"]
+            feed.fail_count += 1
+
+            log = FetchLog(
+                feed_id=feed_id,
+                status="fail",
+                error_message=result["error"],
+                response_time_ms=result.get("response_time_ms"),
+            )
+            db.add(log)
+            db.commit()
+            return {"success": False, "error": result["error"], "articles_count": 0}
+
+        parsed = result["feed_data"]
+
+        # 更新 feed 元信息
+        if hasattr(parsed.feed, "title"):
+            feed.title = parsed.feed.title[:512]
+        if hasattr(parsed.feed, "description"):
+            feed.description = parsed.feed.description[:1000]
+
+        # 存储文章
+        new_count = 0
+        for entry in parsed.entries:
+            article_data = parse_article(entry, feed_id)
+            if not article_data["link"]:
+                continue
+
+            # 检查是否已存在（基于 link）
+            existing = db.query(Article).filter(Article.link == article_data["link"]).first()
+            if existing:
+                # 更新已有文章
+                existing.title = article_data["title"] or existing.title
+                existing.content = article_data["content"] or existing.content
+                existing.summary = article_data["summary"] or existing.summary
+                existing.author = article_data["author"] or existing.author
+                if article_data["published_at"]:
+                    existing.published_at = article_data["published_at"]
+            else:
+                article = Article(**article_data)
+                db.add(article)
+                new_count += 1
+
+        # 更新 feed 统计
+        feed.last_fetch_at = datetime.utcnow()
+        feed.last_fetch_status = "success"
+        feed.last_error = ""
+        feed.success_count += 1
+        feed.article_count = db.query(Article).filter(Article.feed_id == feed_id).count()
+
+        log = FetchLog(
+            feed_id=feed_id,
+            status="success",
+            articles_fetched=new_count,
+            response_time_ms=result.get("response_time_ms"),
+        )
+        db.add(log)
+        db.commit()
+
+        return {
+            "success": True,
+            "articles_count": new_count,
+            "feed_title": feed.title,
+        }
+    except Exception as e:
+        db.rollback()
+        return {"success": False, "error": str(e), "articles_count": 0}
+    finally:
+        db.close()
+
+
+def fetch_all_feeds(feed_ids: list = None) -> list:
+    """并发抓取多个 RSS 源
+    返回每个源的抓取结果列表
+    """
+    db = SessionLocal()
+    try:
+        query = db.query(Feed).filter(Feed.is_active == True)
+        if feed_ids:
+            query = query.filter(Feed.id.in_(feed_ids))
+        feeds = query.all()
+    finally:
+        db.close()
+
+    results = []
+    with ThreadPoolExecutor(max_workers=config.FETCH_CONCURRENCY) as executor:
+        future_to_feed = {
+            executor.submit(fetch_and_store_feed, feed.id): feed
+            for feed in feeds
+        }
+        for future in as_completed(future_to_feed):
+            feed = future_to_feed[future]
+            try:
+                result = future.result()
+                results.append({"feed_id": feed.id, **result})
+            except Exception as e:
+                results.append({"feed_id": feed.id, "success": False, "error": str(e), "articles_count": 0})
+
+    return results
@@ -0,0 +1,74 @@
+"""APScheduler 定时任务管理"""
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers.interval import IntervalTrigger
+from rss_fetcher import fetch_and_store_feed
+import config
+
+_scheduler = None
+
+
+def get_scheduler():
+    """获取或创建调度器实例"""
+    global _scheduler
+    if _scheduler is None:
+        _scheduler = BackgroundScheduler()
+    return _scheduler
+
+
+def add_feed_job(feed_id: int, interval_minutes: int):
+    """为指定 RSS 源添加定时抓取任务"""
+    scheduler = get_scheduler()
+    job_id = f"fetch_feed_{feed_id}"
+
+    # 确保间隔不低于最小值
+    interval = max(interval_minutes, config.MIN_FETCH_INTERVAL)
+
+    # 如果任务已存在则更新
+    existing = scheduler.get_job(job_id)
+    if existing:
+        existing.reschedule(trigger=IntervalTrigger(minutes=interval))
+        return
+
+    scheduler.add_job(
+        fetch_and_store_feed,
+        trigger=IntervalTrigger(minutes=interval),
+        id=job_id,
+        args=[feed_id],
+        replace_existing=True,
+        misfire_grace_time=300,  # 5分钟容错
+        coalesce=True,  # 合并错过的任务
+    )
+
+
+def remove_feed_job(feed_id: int):
+    """移除指定 RSS 源的定时任务"""
+    scheduler = get_scheduler()
+    job_id = f"fetch_feed_{feed_id}"
+    try:
+        scheduler.remove_job(job_id)
+    except Exception:
+        pass
+
+
+def start_scheduler():
+    """启动调度器"""
+    scheduler = get_scheduler()
+    if not scheduler.running:
+        scheduler.start()
+
+
+def stop_scheduler():
+    """停止调度器"""
+    global _scheduler
+    if _scheduler and _scheduler.running:
+        _scheduler.shutdown(wait=False)
+        _scheduler = None
+
+
+def init_feed_jobs(db):
+    """从数据库加载所有活跃 RSS 源并注册定时任务"""
+    from models import Feed
+    feeds = db.query(Feed).filter(Feed.is_active == True).all()
+    for feed in feeds:
+        add_feed_job(feed.id, feed.fetch_interval_minutes or config.DEFAULT_FETCH_INTERVAL)
+    start_scheduler()