feat: init rssKeeper - RSS 抓取、管理与检索系统

完整功能包括: - FastAPI 后端 + SQLite + FTS5 全文搜索 - RSS 源管理、自动发现、OPML 导入导出 - 文章抓取、去重、分类、全文检索 - RSS 源健康度监控 - Vue 3 + Element Plus 暗色主题 Web UI - 对外 REST API 供 AI 分析调用 - Docker + docker-compose 部署
2026-06-11 14:03:36 +08:00
commit 54e7db0ef0
28 changed files with 2915 additions and 0 deletions
@@ -0,0 +1,133 @@
+"""文章管理 API"""
+from typing import Optional
+from fastapi import APIRouter, Depends
+from pydantic import BaseModel
+from sqlalchemy.orm import Session
+from sqlalchemy import desc
+from database import get_db
+from models import Article, Feed
+from fulltext_search import search_articles
+
+router = APIRouter(prefix="/articles", tags=["articles"])
+
+
+class ArticleOut(BaseModel):
+    id: int
+    feed_id: int
+    title: str
+    link: str
+    author: str
+    published_at: Optional[str]
+    summary: str
+    is_read: bool
+    created_at: str
+    feed_title: str
+    category: str
+
+    class Config:
+        from_attributes = True
+
+
+@router.get("")
+def list_articles(
+    skip: int = 0,
+    limit: int = 50,
+    feed_id: Optional[int] = None,
+    category: Optional[str] = None,
+    search: Optional[str] = None,
+    since: Optional[str] = None,
+    until: Optional[str] = None,
+    is_read: Optional[bool] = None,
+    db: Session = Depends(get_db),
+):
+    """获取文章列表，支持多种筛选条件"""
+
+    # 如果有搜索关键词，使用 FTS5 全文搜索
+    if search and search.strip():
+        results, total = search_articles(search.strip(), limit=limit, offset=skip)
+        return {"total": total, "items": results}
+
+    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
+
+    if feed_id:
+        query = query.filter(Article.feed_id == feed_id)
+    if category:
+        query = query.filter(Feed.category == category)
+    if is_read is not None:
+        query = query.filter(Article.is_read == is_read)
+    if since:
+        query = query.filter(Article.published_at >= since)
+    if until:
+        query = query.filter(Article.published_at <= until)
+
+    total = query.count()
+    rows = query.order_by(desc(Article.published_at)).offset(skip).limit(limit).all()
+
+    items = []
+    for article, feed_title, category in rows:
+        items.append({
+            "id": article.id,
+            "feed_id": article.feed_id,
+            "title": article.title or "",
+            "link": article.link,
+            "author": article.author or "",
+            "published_at": article.published_at.isoformat() if article.published_at else None,
+            "summary": article.summary or "",
+            "is_read": article.is_read,
+            "created_at": article.created_at.isoformat(),
+            "feed_title": feed_title or "",
+            "category": category or "",
+        })
+
+    return {"total": total, "items": items}
+
+
+@router.get("/{article_id}")
+def get_article(article_id: int, db: Session = Depends(get_db)):
+    """获取文章详情"""
+    article = db.query(Article).filter(Article.id == article_id).first()
+    if not article:
+        raise HTTPException(status_code=404, detail="文章不存在")
+
+    feed = db.query(Feed).filter(Feed.id == article.feed_id).first()
+
+    return {
+        "id": article.id,
+        "feed_id": article.feed_id,
+        "title": article.title or "",
+        "link": article.link,
+        "author": article.author or "",
+        "published_at": article.published_at.isoformat() if article.published_at else None,
+        "content": article.content or "",
+        "summary": article.summary or "",
+        "is_read": article.is_read,
+        "created_at": article.created_at.isoformat(),
+        "feed_title": feed.title if feed else "",
+        "category": feed.category if feed else "",
+    }
+
+
+@router.put("/{article_id}/read")
+def mark_read(article_id: int, db: Session = Depends(get_db)):
+    """标记文章为已读"""
+    article = db.query(Article).filter(Article.id == article_id).first()
+    if not article:
+        raise HTTPException(status_code=404, detail="文章不存在")
+
+    article.is_read = True
+    db.commit()
+    return {"message": "已标记为已读"}
+
+
+@router.get("/search/fulltext")
+def fulltext_search(
+    q: str,
+    skip: int = 0,
+    limit: int = 50,
+):
+    """全文搜索文章"""
+    results, total = search_articles(q, limit=limit, offset=skip)
+    return {"total": total, "items": results}
+
+
+from fastapi import HTTPException
@@ -0,0 +1,58 @@
+"""仪表盘统计 API"""
+from fastapi import APIRouter, Depends
+from sqlalchemy.orm import Session
+from database import get_db
+from health_checker import get_overall_stats, get_feed_health
+
+router = APIRouter(prefix="/dashboard", tags=["dashboard"])
+
+
+@router.get("/stats")
+def dashboard_stats(db: Session = Depends(get_db)):
+    """仪表盘统计数据"""
+    return get_overall_stats(db)
+
+
+@router.get("/health")
+def dashboard_health(
+    skip: int = 0,
+    limit: int = 100,
+    db: Session = Depends(get_db),
+):
+    """RSS 源健康度列表"""
+    all_health = get_feed_health(db)
+    total = len(all_health)
+
+    # 按健康状态排序：异常在前
+    status_order = {"unhealthy": 0, "warning": 1, "unknown": 2, "healthy": 3}
+    all_health.sort(key=lambda x: status_order.get(x["health_status"], 2))
+
+    items = all_health[skip:skip + limit]
+    return {"total": total, "items": items}
+
+
+@router.get("/recent-activity")
+def recent_activity(limit: int = 20, db: Session = Depends(get_db)):
+    """最近的抓取活动"""
+    from models import FetchLog, Feed
+    from sqlalchemy import desc
+
+    logs = db.query(FetchLog, Feed.title.label("feed_title")).join(Feed).order_by(
+        desc(FetchLog.created_at)
+    ).limit(limit).all()
+
+    return {
+        "items": [
+            {
+                "id": log.id,
+                "feed_id": log.feed_id,
+                "feed_title": feed_title or "",
+                "status": log.status,
+                "articles_fetched": log.articles_fetched,
+                "response_time_ms": log.response_time_ms,
+                "error_message": log.error_message,
+                "created_at": log.created_at.isoformat(),
+            }
+            for log, feed_title in logs
+        ]
+    }
@@ -0,0 +1,163 @@
+"""对外 API（供 AI/外部系统调用）"""
+from typing import Optional
+from datetime import datetime, timedelta
+from fastapi import APIRouter, Depends
+from sqlalchemy.orm import Session
+from sqlalchemy import desc
+from database import get_db
+from models import Article, Feed
+
+router = APIRouter(prefix="/external", tags=["external"])
+
+
+@router.get("/recent")
+def get_recent_articles(
+    hours: int = 24,
+    limit: int = 50,
+    feed_id: Optional[int] = None,
+    category: Optional[str] = None,
+    db: Session = Depends(get_db),
+):
+    """获取最近 N 小时的文章
+    这是对外提供给 AI 分析的主要接口
+    """
+    since = datetime.utcnow() - timedelta(hours=hours)
+
+    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
+
+    query = query.filter(Article.created_at >= since)
+
+    if feed_id:
+        query = query.filter(Article.feed_id == feed_id)
+    if category:
+        query = query.filter(Feed.category == category)
+
+    rows = query.order_by(desc(Article.published_at)).limit(limit).all()
+
+    return {
+        "query": {
+            "hours": hours,
+            "limit": limit,
+            "feed_id": feed_id,
+            "category": category,
+        },
+        "count": len(rows),
+        "articles": [
+            {
+                "id": article.id,
+                "title": article.title or "",
+                "link": article.link,
+                "author": article.author or "",
+                "summary": article.summary or "",
+                "content": article.content or "" if len(article.content or "") < 10000 else article.summary or "",
+                "published_at": article.published_at.isoformat() if article.published_at else None,
+                "created_at": article.created_at.isoformat(),
+                "feed_title": feed_title or "",
+                "category": category or "",
+            }
+            for article, feed_title, category in rows
+        ],
+    }
+
+
+@router.get("/feeds")
+def get_active_feeds(db: Session = Depends(get_db)):
+    """获取所有活跃的 RSS 源列表"""
+    feeds = db.query(Feed).filter(Feed.is_active == True).all()
+
+    return {
+        "count": len(feeds),
+        "feeds": [
+            {
+                "id": feed.id,
+                "title": feed.title or feed.url,
+                "url": feed.url,
+                "category": feed.category or "",
+                "article_count": feed.article_count,
+                "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
+            }
+            for feed in feeds
+        ],
+    }
+
+
+@router.get("/feeds/{feed_id}/articles")
+def get_feed_articles(
+    feed_id: int,
+    limit: int = 100,
+    since: Optional[str] = None,
+    db: Session = Depends(get_db),
+):
+    """获取指定 RSS 源的文章"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        return {"error": "Feed not found"}
+
+    query = db.query(Article).filter(Article.feed_id == feed_id)
+
+    if since:
+        query = query.filter(Article.published_at >= since)
+
+    articles = query.order_by(desc(Article.published_at)).limit(limit).all()
+
+    return {
+        "feed": {
+            "id": feed.id,
+            "title": feed.title or feed.url,
+            "url": feed.url,
+        },
+        "count": len(articles),
+        "articles": [
+            {
+                "id": article.id,
+                "title": article.title or "",
+                "link": article.link,
+                "author": article.author or "",
+                "summary": article.summary or "",
+                "published_at": article.published_at.isoformat() if article.published_at else None,
+            }
+            for article in articles
+        ],
+    }
+
+
+@router.get("/summary")
+def get_daily_summary(
+    date: Optional[str] = None,
+    db: Session = Depends(get_db),
+):
+    """获取指定日期的文章摘要统计
+    供 AI 快速了解某天的 RSS 内容概况
+    """
+    if date:
+        try:
+            day = datetime.strptime(date, "%Y-%m-%d")
+            next_day = day + timedelta(days=1)
+        except ValueError:
+            return {"error": "Invalid date format, use YYYY-MM-DD"}
+    else:
+        day = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
+        next_day = day + timedelta(days=1)
+
+    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
+    query = query.filter(Article.created_at >= day, Article.created_at < next_day)
+    rows = query.order_by(desc(Article.published_at)).all()
+
+    # 按分类统计
+    by_category = {}
+    for article, feed_title, category in rows:
+        cat = category or "未分类"
+        if cat not in by_category:
+            by_category[cat] = []
+        by_category[cat].append({
+            "title": article.title or "",
+            "link": article.link,
+            "feed": feed_title or "",
+            "summary": article.summary or "",
+        })
+
+    return {
+        "date": day.strftime("%Y-%m-%d"),
+        "total_articles": len(rows),
+        "by_category": by_category,
+    }
@@ -0,0 +1,273 @@
+"""RSS 源管理 API"""
+from typing import List, Optional
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel, HttpUrl
+from sqlalchemy.orm import Session
+from database import get_db
+from models import Feed
+from rss_fetcher import discover_feed_url, fetch_and_store_feed
+from scheduler import add_feed_job, remove_feed_job
+
+router = APIRouter(prefix="/feeds", tags=["feeds"])
+
+
+class FeedCreate(BaseModel):
+    url: str
+    title: Optional[str] = ""
+    description: Optional[str] = ""
+    category: Optional[str] = ""
+    is_active: Optional[bool] = True
+    fetch_interval_minutes: Optional[int] = 60
+
+
+class FeedUpdate(BaseModel):
+    title: Optional[str] = None
+    description: Optional[str] = None
+    category: Optional[str] = None
+    is_active: Optional[bool] = None
+    fetch_interval_minutes: Optional[int] = None
+
+
+class FeedOut(BaseModel):
+    id: int
+    url: str
+    title: str
+    description: str
+    category: str
+    is_active: bool
+    fetch_interval_minutes: int
+    last_fetch_at: Optional[str] = None
+    last_fetch_status: str
+    success_count: int
+    fail_count: int
+    article_count: int
+    health_status: str
+    created_at: str
+
+    class Config:
+        from_attributes = True
+
+
+@router.get("", response_model=dict)
+def list_feeds(
+    skip: int = 0,
+    limit: int = 100,
+    category: Optional[str] = None,
+    search: Optional[str] = None,
+    is_active: Optional[bool] = None,
+    db: Session = Depends(get_db),
+):
+    """获取 RSS 源列表，支持分页、分类筛选、搜索"""
+    query = db.query(Feed)
+
+    if category:
+        query = query.filter(Feed.category == category)
+    if is_active is not None:
+        query = query.filter(Feed.is_active == is_active)
+    if search:
+        query = query.filter(
+            Feed.title.contains(search) | Feed.url.contains(search) | Feed.description.contains(search)
+        )
+
+    total = query.count()
+    feeds = query.order_by(Feed.created_at.desc()).offset(skip).limit(limit).all()
+
+    results = []
+    for feed in feeds:
+        data = {
+            "id": feed.id,
+            "url": feed.url,
+            "title": feed.title or feed.url,
+            "description": feed.description or "",
+            "category": feed.category or "",
+            "is_active": feed.is_active,
+            "fetch_interval_minutes": feed.fetch_interval_minutes,
+            "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
+            "last_fetch_status": feed.last_fetch_status,
+            "success_count": feed.success_count,
+            "fail_count": feed.fail_count,
+            "article_count": feed.article_count,
+            "health_status": feed.health_status(),
+            "created_at": feed.created_at.isoformat(),
+        }
+        results.append(data)
+
+    return {"total": total, "items": results}
+
+
+@router.get("/categories")
+def list_categories(db: Session = Depends(get_db)):
+    """获取所有分类列表"""
+    categories = db.query(Feed.category).filter(Feed.category != "").distinct().all()
+    return [c[0] for c in categories if c[0]]
+
+
+@router.post("", response_model=dict)
+def create_feed(data: FeedCreate, db: Session = Depends(get_db)):
+    """添加 RSS 源"""
+    # 检查是否已存在
+    existing = db.query(Feed).filter(Feed.url == data.url).first()
+    if existing:
+        raise HTTPException(status_code=409, detail="该 RSS 源已存在")
+
+    feed = Feed(
+        url=data.url,
+        title=data.title or "",
+        description=data.description or "",
+        category=data.category or "",
+        is_active=data.is_active,
+        fetch_interval_minutes=data.fetch_interval_minutes or 60,
+    )
+    db.add(feed)
+    db.commit()
+    db.refresh(feed)
+
+    # 注册定时任务
+    if feed.is_active:
+        add_feed_job(feed.id, feed.fetch_interval_minutes)
+
+    # 立即抓取一次
+    fetch_and_store_feed(feed.id)
+
+    return {"id": feed.id, "message": "RSS 源添加成功", "url": feed.url}
+
+
+@router.post("/discover")
+def discover_feed(url: str, db: Session = Depends(get_db)):
+    """从网页自动发现 RSS feed URL"""
+    feed_urls = discover_feed_url(url)
+    return {"source_url": url, "found_feeds": feed_urls}
+
+
+@router.get("/{feed_id}", response_model=dict)
+def get_feed(feed_id: int, db: Session = Depends(get_db)):
+    """获取 RSS 源详情"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        raise HTTPException(status_code=404, detail="RSS 源不存在")
+
+    return {
+        "id": feed.id,
+        "url": feed.url,
+        "title": feed.title or feed.url,
+        "description": feed.description or "",
+        "category": feed.category or "",
+        "is_active": feed.is_active,
+        "fetch_interval_minutes": feed.fetch_interval_minutes,
+        "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
+        "last_fetch_status": feed.last_fetch_status,
+        "last_error": feed.last_error,
+        "success_count": feed.success_count,
+        "fail_count": feed.fail_count,
+        "article_count": feed.article_count,
+        "health_status": feed.health_status(),
+        "created_at": feed.created_at.isoformat(),
+    }
+
+
+@router.put("/{feed_id}", response_model=dict)
+def update_feed(feed_id: int, data: FeedUpdate, db: Session = Depends(get_db)):
+    """更新 RSS 源"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        raise HTTPException(status_code=404, detail="RSS 源不存在")
+
+    if data.title is not None:
+        feed.title = data.title
+    if data.description is not None:
+        feed.description = data.description
+    if data.category is not None:
+        feed.category = data.category
+    if data.is_active is not None:
+        feed.is_active = data.is_active
+        if feed.is_active:
+            add_feed_job(feed.id, feed.fetch_interval_minutes)
+        else:
+            remove_feed_job(feed.id)
+    if data.fetch_interval_minutes is not None:
+        feed.fetch_interval_minutes = data.fetch_interval_minutes
+        if feed.is_active:
+            add_feed_job(feed.id, feed.fetch_interval_minutes)
+
+    db.commit()
+    return {"message": "RSS 源更新成功"}
+
+
+@router.delete("/{feed_id}")
+def delete_feed(feed_id: int, db: Session = Depends(get_db)):
+    """删除 RSS 源（级联删除文章和日志）"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        raise HTTPException(status_code=404, detail="RSS 源不存在")
+
+    remove_feed_job(feed_id)
+    db.delete(feed)
+    db.commit()
+    return {"message": "RSS 源已删除"}
+
+
+@router.post("/{feed_id}/fetch")
+def trigger_fetch(feed_id: int, db: Session = Depends(get_db)):
+    """手动触发抓取"""
+    feed = db.query(Feed).filter(Feed.id == feed_id).first()
+    if not feed:
+        raise HTTPException(status_code=404, detail="RSS 源不存在")
+
+    result = fetch_and_store_feed(feed_id)
+    return result
+
+
+@router.post("/import-opml")
+def import_opml(opml_content: str, db: Session = Depends(get_db)):
+    """导入 OPML 文件内容"""
+    import xml.etree.ElementTree as ET
+
+    try:
+        root = ET.fromstring(opml_content)
+    except ET.ParseError:
+        raise HTTPException(status_code=400, detail="无效的 OPML 文件")
+
+    added = 0
+    skipped = 0
+
+    for outline in root.iter("outline"):
+        url = outline.get("xmlUrl") or outline.get("xmlurl")
+        if not url:
+            continue
+
+        existing = db.query(Feed).filter(Feed.url == url).first()
+        if existing:
+            skipped += 1
+            continue
+
+        feed = Feed(
+            url=url,
+            title=outline.get("title", "") or outline.get("text", ""),
+            description=outline.get("description", ""),
+            category=outline.get("category", ""),
+            is_active=True,
+            fetch_interval_minutes=60,
+        )
+        db.add(feed)
+        db.commit()
+        db.refresh(feed)
+
+        add_feed_job(feed.id, feed.fetch_interval_minutes)
+        added += 1
+
+    return {"added": added, "skipped": skipped, "message": f"成功导入 {added} 个 RSS 源"}
+
+
+@router.get("/export-opml")
+def export_opml(db: Session = Depends(get_db)):
+    """导出 OPML 文件内容"""
+    feeds = db.query(Feed).all()
+
+    lines = ['<?xml version="1.0" encoding="UTF-8"?>', '<opml version="2.0">', '<head><title>rssKeeper Feeds</title></head>', '<body>']
+    for feed in feeds:
+        title = (feed.title or feed.url).replace('"', '&quot;')
+        lines.append(f'  <outline type="rss" text="{title}" xmlUrl="{feed.url}" />')
+    lines.append('</body>')
+    lines.append('</opml>')
+
+    return {"opml": "\n".join(lines)}