feat: 代理支持、外部API增强、调度器修复、每日文章看板

- 添加 HTTP 代理支持（国内直连、外网走代理） - 外部 API 新增全文搜索、源健康度/错误筛选、未读筛选 - 修复 APScheduler 线程静默崩溃（_safe_fetch 异常保护） - 健康检查暴露调度器状态 - Dashboard 新增每日文章数柱状图（按 published_at） - 文章列表 API 补上 content 字段，日期筛选修复时间范围 - 修复外部 API 双重 external 前缀 - User-Agent 改为 Chrome 标识缓解 403 - 添加完整 API 接口文档 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-12 09:58:32 +08:00
parent 68bba3d9e0
commit 4286731348
12 changed files with 1057 additions and 44 deletions
@@ -16,6 +16,10 @@ FETCH_TIMEOUT = int(os.getenv("FETCH_TIMEOUT", "30"))
 DEFAULT_FETCH_INTERVAL = int(os.getenv("DEFAULT_FETCH_INTERVAL", "60"))  # 分钟
 MIN_FETCH_INTERVAL = int(os.getenv("MIN_FETCH_INTERVAL", "15"))  # 最小间隔15分钟

+# 代理配置（用于访问外网源）
+HTTP_PROXY = os.getenv("HTTP_PROXY", "")
+HTTPS_PROXY = os.getenv("HTTPS_PROXY", "")
+
 # 内容处理
 MAX_ARTICLE_CONTENT_LENGTH = int(os.getenv("MAX_ARTICLE_CONTENT_LENGTH", "50000"))
 MAX_SUMMARY_LENGTH = int(os.getenv("MAX_SUMMARY_LENGTH", "500"))
@@ -5,7 +5,7 @@ from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from starlette.middleware.cors import CORSMiddleware
 from database import init_db, SessionLocal
-from scheduler import init_feed_jobs, stop_scheduler
+from scheduler import init_feed_jobs, stop_scheduler, scheduler_status
 from routers import feeds, articles, dashboard, external_api
 import config

@@ -57,7 +57,8 @@ app.include_router(external_api.router, prefix=config.EXTERNAL_API_PREFIX)
@app.get("/api/health")
 def health_check():
    """健康检查"""
-    return {"status": "ok", "service": "rssKeeper"}
+    sched = scheduler_status()
+    return {"status": "ok", "service": "rssKeeper", "scheduler": sched}


 # 静态文件服务（前端构建产物）— 必须放在最后，API 路由优先匹配
@@ -72,6 +72,7 @@ def list_articles(
            "link": article.link,
            "author": article.author or "",
            "published_at": article.published_at.isoformat() if article.published_at else None,
+            "content": article.content or "",
            "summary": article.summary or "",
            "is_read": article.is_read,
            "created_at": article.created_at.isoformat(),
@@ -1,6 +1,7 @@
 """仪表盘统计 API"""
 from fastapi import APIRouter, Depends
 from sqlalchemy.orm import Session
+from sqlalchemy import func, text
 from database import get_db
 from health_checker import get_overall_stats, get_feed_health

@@ -31,6 +32,25 @@ def dashboard_health(
    return {"total": total, "items": items}


+@router.get("/articles-daily")
+def articles_daily(days: int = 30, db: Session = Depends(get_db)):
+    """按发布日期统计文章数量"""
+    from models import Article
+    sql = text("""
+        SELECT DATE(published_at) as date, COUNT(*) as count
+        FROM articles
+        WHERE published_at IS NOT NULL
+          AND published_at >= DATE('now', '-' || :days || ' days')
+        GROUP BY DATE(published_at)
+        ORDER BY date DESC
+    """)
+    rows = db.execute(sql, {"days": days}).fetchall()
+    return {
+        "days": days,
+        "data": [{"date": str(r[0]), "count": r[1]} for r in rows],
+    }
+
+
@router.get("/recent-activity")
 def recent_activity(limit: int = 20, db: Session = Depends(get_db)):
    """最近的抓取活动"""
@@ -1,13 +1,14 @@
 """对外 API（供 AI/外部系统调用）"""
-from typing import Optional
+from typing import Optional, List
 from datetime import datetime, timedelta
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, Query
 from sqlalchemy.orm import Session
-from sqlalchemy import desc
+from sqlalchemy import desc, or_
 from database import get_db
 from models import Article, Feed
+from fulltext_search import search_articles

-router = APIRouter(prefix="/external", tags=["external"])
+router = APIRouter(tags=["external"])


@router.get("/recent")
@@ -16,21 +17,28 @@ def get_recent_articles(
    limit: int = 50,
    feed_id: Optional[int] = None,
    category: Optional[str] = None,
+    search: Optional[str] = None,
+    unread_only: bool = False,
    db: Session = Depends(get_db),
 ):
    """获取最近 N 小时的文章
-    这是对外提供给 AI 分析的主要接口
+    供 AI 分析的主要接口，支持多条件组合筛选
    """
    since = datetime.utcnow() - timedelta(hours=hours)

    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
-
    query = query.filter(Article.created_at >= since)

    if feed_id:
        query = query.filter(Article.feed_id == feed_id)
    if category:
        query = query.filter(Feed.category == category)
+    if search:
+        query = query.filter(
+            Article.title.contains(search) | Article.summary.contains(search)
+        )
+    if unread_only:
+        query = query.filter(Article.is_read == False)

    rows = query.order_by(desc(Article.published_at)).limit(limit).all()

@@ -40,6 +48,8 @@ def get_recent_articles(
            "limit": limit,
            "feed_id": feed_id,
            "category": category,
+            "search": search,
+            "unread_only": unread_only,
        },
        "count": len(rows),
        "articles": [
@@ -60,24 +70,86 @@ def get_recent_articles(
    }


-@router.get("/feeds")
-def get_active_feeds(db: Session = Depends(get_db)):
-    """获取所有活跃的 RSS 源列表"""
-    feeds = db.query(Feed).filter(Feed.is_active == True).all()
+@router.get("/search")
+def fulltext_search(
+    q: str = Query(..., description="搜索关键词"),
+    limit: int = Query(50, ge=1, le=200),
+    offset: int = Query(0, ge=0),
+    category: Optional[str] = Query(None, description="按分类筛选"),
+    feed_id: Optional[int] = Query(None, description="按源筛选"),
+    db: Session = Depends(get_db),
+):
+    """全文搜索文章（FTS5）
+    供 AI 按关键词检索文章内容
+    """
+    results, total = search_articles(q, limit, offset)
+
+    # 二次过滤分类和源
+    if category or feed_id:
+        filtered = []
+        for r in results:
+            if category and r["category"] != category:
+                continue
+            if feed_id and r["feed_id"] != feed_id:
+                continue
+            filtered.append(r)
+        results = filtered
+        total = len(filtered)

    return {
-        "count": len(feeds),
-        "feeds": [
-            {
-                "id": feed.id,
-                "title": feed.title or feed.url,
-                "url": feed.url,
-                "category": feed.category or "",
-                "article_count": feed.article_count,
-                "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
-            }
-            for feed in feeds
-        ],
+        "query": q,
+        "total": total,
+        "offset": offset,
+        "limit": limit,
+        "articles": results,
+    }
+
+
+@router.get("/feeds")
+def get_active_feeds(
+    health_status: Optional[str] = Query(None, description="按健康度筛选: healthy/warning/unhealthy/unknown"),
+    category: Optional[str] = Query(None, description="按分类筛选"),
+    error_type: Optional[str] = Query(None, description="按错误类型筛选"),
+    is_active: Optional[bool] = Query(None, description="按启用状态筛选"),
+    db: Session = Depends(get_db),
+):
+    """获取 RSS 源列表（支持多条件筛选）"""
+    query = db.query(Feed)
+
+    if is_active is not None:
+        query = query.filter(Feed.is_active == is_active)
+    else:
+        query = query.filter(Feed.is_active == True)
+
+    if category:
+        query = query.filter(Feed.category == category)
+
+    feeds = query.all()
+
+    results = []
+    for feed in feeds:
+        status = feed.health_status()
+        if health_status and status != health_status:
+            continue
+        if error_type and feed.error_type != error_type:
+            continue
+
+        results.append({
+            "id": feed.id,
+            "title": feed.title or feed.url,
+            "url": feed.url,
+            "category": feed.category or "",
+            "is_active": feed.is_active,
+            "health_status": status,
+            "error_type": feed.error_type,
+            "article_count": feed.article_count,
+            "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
+            "last_error": feed.last_error or "",
+        })
+
+    return {
+        "count": len(results),
+        "feeds": results,
    }


@@ -86,6 +158,8 @@ def get_feed_articles(
    feed_id: int,
    limit: int = 100,
    since: Optional[str] = None,
+    search: Optional[str] = None,
+    unread_only: bool = False,
    db: Session = Depends(get_db),
 ):
    """获取指定 RSS 源的文章"""
@@ -97,6 +171,12 @@ def get_feed_articles(

    if since:
        query = query.filter(Article.published_at >= since)
+    if search:
+        query = query.filter(
+            Article.title.contains(search) | Article.summary.contains(search)
+        )
+    if unread_only:
+        query = query.filter(Article.is_read == False)

    articles = query.order_by(desc(Article.published_at)).limit(limit).all()

@@ -124,6 +204,7 @@ def get_feed_articles(
@router.get("/summary")
 def get_daily_summary(
    date: Optional[str] = None,
+    category: Optional[str] = None,
    db: Session = Depends(get_db),
 ):
    """获取指定日期的文章摘要统计
@@ -141,15 +222,19 @@ def get_daily_summary(

    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
    query = query.filter(Article.created_at >= day, Article.created_at < next_day)
+    if category:
+        query = query.filter(Feed.category == category)
+
    rows = query.order_by(desc(Article.published_at)).all()

-    # 按分类统计
    by_category = {}
-    for article, feed_title, category in rows:
-        cat = category or "未分类"
-        if cat not in by_category:
-            by_category[cat] = []
-        by_category[cat].append({
+    for article, feed_title, cat in rows:
+        c = cat or "未分类"
+        if category and c != category:
+            continue
+        if c not in by_category:
+            by_category[c] = []
+        by_category[c].append({
            "title": article.title or "",
            "link": article.link,
            "feed": feed_title or "",
@@ -13,6 +13,35 @@ from models import Feed, Article, FetchLog
 from database import SessionLocal
 import config

+# 国内域名后缀/关键字 — 这些直连，其余走代理
+CN_DOMAINS = (
+    ".cn", ".com.cn", ".org.cn", ".net.cn",
+    "36kr.com", "zhihu.com", "weibo.com", "douban.com", "bilibili.com",
+    "tmtpost.com", "ifanr.com", "geekpark.net", "pingwest.com",
+    "juejin.cn", "segmentfault.com", "cnblogs.com", "csdn.net",
+    "qq.com", "163.com", "sohu.com", "sina.com.cn", "baidu.com",
+    "taobao.com", "jd.com", "aliyun.com",
+    "xinhuanet.com", "people.com.cn", "sciencenet.cn",
+    "localhost", "127.0.0.1", "192.168.",
+)
+
+
+def _get_proxies(url: str) -> dict:
+    """根据 URL 判断是否需要代理，返回 proxies dict"""
+    if not config.HTTPS_PROXY:
+        return {}
+    from urllib.parse import urlparse
+    host = urlparse(url).hostname or ""
+    # 国内域名直连
+    for d in CN_DOMAINS:
+        if host.endswith(d) or host == d:
+            return {}
+    # 外网走代理
+    return {
+        "http": config.HTTP_PROXY or config.HTTPS_PROXY,
+        "https": config.HTTPS_PROXY,
+    }
+

 def classify_error(error: str) -> str:
    """根据错误信息分类错误类型"""
@@ -54,10 +83,10 @@ def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict:
    start_time = time.time()
    try:
        headers = {
-            "User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36",
            "Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
        }
-        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
+        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, proxies=_get_proxies(url))
        response.raise_for_status()

        # 解析 RSS
@@ -87,9 +116,9 @@ def discover_feed_url(url: str, timeout: int = 15) -> list:
    """
    try:
        headers = {
-            "User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36",
        }
-        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
+        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, proxies=_get_proxies(url))
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")
@@ -1,9 +1,12 @@
 """APScheduler 定时任务管理"""
+import logging
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.interval import IntervalTrigger
 from rss_fetcher import fetch_and_store_feed
 import config

+logger = logging.getLogger(__name__)
+
 _scheduler = None


@@ -11,32 +14,42 @@ def get_scheduler():
    """获取或创建调度器实例"""
    global _scheduler
    if _scheduler is None:
-        _scheduler = BackgroundScheduler()
+        _scheduler = BackgroundScheduler(
+            job_defaults={
+                "coalesce": True,
+                "max_instances": 1,
+                "misfire_grace_time": 300,
+            },
+            logger=logger,
+        )
    return _scheduler


+def _safe_fetch(feed_id: int):
+    """安全包装：防止单个 job 异常导致调度器线程崩溃"""
+    try:
+        fetch_and_store_feed(feed_id)
+    except Exception as e:
+        logger.error(f"调度抓取失败 feed_id={feed_id}: {e}")
+
+
 def add_feed_job(feed_id: int, interval_minutes: int):
    """为指定 RSS 源添加定时抓取任务"""
    scheduler = get_scheduler()
    job_id = f"fetch_feed_{feed_id}"
-
-    # 确保间隔不低于最小值
    interval = max(interval_minutes, config.MIN_FETCH_INTERVAL)

-    # 如果任务已存在则更新
    existing = scheduler.get_job(job_id)
    if existing:
        existing.reschedule(trigger=IntervalTrigger(minutes=interval))
        return

    scheduler.add_job(
-        fetch_and_store_feed,
+        _safe_fetch,
        trigger=IntervalTrigger(minutes=interval),
        id=job_id,
        args=[feed_id],
        replace_existing=True,
-        misfire_grace_time=300,  # 5分钟容错
-        coalesce=True,  # 合并错过的任务
    )


@@ -55,6 +68,7 @@ def start_scheduler():
    scheduler = get_scheduler()
    if not scheduler.running:
        scheduler.start()
+        logger.info("调度器已启动")


 def stop_scheduler():
@@ -65,6 +79,15 @@ def stop_scheduler():
        _scheduler = None


+def scheduler_status():
+    """获取调度器状态（供健康检查使用）"""
+    scheduler = get_scheduler()
+    if not scheduler.running:
+        return {"running": False, "jobs": 0}
+    jobs = scheduler.get_jobs()
+    return {"running": True, "jobs": len(jobs)}
+
+
 def get_feed_next_run(feed_id: int):
    """获取指定 RSS 源的下一次抓取时间"""
    scheduler = get_scheduler()
@@ -81,3 +104,4 @@ def init_feed_jobs(db):
    for feed in feeds:
        add_feed_job(feed.id, feed.fetch_interval_minutes or config.DEFAULT_FETCH_INTERVAL)
    start_scheduler()
+    logger.info(f"已注册 {len(feeds)} 个定时抓取任务")