feat: 代理支持、外部API增强、调度器修复、每日文章看板

- 添加 HTTP 代理支持(国内直连、外网走代理)
- 外部 API 新增全文搜索、源健康度/错误筛选、未读筛选
- 修复 APScheduler 线程静默崩溃(_safe_fetch 异常保护)
- 健康检查暴露调度器状态
- Dashboard 新增每日文章数柱状图(按 published_at)
- 文章列表 API 补上 content 字段,日期筛选修复时间范围
- 修复外部 API 双重 external 前缀
- User-Agent 改为 Chrome 标识缓解 403
- 添加完整 API 接口文档

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
congsh
2026-06-12 09:58:32 +08:00
parent 68bba3d9e0
commit 4286731348
12 changed files with 1057 additions and 44 deletions
+4
View File
@@ -16,6 +16,10 @@ FETCH_TIMEOUT = int(os.getenv("FETCH_TIMEOUT", "30"))
DEFAULT_FETCH_INTERVAL = int(os.getenv("DEFAULT_FETCH_INTERVAL", "60")) # 分钟
MIN_FETCH_INTERVAL = int(os.getenv("MIN_FETCH_INTERVAL", "15")) # 最小间隔15分钟
# 代理配置(用于访问外网源)
HTTP_PROXY = os.getenv("HTTP_PROXY", "")
HTTPS_PROXY = os.getenv("HTTPS_PROXY", "")
# 内容处理
MAX_ARTICLE_CONTENT_LENGTH = int(os.getenv("MAX_ARTICLE_CONTENT_LENGTH", "50000"))
MAX_SUMMARY_LENGTH = int(os.getenv("MAX_SUMMARY_LENGTH", "500"))
+3 -2
View File
@@ -5,7 +5,7 @@ from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from starlette.middleware.cors import CORSMiddleware
from database import init_db, SessionLocal
from scheduler import init_feed_jobs, stop_scheduler
from scheduler import init_feed_jobs, stop_scheduler, scheduler_status
from routers import feeds, articles, dashboard, external_api
import config
@@ -57,7 +57,8 @@ app.include_router(external_api.router, prefix=config.EXTERNAL_API_PREFIX)
@app.get("/api/health")
def health_check():
"""健康检查"""
return {"status": "ok", "service": "rssKeeper"}
sched = scheduler_status()
return {"status": "ok", "service": "rssKeeper", "scheduler": sched}
# 静态文件服务(前端构建产物)— 必须放在最后,API 路由优先匹配
+1
View File
@@ -72,6 +72,7 @@ def list_articles(
"link": article.link,
"author": article.author or "",
"published_at": article.published_at.isoformat() if article.published_at else None,
"content": article.content or "",
"summary": article.summary or "",
"is_read": article.is_read,
"created_at": article.created_at.isoformat(),
+20
View File
@@ -1,6 +1,7 @@
"""仪表盘统计 API"""
from fastapi import APIRouter, Depends
from sqlalchemy.orm import Session
from sqlalchemy import func, text
from database import get_db
from health_checker import get_overall_stats, get_feed_health
@@ -31,6 +32,25 @@ def dashboard_health(
return {"total": total, "items": items}
@router.get("/articles-daily")
def articles_daily(days: int = 30, db: Session = Depends(get_db)):
"""按发布日期统计文章数量"""
from models import Article
sql = text("""
SELECT DATE(published_at) as date, COUNT(*) as count
FROM articles
WHERE published_at IS NOT NULL
AND published_at >= DATE('now', '-' || :days || ' days')
GROUP BY DATE(published_at)
ORDER BY date DESC
""")
rows = db.execute(sql, {"days": days}).fetchall()
return {
"days": days,
"data": [{"date": str(r[0]), "count": r[1]} for r in rows],
}
@router.get("/recent-activity")
def recent_activity(limit: int = 20, db: Session = Depends(get_db)):
"""最近的抓取活动"""
+113 -28
View File
@@ -1,13 +1,14 @@
"""对外 API(供 AI/外部系统调用)"""
from typing import Optional
from typing import Optional, List
from datetime import datetime, timedelta
from fastapi import APIRouter, Depends
from fastapi import APIRouter, Depends, Query
from sqlalchemy.orm import Session
from sqlalchemy import desc
from sqlalchemy import desc, or_
from database import get_db
from models import Article, Feed
from fulltext_search import search_articles
router = APIRouter(prefix="/external", tags=["external"])
router = APIRouter(tags=["external"])
@router.get("/recent")
@@ -16,21 +17,28 @@ def get_recent_articles(
limit: int = 50,
feed_id: Optional[int] = None,
category: Optional[str] = None,
search: Optional[str] = None,
unread_only: bool = False,
db: Session = Depends(get_db),
):
"""获取最近 N 小时的文章
这是对外提供给 AI 分析的主要接口
AI 分析的主要接口,支持多条件组合筛选
"""
since = datetime.utcnow() - timedelta(hours=hours)
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
query = query.filter(Article.created_at >= since)
if feed_id:
query = query.filter(Article.feed_id == feed_id)
if category:
query = query.filter(Feed.category == category)
if search:
query = query.filter(
Article.title.contains(search) | Article.summary.contains(search)
)
if unread_only:
query = query.filter(Article.is_read == False)
rows = query.order_by(desc(Article.published_at)).limit(limit).all()
@@ -40,6 +48,8 @@ def get_recent_articles(
"limit": limit,
"feed_id": feed_id,
"category": category,
"search": search,
"unread_only": unread_only,
},
"count": len(rows),
"articles": [
@@ -60,24 +70,86 @@ def get_recent_articles(
}
@router.get("/feeds")
def get_active_feeds(db: Session = Depends(get_db)):
"""获取所有活跃的 RSS 源列表"""
feeds = db.query(Feed).filter(Feed.is_active == True).all()
@router.get("/search")
def fulltext_search(
q: str = Query(..., description="搜索关键词"),
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0),
category: Optional[str] = Query(None, description="按分类筛选"),
feed_id: Optional[int] = Query(None, description="按源筛选"),
db: Session = Depends(get_db),
):
"""全文搜索文章(FTS5
供 AI 按关键词检索文章内容
"""
results, total = search_articles(q, limit, offset)
# 二次过滤分类和源
if category or feed_id:
filtered = []
for r in results:
if category and r["category"] != category:
continue
if feed_id and r["feed_id"] != feed_id:
continue
filtered.append(r)
results = filtered
total = len(filtered)
return {
"count": len(feeds),
"feeds": [
{
"id": feed.id,
"title": feed.title or feed.url,
"url": feed.url,
"category": feed.category or "",
"article_count": feed.article_count,
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
}
for feed in feeds
],
"query": q,
"total": total,
"offset": offset,
"limit": limit,
"articles": results,
}
@router.get("/feeds")
def get_active_feeds(
health_status: Optional[str] = Query(None, description="按健康度筛选: healthy/warning/unhealthy/unknown"),
category: Optional[str] = Query(None, description="按分类筛选"),
error_type: Optional[str] = Query(None, description="按错误类型筛选"),
is_active: Optional[bool] = Query(None, description="按启用状态筛选"),
db: Session = Depends(get_db),
):
"""获取 RSS 源列表(支持多条件筛选)"""
query = db.query(Feed)
if is_active is not None:
query = query.filter(Feed.is_active == is_active)
else:
query = query.filter(Feed.is_active == True)
if category:
query = query.filter(Feed.category == category)
feeds = query.all()
results = []
for feed in feeds:
status = feed.health_status()
if health_status and status != health_status:
continue
if error_type and feed.error_type != error_type:
continue
results.append({
"id": feed.id,
"title": feed.title or feed.url,
"url": feed.url,
"category": feed.category or "",
"is_active": feed.is_active,
"health_status": status,
"error_type": feed.error_type,
"article_count": feed.article_count,
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
"last_error": feed.last_error or "",
})
return {
"count": len(results),
"feeds": results,
}
@@ -86,6 +158,8 @@ def get_feed_articles(
feed_id: int,
limit: int = 100,
since: Optional[str] = None,
search: Optional[str] = None,
unread_only: bool = False,
db: Session = Depends(get_db),
):
"""获取指定 RSS 源的文章"""
@@ -97,6 +171,12 @@ def get_feed_articles(
if since:
query = query.filter(Article.published_at >= since)
if search:
query = query.filter(
Article.title.contains(search) | Article.summary.contains(search)
)
if unread_only:
query = query.filter(Article.is_read == False)
articles = query.order_by(desc(Article.published_at)).limit(limit).all()
@@ -124,6 +204,7 @@ def get_feed_articles(
@router.get("/summary")
def get_daily_summary(
date: Optional[str] = None,
category: Optional[str] = None,
db: Session = Depends(get_db),
):
"""获取指定日期的文章摘要统计
@@ -141,15 +222,19 @@ def get_daily_summary(
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
query = query.filter(Article.created_at >= day, Article.created_at < next_day)
if category:
query = query.filter(Feed.category == category)
rows = query.order_by(desc(Article.published_at)).all()
# 按分类统计
by_category = {}
for article, feed_title, category in rows:
cat = category or "未分类"
if cat not in by_category:
by_category[cat] = []
by_category[cat].append({
for article, feed_title, cat in rows:
c = cat or "未分类"
if category and c != category:
continue
if c not in by_category:
by_category[c] = []
by_category[c].append({
"title": article.title or "",
"link": article.link,
"feed": feed_title or "",
+33 -4
View File
@@ -13,6 +13,35 @@ from models import Feed, Article, FetchLog
from database import SessionLocal
import config
# 国内域名后缀/关键字 — 这些直连,其余走代理
CN_DOMAINS = (
".cn", ".com.cn", ".org.cn", ".net.cn",
"36kr.com", "zhihu.com", "weibo.com", "douban.com", "bilibili.com",
"tmtpost.com", "ifanr.com", "geekpark.net", "pingwest.com",
"juejin.cn", "segmentfault.com", "cnblogs.com", "csdn.net",
"qq.com", "163.com", "sohu.com", "sina.com.cn", "baidu.com",
"taobao.com", "jd.com", "aliyun.com",
"xinhuanet.com", "people.com.cn", "sciencenet.cn",
"localhost", "127.0.0.1", "192.168.",
)
def _get_proxies(url: str) -> dict:
"""根据 URL 判断是否需要代理,返回 proxies dict"""
if not config.HTTPS_PROXY:
return {}
from urllib.parse import urlparse
host = urlparse(url).hostname or ""
# 国内域名直连
for d in CN_DOMAINS:
if host.endswith(d) or host == d:
return {}
# 外网走代理
return {
"http": config.HTTP_PROXY or config.HTTPS_PROXY,
"https": config.HTTPS_PROXY,
}
def classify_error(error: str) -> str:
"""根据错误信息分类错误类型"""
@@ -54,10 +83,10 @@ def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict:
start_time = time.time()
try:
headers = {
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36",
"Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
}
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, proxies=_get_proxies(url))
response.raise_for_status()
# 解析 RSS
@@ -87,9 +116,9 @@ def discover_feed_url(url: str, timeout: int = 15) -> list:
"""
try:
headers = {
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36",
}
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, proxies=_get_proxies(url))
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
+31 -7
View File
@@ -1,9 +1,12 @@
"""APScheduler 定时任务管理"""
import logging
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.interval import IntervalTrigger
from rss_fetcher import fetch_and_store_feed
import config
logger = logging.getLogger(__name__)
_scheduler = None
@@ -11,32 +14,42 @@ def get_scheduler():
"""获取或创建调度器实例"""
global _scheduler
if _scheduler is None:
_scheduler = BackgroundScheduler()
_scheduler = BackgroundScheduler(
job_defaults={
"coalesce": True,
"max_instances": 1,
"misfire_grace_time": 300,
},
logger=logger,
)
return _scheduler
def _safe_fetch(feed_id: int):
"""安全包装:防止单个 job 异常导致调度器线程崩溃"""
try:
fetch_and_store_feed(feed_id)
except Exception as e:
logger.error(f"调度抓取失败 feed_id={feed_id}: {e}")
def add_feed_job(feed_id: int, interval_minutes: int):
"""为指定 RSS 源添加定时抓取任务"""
scheduler = get_scheduler()
job_id = f"fetch_feed_{feed_id}"
# 确保间隔不低于最小值
interval = max(interval_minutes, config.MIN_FETCH_INTERVAL)
# 如果任务已存在则更新
existing = scheduler.get_job(job_id)
if existing:
existing.reschedule(trigger=IntervalTrigger(minutes=interval))
return
scheduler.add_job(
fetch_and_store_feed,
_safe_fetch,
trigger=IntervalTrigger(minutes=interval),
id=job_id,
args=[feed_id],
replace_existing=True,
misfire_grace_time=300, # 5分钟容错
coalesce=True, # 合并错过的任务
)
@@ -55,6 +68,7 @@ def start_scheduler():
scheduler = get_scheduler()
if not scheduler.running:
scheduler.start()
logger.info("调度器已启动")
def stop_scheduler():
@@ -65,6 +79,15 @@ def stop_scheduler():
_scheduler = None
def scheduler_status():
"""获取调度器状态(供健康检查使用)"""
scheduler = get_scheduler()
if not scheduler.running:
return {"running": False, "jobs": 0}
jobs = scheduler.get_jobs()
return {"running": True, "jobs": len(jobs)}
def get_feed_next_run(feed_id: int):
"""获取指定 RSS 源的下一次抓取时间"""
scheduler = get_scheduler()
@@ -81,3 +104,4 @@ def init_feed_jobs(db):
for feed in feeds:
add_feed_job(feed.id, feed.fetch_interval_minutes or config.DEFAULT_FETCH_INTERVAL)
start_scheduler()
logger.info(f"已注册 {len(feeds)} 个定时抓取任务")