feat: 代理支持、外部API增强、调度器修复、每日文章看板
- 添加 HTTP 代理支持(国内直连、外网走代理) - 外部 API 新增全文搜索、源健康度/错误筛选、未读筛选 - 修复 APScheduler 线程静默崩溃(_safe_fetch 异常保护) - 健康检查暴露调度器状态 - Dashboard 新增每日文章数柱状图(按 published_at) - 文章列表 API 补上 content 字段,日期筛选修复时间范围 - 修复外部 API 双重 external 前缀 - User-Agent 改为 Chrome 标识缓解 403 - 添加完整 API 接口文档 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,10 @@ FETCH_TIMEOUT = int(os.getenv("FETCH_TIMEOUT", "30"))
|
||||
DEFAULT_FETCH_INTERVAL = int(os.getenv("DEFAULT_FETCH_INTERVAL", "60")) # 分钟
|
||||
MIN_FETCH_INTERVAL = int(os.getenv("MIN_FETCH_INTERVAL", "15")) # 最小间隔15分钟
|
||||
|
||||
# 代理配置(用于访问外网源)
|
||||
HTTP_PROXY = os.getenv("HTTP_PROXY", "")
|
||||
HTTPS_PROXY = os.getenv("HTTPS_PROXY", "")
|
||||
|
||||
# 内容处理
|
||||
MAX_ARTICLE_CONTENT_LENGTH = int(os.getenv("MAX_ARTICLE_CONTENT_LENGTH", "50000"))
|
||||
MAX_SUMMARY_LENGTH = int(os.getenv("MAX_SUMMARY_LENGTH", "500"))
|
||||
|
||||
+3
-2
@@ -5,7 +5,7 @@ from fastapi import FastAPI
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from starlette.middleware.cors import CORSMiddleware
|
||||
from database import init_db, SessionLocal
|
||||
from scheduler import init_feed_jobs, stop_scheduler
|
||||
from scheduler import init_feed_jobs, stop_scheduler, scheduler_status
|
||||
from routers import feeds, articles, dashboard, external_api
|
||||
import config
|
||||
|
||||
@@ -57,7 +57,8 @@ app.include_router(external_api.router, prefix=config.EXTERNAL_API_PREFIX)
|
||||
@app.get("/api/health")
|
||||
def health_check():
|
||||
"""健康检查"""
|
||||
return {"status": "ok", "service": "rssKeeper"}
|
||||
sched = scheduler_status()
|
||||
return {"status": "ok", "service": "rssKeeper", "scheduler": sched}
|
||||
|
||||
|
||||
# 静态文件服务(前端构建产物)— 必须放在最后,API 路由优先匹配
|
||||
|
||||
@@ -72,6 +72,7 @@ def list_articles(
|
||||
"link": article.link,
|
||||
"author": article.author or "",
|
||||
"published_at": article.published_at.isoformat() if article.published_at else None,
|
||||
"content": article.content or "",
|
||||
"summary": article.summary or "",
|
||||
"is_read": article.is_read,
|
||||
"created_at": article.created_at.isoformat(),
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""仪表盘统计 API"""
|
||||
from fastapi import APIRouter, Depends
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func, text
|
||||
from database import get_db
|
||||
from health_checker import get_overall_stats, get_feed_health
|
||||
|
||||
@@ -31,6 +32,25 @@ def dashboard_health(
|
||||
return {"total": total, "items": items}
|
||||
|
||||
|
||||
@router.get("/articles-daily")
|
||||
def articles_daily(days: int = 30, db: Session = Depends(get_db)):
|
||||
"""按发布日期统计文章数量"""
|
||||
from models import Article
|
||||
sql = text("""
|
||||
SELECT DATE(published_at) as date, COUNT(*) as count
|
||||
FROM articles
|
||||
WHERE published_at IS NOT NULL
|
||||
AND published_at >= DATE('now', '-' || :days || ' days')
|
||||
GROUP BY DATE(published_at)
|
||||
ORDER BY date DESC
|
||||
""")
|
||||
rows = db.execute(sql, {"days": days}).fetchall()
|
||||
return {
|
||||
"days": days,
|
||||
"data": [{"date": str(r[0]), "count": r[1]} for r in rows],
|
||||
}
|
||||
|
||||
|
||||
@router.get("/recent-activity")
|
||||
def recent_activity(limit: int = 20, db: Session = Depends(get_db)):
|
||||
"""最近的抓取活动"""
|
||||
|
||||
+113
-28
@@ -1,13 +1,14 @@
|
||||
"""对外 API(供 AI/外部系统调用)"""
|
||||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
from datetime import datetime, timedelta
|
||||
from fastapi import APIRouter, Depends
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import desc
|
||||
from sqlalchemy import desc, or_
|
||||
from database import get_db
|
||||
from models import Article, Feed
|
||||
from fulltext_search import search_articles
|
||||
|
||||
router = APIRouter(prefix="/external", tags=["external"])
|
||||
router = APIRouter(tags=["external"])
|
||||
|
||||
|
||||
@router.get("/recent")
|
||||
@@ -16,21 +17,28 @@ def get_recent_articles(
|
||||
limit: int = 50,
|
||||
feed_id: Optional[int] = None,
|
||||
category: Optional[str] = None,
|
||||
search: Optional[str] = None,
|
||||
unread_only: bool = False,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""获取最近 N 小时的文章
|
||||
这是对外提供给 AI 分析的主要接口
|
||||
供 AI 分析的主要接口,支持多条件组合筛选
|
||||
"""
|
||||
since = datetime.utcnow() - timedelta(hours=hours)
|
||||
|
||||
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
|
||||
|
||||
query = query.filter(Article.created_at >= since)
|
||||
|
||||
if feed_id:
|
||||
query = query.filter(Article.feed_id == feed_id)
|
||||
if category:
|
||||
query = query.filter(Feed.category == category)
|
||||
if search:
|
||||
query = query.filter(
|
||||
Article.title.contains(search) | Article.summary.contains(search)
|
||||
)
|
||||
if unread_only:
|
||||
query = query.filter(Article.is_read == False)
|
||||
|
||||
rows = query.order_by(desc(Article.published_at)).limit(limit).all()
|
||||
|
||||
@@ -40,6 +48,8 @@ def get_recent_articles(
|
||||
"limit": limit,
|
||||
"feed_id": feed_id,
|
||||
"category": category,
|
||||
"search": search,
|
||||
"unread_only": unread_only,
|
||||
},
|
||||
"count": len(rows),
|
||||
"articles": [
|
||||
@@ -60,24 +70,86 @@ def get_recent_articles(
|
||||
}
|
||||
|
||||
|
||||
@router.get("/feeds")
|
||||
def get_active_feeds(db: Session = Depends(get_db)):
|
||||
"""获取所有活跃的 RSS 源列表"""
|
||||
feeds = db.query(Feed).filter(Feed.is_active == True).all()
|
||||
@router.get("/search")
|
||||
def fulltext_search(
|
||||
q: str = Query(..., description="搜索关键词"),
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
offset: int = Query(0, ge=0),
|
||||
category: Optional[str] = Query(None, description="按分类筛选"),
|
||||
feed_id: Optional[int] = Query(None, description="按源筛选"),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""全文搜索文章(FTS5)
|
||||
供 AI 按关键词检索文章内容
|
||||
"""
|
||||
results, total = search_articles(q, limit, offset)
|
||||
|
||||
# 二次过滤分类和源
|
||||
if category or feed_id:
|
||||
filtered = []
|
||||
for r in results:
|
||||
if category and r["category"] != category:
|
||||
continue
|
||||
if feed_id and r["feed_id"] != feed_id:
|
||||
continue
|
||||
filtered.append(r)
|
||||
results = filtered
|
||||
total = len(filtered)
|
||||
|
||||
return {
|
||||
"count": len(feeds),
|
||||
"feeds": [
|
||||
{
|
||||
"id": feed.id,
|
||||
"title": feed.title or feed.url,
|
||||
"url": feed.url,
|
||||
"category": feed.category or "",
|
||||
"article_count": feed.article_count,
|
||||
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
|
||||
}
|
||||
for feed in feeds
|
||||
],
|
||||
"query": q,
|
||||
"total": total,
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"articles": results,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/feeds")
|
||||
def get_active_feeds(
|
||||
health_status: Optional[str] = Query(None, description="按健康度筛选: healthy/warning/unhealthy/unknown"),
|
||||
category: Optional[str] = Query(None, description="按分类筛选"),
|
||||
error_type: Optional[str] = Query(None, description="按错误类型筛选"),
|
||||
is_active: Optional[bool] = Query(None, description="按启用状态筛选"),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""获取 RSS 源列表(支持多条件筛选)"""
|
||||
query = db.query(Feed)
|
||||
|
||||
if is_active is not None:
|
||||
query = query.filter(Feed.is_active == is_active)
|
||||
else:
|
||||
query = query.filter(Feed.is_active == True)
|
||||
|
||||
if category:
|
||||
query = query.filter(Feed.category == category)
|
||||
|
||||
feeds = query.all()
|
||||
|
||||
results = []
|
||||
for feed in feeds:
|
||||
status = feed.health_status()
|
||||
if health_status and status != health_status:
|
||||
continue
|
||||
if error_type and feed.error_type != error_type:
|
||||
continue
|
||||
|
||||
results.append({
|
||||
"id": feed.id,
|
||||
"title": feed.title or feed.url,
|
||||
"url": feed.url,
|
||||
"category": feed.category or "",
|
||||
"is_active": feed.is_active,
|
||||
"health_status": status,
|
||||
"error_type": feed.error_type,
|
||||
"article_count": feed.article_count,
|
||||
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
|
||||
"last_error": feed.last_error or "",
|
||||
})
|
||||
|
||||
return {
|
||||
"count": len(results),
|
||||
"feeds": results,
|
||||
}
|
||||
|
||||
|
||||
@@ -86,6 +158,8 @@ def get_feed_articles(
|
||||
feed_id: int,
|
||||
limit: int = 100,
|
||||
since: Optional[str] = None,
|
||||
search: Optional[str] = None,
|
||||
unread_only: bool = False,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""获取指定 RSS 源的文章"""
|
||||
@@ -97,6 +171,12 @@ def get_feed_articles(
|
||||
|
||||
if since:
|
||||
query = query.filter(Article.published_at >= since)
|
||||
if search:
|
||||
query = query.filter(
|
||||
Article.title.contains(search) | Article.summary.contains(search)
|
||||
)
|
||||
if unread_only:
|
||||
query = query.filter(Article.is_read == False)
|
||||
|
||||
articles = query.order_by(desc(Article.published_at)).limit(limit).all()
|
||||
|
||||
@@ -124,6 +204,7 @@ def get_feed_articles(
|
||||
@router.get("/summary")
|
||||
def get_daily_summary(
|
||||
date: Optional[str] = None,
|
||||
category: Optional[str] = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""获取指定日期的文章摘要统计
|
||||
@@ -141,15 +222,19 @@ def get_daily_summary(
|
||||
|
||||
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
|
||||
query = query.filter(Article.created_at >= day, Article.created_at < next_day)
|
||||
if category:
|
||||
query = query.filter(Feed.category == category)
|
||||
|
||||
rows = query.order_by(desc(Article.published_at)).all()
|
||||
|
||||
# 按分类统计
|
||||
by_category = {}
|
||||
for article, feed_title, category in rows:
|
||||
cat = category or "未分类"
|
||||
if cat not in by_category:
|
||||
by_category[cat] = []
|
||||
by_category[cat].append({
|
||||
for article, feed_title, cat in rows:
|
||||
c = cat or "未分类"
|
||||
if category and c != category:
|
||||
continue
|
||||
if c not in by_category:
|
||||
by_category[c] = []
|
||||
by_category[c].append({
|
||||
"title": article.title or "",
|
||||
"link": article.link,
|
||||
"feed": feed_title or "",
|
||||
|
||||
+33
-4
@@ -13,6 +13,35 @@ from models import Feed, Article, FetchLog
|
||||
from database import SessionLocal
|
||||
import config
|
||||
|
||||
# 国内域名后缀/关键字 — 这些直连,其余走代理
|
||||
CN_DOMAINS = (
|
||||
".cn", ".com.cn", ".org.cn", ".net.cn",
|
||||
"36kr.com", "zhihu.com", "weibo.com", "douban.com", "bilibili.com",
|
||||
"tmtpost.com", "ifanr.com", "geekpark.net", "pingwest.com",
|
||||
"juejin.cn", "segmentfault.com", "cnblogs.com", "csdn.net",
|
||||
"qq.com", "163.com", "sohu.com", "sina.com.cn", "baidu.com",
|
||||
"taobao.com", "jd.com", "aliyun.com",
|
||||
"xinhuanet.com", "people.com.cn", "sciencenet.cn",
|
||||
"localhost", "127.0.0.1", "192.168.",
|
||||
)
|
||||
|
||||
|
||||
def _get_proxies(url: str) -> dict:
|
||||
"""根据 URL 判断是否需要代理,返回 proxies dict"""
|
||||
if not config.HTTPS_PROXY:
|
||||
return {}
|
||||
from urllib.parse import urlparse
|
||||
host = urlparse(url).hostname or ""
|
||||
# 国内域名直连
|
||||
for d in CN_DOMAINS:
|
||||
if host.endswith(d) or host == d:
|
||||
return {}
|
||||
# 外网走代理
|
||||
return {
|
||||
"http": config.HTTP_PROXY or config.HTTPS_PROXY,
|
||||
"https": config.HTTPS_PROXY,
|
||||
}
|
||||
|
||||
|
||||
def classify_error(error: str) -> str:
|
||||
"""根据错误信息分类错误类型"""
|
||||
@@ -54,10 +83,10 @@ def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict:
|
||||
start_time = time.time()
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36",
|
||||
"Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
|
||||
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, proxies=_get_proxies(url))
|
||||
response.raise_for_status()
|
||||
|
||||
# 解析 RSS
|
||||
@@ -87,9 +116,9 @@ def discover_feed_url(url: str, timeout: int = 15) -> list:
|
||||
"""
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36",
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
|
||||
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, proxies=_get_proxies(url))
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
+31
-7
@@ -1,9 +1,12 @@
|
||||
"""APScheduler 定时任务管理"""
|
||||
import logging
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from rss_fetcher import fetch_and_store_feed
|
||||
import config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_scheduler = None
|
||||
|
||||
|
||||
@@ -11,32 +14,42 @@ def get_scheduler():
|
||||
"""获取或创建调度器实例"""
|
||||
global _scheduler
|
||||
if _scheduler is None:
|
||||
_scheduler = BackgroundScheduler()
|
||||
_scheduler = BackgroundScheduler(
|
||||
job_defaults={
|
||||
"coalesce": True,
|
||||
"max_instances": 1,
|
||||
"misfire_grace_time": 300,
|
||||
},
|
||||
logger=logger,
|
||||
)
|
||||
return _scheduler
|
||||
|
||||
|
||||
def _safe_fetch(feed_id: int):
|
||||
"""安全包装:防止单个 job 异常导致调度器线程崩溃"""
|
||||
try:
|
||||
fetch_and_store_feed(feed_id)
|
||||
except Exception as e:
|
||||
logger.error(f"调度抓取失败 feed_id={feed_id}: {e}")
|
||||
|
||||
|
||||
def add_feed_job(feed_id: int, interval_minutes: int):
|
||||
"""为指定 RSS 源添加定时抓取任务"""
|
||||
scheduler = get_scheduler()
|
||||
job_id = f"fetch_feed_{feed_id}"
|
||||
|
||||
# 确保间隔不低于最小值
|
||||
interval = max(interval_minutes, config.MIN_FETCH_INTERVAL)
|
||||
|
||||
# 如果任务已存在则更新
|
||||
existing = scheduler.get_job(job_id)
|
||||
if existing:
|
||||
existing.reschedule(trigger=IntervalTrigger(minutes=interval))
|
||||
return
|
||||
|
||||
scheduler.add_job(
|
||||
fetch_and_store_feed,
|
||||
_safe_fetch,
|
||||
trigger=IntervalTrigger(minutes=interval),
|
||||
id=job_id,
|
||||
args=[feed_id],
|
||||
replace_existing=True,
|
||||
misfire_grace_time=300, # 5分钟容错
|
||||
coalesce=True, # 合并错过的任务
|
||||
)
|
||||
|
||||
|
||||
@@ -55,6 +68,7 @@ def start_scheduler():
|
||||
scheduler = get_scheduler()
|
||||
if not scheduler.running:
|
||||
scheduler.start()
|
||||
logger.info("调度器已启动")
|
||||
|
||||
|
||||
def stop_scheduler():
|
||||
@@ -65,6 +79,15 @@ def stop_scheduler():
|
||||
_scheduler = None
|
||||
|
||||
|
||||
def scheduler_status():
|
||||
"""获取调度器状态(供健康检查使用)"""
|
||||
scheduler = get_scheduler()
|
||||
if not scheduler.running:
|
||||
return {"running": False, "jobs": 0}
|
||||
jobs = scheduler.get_jobs()
|
||||
return {"running": True, "jobs": len(jobs)}
|
||||
|
||||
|
||||
def get_feed_next_run(feed_id: int):
|
||||
"""获取指定 RSS 源的下一次抓取时间"""
|
||||
scheduler = get_scheduler()
|
||||
@@ -81,3 +104,4 @@ def init_feed_jobs(db):
|
||||
for feed in feeds:
|
||||
add_feed_job(feed.id, feed.fetch_interval_minutes or config.DEFAULT_FETCH_INTERVAL)
|
||||
start_scheduler()
|
||||
logger.info(f"已注册 {len(feeds)} 个定时抓取任务")
|
||||
|
||||
Reference in New Issue
Block a user