feat: init rssKeeper - RSS 抓取、管理与检索系统

完整功能包括:
- FastAPI 后端 + SQLite + FTS5 全文搜索
- RSS 源管理、自动发现、OPML 导入导出
- 文章抓取、去重、分类、全文检索
- RSS 源健康度监控
- Vue 3 + Element Plus 暗色主题 Web UI
- 对外 REST API 供 AI 分析调用
- Docker + docker-compose 部署
This commit is contained in:
congsh
2026-06-11 14:03:36 +08:00
commit 54e7db0ef0
28 changed files with 2915 additions and 0 deletions
+26
View File
@@ -0,0 +1,26 @@
"""配置管理 - 环境变量 + 默认值"""
import os
from pathlib import Path
# 项目根目录
BASE_DIR = Path(__file__).parent
DATA_DIR = Path(os.getenv("DATA_DIR", "/app/data"))
DATA_DIR.mkdir(parents=True, exist_ok=True)
# 数据库
DATABASE_URL = os.getenv("DATABASE_URL", str(DATA_DIR / "rsskeeper.db"))
# RSS 抓取配置
FETCH_CONCURRENCY = int(os.getenv("FETCH_CONCURRENCY", "10"))
FETCH_TIMEOUT = int(os.getenv("FETCH_TIMEOUT", "30"))
DEFAULT_FETCH_INTERVAL = int(os.getenv("DEFAULT_FETCH_INTERVAL", "60")) # 分钟
MIN_FETCH_INTERVAL = int(os.getenv("MIN_FETCH_INTERVAL", "15")) # 最小间隔15分钟
# 内容处理
MAX_ARTICLE_CONTENT_LENGTH = int(os.getenv("MAX_ARTICLE_CONTENT_LENGTH", "50000"))
MAX_SUMMARY_LENGTH = int(os.getenv("MAX_SUMMARY_LENGTH", "500"))
ARTICLE_RETENTION_DAYS = int(os.getenv("ARTICLE_RETENTION_DAYS", "0")) # 0 = 永久保留
# API 配置
API_PREFIX = "/api"
EXTERNAL_API_PREFIX = "/api/v1/external"
+89
View File
@@ -0,0 +1,89 @@
"""数据库连接与初始化"""
from sqlalchemy import create_engine, event
from sqlalchemy.orm import sessionmaker, declarative_base
from config import DATABASE_URL
# SQLite 连接
engine = create_engine(
f"sqlite:///{DATABASE_URL}",
connect_args={"check_same_thread": False},
echo=False,
)
# 启用 SQLite 外键约束
@event.listens_for(engine, "connect")
def set_sqlite_pragma(dbapi_conn, connection_record):
cursor = dbapi_conn.cursor()
cursor.execute("PRAGMA foreign_keys=ON")
cursor.close()
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()
def get_db():
"""FastAPI 依赖注入用"""
db = SessionLocal()
try:
yield db
finally:
db.close()
def init_db():
"""初始化数据库表"""
from models import Feed, Article, FetchLog # noqa
Base.metadata.create_all(bind=engine)
init_fts5()
def init_fts5():
"""初始化 FTS5 全文搜索虚拟表"""
conn = engine.raw_connection()
cursor = conn.cursor()
# 检查 FTS5 扩展是否可用
try:
cursor.execute("SELECT sqlite_compileoption_used('ENABLE_FTS5')")
has_fts5 = cursor.fetchone()[0]
if not has_fts5:
print("警告: SQLite 未启用 FTS5 扩展,全文搜索将不可用")
return
except Exception:
pass
# 创建 FTS5 虚拟表
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS articles_fts USING fts5(
title, content,
content='articles',
content_rowid='id'
)
""")
# 创建触发器,自动同步 articles 表到 FTS5
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS articles_fts_insert AFTER INSERT ON articles BEGIN
INSERT INTO articles_fts(rowid, title, content)
VALUES (new.id, new.title, new.content);
END
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS articles_fts_delete AFTER DELETE ON articles BEGIN
INSERT INTO articles_fts(articles_fts, rowid, title, content)
VALUES ('delete', old.id, old.title, old.content);
END
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS articles_fts_update AFTER UPDATE ON articles BEGIN
INSERT INTO articles_fts(articles_fts, rowid, title, content)
VALUES ('delete', old.id, old.title, old.content);
INSERT INTO articles_fts(rowid, title, content)
VALUES (new.id, new.title, new.content);
END
""")
conn.commit()
cursor.close()
conn.close()
+81
View File
@@ -0,0 +1,81 @@
"""SQLite FTS5 全文搜索封装"""
from sqlalchemy import text
from database import engine
def search_articles(query: str, limit: int = 50, offset: int = 0):
"""全文搜索文章
返回 [(article_id, title, content_snippet, rank), ...]
"""
if not query or not query.strip():
return [], 0
# 转义 FTS5 特殊字符
query = query.replace('"', '""').strip()
conn = engine.raw_connection()
cursor = conn.cursor()
try:
# 使用 FTS5 查询
sql = """
SELECT a.id, a.title, a.summary, a.link, a.published_at, a.created_at,
f.id as feed_id, f.title as feed_title, f.category,
rank
FROM articles_fts
JOIN articles a ON articles_fts.rowid = a.id
JOIN feeds f ON a.feed_id = f.id
WHERE articles_fts MATCH ?
ORDER BY rank
LIMIT ? OFFSET ?
"""
cursor.execute(sql, (query, limit, offset))
rows = cursor.fetchall()
# 获取总数
count_sql = """
SELECT COUNT(*) FROM articles_fts WHERE articles_fts MATCH ?
"""
cursor.execute(count_sql, (query,))
total = cursor.fetchone()[0]
results = []
for row in rows:
results.append({
"id": row[0],
"title": row[1],
"summary": row[2],
"link": row[3],
"published_at": row[4],
"created_at": row[5],
"feed_id": row[6],
"feed_title": row[7],
"category": row[8],
})
return results, total
except Exception as e:
# FTS5 查询失败时返回空结果
return [], 0
finally:
cursor.close()
conn.close()
def rebuild_fts_index():
"""重建 FTS5 索引(数据不一致时使用)"""
conn = engine.raw_connection()
cursor = conn.cursor()
try:
cursor.execute("DELETE FROM articles_fts")
cursor.execute("""
INSERT INTO articles_fts(rowid, title, content)
SELECT id, title, content FROM articles
""")
conn.commit()
return True
except Exception:
return False
finally:
cursor.close()
conn.close()
+112
View File
@@ -0,0 +1,112 @@
"""RSS 源健康度检测"""
from datetime import datetime, timedelta
from typing import List, Dict
from sqlalchemy.orm import Session
from models import Feed, FetchLog
def get_feed_health(db: Session, feed_id: int = None) -> List[Dict]:
"""获取 RSS 源健康度信息
返回每个源的健康状态详情
"""
query = db.query(Feed)
if feed_id:
query = query.filter(Feed.id == feed_id)
feeds = query.all()
results = []
for feed in feeds:
total = feed.success_count + feed.fail_count
success_rate = round(feed.success_count / total * 100, 1) if total > 0 else 0
days_since_fetch = None
if feed.last_fetch_at:
days_since_fetch = (datetime.utcnow() - feed.last_fetch_at).days
# 获取最近 7 天抓取记录
recent_logs = db.query(FetchLog).filter(
FetchLog.feed_id == feed.id,
FetchLog.created_at >= datetime.utcnow() - timedelta(days=7)
).order_by(FetchLog.created_at.desc()).limit(10).all()
health = feed.health_status()
results.append({
"id": feed.id,
"title": feed.title or feed.url,
"url": feed.url,
"is_active": feed.is_active,
"health_status": health,
"health_label": _health_label(health),
"success_rate": success_rate,
"success_count": feed.success_count,
"fail_count": feed.fail_count,
"total_fetches": total,
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
"days_since_fetch": days_since_fetch,
"article_count": feed.article_count,
"last_error": feed.last_error,
"recent_logs": [
{
"status": log.status,
"articles_fetched": log.articles_fetched,
"response_time_ms": log.response_time_ms,
"created_at": log.created_at.isoformat(),
"error_message": log.error_message if log.status == "fail" else None,
}
for log in recent_logs
],
})
return results
def _health_label(status: str) -> str:
labels = {
"healthy": "健康",
"warning": "警告",
"unhealthy": "异常",
"unknown": "未知",
}
return labels.get(status, "未知")
def get_overall_stats(db: Session) -> Dict:
"""获取整体统计信息"""
total_feeds = db.query(Feed).count()
active_feeds = db.query(Feed).filter(Feed.is_active == True).count()
total_articles = db.query(Feed).with_entities(Feed.article_count).all()
total_articles_count = sum(a[0] for a in total_articles) if total_articles else 0
# 健康源统计
feeds = db.query(Feed).all()
healthy = warning = unhealthy = 0
for feed in feeds:
status = feed.health_status()
if status == "healthy":
healthy += 1
elif status == "warning":
warning += 1
elif status == "unhealthy":
unhealthy += 1
# 今日抓取
today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
from models import FetchLog
today_fetches = db.query(FetchLog).filter(FetchLog.created_at >= today).count()
today_success = db.query(FetchLog).filter(
FetchLog.created_at >= today, FetchLog.status == "success"
).count()
return {
"total_feeds": total_feeds,
"active_feeds": active_feeds,
"total_articles": total_articles_count,
"healthy_feeds": healthy,
"warning_feeds": warning,
"unhealthy_feeds": unhealthy,
"today_fetches": today_fetches,
"today_success": today_success,
"today_success_rate": round(today_success / today_fetches * 100, 1) if today_fetches > 0 else 0,
}
+75
View File
@@ -0,0 +1,75 @@
"""rssKeeper - FastAPI 入口"""
import os
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from starlette.middleware.cors import CORSMiddleware
from database import init_db, SessionLocal
from scheduler import init_feed_jobs, stop_scheduler
from routers import feeds, articles, dashboard, external_api
import config
@asynccontextmanager
async def lifespan(app: FastAPI):
"""应用生命周期管理"""
# 启动时:初始化数据库 + 注册定时任务
init_db()
db = SessionLocal()
try:
init_feed_jobs(db)
finally:
db.close()
yield
# 关闭时:停止调度器
stop_scheduler()
app = FastAPI(
title="rssKeeper",
description="RSS 抓取、管理与检索系统",
version="1.0.0",
lifespan=lifespan,
)
# CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# API 路由
app.include_router(feeds.router, prefix=config.API_PREFIX)
app.include_router(articles.router, prefix=config.API_PREFIX)
app.include_router(dashboard.router, prefix=config.API_PREFIX)
app.include_router(external_api.router, prefix=config.EXTERNAL_API_PREFIX)
@app.get("/api/health")
def health_check():
"""健康检查"""
return {"status": "ok", "service": "rssKeeper"}
# 静态文件服务(前端构建产物)
static_dir = os.path.join(config.BASE_DIR, "static")
if os.path.exists(static_dir):
app.mount("/static", StaticFiles(directory=static_dir), name="static")
@app.get("/{full_path:path}")
async def serve_spa(full_path: str):
"""Vue SPA 路由回退"""
# API 路由不走这里
if full_path.startswith("api/") or full_path.startswith("docs") or full_path.startswith("openapi.json"):
return {"detail": "Not found"}
index_path = os.path.join(static_dir, "index.html")
if os.path.exists(index_path):
return FileResponse(index_path)
return {"detail": "Frontend not built"}
+90
View File
@@ -0,0 +1,90 @@
"""SQLAlchemy 数据模型"""
from datetime import datetime
from sqlalchemy import Column, Integer, String, Text, Boolean, DateTime, ForeignKey
from sqlalchemy.orm import relationship
from database import Base
class Feed(Base):
"""RSS 源"""
__tablename__ = "feeds"
id = Column(Integer, primary_key=True, index=True)
url = Column(String(2048), unique=True, nullable=False, index=True)
title = Column(String(512), default="")
description = Column(Text, default="")
category = Column(String(128), default="")
is_active = Column(Boolean, default=True, index=True)
fetch_interval_minutes = Column(Integer, default=60)
# 抓取统计
last_fetch_at = Column(DateTime, nullable=True)
last_fetch_status = Column(String(20), default="")
last_error = Column(Text, default="")
success_count = Column(Integer, default=0)
fail_count = Column(Integer, default=0)
article_count = Column(Integer, default=0)
created_at = Column(DateTime, default=datetime.utcnow)
# 关联
articles = relationship("Article", back_populates="feed", cascade="all, delete-orphan")
fetch_logs = relationship("FetchLog", back_populates="feed", cascade="all, delete-orphan")
def health_status(self):
"""计算健康度
🟢 健康: 成功率 >= 90%, 最近7天有更新
🟡 警告: 成功率 50%-90%, 或超过3天未更新
🔴 异常: 成功率 < 50%, 或超过7天未更新
"""
total = self.success_count + self.fail_count
if total == 0:
return "unknown"
success_rate = self.success_count / total
days_since_last_fetch = None
if self.last_fetch_at:
days_since_last_fetch = (datetime.utcnow() - self.last_fetch_at).days
if success_rate >= 0.9 and (days_since_last_fetch is None or days_since_last_fetch <= 7):
return "healthy"
elif success_rate >= 0.5 and (days_since_last_fetch is None or days_since_last_fetch <= 7):
return "warning"
else:
return "unhealthy"
class Article(Base):
"""RSS 文章"""
__tablename__ = "articles"
id = Column(Integer, primary_key=True, index=True)
feed_id = Column(Integer, ForeignKey("feeds.id", ondelete="CASCADE"), nullable=False, index=True)
title = Column(String(1024), default="", index=True)
link = Column(String(2048), unique=True, nullable=False, index=True)
author = Column(String(256), default="")
published_at = Column(DateTime, nullable=True, index=True)
content = Column(Text, default="")
summary = Column(Text, default="")
is_read = Column(Boolean, default=False)
created_at = Column(DateTime, default=datetime.utcnow, index=True)
# 关联
feed = relationship("Feed", back_populates="articles")
class FetchLog(Base):
"""抓取日志"""
__tablename__ = "fetch_logs"
id = Column(Integer, primary_key=True, index=True)
feed_id = Column(Integer, ForeignKey("feeds.id", ondelete="CASCADE"), nullable=False, index=True)
status = Column(String(20), nullable=False) # success / fail
articles_fetched = Column(Integer, default=0)
error_message = Column(Text, default="")
response_time_ms = Column(Integer, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, index=True)
# 关联
feed = relationship("Feed", back_populates="fetch_logs")
+9
View File
@@ -0,0 +1,9 @@
fastapi>=0.110.0
uvicorn[standard]>=0.29.0
sqlalchemy>=2.0.0
pydantic>=2.6.0
feedparser>=6.0.11
requests>=2.31.0
beautifulsoup4>=4.12.0
apscheduler>=3.10.4
lxml>=5.1.0
+133
View File
@@ -0,0 +1,133 @@
"""文章管理 API"""
from typing import Optional
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from sqlalchemy.orm import Session
from sqlalchemy import desc
from database import get_db
from models import Article, Feed
from fulltext_search import search_articles
router = APIRouter(prefix="/articles", tags=["articles"])
class ArticleOut(BaseModel):
id: int
feed_id: int
title: str
link: str
author: str
published_at: Optional[str]
summary: str
is_read: bool
created_at: str
feed_title: str
category: str
class Config:
from_attributes = True
@router.get("")
def list_articles(
skip: int = 0,
limit: int = 50,
feed_id: Optional[int] = None,
category: Optional[str] = None,
search: Optional[str] = None,
since: Optional[str] = None,
until: Optional[str] = None,
is_read: Optional[bool] = None,
db: Session = Depends(get_db),
):
"""获取文章列表,支持多种筛选条件"""
# 如果有搜索关键词,使用 FTS5 全文搜索
if search and search.strip():
results, total = search_articles(search.strip(), limit=limit, offset=skip)
return {"total": total, "items": results}
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
if feed_id:
query = query.filter(Article.feed_id == feed_id)
if category:
query = query.filter(Feed.category == category)
if is_read is not None:
query = query.filter(Article.is_read == is_read)
if since:
query = query.filter(Article.published_at >= since)
if until:
query = query.filter(Article.published_at <= until)
total = query.count()
rows = query.order_by(desc(Article.published_at)).offset(skip).limit(limit).all()
items = []
for article, feed_title, category in rows:
items.append({
"id": article.id,
"feed_id": article.feed_id,
"title": article.title or "",
"link": article.link,
"author": article.author or "",
"published_at": article.published_at.isoformat() if article.published_at else None,
"summary": article.summary or "",
"is_read": article.is_read,
"created_at": article.created_at.isoformat(),
"feed_title": feed_title or "",
"category": category or "",
})
return {"total": total, "items": items}
@router.get("/{article_id}")
def get_article(article_id: int, db: Session = Depends(get_db)):
"""获取文章详情"""
article = db.query(Article).filter(Article.id == article_id).first()
if not article:
raise HTTPException(status_code=404, detail="文章不存在")
feed = db.query(Feed).filter(Feed.id == article.feed_id).first()
return {
"id": article.id,
"feed_id": article.feed_id,
"title": article.title or "",
"link": article.link,
"author": article.author or "",
"published_at": article.published_at.isoformat() if article.published_at else None,
"content": article.content or "",
"summary": article.summary or "",
"is_read": article.is_read,
"created_at": article.created_at.isoformat(),
"feed_title": feed.title if feed else "",
"category": feed.category if feed else "",
}
@router.put("/{article_id}/read")
def mark_read(article_id: int, db: Session = Depends(get_db)):
"""标记文章为已读"""
article = db.query(Article).filter(Article.id == article_id).first()
if not article:
raise HTTPException(status_code=404, detail="文章不存在")
article.is_read = True
db.commit()
return {"message": "已标记为已读"}
@router.get("/search/fulltext")
def fulltext_search(
q: str,
skip: int = 0,
limit: int = 50,
):
"""全文搜索文章"""
results, total = search_articles(q, limit=limit, offset=skip)
return {"total": total, "items": results}
from fastapi import HTTPException
+58
View File
@@ -0,0 +1,58 @@
"""仪表盘统计 API"""
from fastapi import APIRouter, Depends
from sqlalchemy.orm import Session
from database import get_db
from health_checker import get_overall_stats, get_feed_health
router = APIRouter(prefix="/dashboard", tags=["dashboard"])
@router.get("/stats")
def dashboard_stats(db: Session = Depends(get_db)):
"""仪表盘统计数据"""
return get_overall_stats(db)
@router.get("/health")
def dashboard_health(
skip: int = 0,
limit: int = 100,
db: Session = Depends(get_db),
):
"""RSS 源健康度列表"""
all_health = get_feed_health(db)
total = len(all_health)
# 按健康状态排序:异常在前
status_order = {"unhealthy": 0, "warning": 1, "unknown": 2, "healthy": 3}
all_health.sort(key=lambda x: status_order.get(x["health_status"], 2))
items = all_health[skip:skip + limit]
return {"total": total, "items": items}
@router.get("/recent-activity")
def recent_activity(limit: int = 20, db: Session = Depends(get_db)):
"""最近的抓取活动"""
from models import FetchLog, Feed
from sqlalchemy import desc
logs = db.query(FetchLog, Feed.title.label("feed_title")).join(Feed).order_by(
desc(FetchLog.created_at)
).limit(limit).all()
return {
"items": [
{
"id": log.id,
"feed_id": log.feed_id,
"feed_title": feed_title or "",
"status": log.status,
"articles_fetched": log.articles_fetched,
"response_time_ms": log.response_time_ms,
"error_message": log.error_message,
"created_at": log.created_at.isoformat(),
}
for log, feed_title in logs
]
}
+163
View File
@@ -0,0 +1,163 @@
"""对外 API(供 AI/外部系统调用)"""
from typing import Optional
from datetime import datetime, timedelta
from fastapi import APIRouter, Depends
from sqlalchemy.orm import Session
from sqlalchemy import desc
from database import get_db
from models import Article, Feed
router = APIRouter(prefix="/external", tags=["external"])
@router.get("/recent")
def get_recent_articles(
hours: int = 24,
limit: int = 50,
feed_id: Optional[int] = None,
category: Optional[str] = None,
db: Session = Depends(get_db),
):
"""获取最近 N 小时的文章
这是对外提供给 AI 分析的主要接口
"""
since = datetime.utcnow() - timedelta(hours=hours)
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
query = query.filter(Article.created_at >= since)
if feed_id:
query = query.filter(Article.feed_id == feed_id)
if category:
query = query.filter(Feed.category == category)
rows = query.order_by(desc(Article.published_at)).limit(limit).all()
return {
"query": {
"hours": hours,
"limit": limit,
"feed_id": feed_id,
"category": category,
},
"count": len(rows),
"articles": [
{
"id": article.id,
"title": article.title or "",
"link": article.link,
"author": article.author or "",
"summary": article.summary or "",
"content": article.content or "" if len(article.content or "") < 10000 else article.summary or "",
"published_at": article.published_at.isoformat() if article.published_at else None,
"created_at": article.created_at.isoformat(),
"feed_title": feed_title or "",
"category": category or "",
}
for article, feed_title, category in rows
],
}
@router.get("/feeds")
def get_active_feeds(db: Session = Depends(get_db)):
"""获取所有活跃的 RSS 源列表"""
feeds = db.query(Feed).filter(Feed.is_active == True).all()
return {
"count": len(feeds),
"feeds": [
{
"id": feed.id,
"title": feed.title or feed.url,
"url": feed.url,
"category": feed.category or "",
"article_count": feed.article_count,
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
}
for feed in feeds
],
}
@router.get("/feeds/{feed_id}/articles")
def get_feed_articles(
feed_id: int,
limit: int = 100,
since: Optional[str] = None,
db: Session = Depends(get_db),
):
"""获取指定 RSS 源的文章"""
feed = db.query(Feed).filter(Feed.id == feed_id).first()
if not feed:
return {"error": "Feed not found"}
query = db.query(Article).filter(Article.feed_id == feed_id)
if since:
query = query.filter(Article.published_at >= since)
articles = query.order_by(desc(Article.published_at)).limit(limit).all()
return {
"feed": {
"id": feed.id,
"title": feed.title or feed.url,
"url": feed.url,
},
"count": len(articles),
"articles": [
{
"id": article.id,
"title": article.title or "",
"link": article.link,
"author": article.author or "",
"summary": article.summary or "",
"published_at": article.published_at.isoformat() if article.published_at else None,
}
for article in articles
],
}
@router.get("/summary")
def get_daily_summary(
date: Optional[str] = None,
db: Session = Depends(get_db),
):
"""获取指定日期的文章摘要统计
供 AI 快速了解某天的 RSS 内容概况
"""
if date:
try:
day = datetime.strptime(date, "%Y-%m-%d")
next_day = day + timedelta(days=1)
except ValueError:
return {"error": "Invalid date format, use YYYY-MM-DD"}
else:
day = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
next_day = day + timedelta(days=1)
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
query = query.filter(Article.created_at >= day, Article.created_at < next_day)
rows = query.order_by(desc(Article.published_at)).all()
# 按分类统计
by_category = {}
for article, feed_title, category in rows:
cat = category or "未分类"
if cat not in by_category:
by_category[cat] = []
by_category[cat].append({
"title": article.title or "",
"link": article.link,
"feed": feed_title or "",
"summary": article.summary or "",
})
return {
"date": day.strftime("%Y-%m-%d"),
"total_articles": len(rows),
"by_category": by_category,
}
+273
View File
@@ -0,0 +1,273 @@
"""RSS 源管理 API"""
from typing import List, Optional
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel, HttpUrl
from sqlalchemy.orm import Session
from database import get_db
from models import Feed
from rss_fetcher import discover_feed_url, fetch_and_store_feed
from scheduler import add_feed_job, remove_feed_job
router = APIRouter(prefix="/feeds", tags=["feeds"])
class FeedCreate(BaseModel):
url: str
title: Optional[str] = ""
description: Optional[str] = ""
category: Optional[str] = ""
is_active: Optional[bool] = True
fetch_interval_minutes: Optional[int] = 60
class FeedUpdate(BaseModel):
title: Optional[str] = None
description: Optional[str] = None
category: Optional[str] = None
is_active: Optional[bool] = None
fetch_interval_minutes: Optional[int] = None
class FeedOut(BaseModel):
id: int
url: str
title: str
description: str
category: str
is_active: bool
fetch_interval_minutes: int
last_fetch_at: Optional[str] = None
last_fetch_status: str
success_count: int
fail_count: int
article_count: int
health_status: str
created_at: str
class Config:
from_attributes = True
@router.get("", response_model=dict)
def list_feeds(
skip: int = 0,
limit: int = 100,
category: Optional[str] = None,
search: Optional[str] = None,
is_active: Optional[bool] = None,
db: Session = Depends(get_db),
):
"""获取 RSS 源列表,支持分页、分类筛选、搜索"""
query = db.query(Feed)
if category:
query = query.filter(Feed.category == category)
if is_active is not None:
query = query.filter(Feed.is_active == is_active)
if search:
query = query.filter(
Feed.title.contains(search) | Feed.url.contains(search) | Feed.description.contains(search)
)
total = query.count()
feeds = query.order_by(Feed.created_at.desc()).offset(skip).limit(limit).all()
results = []
for feed in feeds:
data = {
"id": feed.id,
"url": feed.url,
"title": feed.title or feed.url,
"description": feed.description or "",
"category": feed.category or "",
"is_active": feed.is_active,
"fetch_interval_minutes": feed.fetch_interval_minutes,
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
"last_fetch_status": feed.last_fetch_status,
"success_count": feed.success_count,
"fail_count": feed.fail_count,
"article_count": feed.article_count,
"health_status": feed.health_status(),
"created_at": feed.created_at.isoformat(),
}
results.append(data)
return {"total": total, "items": results}
@router.get("/categories")
def list_categories(db: Session = Depends(get_db)):
"""获取所有分类列表"""
categories = db.query(Feed.category).filter(Feed.category != "").distinct().all()
return [c[0] for c in categories if c[0]]
@router.post("", response_model=dict)
def create_feed(data: FeedCreate, db: Session = Depends(get_db)):
"""添加 RSS 源"""
# 检查是否已存在
existing = db.query(Feed).filter(Feed.url == data.url).first()
if existing:
raise HTTPException(status_code=409, detail="该 RSS 源已存在")
feed = Feed(
url=data.url,
title=data.title or "",
description=data.description or "",
category=data.category or "",
is_active=data.is_active,
fetch_interval_minutes=data.fetch_interval_minutes or 60,
)
db.add(feed)
db.commit()
db.refresh(feed)
# 注册定时任务
if feed.is_active:
add_feed_job(feed.id, feed.fetch_interval_minutes)
# 立即抓取一次
fetch_and_store_feed(feed.id)
return {"id": feed.id, "message": "RSS 源添加成功", "url": feed.url}
@router.post("/discover")
def discover_feed(url: str, db: Session = Depends(get_db)):
"""从网页自动发现 RSS feed URL"""
feed_urls = discover_feed_url(url)
return {"source_url": url, "found_feeds": feed_urls}
@router.get("/{feed_id}", response_model=dict)
def get_feed(feed_id: int, db: Session = Depends(get_db)):
"""获取 RSS 源详情"""
feed = db.query(Feed).filter(Feed.id == feed_id).first()
if not feed:
raise HTTPException(status_code=404, detail="RSS 源不存在")
return {
"id": feed.id,
"url": feed.url,
"title": feed.title or feed.url,
"description": feed.description or "",
"category": feed.category or "",
"is_active": feed.is_active,
"fetch_interval_minutes": feed.fetch_interval_minutes,
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
"last_fetch_status": feed.last_fetch_status,
"last_error": feed.last_error,
"success_count": feed.success_count,
"fail_count": feed.fail_count,
"article_count": feed.article_count,
"health_status": feed.health_status(),
"created_at": feed.created_at.isoformat(),
}
@router.put("/{feed_id}", response_model=dict)
def update_feed(feed_id: int, data: FeedUpdate, db: Session = Depends(get_db)):
"""更新 RSS 源"""
feed = db.query(Feed).filter(Feed.id == feed_id).first()
if not feed:
raise HTTPException(status_code=404, detail="RSS 源不存在")
if data.title is not None:
feed.title = data.title
if data.description is not None:
feed.description = data.description
if data.category is not None:
feed.category = data.category
if data.is_active is not None:
feed.is_active = data.is_active
if feed.is_active:
add_feed_job(feed.id, feed.fetch_interval_minutes)
else:
remove_feed_job(feed.id)
if data.fetch_interval_minutes is not None:
feed.fetch_interval_minutes = data.fetch_interval_minutes
if feed.is_active:
add_feed_job(feed.id, feed.fetch_interval_minutes)
db.commit()
return {"message": "RSS 源更新成功"}
@router.delete("/{feed_id}")
def delete_feed(feed_id: int, db: Session = Depends(get_db)):
"""删除 RSS 源(级联删除文章和日志)"""
feed = db.query(Feed).filter(Feed.id == feed_id).first()
if not feed:
raise HTTPException(status_code=404, detail="RSS 源不存在")
remove_feed_job(feed_id)
db.delete(feed)
db.commit()
return {"message": "RSS 源已删除"}
@router.post("/{feed_id}/fetch")
def trigger_fetch(feed_id: int, db: Session = Depends(get_db)):
"""手动触发抓取"""
feed = db.query(Feed).filter(Feed.id == feed_id).first()
if not feed:
raise HTTPException(status_code=404, detail="RSS 源不存在")
result = fetch_and_store_feed(feed_id)
return result
@router.post("/import-opml")
def import_opml(opml_content: str, db: Session = Depends(get_db)):
"""导入 OPML 文件内容"""
import xml.etree.ElementTree as ET
try:
root = ET.fromstring(opml_content)
except ET.ParseError:
raise HTTPException(status_code=400, detail="无效的 OPML 文件")
added = 0
skipped = 0
for outline in root.iter("outline"):
url = outline.get("xmlUrl") or outline.get("xmlurl")
if not url:
continue
existing = db.query(Feed).filter(Feed.url == url).first()
if existing:
skipped += 1
continue
feed = Feed(
url=url,
title=outline.get("title", "") or outline.get("text", ""),
description=outline.get("description", ""),
category=outline.get("category", ""),
is_active=True,
fetch_interval_minutes=60,
)
db.add(feed)
db.commit()
db.refresh(feed)
add_feed_job(feed.id, feed.fetch_interval_minutes)
added += 1
return {"added": added, "skipped": skipped, "message": f"成功导入 {added} 个 RSS 源"}
@router.get("/export-opml")
def export_opml(db: Session = Depends(get_db)):
"""导出 OPML 文件内容"""
feeds = db.query(Feed).all()
lines = ['<?xml version="1.0" encoding="UTF-8"?>', '<opml version="2.0">', '<head><title>rssKeeper Feeds</title></head>', '<body>']
for feed in feeds:
title = (feed.title or feed.url).replace('"', '&quot;')
lines.append(f' <outline type="rss" text="{title}" xmlUrl="{feed.url}" />')
lines.append('</body>')
lines.append('</opml>')
return {"opml": "\n".join(lines)}
+298
View File
@@ -0,0 +1,298 @@
"""RSS 抓取核心逻辑"""
import time
import re
import html
import hashlib
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin
import requests
import feedparser
from bs4 import BeautifulSoup
from sqlalchemy.orm import Session
from models import Feed, Article, FetchLog
from database import SessionLocal
import config
def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict:
"""抓取单个 RSS 源
返回 {"success": bool, "feed_data": parsed, "error": str, "response_time_ms": int}
"""
start_time = time.time()
try:
headers = {
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
"Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
}
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
response.raise_for_status()
# 解析 RSS
parsed = feedparser.parse(response.content)
response_time_ms = int((time.time() - start_time) * 1000)
if parsed.bozo and hasattr(parsed, 'bozo_exception'):
# 有解析警告但可能仍然可用
pass
return {
"success": True,
"feed_data": parsed,
"error": None,
"response_time_ms": response_time_ms,
}
except requests.exceptions.RequestException as e:
return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
except Exception as e:
return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
def discover_feed_url(url: str, timeout: int = 15) -> list:
"""从任意网页自动发现 RSS/Atom feed URL
返回找到的 feed URL 列表
"""
try:
headers = {
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
}
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
feed_urls = []
# 查找 <link rel="alternate"> 标签
for link in soup.find_all("link", rel="alternate"):
link_type = link.get("type", "").lower()
href = link.get("href", "")
if href and any(t in link_type for t in ["rss", "atom", "xml"]):
full_url = urljoin(response.url, href)
feed_urls.append(full_url)
# 也查找常见的 RSS 链接
common_patterns = [
"/rss", "/feed", "/feeds", "/atom.xml", "/rss.xml",
"/index.xml", "/feed.xml", "/?feed=rss2",
]
for pattern in common_patterns:
candidate = urljoin(response.url, pattern)
if candidate not in feed_urls:
# 验证是否是有效的 feed
try:
resp = requests.head(candidate, headers=headers, timeout=5, allow_redirects=True)
content_type = resp.headers.get("Content-Type", "").lower()
if any(t in content_type for t in ["rss", "atom", "xml"]):
feed_urls.append(candidate)
except Exception:
pass
return list(dict.fromkeys(feed_urls)) # 去重保持顺序
except Exception:
return []
def parse_article(entry, feed_id: int) -> dict:
"""从 feedparser entry 解析文章数据"""
title = entry.get("title", "")
link = entry.get("link", "")
author = entry.get("author", "")
# 发布时间
published_at = None
if hasattr(entry, "published_parsed") and entry.published_parsed:
try:
published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
except (ValueError, TypeError):
pass
if not published_at and hasattr(entry, "updated_parsed") and entry.updated_parsed:
try:
published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
except (ValueError, TypeError):
pass
# 内容:优先 summary,其次 content
content = ""
if hasattr(entry, "content") and entry.content:
content = entry.content[0].value
elif hasattr(entry, "summary"):
content = entry.summary
# 清洗 HTML
content = clean_html(content)
# 生成摘要
summary = generate_summary(content)
return {
"feed_id": feed_id,
"title": title[:1024],
"link": link[:2048],
"author": author[:256],
"published_at": published_at,
"content": content[:config.MAX_ARTICLE_CONTENT_LENGTH],
"summary": summary[:config.MAX_SUMMARY_LENGTH],
}
def clean_html(html_text: str) -> str:
"""清洗 HTML,去除 script/style 标签,转为安全文本"""
if not html_text:
return ""
# 先解码 HTML 实体
text = html.unescape(html_text)
# 用 BeautifulSoup 清理
soup = BeautifulSoup(text, "html.parser")
# 移除 script 和 style
for tag in soup(["script", "style", "iframe", "object", "embed"]):
tag.decompose()
# 获取纯文本
cleaned = soup.get_text(separator="\n")
# 压缩空白行
cleaned = re.sub(r"\n\s*\n+", "\n\n", cleaned)
cleaned = cleaned.strip()
return cleaned
def generate_summary(content: str, max_length: int = 300) -> str:
"""从内容生成摘要"""
if not content:
return ""
# 去掉多余空白
text = re.sub(r"\s+", " ", content).strip()
if len(text) <= max_length:
return text
# 在句子边界截断
truncated = text[:max_length]
last_period = max(truncated.rfind(""), truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
if last_period > max_length * 0.5:
return truncated[:last_period + 1]
return truncated + "..."
def fetch_and_store_feed(feed_id: int) -> dict:
"""抓取指定 RSS 源并存储文章
返回抓取结果统计
"""
db = SessionLocal()
try:
feed = db.query(Feed).filter(Feed.id == feed_id).first()
if not feed:
return {"success": False, "error": "Feed not found", "articles_count": 0}
result = fetch_feed(feed.url)
if not result["success"]:
# 记录失败
feed.last_fetch_at = datetime.utcnow()
feed.last_fetch_status = "fail"
feed.last_error = result["error"]
feed.fail_count += 1
log = FetchLog(
feed_id=feed_id,
status="fail",
error_message=result["error"],
response_time_ms=result.get("response_time_ms"),
)
db.add(log)
db.commit()
return {"success": False, "error": result["error"], "articles_count": 0}
parsed = result["feed_data"]
# 更新 feed 元信息
if hasattr(parsed.feed, "title"):
feed.title = parsed.feed.title[:512]
if hasattr(parsed.feed, "description"):
feed.description = parsed.feed.description[:1000]
# 存储文章
new_count = 0
for entry in parsed.entries:
article_data = parse_article(entry, feed_id)
if not article_data["link"]:
continue
# 检查是否已存在(基于 link
existing = db.query(Article).filter(Article.link == article_data["link"]).first()
if existing:
# 更新已有文章
existing.title = article_data["title"] or existing.title
existing.content = article_data["content"] or existing.content
existing.summary = article_data["summary"] or existing.summary
existing.author = article_data["author"] or existing.author
if article_data["published_at"]:
existing.published_at = article_data["published_at"]
else:
article = Article(**article_data)
db.add(article)
new_count += 1
# 更新 feed 统计
feed.last_fetch_at = datetime.utcnow()
feed.last_fetch_status = "success"
feed.last_error = ""
feed.success_count += 1
feed.article_count = db.query(Article).filter(Article.feed_id == feed_id).count()
log = FetchLog(
feed_id=feed_id,
status="success",
articles_fetched=new_count,
response_time_ms=result.get("response_time_ms"),
)
db.add(log)
db.commit()
return {
"success": True,
"articles_count": new_count,
"feed_title": feed.title,
}
except Exception as e:
db.rollback()
return {"success": False, "error": str(e), "articles_count": 0}
finally:
db.close()
def fetch_all_feeds(feed_ids: list = None) -> list:
"""并发抓取多个 RSS 源
返回每个源的抓取结果列表
"""
db = SessionLocal()
try:
query = db.query(Feed).filter(Feed.is_active == True)
if feed_ids:
query = query.filter(Feed.id.in_(feed_ids))
feeds = query.all()
finally:
db.close()
results = []
with ThreadPoolExecutor(max_workers=config.FETCH_CONCURRENCY) as executor:
future_to_feed = {
executor.submit(fetch_and_store_feed, feed.id): feed
for feed in feeds
}
for future in as_completed(future_to_feed):
feed = future_to_feed[future]
try:
result = future.result()
results.append({"feed_id": feed.id, **result})
except Exception as e:
results.append({"feed_id": feed.id, "success": False, "error": str(e), "articles_count": 0})
return results
+74
View File
@@ -0,0 +1,74 @@
"""APScheduler 定时任务管理"""
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.interval import IntervalTrigger
from rss_fetcher import fetch_and_store_feed
import config
_scheduler = None
def get_scheduler():
"""获取或创建调度器实例"""
global _scheduler
if _scheduler is None:
_scheduler = BackgroundScheduler()
return _scheduler
def add_feed_job(feed_id: int, interval_minutes: int):
"""为指定 RSS 源添加定时抓取任务"""
scheduler = get_scheduler()
job_id = f"fetch_feed_{feed_id}"
# 确保间隔不低于最小值
interval = max(interval_minutes, config.MIN_FETCH_INTERVAL)
# 如果任务已存在则更新
existing = scheduler.get_job(job_id)
if existing:
existing.reschedule(trigger=IntervalTrigger(minutes=interval))
return
scheduler.add_job(
fetch_and_store_feed,
trigger=IntervalTrigger(minutes=interval),
id=job_id,
args=[feed_id],
replace_existing=True,
misfire_grace_time=300, # 5分钟容错
coalesce=True, # 合并错过的任务
)
def remove_feed_job(feed_id: int):
"""移除指定 RSS 源的定时任务"""
scheduler = get_scheduler()
job_id = f"fetch_feed_{feed_id}"
try:
scheduler.remove_job(job_id)
except Exception:
pass
def start_scheduler():
"""启动调度器"""
scheduler = get_scheduler()
if not scheduler.running:
scheduler.start()
def stop_scheduler():
"""停止调度器"""
global _scheduler
if _scheduler and _scheduler.running:
_scheduler.shutdown(wait=False)
_scheduler = None
def init_feed_jobs(db):
"""从数据库加载所有活跃 RSS 源并注册定时任务"""
from models import Feed
feeds = db.query(Feed).filter(Feed.is_active == True).all()
for feed in feeds:
add_feed_job(feed.id, feed.fetch_interval_minutes or config.DEFAULT_FETCH_INTERVAL)
start_scheduler()