feat: init rssKeeper - RSS 抓取、管理与检索系统
完整功能包括: - FastAPI 后端 + SQLite + FTS5 全文搜索 - RSS 源管理、自动发现、OPML 导入导出 - 文章抓取、去重、分类、全文检索 - RSS 源健康度监控 - Vue 3 + Element Plus 暗色主题 Web UI - 对外 REST API 供 AI 分析调用 - Docker + docker-compose 部署
This commit is contained in:
@@ -0,0 +1,26 @@
|
||||
"""配置管理 - 环境变量 + 默认值"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# 项目根目录
|
||||
BASE_DIR = Path(__file__).parent
|
||||
DATA_DIR = Path(os.getenv("DATA_DIR", "/app/data"))
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 数据库
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", str(DATA_DIR / "rsskeeper.db"))
|
||||
|
||||
# RSS 抓取配置
|
||||
FETCH_CONCURRENCY = int(os.getenv("FETCH_CONCURRENCY", "10"))
|
||||
FETCH_TIMEOUT = int(os.getenv("FETCH_TIMEOUT", "30"))
|
||||
DEFAULT_FETCH_INTERVAL = int(os.getenv("DEFAULT_FETCH_INTERVAL", "60")) # 分钟
|
||||
MIN_FETCH_INTERVAL = int(os.getenv("MIN_FETCH_INTERVAL", "15")) # 最小间隔15分钟
|
||||
|
||||
# 内容处理
|
||||
MAX_ARTICLE_CONTENT_LENGTH = int(os.getenv("MAX_ARTICLE_CONTENT_LENGTH", "50000"))
|
||||
MAX_SUMMARY_LENGTH = int(os.getenv("MAX_SUMMARY_LENGTH", "500"))
|
||||
ARTICLE_RETENTION_DAYS = int(os.getenv("ARTICLE_RETENTION_DAYS", "0")) # 0 = 永久保留
|
||||
|
||||
# API 配置
|
||||
API_PREFIX = "/api"
|
||||
EXTERNAL_API_PREFIX = "/api/v1/external"
|
||||
@@ -0,0 +1,89 @@
|
||||
"""数据库连接与初始化"""
|
||||
from sqlalchemy import create_engine, event
|
||||
from sqlalchemy.orm import sessionmaker, declarative_base
|
||||
from config import DATABASE_URL
|
||||
|
||||
# SQLite 连接
|
||||
engine = create_engine(
|
||||
f"sqlite:///{DATABASE_URL}",
|
||||
connect_args={"check_same_thread": False},
|
||||
echo=False,
|
||||
)
|
||||
|
||||
# 启用 SQLite 外键约束
|
||||
@event.listens_for(engine, "connect")
|
||||
def set_sqlite_pragma(dbapi_conn, connection_record):
|
||||
cursor = dbapi_conn.cursor()
|
||||
cursor.execute("PRAGMA foreign_keys=ON")
|
||||
cursor.close()
|
||||
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
def get_db():
|
||||
"""FastAPI 依赖注入用"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def init_db():
|
||||
"""初始化数据库表"""
|
||||
from models import Feed, Article, FetchLog # noqa
|
||||
|
||||
Base.metadata.create_all(bind=engine)
|
||||
init_fts5()
|
||||
|
||||
|
||||
def init_fts5():
|
||||
"""初始化 FTS5 全文搜索虚拟表"""
|
||||
conn = engine.raw_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 检查 FTS5 扩展是否可用
|
||||
try:
|
||||
cursor.execute("SELECT sqlite_compileoption_used('ENABLE_FTS5')")
|
||||
has_fts5 = cursor.fetchone()[0]
|
||||
if not has_fts5:
|
||||
print("警告: SQLite 未启用 FTS5 扩展,全文搜索将不可用")
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 创建 FTS5 虚拟表
|
||||
cursor.execute("""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS articles_fts USING fts5(
|
||||
title, content,
|
||||
content='articles',
|
||||
content_rowid='id'
|
||||
)
|
||||
""")
|
||||
|
||||
# 创建触发器,自动同步 articles 表到 FTS5
|
||||
cursor.execute("""
|
||||
CREATE TRIGGER IF NOT EXISTS articles_fts_insert AFTER INSERT ON articles BEGIN
|
||||
INSERT INTO articles_fts(rowid, title, content)
|
||||
VALUES (new.id, new.title, new.content);
|
||||
END
|
||||
""")
|
||||
cursor.execute("""
|
||||
CREATE TRIGGER IF NOT EXISTS articles_fts_delete AFTER DELETE ON articles BEGIN
|
||||
INSERT INTO articles_fts(articles_fts, rowid, title, content)
|
||||
VALUES ('delete', old.id, old.title, old.content);
|
||||
END
|
||||
""")
|
||||
cursor.execute("""
|
||||
CREATE TRIGGER IF NOT EXISTS articles_fts_update AFTER UPDATE ON articles BEGIN
|
||||
INSERT INTO articles_fts(articles_fts, rowid, title, content)
|
||||
VALUES ('delete', old.id, old.title, old.content);
|
||||
INSERT INTO articles_fts(rowid, title, content)
|
||||
VALUES (new.id, new.title, new.content);
|
||||
END
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
@@ -0,0 +1,81 @@
|
||||
"""SQLite FTS5 全文搜索封装"""
|
||||
from sqlalchemy import text
|
||||
from database import engine
|
||||
|
||||
|
||||
def search_articles(query: str, limit: int = 50, offset: int = 0):
|
||||
"""全文搜索文章
|
||||
返回 [(article_id, title, content_snippet, rank), ...]
|
||||
"""
|
||||
if not query or not query.strip():
|
||||
return [], 0
|
||||
|
||||
# 转义 FTS5 特殊字符
|
||||
query = query.replace('"', '""').strip()
|
||||
|
||||
conn = engine.raw_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
# 使用 FTS5 查询
|
||||
sql = """
|
||||
SELECT a.id, a.title, a.summary, a.link, a.published_at, a.created_at,
|
||||
f.id as feed_id, f.title as feed_title, f.category,
|
||||
rank
|
||||
FROM articles_fts
|
||||
JOIN articles a ON articles_fts.rowid = a.id
|
||||
JOIN feeds f ON a.feed_id = f.id
|
||||
WHERE articles_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ? OFFSET ?
|
||||
"""
|
||||
cursor.execute(sql, (query, limit, offset))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
# 获取总数
|
||||
count_sql = """
|
||||
SELECT COUNT(*) FROM articles_fts WHERE articles_fts MATCH ?
|
||||
"""
|
||||
cursor.execute(count_sql, (query,))
|
||||
total = cursor.fetchone()[0]
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
results.append({
|
||||
"id": row[0],
|
||||
"title": row[1],
|
||||
"summary": row[2],
|
||||
"link": row[3],
|
||||
"published_at": row[4],
|
||||
"created_at": row[5],
|
||||
"feed_id": row[6],
|
||||
"feed_title": row[7],
|
||||
"category": row[8],
|
||||
})
|
||||
|
||||
return results, total
|
||||
except Exception as e:
|
||||
# FTS5 查询失败时返回空结果
|
||||
return [], 0
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def rebuild_fts_index():
|
||||
"""重建 FTS5 索引(数据不一致时使用)"""
|
||||
conn = engine.raw_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
cursor.execute("DELETE FROM articles_fts")
|
||||
cursor.execute("""
|
||||
INSERT INTO articles_fts(rowid, title, content)
|
||||
SELECT id, title, content FROM articles
|
||||
""")
|
||||
conn.commit()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
@@ -0,0 +1,112 @@
|
||||
"""RSS 源健康度检测"""
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict
|
||||
from sqlalchemy.orm import Session
|
||||
from models import Feed, FetchLog
|
||||
|
||||
|
||||
def get_feed_health(db: Session, feed_id: int = None) -> List[Dict]:
|
||||
"""获取 RSS 源健康度信息
|
||||
返回每个源的健康状态详情
|
||||
"""
|
||||
query = db.query(Feed)
|
||||
if feed_id:
|
||||
query = query.filter(Feed.id == feed_id)
|
||||
|
||||
feeds = query.all()
|
||||
results = []
|
||||
|
||||
for feed in feeds:
|
||||
total = feed.success_count + feed.fail_count
|
||||
success_rate = round(feed.success_count / total * 100, 1) if total > 0 else 0
|
||||
|
||||
days_since_fetch = None
|
||||
if feed.last_fetch_at:
|
||||
days_since_fetch = (datetime.utcnow() - feed.last_fetch_at).days
|
||||
|
||||
# 获取最近 7 天抓取记录
|
||||
recent_logs = db.query(FetchLog).filter(
|
||||
FetchLog.feed_id == feed.id,
|
||||
FetchLog.created_at >= datetime.utcnow() - timedelta(days=7)
|
||||
).order_by(FetchLog.created_at.desc()).limit(10).all()
|
||||
|
||||
health = feed.health_status()
|
||||
|
||||
results.append({
|
||||
"id": feed.id,
|
||||
"title": feed.title or feed.url,
|
||||
"url": feed.url,
|
||||
"is_active": feed.is_active,
|
||||
"health_status": health,
|
||||
"health_label": _health_label(health),
|
||||
"success_rate": success_rate,
|
||||
"success_count": feed.success_count,
|
||||
"fail_count": feed.fail_count,
|
||||
"total_fetches": total,
|
||||
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
|
||||
"days_since_fetch": days_since_fetch,
|
||||
"article_count": feed.article_count,
|
||||
"last_error": feed.last_error,
|
||||
"recent_logs": [
|
||||
{
|
||||
"status": log.status,
|
||||
"articles_fetched": log.articles_fetched,
|
||||
"response_time_ms": log.response_time_ms,
|
||||
"created_at": log.created_at.isoformat(),
|
||||
"error_message": log.error_message if log.status == "fail" else None,
|
||||
}
|
||||
for log in recent_logs
|
||||
],
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _health_label(status: str) -> str:
|
||||
labels = {
|
||||
"healthy": "健康",
|
||||
"warning": "警告",
|
||||
"unhealthy": "异常",
|
||||
"unknown": "未知",
|
||||
}
|
||||
return labels.get(status, "未知")
|
||||
|
||||
|
||||
def get_overall_stats(db: Session) -> Dict:
|
||||
"""获取整体统计信息"""
|
||||
total_feeds = db.query(Feed).count()
|
||||
active_feeds = db.query(Feed).filter(Feed.is_active == True).count()
|
||||
total_articles = db.query(Feed).with_entities(Feed.article_count).all()
|
||||
total_articles_count = sum(a[0] for a in total_articles) if total_articles else 0
|
||||
|
||||
# 健康源统计
|
||||
feeds = db.query(Feed).all()
|
||||
healthy = warning = unhealthy = 0
|
||||
for feed in feeds:
|
||||
status = feed.health_status()
|
||||
if status == "healthy":
|
||||
healthy += 1
|
||||
elif status == "warning":
|
||||
warning += 1
|
||||
elif status == "unhealthy":
|
||||
unhealthy += 1
|
||||
|
||||
# 今日抓取
|
||||
today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
from models import FetchLog
|
||||
today_fetches = db.query(FetchLog).filter(FetchLog.created_at >= today).count()
|
||||
today_success = db.query(FetchLog).filter(
|
||||
FetchLog.created_at >= today, FetchLog.status == "success"
|
||||
).count()
|
||||
|
||||
return {
|
||||
"total_feeds": total_feeds,
|
||||
"active_feeds": active_feeds,
|
||||
"total_articles": total_articles_count,
|
||||
"healthy_feeds": healthy,
|
||||
"warning_feeds": warning,
|
||||
"unhealthy_feeds": unhealthy,
|
||||
"today_fetches": today_fetches,
|
||||
"today_success": today_success,
|
||||
"today_success_rate": round(today_success / today_fetches * 100, 1) if today_fetches > 0 else 0,
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
"""rssKeeper - FastAPI 入口"""
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import FileResponse
|
||||
from starlette.middleware.cors import CORSMiddleware
|
||||
from database import init_db, SessionLocal
|
||||
from scheduler import init_feed_jobs, stop_scheduler
|
||||
from routers import feeds, articles, dashboard, external_api
|
||||
import config
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""应用生命周期管理"""
|
||||
# 启动时:初始化数据库 + 注册定时任务
|
||||
init_db()
|
||||
db = SessionLocal()
|
||||
try:
|
||||
init_feed_jobs(db)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
yield
|
||||
|
||||
# 关闭时:停止调度器
|
||||
stop_scheduler()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="rssKeeper",
|
||||
description="RSS 抓取、管理与检索系统",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# CORS
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# API 路由
|
||||
app.include_router(feeds.router, prefix=config.API_PREFIX)
|
||||
app.include_router(articles.router, prefix=config.API_PREFIX)
|
||||
app.include_router(dashboard.router, prefix=config.API_PREFIX)
|
||||
app.include_router(external_api.router, prefix=config.EXTERNAL_API_PREFIX)
|
||||
|
||||
|
||||
@app.get("/api/health")
|
||||
def health_check():
|
||||
"""健康检查"""
|
||||
return {"status": "ok", "service": "rssKeeper"}
|
||||
|
||||
|
||||
# 静态文件服务(前端构建产物)
|
||||
static_dir = os.path.join(config.BASE_DIR, "static")
|
||||
if os.path.exists(static_dir):
|
||||
app.mount("/static", StaticFiles(directory=static_dir), name="static")
|
||||
|
||||
@app.get("/{full_path:path}")
|
||||
async def serve_spa(full_path: str):
|
||||
"""Vue SPA 路由回退"""
|
||||
# API 路由不走这里
|
||||
if full_path.startswith("api/") or full_path.startswith("docs") or full_path.startswith("openapi.json"):
|
||||
return {"detail": "Not found"}
|
||||
|
||||
index_path = os.path.join(static_dir, "index.html")
|
||||
if os.path.exists(index_path):
|
||||
return FileResponse(index_path)
|
||||
return {"detail": "Frontend not built"}
|
||||
@@ -0,0 +1,90 @@
|
||||
"""SQLAlchemy 数据模型"""
|
||||
from datetime import datetime
|
||||
from sqlalchemy import Column, Integer, String, Text, Boolean, DateTime, ForeignKey
|
||||
from sqlalchemy.orm import relationship
|
||||
from database import Base
|
||||
|
||||
|
||||
class Feed(Base):
|
||||
"""RSS 源"""
|
||||
__tablename__ = "feeds"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
url = Column(String(2048), unique=True, nullable=False, index=True)
|
||||
title = Column(String(512), default="")
|
||||
description = Column(Text, default="")
|
||||
category = Column(String(128), default="")
|
||||
is_active = Column(Boolean, default=True, index=True)
|
||||
fetch_interval_minutes = Column(Integer, default=60)
|
||||
|
||||
# 抓取统计
|
||||
last_fetch_at = Column(DateTime, nullable=True)
|
||||
last_fetch_status = Column(String(20), default="")
|
||||
last_error = Column(Text, default="")
|
||||
success_count = Column(Integer, default=0)
|
||||
fail_count = Column(Integer, default=0)
|
||||
article_count = Column(Integer, default=0)
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
# 关联
|
||||
articles = relationship("Article", back_populates="feed", cascade="all, delete-orphan")
|
||||
fetch_logs = relationship("FetchLog", back_populates="feed", cascade="all, delete-orphan")
|
||||
|
||||
def health_status(self):
|
||||
"""计算健康度
|
||||
🟢 健康: 成功率 >= 90%, 最近7天有更新
|
||||
🟡 警告: 成功率 50%-90%, 或超过3天未更新
|
||||
🔴 异常: 成功率 < 50%, 或超过7天未更新
|
||||
"""
|
||||
total = self.success_count + self.fail_count
|
||||
if total == 0:
|
||||
return "unknown"
|
||||
|
||||
success_rate = self.success_count / total
|
||||
|
||||
days_since_last_fetch = None
|
||||
if self.last_fetch_at:
|
||||
days_since_last_fetch = (datetime.utcnow() - self.last_fetch_at).days
|
||||
|
||||
if success_rate >= 0.9 and (days_since_last_fetch is None or days_since_last_fetch <= 7):
|
||||
return "healthy"
|
||||
elif success_rate >= 0.5 and (days_since_last_fetch is None or days_since_last_fetch <= 7):
|
||||
return "warning"
|
||||
else:
|
||||
return "unhealthy"
|
||||
|
||||
|
||||
class Article(Base):
|
||||
"""RSS 文章"""
|
||||
__tablename__ = "articles"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
feed_id = Column(Integer, ForeignKey("feeds.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
title = Column(String(1024), default="", index=True)
|
||||
link = Column(String(2048), unique=True, nullable=False, index=True)
|
||||
author = Column(String(256), default="")
|
||||
published_at = Column(DateTime, nullable=True, index=True)
|
||||
content = Column(Text, default="")
|
||||
summary = Column(Text, default="")
|
||||
is_read = Column(Boolean, default=False)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, index=True)
|
||||
|
||||
# 关联
|
||||
feed = relationship("Feed", back_populates="articles")
|
||||
|
||||
|
||||
class FetchLog(Base):
|
||||
"""抓取日志"""
|
||||
__tablename__ = "fetch_logs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
feed_id = Column(Integer, ForeignKey("feeds.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
status = Column(String(20), nullable=False) # success / fail
|
||||
articles_fetched = Column(Integer, default=0)
|
||||
error_message = Column(Text, default="")
|
||||
response_time_ms = Column(Integer, nullable=True)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, index=True)
|
||||
|
||||
# 关联
|
||||
feed = relationship("Feed", back_populates="fetch_logs")
|
||||
@@ -0,0 +1,9 @@
|
||||
fastapi>=0.110.0
|
||||
uvicorn[standard]>=0.29.0
|
||||
sqlalchemy>=2.0.0
|
||||
pydantic>=2.6.0
|
||||
feedparser>=6.0.11
|
||||
requests>=2.31.0
|
||||
beautifulsoup4>=4.12.0
|
||||
apscheduler>=3.10.4
|
||||
lxml>=5.1.0
|
||||
@@ -0,0 +1,133 @@
|
||||
"""文章管理 API"""
|
||||
from typing import Optional
|
||||
from fastapi import APIRouter, Depends
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import desc
|
||||
from database import get_db
|
||||
from models import Article, Feed
|
||||
from fulltext_search import search_articles
|
||||
|
||||
router = APIRouter(prefix="/articles", tags=["articles"])
|
||||
|
||||
|
||||
class ArticleOut(BaseModel):
|
||||
id: int
|
||||
feed_id: int
|
||||
title: str
|
||||
link: str
|
||||
author: str
|
||||
published_at: Optional[str]
|
||||
summary: str
|
||||
is_read: bool
|
||||
created_at: str
|
||||
feed_title: str
|
||||
category: str
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
@router.get("")
|
||||
def list_articles(
|
||||
skip: int = 0,
|
||||
limit: int = 50,
|
||||
feed_id: Optional[int] = None,
|
||||
category: Optional[str] = None,
|
||||
search: Optional[str] = None,
|
||||
since: Optional[str] = None,
|
||||
until: Optional[str] = None,
|
||||
is_read: Optional[bool] = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""获取文章列表,支持多种筛选条件"""
|
||||
|
||||
# 如果有搜索关键词,使用 FTS5 全文搜索
|
||||
if search and search.strip():
|
||||
results, total = search_articles(search.strip(), limit=limit, offset=skip)
|
||||
return {"total": total, "items": results}
|
||||
|
||||
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
|
||||
|
||||
if feed_id:
|
||||
query = query.filter(Article.feed_id == feed_id)
|
||||
if category:
|
||||
query = query.filter(Feed.category == category)
|
||||
if is_read is not None:
|
||||
query = query.filter(Article.is_read == is_read)
|
||||
if since:
|
||||
query = query.filter(Article.published_at >= since)
|
||||
if until:
|
||||
query = query.filter(Article.published_at <= until)
|
||||
|
||||
total = query.count()
|
||||
rows = query.order_by(desc(Article.published_at)).offset(skip).limit(limit).all()
|
||||
|
||||
items = []
|
||||
for article, feed_title, category in rows:
|
||||
items.append({
|
||||
"id": article.id,
|
||||
"feed_id": article.feed_id,
|
||||
"title": article.title or "",
|
||||
"link": article.link,
|
||||
"author": article.author or "",
|
||||
"published_at": article.published_at.isoformat() if article.published_at else None,
|
||||
"summary": article.summary or "",
|
||||
"is_read": article.is_read,
|
||||
"created_at": article.created_at.isoformat(),
|
||||
"feed_title": feed_title or "",
|
||||
"category": category or "",
|
||||
})
|
||||
|
||||
return {"total": total, "items": items}
|
||||
|
||||
|
||||
@router.get("/{article_id}")
|
||||
def get_article(article_id: int, db: Session = Depends(get_db)):
|
||||
"""获取文章详情"""
|
||||
article = db.query(Article).filter(Article.id == article_id).first()
|
||||
if not article:
|
||||
raise HTTPException(status_code=404, detail="文章不存在")
|
||||
|
||||
feed = db.query(Feed).filter(Feed.id == article.feed_id).first()
|
||||
|
||||
return {
|
||||
"id": article.id,
|
||||
"feed_id": article.feed_id,
|
||||
"title": article.title or "",
|
||||
"link": article.link,
|
||||
"author": article.author or "",
|
||||
"published_at": article.published_at.isoformat() if article.published_at else None,
|
||||
"content": article.content or "",
|
||||
"summary": article.summary or "",
|
||||
"is_read": article.is_read,
|
||||
"created_at": article.created_at.isoformat(),
|
||||
"feed_title": feed.title if feed else "",
|
||||
"category": feed.category if feed else "",
|
||||
}
|
||||
|
||||
|
||||
@router.put("/{article_id}/read")
|
||||
def mark_read(article_id: int, db: Session = Depends(get_db)):
|
||||
"""标记文章为已读"""
|
||||
article = db.query(Article).filter(Article.id == article_id).first()
|
||||
if not article:
|
||||
raise HTTPException(status_code=404, detail="文章不存在")
|
||||
|
||||
article.is_read = True
|
||||
db.commit()
|
||||
return {"message": "已标记为已读"}
|
||||
|
||||
|
||||
@router.get("/search/fulltext")
|
||||
def fulltext_search(
|
||||
q: str,
|
||||
skip: int = 0,
|
||||
limit: int = 50,
|
||||
):
|
||||
"""全文搜索文章"""
|
||||
results, total = search_articles(q, limit=limit, offset=skip)
|
||||
return {"total": total, "items": results}
|
||||
|
||||
|
||||
from fastapi import HTTPException
|
||||
@@ -0,0 +1,58 @@
|
||||
"""仪表盘统计 API"""
|
||||
from fastapi import APIRouter, Depends
|
||||
from sqlalchemy.orm import Session
|
||||
from database import get_db
|
||||
from health_checker import get_overall_stats, get_feed_health
|
||||
|
||||
router = APIRouter(prefix="/dashboard", tags=["dashboard"])
|
||||
|
||||
|
||||
@router.get("/stats")
|
||||
def dashboard_stats(db: Session = Depends(get_db)):
|
||||
"""仪表盘统计数据"""
|
||||
return get_overall_stats(db)
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
def dashboard_health(
|
||||
skip: int = 0,
|
||||
limit: int = 100,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""RSS 源健康度列表"""
|
||||
all_health = get_feed_health(db)
|
||||
total = len(all_health)
|
||||
|
||||
# 按健康状态排序:异常在前
|
||||
status_order = {"unhealthy": 0, "warning": 1, "unknown": 2, "healthy": 3}
|
||||
all_health.sort(key=lambda x: status_order.get(x["health_status"], 2))
|
||||
|
||||
items = all_health[skip:skip + limit]
|
||||
return {"total": total, "items": items}
|
||||
|
||||
|
||||
@router.get("/recent-activity")
|
||||
def recent_activity(limit: int = 20, db: Session = Depends(get_db)):
|
||||
"""最近的抓取活动"""
|
||||
from models import FetchLog, Feed
|
||||
from sqlalchemy import desc
|
||||
|
||||
logs = db.query(FetchLog, Feed.title.label("feed_title")).join(Feed).order_by(
|
||||
desc(FetchLog.created_at)
|
||||
).limit(limit).all()
|
||||
|
||||
return {
|
||||
"items": [
|
||||
{
|
||||
"id": log.id,
|
||||
"feed_id": log.feed_id,
|
||||
"feed_title": feed_title or "",
|
||||
"status": log.status,
|
||||
"articles_fetched": log.articles_fetched,
|
||||
"response_time_ms": log.response_time_ms,
|
||||
"error_message": log.error_message,
|
||||
"created_at": log.created_at.isoformat(),
|
||||
}
|
||||
for log, feed_title in logs
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,163 @@
|
||||
"""对外 API(供 AI/外部系统调用)"""
|
||||
from typing import Optional
|
||||
from datetime import datetime, timedelta
|
||||
from fastapi import APIRouter, Depends
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import desc
|
||||
from database import get_db
|
||||
from models import Article, Feed
|
||||
|
||||
router = APIRouter(prefix="/external", tags=["external"])
|
||||
|
||||
|
||||
@router.get("/recent")
|
||||
def get_recent_articles(
|
||||
hours: int = 24,
|
||||
limit: int = 50,
|
||||
feed_id: Optional[int] = None,
|
||||
category: Optional[str] = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""获取最近 N 小时的文章
|
||||
这是对外提供给 AI 分析的主要接口
|
||||
"""
|
||||
since = datetime.utcnow() - timedelta(hours=hours)
|
||||
|
||||
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
|
||||
|
||||
query = query.filter(Article.created_at >= since)
|
||||
|
||||
if feed_id:
|
||||
query = query.filter(Article.feed_id == feed_id)
|
||||
if category:
|
||||
query = query.filter(Feed.category == category)
|
||||
|
||||
rows = query.order_by(desc(Article.published_at)).limit(limit).all()
|
||||
|
||||
return {
|
||||
"query": {
|
||||
"hours": hours,
|
||||
"limit": limit,
|
||||
"feed_id": feed_id,
|
||||
"category": category,
|
||||
},
|
||||
"count": len(rows),
|
||||
"articles": [
|
||||
{
|
||||
"id": article.id,
|
||||
"title": article.title or "",
|
||||
"link": article.link,
|
||||
"author": article.author or "",
|
||||
"summary": article.summary or "",
|
||||
"content": article.content or "" if len(article.content or "") < 10000 else article.summary or "",
|
||||
"published_at": article.published_at.isoformat() if article.published_at else None,
|
||||
"created_at": article.created_at.isoformat(),
|
||||
"feed_title": feed_title or "",
|
||||
"category": category or "",
|
||||
}
|
||||
for article, feed_title, category in rows
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@router.get("/feeds")
|
||||
def get_active_feeds(db: Session = Depends(get_db)):
|
||||
"""获取所有活跃的 RSS 源列表"""
|
||||
feeds = db.query(Feed).filter(Feed.is_active == True).all()
|
||||
|
||||
return {
|
||||
"count": len(feeds),
|
||||
"feeds": [
|
||||
{
|
||||
"id": feed.id,
|
||||
"title": feed.title or feed.url,
|
||||
"url": feed.url,
|
||||
"category": feed.category or "",
|
||||
"article_count": feed.article_count,
|
||||
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
|
||||
}
|
||||
for feed in feeds
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@router.get("/feeds/{feed_id}/articles")
|
||||
def get_feed_articles(
|
||||
feed_id: int,
|
||||
limit: int = 100,
|
||||
since: Optional[str] = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""获取指定 RSS 源的文章"""
|
||||
feed = db.query(Feed).filter(Feed.id == feed_id).first()
|
||||
if not feed:
|
||||
return {"error": "Feed not found"}
|
||||
|
||||
query = db.query(Article).filter(Article.feed_id == feed_id)
|
||||
|
||||
if since:
|
||||
query = query.filter(Article.published_at >= since)
|
||||
|
||||
articles = query.order_by(desc(Article.published_at)).limit(limit).all()
|
||||
|
||||
return {
|
||||
"feed": {
|
||||
"id": feed.id,
|
||||
"title": feed.title or feed.url,
|
||||
"url": feed.url,
|
||||
},
|
||||
"count": len(articles),
|
||||
"articles": [
|
||||
{
|
||||
"id": article.id,
|
||||
"title": article.title or "",
|
||||
"link": article.link,
|
||||
"author": article.author or "",
|
||||
"summary": article.summary or "",
|
||||
"published_at": article.published_at.isoformat() if article.published_at else None,
|
||||
}
|
||||
for article in articles
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@router.get("/summary")
|
||||
def get_daily_summary(
|
||||
date: Optional[str] = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""获取指定日期的文章摘要统计
|
||||
供 AI 快速了解某天的 RSS 内容概况
|
||||
"""
|
||||
if date:
|
||||
try:
|
||||
day = datetime.strptime(date, "%Y-%m-%d")
|
||||
next_day = day + timedelta(days=1)
|
||||
except ValueError:
|
||||
return {"error": "Invalid date format, use YYYY-MM-DD"}
|
||||
else:
|
||||
day = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
next_day = day + timedelta(days=1)
|
||||
|
||||
query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
|
||||
query = query.filter(Article.created_at >= day, Article.created_at < next_day)
|
||||
rows = query.order_by(desc(Article.published_at)).all()
|
||||
|
||||
# 按分类统计
|
||||
by_category = {}
|
||||
for article, feed_title, category in rows:
|
||||
cat = category or "未分类"
|
||||
if cat not in by_category:
|
||||
by_category[cat] = []
|
||||
by_category[cat].append({
|
||||
"title": article.title or "",
|
||||
"link": article.link,
|
||||
"feed": feed_title or "",
|
||||
"summary": article.summary or "",
|
||||
})
|
||||
|
||||
return {
|
||||
"date": day.strftime("%Y-%m-%d"),
|
||||
"total_articles": len(rows),
|
||||
"by_category": by_category,
|
||||
}
|
||||
@@ -0,0 +1,273 @@
|
||||
"""RSS 源管理 API"""
|
||||
from typing import List, Optional
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from sqlalchemy.orm import Session
|
||||
from database import get_db
|
||||
from models import Feed
|
||||
from rss_fetcher import discover_feed_url, fetch_and_store_feed
|
||||
from scheduler import add_feed_job, remove_feed_job
|
||||
|
||||
router = APIRouter(prefix="/feeds", tags=["feeds"])
|
||||
|
||||
|
||||
class FeedCreate(BaseModel):
|
||||
url: str
|
||||
title: Optional[str] = ""
|
||||
description: Optional[str] = ""
|
||||
category: Optional[str] = ""
|
||||
is_active: Optional[bool] = True
|
||||
fetch_interval_minutes: Optional[int] = 60
|
||||
|
||||
|
||||
class FeedUpdate(BaseModel):
|
||||
title: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
is_active: Optional[bool] = None
|
||||
fetch_interval_minutes: Optional[int] = None
|
||||
|
||||
|
||||
class FeedOut(BaseModel):
|
||||
id: int
|
||||
url: str
|
||||
title: str
|
||||
description: str
|
||||
category: str
|
||||
is_active: bool
|
||||
fetch_interval_minutes: int
|
||||
last_fetch_at: Optional[str] = None
|
||||
last_fetch_status: str
|
||||
success_count: int
|
||||
fail_count: int
|
||||
article_count: int
|
||||
health_status: str
|
||||
created_at: str
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
@router.get("", response_model=dict)
|
||||
def list_feeds(
|
||||
skip: int = 0,
|
||||
limit: int = 100,
|
||||
category: Optional[str] = None,
|
||||
search: Optional[str] = None,
|
||||
is_active: Optional[bool] = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""获取 RSS 源列表,支持分页、分类筛选、搜索"""
|
||||
query = db.query(Feed)
|
||||
|
||||
if category:
|
||||
query = query.filter(Feed.category == category)
|
||||
if is_active is not None:
|
||||
query = query.filter(Feed.is_active == is_active)
|
||||
if search:
|
||||
query = query.filter(
|
||||
Feed.title.contains(search) | Feed.url.contains(search) | Feed.description.contains(search)
|
||||
)
|
||||
|
||||
total = query.count()
|
||||
feeds = query.order_by(Feed.created_at.desc()).offset(skip).limit(limit).all()
|
||||
|
||||
results = []
|
||||
for feed in feeds:
|
||||
data = {
|
||||
"id": feed.id,
|
||||
"url": feed.url,
|
||||
"title": feed.title or feed.url,
|
||||
"description": feed.description or "",
|
||||
"category": feed.category or "",
|
||||
"is_active": feed.is_active,
|
||||
"fetch_interval_minutes": feed.fetch_interval_minutes,
|
||||
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
|
||||
"last_fetch_status": feed.last_fetch_status,
|
||||
"success_count": feed.success_count,
|
||||
"fail_count": feed.fail_count,
|
||||
"article_count": feed.article_count,
|
||||
"health_status": feed.health_status(),
|
||||
"created_at": feed.created_at.isoformat(),
|
||||
}
|
||||
results.append(data)
|
||||
|
||||
return {"total": total, "items": results}
|
||||
|
||||
|
||||
@router.get("/categories")
|
||||
def list_categories(db: Session = Depends(get_db)):
|
||||
"""获取所有分类列表"""
|
||||
categories = db.query(Feed.category).filter(Feed.category != "").distinct().all()
|
||||
return [c[0] for c in categories if c[0]]
|
||||
|
||||
|
||||
@router.post("", response_model=dict)
|
||||
def create_feed(data: FeedCreate, db: Session = Depends(get_db)):
|
||||
"""添加 RSS 源"""
|
||||
# 检查是否已存在
|
||||
existing = db.query(Feed).filter(Feed.url == data.url).first()
|
||||
if existing:
|
||||
raise HTTPException(status_code=409, detail="该 RSS 源已存在")
|
||||
|
||||
feed = Feed(
|
||||
url=data.url,
|
||||
title=data.title or "",
|
||||
description=data.description or "",
|
||||
category=data.category or "",
|
||||
is_active=data.is_active,
|
||||
fetch_interval_minutes=data.fetch_interval_minutes or 60,
|
||||
)
|
||||
db.add(feed)
|
||||
db.commit()
|
||||
db.refresh(feed)
|
||||
|
||||
# 注册定时任务
|
||||
if feed.is_active:
|
||||
add_feed_job(feed.id, feed.fetch_interval_minutes)
|
||||
|
||||
# 立即抓取一次
|
||||
fetch_and_store_feed(feed.id)
|
||||
|
||||
return {"id": feed.id, "message": "RSS 源添加成功", "url": feed.url}
|
||||
|
||||
|
||||
@router.post("/discover")
|
||||
def discover_feed(url: str, db: Session = Depends(get_db)):
|
||||
"""从网页自动发现 RSS feed URL"""
|
||||
feed_urls = discover_feed_url(url)
|
||||
return {"source_url": url, "found_feeds": feed_urls}
|
||||
|
||||
|
||||
@router.get("/{feed_id}", response_model=dict)
|
||||
def get_feed(feed_id: int, db: Session = Depends(get_db)):
|
||||
"""获取 RSS 源详情"""
|
||||
feed = db.query(Feed).filter(Feed.id == feed_id).first()
|
||||
if not feed:
|
||||
raise HTTPException(status_code=404, detail="RSS 源不存在")
|
||||
|
||||
return {
|
||||
"id": feed.id,
|
||||
"url": feed.url,
|
||||
"title": feed.title or feed.url,
|
||||
"description": feed.description or "",
|
||||
"category": feed.category or "",
|
||||
"is_active": feed.is_active,
|
||||
"fetch_interval_minutes": feed.fetch_interval_minutes,
|
||||
"last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None,
|
||||
"last_fetch_status": feed.last_fetch_status,
|
||||
"last_error": feed.last_error,
|
||||
"success_count": feed.success_count,
|
||||
"fail_count": feed.fail_count,
|
||||
"article_count": feed.article_count,
|
||||
"health_status": feed.health_status(),
|
||||
"created_at": feed.created_at.isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@router.put("/{feed_id}", response_model=dict)
|
||||
def update_feed(feed_id: int, data: FeedUpdate, db: Session = Depends(get_db)):
|
||||
"""更新 RSS 源"""
|
||||
feed = db.query(Feed).filter(Feed.id == feed_id).first()
|
||||
if not feed:
|
||||
raise HTTPException(status_code=404, detail="RSS 源不存在")
|
||||
|
||||
if data.title is not None:
|
||||
feed.title = data.title
|
||||
if data.description is not None:
|
||||
feed.description = data.description
|
||||
if data.category is not None:
|
||||
feed.category = data.category
|
||||
if data.is_active is not None:
|
||||
feed.is_active = data.is_active
|
||||
if feed.is_active:
|
||||
add_feed_job(feed.id, feed.fetch_interval_minutes)
|
||||
else:
|
||||
remove_feed_job(feed.id)
|
||||
if data.fetch_interval_minutes is not None:
|
||||
feed.fetch_interval_minutes = data.fetch_interval_minutes
|
||||
if feed.is_active:
|
||||
add_feed_job(feed.id, feed.fetch_interval_minutes)
|
||||
|
||||
db.commit()
|
||||
return {"message": "RSS 源更新成功"}
|
||||
|
||||
|
||||
@router.delete("/{feed_id}")
|
||||
def delete_feed(feed_id: int, db: Session = Depends(get_db)):
|
||||
"""删除 RSS 源(级联删除文章和日志)"""
|
||||
feed = db.query(Feed).filter(Feed.id == feed_id).first()
|
||||
if not feed:
|
||||
raise HTTPException(status_code=404, detail="RSS 源不存在")
|
||||
|
||||
remove_feed_job(feed_id)
|
||||
db.delete(feed)
|
||||
db.commit()
|
||||
return {"message": "RSS 源已删除"}
|
||||
|
||||
|
||||
@router.post("/{feed_id}/fetch")
|
||||
def trigger_fetch(feed_id: int, db: Session = Depends(get_db)):
|
||||
"""手动触发抓取"""
|
||||
feed = db.query(Feed).filter(Feed.id == feed_id).first()
|
||||
if not feed:
|
||||
raise HTTPException(status_code=404, detail="RSS 源不存在")
|
||||
|
||||
result = fetch_and_store_feed(feed_id)
|
||||
return result
|
||||
|
||||
|
||||
@router.post("/import-opml")
|
||||
def import_opml(opml_content: str, db: Session = Depends(get_db)):
|
||||
"""导入 OPML 文件内容"""
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
try:
|
||||
root = ET.fromstring(opml_content)
|
||||
except ET.ParseError:
|
||||
raise HTTPException(status_code=400, detail="无效的 OPML 文件")
|
||||
|
||||
added = 0
|
||||
skipped = 0
|
||||
|
||||
for outline in root.iter("outline"):
|
||||
url = outline.get("xmlUrl") or outline.get("xmlurl")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
existing = db.query(Feed).filter(Feed.url == url).first()
|
||||
if existing:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
feed = Feed(
|
||||
url=url,
|
||||
title=outline.get("title", "") or outline.get("text", ""),
|
||||
description=outline.get("description", ""),
|
||||
category=outline.get("category", ""),
|
||||
is_active=True,
|
||||
fetch_interval_minutes=60,
|
||||
)
|
||||
db.add(feed)
|
||||
db.commit()
|
||||
db.refresh(feed)
|
||||
|
||||
add_feed_job(feed.id, feed.fetch_interval_minutes)
|
||||
added += 1
|
||||
|
||||
return {"added": added, "skipped": skipped, "message": f"成功导入 {added} 个 RSS 源"}
|
||||
|
||||
|
||||
@router.get("/export-opml")
|
||||
def export_opml(db: Session = Depends(get_db)):
|
||||
"""导出 OPML 文件内容"""
|
||||
feeds = db.query(Feed).all()
|
||||
|
||||
lines = ['<?xml version="1.0" encoding="UTF-8"?>', '<opml version="2.0">', '<head><title>rssKeeper Feeds</title></head>', '<body>']
|
||||
for feed in feeds:
|
||||
title = (feed.title or feed.url).replace('"', '"')
|
||||
lines.append(f' <outline type="rss" text="{title}" xmlUrl="{feed.url}" />')
|
||||
lines.append('</body>')
|
||||
lines.append('</opml>')
|
||||
|
||||
return {"opml": "\n".join(lines)}
|
||||
@@ -0,0 +1,298 @@
|
||||
"""RSS 抓取核心逻辑"""
|
||||
import time
|
||||
import re
|
||||
import html
|
||||
import hashlib
|
||||
from datetime import datetime, timezone
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from urllib.parse import urljoin
|
||||
import requests
|
||||
import feedparser
|
||||
from bs4 import BeautifulSoup
|
||||
from sqlalchemy.orm import Session
|
||||
from models import Feed, Article, FetchLog
|
||||
from database import SessionLocal
|
||||
import config
|
||||
|
||||
|
||||
def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict:
|
||||
"""抓取单个 RSS 源
|
||||
返回 {"success": bool, "feed_data": parsed, "error": str, "response_time_ms": int}
|
||||
"""
|
||||
start_time = time.time()
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
|
||||
"Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# 解析 RSS
|
||||
parsed = feedparser.parse(response.content)
|
||||
|
||||
response_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
if parsed.bozo and hasattr(parsed, 'bozo_exception'):
|
||||
# 有解析警告但可能仍然可用
|
||||
pass
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"feed_data": parsed,
|
||||
"error": None,
|
||||
"response_time_ms": response_time_ms,
|
||||
}
|
||||
except requests.exceptions.RequestException as e:
|
||||
return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
|
||||
except Exception as e:
|
||||
return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
|
||||
|
||||
|
||||
def discover_feed_url(url: str, timeout: int = 15) -> list:
|
||||
"""从任意网页自动发现 RSS/Atom feed URL
|
||||
返回找到的 feed URL 列表
|
||||
"""
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
feed_urls = []
|
||||
|
||||
# 查找 <link rel="alternate"> 标签
|
||||
for link in soup.find_all("link", rel="alternate"):
|
||||
link_type = link.get("type", "").lower()
|
||||
href = link.get("href", "")
|
||||
if href and any(t in link_type for t in ["rss", "atom", "xml"]):
|
||||
full_url = urljoin(response.url, href)
|
||||
feed_urls.append(full_url)
|
||||
|
||||
# 也查找常见的 RSS 链接
|
||||
common_patterns = [
|
||||
"/rss", "/feed", "/feeds", "/atom.xml", "/rss.xml",
|
||||
"/index.xml", "/feed.xml", "/?feed=rss2",
|
||||
]
|
||||
for pattern in common_patterns:
|
||||
candidate = urljoin(response.url, pattern)
|
||||
if candidate not in feed_urls:
|
||||
# 验证是否是有效的 feed
|
||||
try:
|
||||
resp = requests.head(candidate, headers=headers, timeout=5, allow_redirects=True)
|
||||
content_type = resp.headers.get("Content-Type", "").lower()
|
||||
if any(t in content_type for t in ["rss", "atom", "xml"]):
|
||||
feed_urls.append(candidate)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return list(dict.fromkeys(feed_urls)) # 去重保持顺序
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def parse_article(entry, feed_id: int) -> dict:
|
||||
"""从 feedparser entry 解析文章数据"""
|
||||
title = entry.get("title", "")
|
||||
link = entry.get("link", "")
|
||||
author = entry.get("author", "")
|
||||
|
||||
# 发布时间
|
||||
published_at = None
|
||||
if hasattr(entry, "published_parsed") and entry.published_parsed:
|
||||
try:
|
||||
published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
if not published_at and hasattr(entry, "updated_parsed") and entry.updated_parsed:
|
||||
try:
|
||||
published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# 内容:优先 summary,其次 content
|
||||
content = ""
|
||||
if hasattr(entry, "content") and entry.content:
|
||||
content = entry.content[0].value
|
||||
elif hasattr(entry, "summary"):
|
||||
content = entry.summary
|
||||
|
||||
# 清洗 HTML
|
||||
content = clean_html(content)
|
||||
|
||||
# 生成摘要
|
||||
summary = generate_summary(content)
|
||||
|
||||
return {
|
||||
"feed_id": feed_id,
|
||||
"title": title[:1024],
|
||||
"link": link[:2048],
|
||||
"author": author[:256],
|
||||
"published_at": published_at,
|
||||
"content": content[:config.MAX_ARTICLE_CONTENT_LENGTH],
|
||||
"summary": summary[:config.MAX_SUMMARY_LENGTH],
|
||||
}
|
||||
|
||||
|
||||
def clean_html(html_text: str) -> str:
|
||||
"""清洗 HTML,去除 script/style 标签,转为安全文本"""
|
||||
if not html_text:
|
||||
return ""
|
||||
|
||||
# 先解码 HTML 实体
|
||||
text = html.unescape(html_text)
|
||||
|
||||
# 用 BeautifulSoup 清理
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
# 移除 script 和 style
|
||||
for tag in soup(["script", "style", "iframe", "object", "embed"]):
|
||||
tag.decompose()
|
||||
|
||||
# 获取纯文本
|
||||
cleaned = soup.get_text(separator="\n")
|
||||
|
||||
# 压缩空白行
|
||||
cleaned = re.sub(r"\n\s*\n+", "\n\n", cleaned)
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def generate_summary(content: str, max_length: int = 300) -> str:
|
||||
"""从内容生成摘要"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# 去掉多余空白
|
||||
text = re.sub(r"\s+", " ", content).strip()
|
||||
|
||||
if len(text) <= max_length:
|
||||
return text
|
||||
|
||||
# 在句子边界截断
|
||||
truncated = text[:max_length]
|
||||
last_period = max(truncated.rfind("。"), truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
|
||||
if last_period > max_length * 0.5:
|
||||
return truncated[:last_period + 1]
|
||||
|
||||
return truncated + "..."
|
||||
|
||||
|
||||
def fetch_and_store_feed(feed_id: int) -> dict:
|
||||
"""抓取指定 RSS 源并存储文章
|
||||
返回抓取结果统计
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
feed = db.query(Feed).filter(Feed.id == feed_id).first()
|
||||
if not feed:
|
||||
return {"success": False, "error": "Feed not found", "articles_count": 0}
|
||||
|
||||
result = fetch_feed(feed.url)
|
||||
|
||||
if not result["success"]:
|
||||
# 记录失败
|
||||
feed.last_fetch_at = datetime.utcnow()
|
||||
feed.last_fetch_status = "fail"
|
||||
feed.last_error = result["error"]
|
||||
feed.fail_count += 1
|
||||
|
||||
log = FetchLog(
|
||||
feed_id=feed_id,
|
||||
status="fail",
|
||||
error_message=result["error"],
|
||||
response_time_ms=result.get("response_time_ms"),
|
||||
)
|
||||
db.add(log)
|
||||
db.commit()
|
||||
return {"success": False, "error": result["error"], "articles_count": 0}
|
||||
|
||||
parsed = result["feed_data"]
|
||||
|
||||
# 更新 feed 元信息
|
||||
if hasattr(parsed.feed, "title"):
|
||||
feed.title = parsed.feed.title[:512]
|
||||
if hasattr(parsed.feed, "description"):
|
||||
feed.description = parsed.feed.description[:1000]
|
||||
|
||||
# 存储文章
|
||||
new_count = 0
|
||||
for entry in parsed.entries:
|
||||
article_data = parse_article(entry, feed_id)
|
||||
if not article_data["link"]:
|
||||
continue
|
||||
|
||||
# 检查是否已存在(基于 link)
|
||||
existing = db.query(Article).filter(Article.link == article_data["link"]).first()
|
||||
if existing:
|
||||
# 更新已有文章
|
||||
existing.title = article_data["title"] or existing.title
|
||||
existing.content = article_data["content"] or existing.content
|
||||
existing.summary = article_data["summary"] or existing.summary
|
||||
existing.author = article_data["author"] or existing.author
|
||||
if article_data["published_at"]:
|
||||
existing.published_at = article_data["published_at"]
|
||||
else:
|
||||
article = Article(**article_data)
|
||||
db.add(article)
|
||||
new_count += 1
|
||||
|
||||
# 更新 feed 统计
|
||||
feed.last_fetch_at = datetime.utcnow()
|
||||
feed.last_fetch_status = "success"
|
||||
feed.last_error = ""
|
||||
feed.success_count += 1
|
||||
feed.article_count = db.query(Article).filter(Article.feed_id == feed_id).count()
|
||||
|
||||
log = FetchLog(
|
||||
feed_id=feed_id,
|
||||
status="success",
|
||||
articles_fetched=new_count,
|
||||
response_time_ms=result.get("response_time_ms"),
|
||||
)
|
||||
db.add(log)
|
||||
db.commit()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"articles_count": new_count,
|
||||
"feed_title": feed.title,
|
||||
}
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
return {"success": False, "error": str(e), "articles_count": 0}
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def fetch_all_feeds(feed_ids: list = None) -> list:
|
||||
"""并发抓取多个 RSS 源
|
||||
返回每个源的抓取结果列表
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
query = db.query(Feed).filter(Feed.is_active == True)
|
||||
if feed_ids:
|
||||
query = query.filter(Feed.id.in_(feed_ids))
|
||||
feeds = query.all()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
results = []
|
||||
with ThreadPoolExecutor(max_workers=config.FETCH_CONCURRENCY) as executor:
|
||||
future_to_feed = {
|
||||
executor.submit(fetch_and_store_feed, feed.id): feed
|
||||
for feed in feeds
|
||||
}
|
||||
for future in as_completed(future_to_feed):
|
||||
feed = future_to_feed[future]
|
||||
try:
|
||||
result = future.result()
|
||||
results.append({"feed_id": feed.id, **result})
|
||||
except Exception as e:
|
||||
results.append({"feed_id": feed.id, "success": False, "error": str(e), "articles_count": 0})
|
||||
|
||||
return results
|
||||
@@ -0,0 +1,74 @@
|
||||
"""APScheduler 定时任务管理"""
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from rss_fetcher import fetch_and_store_feed
|
||||
import config
|
||||
|
||||
_scheduler = None
|
||||
|
||||
|
||||
def get_scheduler():
|
||||
"""获取或创建调度器实例"""
|
||||
global _scheduler
|
||||
if _scheduler is None:
|
||||
_scheduler = BackgroundScheduler()
|
||||
return _scheduler
|
||||
|
||||
|
||||
def add_feed_job(feed_id: int, interval_minutes: int):
|
||||
"""为指定 RSS 源添加定时抓取任务"""
|
||||
scheduler = get_scheduler()
|
||||
job_id = f"fetch_feed_{feed_id}"
|
||||
|
||||
# 确保间隔不低于最小值
|
||||
interval = max(interval_minutes, config.MIN_FETCH_INTERVAL)
|
||||
|
||||
# 如果任务已存在则更新
|
||||
existing = scheduler.get_job(job_id)
|
||||
if existing:
|
||||
existing.reschedule(trigger=IntervalTrigger(minutes=interval))
|
||||
return
|
||||
|
||||
scheduler.add_job(
|
||||
fetch_and_store_feed,
|
||||
trigger=IntervalTrigger(minutes=interval),
|
||||
id=job_id,
|
||||
args=[feed_id],
|
||||
replace_existing=True,
|
||||
misfire_grace_time=300, # 5分钟容错
|
||||
coalesce=True, # 合并错过的任务
|
||||
)
|
||||
|
||||
|
||||
def remove_feed_job(feed_id: int):
|
||||
"""移除指定 RSS 源的定时任务"""
|
||||
scheduler = get_scheduler()
|
||||
job_id = f"fetch_feed_{feed_id}"
|
||||
try:
|
||||
scheduler.remove_job(job_id)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def start_scheduler():
|
||||
"""启动调度器"""
|
||||
scheduler = get_scheduler()
|
||||
if not scheduler.running:
|
||||
scheduler.start()
|
||||
|
||||
|
||||
def stop_scheduler():
|
||||
"""停止调度器"""
|
||||
global _scheduler
|
||||
if _scheduler and _scheduler.running:
|
||||
_scheduler.shutdown(wait=False)
|
||||
_scheduler = None
|
||||
|
||||
|
||||
def init_feed_jobs(db):
|
||||
"""从数据库加载所有活跃 RSS 源并注册定时任务"""
|
||||
from models import Feed
|
||||
feeds = db.query(Feed).filter(Feed.is_active == True).all()
|
||||
for feed in feeds:
|
||||
add_feed_job(feed.id, feed.fetch_interval_minutes or config.DEFAULT_FETCH_INTERVAL)
|
||||
start_scheduler()
|
||||
Reference in New Issue
Block a user