diff --git a/Dockerfile b/Dockerfile index ddadc5e..2d81772 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ FROM node:20-alpine AS frontend-builder WORKDIR /app/frontend COPY frontend/package.json ./ -RUN npm install +RUN npm config set registry https://registry.npmmirror.com && npm install COPY frontend/ . RUN npm run build @@ -11,6 +11,9 @@ RUN npm run build FROM python:3.12.7-slim WORKDIR /app +# 使用国内 apt 镜像源 +RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources + # 安装系统依赖(构建时) RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ @@ -18,9 +21,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libxslt1-dev \ && rm -rf /var/lib/apt/lists/* -# 安装 Python 依赖 +# 安装 Python 依赖(使用国内 pip 镜像) COPY backend/requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt # 清理构建依赖 RUN apt-get purge -y gcc && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* diff --git a/backend/database.py b/backend/database.py index 651c2cc..6bab429 100644 --- a/backend/database.py +++ b/backend/database.py @@ -35,9 +35,48 @@ def init_db(): from models import Feed, Article, FetchLog # noqa Base.metadata.create_all(bind=engine) + _migrate(engine) init_fts5() +def _migrate(engine): + """处理数据库增量迁移(添加新列)""" + import logging + logger = logging.getLogger(__name__) + conn = engine.raw_connection() + cursor = conn.cursor() + + # 获取 feeds 表现有列 + cursor.execute("PRAGMA table_info(feeds)") + existing = {row[1] for row in cursor.fetchall()} + + migrations = [ + ("feeds", "error_type", "VARCHAR(32) DEFAULT ''"), + ] + for table, column, col_type in migrations: + if column not in existing: + logger.info(f"迁移: ALTER TABLE {table} ADD COLUMN {column} {col_type}") + cursor.execute(f"ALTER TABLE {table} ADD COLUMN {column} {col_type}") + + conn.commit() + + # 对已有错误数据分类 + from rss_fetcher import classify_error + cursor.execute("SELECT id, last_error FROM feeds WHERE last_error != '' AND (error_type IS NULL OR error_type = '')") + rows = cursor.fetchall() + for row in rows: + feed_id, error = row + etype = classify_error(error) + if etype: + cursor.execute("UPDATE feeds SET error_type = ? WHERE id = ?", (etype, feed_id)) + if rows: + conn.commit() + logger.info(f"迁移: 已分类 {len(rows)} 条历史错误") + + cursor.close() + conn.close() + + def init_fts5(): """初始化 FTS5 全文搜索虚拟表""" conn = engine.raw_connection() diff --git a/backend/health_checker.py b/backend/health_checker.py index 89f9800..e3bcd28 100644 --- a/backend/health_checker.py +++ b/backend/health_checker.py @@ -1,5 +1,5 @@ """RSS 源健康度检测""" -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta from typing import List, Dict from sqlalchemy import func from sqlalchemy.orm import Session @@ -10,7 +10,7 @@ def get_feed_health(db: Session, feed_id: int = None) -> List[Dict]: """获取 RSS 源健康度信息 返回每个源的健康状态详情 """ - now = datetime.now(timezone.utc) + now = datetime.utcnow() query = db.query(Feed) if feed_id: query = query.filter(Feed.id == feed_id) @@ -84,7 +84,7 @@ def get_overall_stats(db: Session) -> Dict: # 健康源统计 feeds = db.query(Feed).all() healthy = warning = unhealthy = 0 - now = datetime.now(timezone.utc) + now = datetime.utcnow() for feed in feeds: status = feed.health_status(now=now) if status == "healthy": diff --git a/backend/main.py b/backend/main.py index 5eae782..f5cc4ae 100644 --- a/backend/main.py +++ b/backend/main.py @@ -3,7 +3,6 @@ import os from contextlib import asynccontextmanager from fastapi import FastAPI from fastapi.staticfiles import StaticFiles -from fastapi.responses import FileResponse from starlette.middleware.cors import CORSMiddleware from database import init_db, SessionLocal from scheduler import init_feed_jobs, stop_scheduler @@ -61,25 +60,7 @@ def health_check(): return {"status": "ok", "service": "rssKeeper"} -# 静态文件服务(前端构建产物) +# 静态文件服务(前端构建产物)— 必须放在最后,API 路由优先匹配 static_dir = os.path.join(config.BASE_DIR, "static") if os.path.exists(static_dir): - app.mount("/static", StaticFiles(directory=static_dir), name="static") - - # API 路径白名单 — 这些路径不应被 SPA 兜底 - _API_PATHS = { - "api", "docs", "openapi.json", "redoc", - } - - @app.get("/{full_path:path}") - async def serve_spa(full_path: str): - """Vue SPA 路由回退""" - # API/文档路由不走 SPA 兜底 - first_seg = full_path.split("/")[0] if full_path else "" - if first_seg in _API_PATHS: - return {"detail": "Not found"} - - index_path = os.path.join(static_dir, "index.html") - if os.path.exists(index_path): - return FileResponse(index_path) - return {"detail": "Frontend not built"} + app.mount("/", StaticFiles(directory=static_dir, html=True), name="static") diff --git a/backend/models.py b/backend/models.py index 42ca57a..23daba7 100644 --- a/backend/models.py +++ b/backend/models.py @@ -1,5 +1,5 @@ """SQLAlchemy 数据模型""" -from datetime import datetime, timezone +from datetime import datetime from sqlalchemy import Column, Integer, String, Text, Boolean, DateTime, ForeignKey from sqlalchemy.orm import relationship from database import Base @@ -21,11 +21,12 @@ class Feed(Base): last_fetch_at = Column(DateTime, nullable=True) last_fetch_status = Column(String(20), default="") last_error = Column(Text, default="") + error_type = Column(String(32), default="") success_count = Column(Integer, default=0) fail_count = Column(Integer, default=0) article_count = Column(Integer, default=0) - created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc)) + created_at = Column(DateTime, default=datetime.utcnow) # 关联 articles = relationship("Article", back_populates="feed", cascade="all, delete-orphan") @@ -36,6 +37,7 @@ class Feed(Base): 🟢 健康: 成功率 >= 90%, 最近7天有更新 🟡 警告: 成功率 50%-90%, 或超过3天未更新 🔴 异常: 成功率 < 50%, 或超过7天未更新 + ⚪ 未知: 尚未进行过任何抓取 """ total = self.success_count + self.fail_count if total == 0: @@ -44,7 +46,7 @@ class Feed(Base): success_rate = self.success_count / total if now is None: - now = datetime.now(timezone.utc) + now = datetime.utcnow() days_since_last_fetch = None if self.last_fetch_at: @@ -71,7 +73,7 @@ class Article(Base): content = Column(Text, default="") summary = Column(Text, default="") is_read = Column(Boolean, default=False) - created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), index=True) + created_at = Column(DateTime, default=datetime.utcnow, index=True) # 关联 feed = relationship("Feed", back_populates="articles") @@ -87,7 +89,7 @@ class FetchLog(Base): articles_fetched = Column(Integer, default=0) error_message = Column(Text, default="") response_time_ms = Column(Integer, nullable=True) - created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), index=True) + created_at = Column(DateTime, default=datetime.utcnow, index=True) # 关联 feed = relationship("Feed", back_populates="fetch_logs") diff --git a/backend/routers/external_api.py b/backend/routers/external_api.py index fe83a03..b849619 100644 --- a/backend/routers/external_api.py +++ b/backend/routers/external_api.py @@ -1,6 +1,6 @@ """对外 API(供 AI/外部系统调用)""" from typing import Optional -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta from fastapi import APIRouter, Depends from sqlalchemy.orm import Session from sqlalchemy import desc @@ -21,7 +21,7 @@ def get_recent_articles( """获取最近 N 小时的文章 这是对外提供给 AI 分析的主要接口 """ - since = datetime.now(timezone.utc) - timedelta(hours=hours) + since = datetime.utcnow() - timedelta(hours=hours) query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed) @@ -136,7 +136,7 @@ def get_daily_summary( except ValueError: return {"error": "Invalid date format, use YYYY-MM-DD"} else: - day = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0) + day = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) next_day = day + timedelta(days=1) query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed) diff --git a/backend/routers/feeds.py b/backend/routers/feeds.py index f3e20b5..e50e3d0 100644 --- a/backend/routers/feeds.py +++ b/backend/routers/feeds.py @@ -5,8 +5,8 @@ from pydantic import BaseModel, HttpUrl from sqlalchemy.orm import Session from database import get_db from models import Feed -from rss_fetcher import discover_feed_url, fetch_and_store_feed -from scheduler import add_feed_job, remove_feed_job +from rss_fetcher import discover_feed_url, fetch_and_store_feed, fetch_all_feeds +from scheduler import add_feed_job, remove_feed_job, get_feed_next_run router = APIRouter(prefix="/feeds", tags=["feeds"]) @@ -55,9 +55,10 @@ def list_feeds( category: Optional[str] = None, search: Optional[str] = None, is_active: Optional[bool] = None, + health_status: Optional[str] = None, db: Session = Depends(get_db), ): - """获取 RSS 源列表,支持分页、分类筛选、搜索""" + """获取 RSS 源列表,支持分页、分类筛选、搜索、健康度筛选""" query = db.query(Feed) if category: @@ -70,10 +71,22 @@ def list_feeds( ) total = query.count() - feeds = query.order_by(Feed.created_at.desc()).offset(skip).limit(limit).all() + + # 健康度是计算字段,需要在 Python 中过滤 + if health_status: + all_feeds = query.order_by(Feed.created_at.desc()).all() + matched = [] + for feed in all_feeds: + if feed.health_status() == health_status: + matched.append(feed) + total = len(matched) + feeds = matched[skip:skip + limit] + else: + feeds = query.order_by(Feed.created_at.desc()).offset(skip).limit(limit).all() results = [] for feed in feeds: + next_run = get_feed_next_run(feed.id) data = { "id": feed.id, "url": feed.url, @@ -84,10 +97,13 @@ def list_feeds( "fetch_interval_minutes": feed.fetch_interval_minutes, "last_fetch_at": feed.last_fetch_at.isoformat() if feed.last_fetch_at else None, "last_fetch_status": feed.last_fetch_status, + "last_error": feed.last_error, + "error_type": feed.error_type, "success_count": feed.success_count, "fail_count": feed.fail_count, "article_count": feed.article_count, "health_status": feed.health_status(), + "next_fetch_time": next_run.isoformat() if next_run else None, "created_at": feed.created_at.isoformat(), } results.append(data) @@ -210,6 +226,24 @@ def delete_feed(feed_id: int, db: Session = Depends(get_db)): return {"message": "RSS 源已删除"} +class BatchFetchRequest(BaseModel): + feed_ids: List[int] + + +@router.post("/batch-fetch") +def batch_fetch(data: BatchFetchRequest): + """批量抓取(并发同步执行,等待结果返回)""" + results = fetch_all_feeds(data.feed_ids) + success = sum(1 for r in results if r.get("success")) + fail = len(results) - success + return { + "message": f"完成:{success} 个成功,{fail} 个失败", + "total": len(results), + "success": success, + "fail": fail, + } + + @router.post("/{feed_id}/fetch") def trigger_fetch(feed_id: int, db: Session = Depends(get_db)): """手动触发抓取""" diff --git a/backend/rss_fetcher.py b/backend/rss_fetcher.py index b93b340..0f9d703 100644 --- a/backend/rss_fetcher.py +++ b/backend/rss_fetcher.py @@ -2,7 +2,7 @@ import time import re import html -from datetime import datetime, timezone +from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import urljoin import requests @@ -14,6 +14,39 @@ from database import SessionLocal import config +def classify_error(error: str) -> str: + """根据错误信息分类错误类型""" + if not error: + return "" + err = error.lower() + + if "404" in error or "not found" in err: + return "url_invalid" + if "403" in error or "forbidden" in err: + return "forbidden" + if "429" in error or "too many request" in err: + return "rate_limited" + if "timeout" in err or "timed out" in err: + return "timeout" + if "connecttimeout" in err or "connectiontimeout" in err: + return "timeout" + if "could not resolve" in err or "name or service not known" in err or "nodename nor servname" in err: + return "dns_failure" + if "connection refused" in err: + return "connection_refused" + if "connection aborted" in err or "remotedisconnected" in err or "remote end closed" in err: + return "connection_reset" + if "ssl" in err or "certificate" in err or "certifi" in err: + return "ssl_error" + if "max retries" in err or "newconnectionerror" in err: + return "unreachable" + if "invalid url" in err or "no host" in err or "missing scheme" in err: + return "url_malformed" + if "5" in error and "server error" in err: + return "server_error" + return "unknown" + + def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict: """抓取单个 RSS 源 返回 {"success": bool, "feed_data": parsed, "error": str, "response_time_ms": int} @@ -102,12 +135,12 @@ def parse_article(entry, feed_id: int) -> dict: published_at = None if hasattr(entry, "published_parsed") and entry.published_parsed: try: - published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc) + published_at = datetime(*entry.published_parsed[:6]) except (ValueError, TypeError): pass if not published_at and hasattr(entry, "updated_parsed") and entry.updated_parsed: try: - published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc) + published_at = datetime(*entry.updated_parsed[:6]) except (ValueError, TypeError): pass @@ -199,9 +232,10 @@ def fetch_and_store_feed(feed_id: int) -> dict: if not result["success"]: # 记录失败 - feed.last_fetch_at = datetime.now(timezone.utc) + feed.last_fetch_at = datetime.utcnow() feed.last_fetch_status = "fail" feed.last_error = result["error"] + feed.error_type = classify_error(result["error"]) feed.fail_count += 1 log = FetchLog( @@ -264,9 +298,10 @@ def fetch_and_store_feed(feed_id: int) -> dict: existing.published_at = article_data["published_at"] # 更新 feed 统计 - feed.last_fetch_at = datetime.now(timezone.utc) + feed.last_fetch_at = datetime.utcnow() feed.last_fetch_status = "success" feed.last_error = "" + feed.error_type = "" feed.success_count += 1 feed.article_count += new_count diff --git a/backend/scheduler.py b/backend/scheduler.py index 7c1f21c..b0e18e3 100644 --- a/backend/scheduler.py +++ b/backend/scheduler.py @@ -65,6 +65,15 @@ def stop_scheduler(): _scheduler = None +def get_feed_next_run(feed_id: int): + """获取指定 RSS 源的下一次抓取时间""" + scheduler = get_scheduler() + if not scheduler.running: + return None + job = scheduler.get_job(f"fetch_feed_{feed_id}") + return job.next_run_time if job else None + + def init_feed_jobs(db): """从数据库加载所有活跃 RSS 源并注册定时任务""" from models import Feed diff --git a/frontend/src/api/index.js b/frontend/src/api/index.js index 2a64869..2744bb3 100644 --- a/frontend/src/api/index.js +++ b/frontend/src/api/index.js @@ -32,6 +32,7 @@ export const feedsApi = { update: (id, data) => api.put(`/api/feeds/${id}`, data), remove: (id) => api.delete(`/api/feeds/${id}`), fetch: (id) => api.post(`/api/feeds/${id}/fetch`), + batchFetch: (ids) => api.post('/api/feeds/batch-fetch', { feed_ids: ids }, { timeout: 300000 }), discover: (url) => api.post('/api/feeds/discover', null, { params: { url } }), importOpml: (content) => api.post('/api/feeds/import-opml', { opml_content: content }), exportOpml: () => api.get('/api/feeds/export-opml'), diff --git a/frontend/src/views/Dashboard.vue b/frontend/src/views/Dashboard.vue index ed5e2e8..618750d 100644 --- a/frontend/src/views/Dashboard.vue +++ b/frontend/src/views/Dashboard.vue @@ -1,5 +1,5 @@