fix: 端口更换 & 代码审核修复

端口: - 服务端口 8000 → 7329 - 前端开发端口 5173 → 7330 安全: - CORS 收紧为白名单，关闭 credentials - SPA 路由白名单完善 - 前端 XSS 转义可靠性: - 时区统一为 datetime.now(timezone.utc) - 文章入库改为内存去重 + 增量计数 - OPML 导入改为 body 参数接收 - OPML 导出 URL XML 转义 - 首次抓取改为 BackgroundTasks 异步 - articles.py HTTPException 移到顶部 import - FTS5 异常显式日志 - FTS5 查询加引号包裹防布尔注入 - 中文摘要支持中文标点 - 去掉未使用的 hashlib import 部署: - Dockerfile 锁 python:3.12.7-slim - requirements 锁定具体版本 - healthcheck 不用 curl（镜像里没有） - docker-compose 使用 .env 文件 - 新增 .env 配置文件
2026-06-11 14:31:29 +08:00
parent 54e7db0ef0
commit c59dd304f7
17 changed files with 701 additions and 106 deletions
@@ -43,15 +43,19 @@ def init_fts5():
    conn = engine.raw_connection()
    cursor = conn.cursor()

+    import logging
+    logger = logging.getLogger(__name__)
+
    # 检查 FTS5 扩展是否可用
    try:
        cursor.execute("SELECT sqlite_compileoption_used('ENABLE_FTS5')")
        has_fts5 = cursor.fetchone()[0]
        if not has_fts5:
-            print("警告: SQLite 未启用 FTS5 扩展，全文搜索将不可用")
+            logger.warning("SQLite 未启用 FTS5 扩展，全文搜索将不可用")
            return
-    except Exception:
-        pass
+    except Exception as e:
+        logger.error(f"FTS5 检测失败: {e}")
+        return

    # 创建 FTS5 虚拟表
    cursor.execute("""
@@ -10,8 +10,13 @@ def search_articles(query: str, limit: int = 50, offset: int = 0):
    if not query or not query.strip():
        return [], 0

-    # 转义 FTS5 特殊字符
+    # 转义 FTS5 特殊字符（双引号、* 等）
+    # 简单策略：将用户查询视为一个整体短语，加引号包裹
    query = query.replace('"', '""').strip()
+    if not query:
+        return [], 0
+    # 用双引号包裹，避免 FTS5 布尔操作符被误解析
+    query = f'"{query}"'

    conn = engine.raw_connection()
    cursor = conn.cursor()
@@ -1,6 +1,7 @@
 """RSS 源健康度检测"""
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from typing import List, Dict
+from sqlalchemy import func
 from sqlalchemy.orm import Session
 from models import Feed, FetchLog

@@ -9,6 +10,7 @@ def get_feed_health(db: Session, feed_id: int = None) -> List[Dict]:
    """获取 RSS 源健康度信息
    返回每个源的健康状态详情
    """
+    now = datetime.now(timezone.utc)
    query = db.query(Feed)
    if feed_id:
        query = query.filter(Feed.id == feed_id)
@@ -22,15 +24,16 @@ def get_feed_health(db: Session, feed_id: int = None) -> List[Dict]:

        days_since_fetch = None
        if feed.last_fetch_at:
-            days_since_fetch = (datetime.utcnow() - feed.last_fetch_at).days
+            days_since_fetch = (now - feed.last_fetch_at).days

        # 获取最近 7 天抓取记录
+        week_ago = now - timedelta(days=7)
        recent_logs = db.query(FetchLog).filter(
            FetchLog.feed_id == feed.id,
-            FetchLog.created_at >= datetime.utcnow() - timedelta(days=7)
+            FetchLog.created_at >= week_ago
        ).order_by(FetchLog.created_at.desc()).limit(10).all()

-        health = feed.health_status()
+        health = feed.health_status(now=now)

        results.append({
            "id": feed.id,
@@ -76,14 +79,14 @@ def get_overall_stats(db: Session) -> Dict:
    """获取整体统计信息"""
    total_feeds = db.query(Feed).count()
    active_feeds = db.query(Feed).filter(Feed.is_active == True).count()
-    total_articles = db.query(Feed).with_entities(Feed.article_count).all()
-    total_articles_count = sum(a[0] for a in total_articles) if total_articles else 0
+    total_articles_count = db.query(func.sum(Feed.article_count)).scalar() or 0

    # 健康源统计
    feeds = db.query(Feed).all()
    healthy = warning = unhealthy = 0
+    now = datetime.now(timezone.utc)
    for feed in feeds:
-        status = feed.health_status()
+        status = feed.health_status(now=now)
        if status == "healthy":
            healthy += 1
        elif status == "warning":
@@ -92,8 +95,7 @@ def get_overall_stats(db: Session) -> Dict:
            unhealthy += 1

    # 今日抓取
-    today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
-    from models import FetchLog
+    today = now.replace(hour=0, minute=0, second=0, microsecond=0)
    today_fetches = db.query(FetchLog).filter(FetchLog.created_at >= today).count()
    today_success = db.query(FetchLog).filter(
        FetchLog.created_at >= today, FetchLog.status == "success"
@@ -35,13 +35,17 @@ app = FastAPI(
    lifespan=lifespan,
 )

-# CORS
+# CORS — 仅允许同源和开发环境
 app.add_middleware(
    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
+    allow_origins=[
+        "http://localhost:7329",
+        "http://localhost:7330",
+        "http://127.0.0.1:7329",
+    ],
+    allow_credentials=False,
+    allow_methods=["GET", "POST", "PUT", "DELETE"],
+    allow_headers=["Content-Type", "Authorization", "X-API-Key"],
 )

 # API 路由
@@ -62,11 +66,17 @@ static_dir = os.path.join(config.BASE_DIR, "static")
 if os.path.exists(static_dir):
    app.mount("/static", StaticFiles(directory=static_dir), name="static")

+    # API 路径白名单 — 这些路径不应被 SPA 兜底
+    _API_PATHS = {
+        "api", "docs", "openapi.json", "redoc",
+    }
+
    @app.get("/{full_path:path}")
    async def serve_spa(full_path: str):
        """Vue SPA 路由回退"""
-        # API 路由不走这里
-        if full_path.startswith("api/") or full_path.startswith("docs") or full_path.startswith("openapi.json"):
+        # API/文档路由不走 SPA 兜底
+        first_seg = full_path.split("/")[0] if full_path else ""
+        if first_seg in _API_PATHS:
            return {"detail": "Not found"}

        index_path = os.path.join(static_dir, "index.html")
@@ -1,5 +1,5 @@
 """SQLAlchemy 数据模型"""
-from datetime import datetime
+from datetime import datetime, timezone
 from sqlalchemy import Column, Integer, String, Text, Boolean, DateTime, ForeignKey
 from sqlalchemy.orm import relationship
 from database import Base
@@ -25,13 +25,13 @@ class Feed(Base):
    fail_count = Column(Integer, default=0)
    article_count = Column(Integer, default=0)

-    created_at = Column(DateTime, default=datetime.utcnow)
+    created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc))

    # 关联
    articles = relationship("Article", back_populates="feed", cascade="all, delete-orphan")
    fetch_logs = relationship("FetchLog", back_populates="feed", cascade="all, delete-orphan")

-    def health_status(self):
+    def health_status(self, now: datetime = None):
        """计算健康度
        🟢 健康: 成功率 >= 90%, 最近7天有更新
        🟡 警告: 成功率 50%-90%, 或超过3天未更新
@@ -43,9 +43,12 @@ class Feed(Base):

        success_rate = self.success_count / total

+        if now is None:
+            now = datetime.now(timezone.utc)
+
        days_since_last_fetch = None
        if self.last_fetch_at:
-            days_since_last_fetch = (datetime.utcnow() - self.last_fetch_at).days
+            days_since_last_fetch = (now - self.last_fetch_at).days

        if success_rate >= 0.9 and (days_since_last_fetch is None or days_since_last_fetch <= 7):
            return "healthy"
@@ -68,7 +71,7 @@ class Article(Base):
    content = Column(Text, default="")
    summary = Column(Text, default="")
    is_read = Column(Boolean, default=False)
-    created_at = Column(DateTime, default=datetime.utcnow, index=True)
+    created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), index=True)

    # 关联
    feed = relationship("Feed", back_populates="articles")
@@ -84,7 +87,7 @@ class FetchLog(Base):
    articles_fetched = Column(Integer, default=0)
    error_message = Column(Text, default="")
    response_time_ms = Column(Integer, nullable=True)
-    created_at = Column(DateTime, default=datetime.utcnow, index=True)
+    created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), index=True)

    # 关联
    feed = relationship("Feed", back_populates="fetch_logs")
@@ -1,9 +1,9 @@
-fastapi>=0.110.0
-uvicorn[standard]>=0.29.0
-sqlalchemy>=2.0.0
-pydantic>=2.6.0
-feedparser>=6.0.11
-requests>=2.31.0
-beautifulsoup4>=4.12.0
-apscheduler>=3.10.4
-lxml>=5.1.0
+fastapi==0.115.0
+uvicorn[standard]==0.32.0
+sqlalchemy==2.0.36
+pydantic==2.9.2
+feedparser==6.0.11
+requests==2.32.3
+beautifulsoup4==4.12.3
+apscheduler==3.10.4
+lxml==5.3.0
@@ -1,6 +1,6 @@
 """文章管理 API"""
 from typing import Optional
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel
 from sqlalchemy.orm import Session
 from sqlalchemy import desc
@@ -130,4 +130,3 @@ def fulltext_search(
    return {"total": total, "items": results}


-from fastapi import HTTPException
@@ -1,6 +1,6 @@
 """对外 API（供 AI/外部系统调用）"""
 from typing import Optional
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from fastapi import APIRouter, Depends
 from sqlalchemy.orm import Session
 from sqlalchemy import desc
@@ -21,7 +21,7 @@ def get_recent_articles(
    """获取最近 N 小时的文章
    这是对外提供给 AI 分析的主要接口
    """
-    since = datetime.utcnow() - timedelta(hours=hours)
+    since = datetime.now(timezone.utc) - timedelta(hours=hours)

    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)

@@ -136,7 +136,7 @@ def get_daily_summary(
        except ValueError:
            return {"error": "Invalid date format, use YYYY-MM-DD"}
    else:
-        day = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
+        day = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
        next_day = day + timedelta(days=1)

    query = db.query(Article, Feed.title.label("feed_title"), Feed.category.label("category")).join(Feed)
@@ -1,6 +1,6 @@
 """RSS 源管理 API"""
 from typing import List, Optional
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
 from pydantic import BaseModel, HttpUrl
 from sqlalchemy.orm import Session
 from database import get_db
@@ -103,7 +103,11 @@ def list_categories(db: Session = Depends(get_db)):


@router.post("", response_model=dict)
-def create_feed(data: FeedCreate, db: Session = Depends(get_db)):
+def create_feed(
+    data: FeedCreate,
+    background_tasks: BackgroundTasks,
+    db: Session = Depends(get_db),
+):
    """添加 RSS 源"""
    # 检查是否已存在
    existing = db.query(Feed).filter(Feed.url == data.url).first()
@@ -126,10 +130,10 @@ def create_feed(data: FeedCreate, db: Session = Depends(get_db)):
    if feed.is_active:
        add_feed_job(feed.id, feed.fetch_interval_minutes)

-    # 立即抓取一次
-    fetch_and_store_feed(feed.id)
+    # 后台异步首次抓取，不阻塞 HTTP 响应
+    background_tasks.add_task(fetch_and_store_feed, feed.id)

-    return {"id": feed.id, "message": "RSS 源添加成功", "url": feed.url}
+    return {"id": feed.id, "message": "RSS 源添加成功，正在后台抓取", "url": feed.url}


@router.post("/discover")
@@ -217,13 +221,25 @@ def trigger_fetch(feed_id: int, db: Session = Depends(get_db)):
    return result


+class OpmlImport(BaseModel):
+    opml_content: str
+
+
@router.post("/import-opml")
-def import_opml(opml_content: str, db: Session = Depends(get_db)):
+def import_opml(data: OpmlImport, db: Session = Depends(get_db)):
    """导入 OPML 文件内容"""
    import xml.etree.ElementTree as ET

+    content = data.opml_content.strip()
+    if not content:
+        raise HTTPException(status_code=400, detail="OPML 内容不能为空")
+
+    # 限制大小（防止滥用）
+    if len(content) > 5_000_000:  # 5MB
+        raise HTTPException(status_code=413, detail="OPML 文件过大")
+
    try:
-        root = ET.fromstring(opml_content)
+        root = ET.fromstring(content)
    except ET.ParseError:
        raise HTTPException(status_code=400, detail="无效的 OPML 文件")

@@ -261,12 +277,14 @@ def import_opml(opml_content: str, db: Session = Depends(get_db)):
@router.get("/export-opml")
 def export_opml(db: Session = Depends(get_db)):
    """导出 OPML 文件内容"""
+    from xml.sax.saxutils import escape
    feeds = db.query(Feed).all()

    lines = ['<?xml version="1.0" encoding="UTF-8"?>', '<opml version="2.0">', '<head><title>rssKeeper Feeds</title></head>', '<body>']
    for feed in feeds:
-        title = (feed.title or feed.url).replace('"', '&quot;')
-        lines.append(f'  <outline type="rss" text="{title}" xmlUrl="{feed.url}" />')
+        title = escape(feed.title or feed.url, {'"': '&quot;'})
+        url = escape(feed.url)
+        lines.append(f'  <outline type="rss" text="{title}" xmlUrl="{url}" />')
    lines.append('</body>')
    lines.append('</opml>')

@@ -2,7 +2,6 @@
 import time
 import re
 import html
-import hashlib
 from datetime import datetime, timezone
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.parse import urljoin
@@ -99,16 +98,16 @@ def parse_article(entry, feed_id: int) -> dict:
    link = entry.get("link", "")
    author = entry.get("author", "")

-    # 发布时间
+    # 发布时间 — 统一存为 UTC aware datetime
    published_at = None
    if hasattr(entry, "published_parsed") and entry.published_parsed:
        try:
-            published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
+            published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc)
        except (ValueError, TypeError):
            pass
    if not published_at and hasattr(entry, "updated_parsed") and entry.updated_parsed:
        try:
-            published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
+            published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
        except (ValueError, TypeError):
            pass

@@ -172,9 +171,14 @@ def generate_summary(content: str, max_length: int = 300) -> str:
    if len(text) <= max_length:
        return text

-    # 在句子边界截断
+    # 在句子边界截断（支持中英文标点）
    truncated = text[:max_length]
-    last_period = max(truncated.rfind("。"), truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
+    last_period = max(
+        truncated.rfind("。"), truncated.rfind(". "),
+        truncated.rfind("! "), truncated.rfind("? "),
+        truncated.rfind("？"), truncated.rfind("！"),
+        truncated.rfind("；"),
+    )
    if last_period > max_length * 0.5:
        return truncated[:last_period + 1]

@@ -195,7 +199,7 @@ def fetch_and_store_feed(feed_id: int) -> dict:

        if not result["success"]:
            # 记录失败
-            feed.last_fetch_at = datetime.utcnow()
+            feed.last_fetch_at = datetime.now(timezone.utc)
            feed.last_fetch_status = "fail"
            feed.last_error = result["error"]
            feed.fail_count += 1
@@ -218,34 +222,53 @@ def fetch_and_store_feed(feed_id: int) -> dict:
        if hasattr(parsed.feed, "description"):
            feed.description = parsed.feed.description[:1000]

-        # 存储文章
-        new_count = 0
+        # 存储文章 — 先收集所有文章，内存去重后批量入库
+        seen_links = set()
+        articles_to_add = []
+        articles_to_update = []
+
        for entry in parsed.entries:
            article_data = parse_article(entry, feed_id)
-            if not article_data["link"]:
+            link = article_data.get("link", "")
+            if not link or link in seen_links:
                continue
+            seen_links.add(link)
+            articles_to_add.append(article_data)

-            # 检查是否已存在（基于 link）
-            existing = db.query(Article).filter(Article.link == article_data["link"]).first()
-            if existing:
-                # 更新已有文章
-                existing.title = article_data["title"] or existing.title
-                existing.content = article_data["content"] or existing.content
-                existing.summary = article_data["summary"] or existing.summary
-                existing.author = article_data["author"] or existing.author
-                if article_data["published_at"]:
-                    existing.published_at = article_data["published_at"]
-            else:
-                article = Article(**article_data)
-                db.add(article)
-                new_count += 1
+        # 批量查询已有文章
+        if articles_to_add:
+            existing_links = {
+                row[0] for row in db.query(Article.link).filter(
+                    Article.link.in_([a["link"] for a in articles_to_add])
+                ).all()
+            }
+
+            new_count = 0
+            for article_data in articles_to_add:
+                if article_data["link"] in existing_links:
+                    articles_to_update.append(article_data)
+                else:
+                    article = Article(**article_data)
+                    db.add(article)
+                    new_count += 1
+
+            # 更新已有文章
+            for article_data in articles_to_update:
+                existing = db.query(Article).filter(Article.link == article_data["link"]).first()
+                if existing:
+                    existing.title = article_data["title"] or existing.title
+                    existing.content = article_data["content"] or existing.content
+                    existing.summary = article_data["summary"] or existing.summary
+                    existing.author = article_data["author"] or existing.author
+                    if article_data["published_at"]:
+                        existing.published_at = article_data["published_at"]

        # 更新 feed 统计
-        feed.last_fetch_at = datetime.utcnow()
+        feed.last_fetch_at = datetime.now(timezone.utc)
        feed.last_fetch_status = "success"
        feed.last_error = ""
        feed.success_count += 1
-        feed.article_count = db.query(Article).filter(Article.feed_id == feed_id).count()
+        feed.article_count += new_count

        log = FetchLog(
            feed_id=feed_id,