fix: 端口更换 & 代码审核修复

端口: - 服务端口 8000 → 7329 - 前端开发端口 5173 → 7330 安全: - CORS 收紧为白名单，关闭 credentials - SPA 路由白名单完善 - 前端 XSS 转义可靠性: - 时区统一为 datetime.now(timezone.utc) - 文章入库改为内存去重 + 增量计数 - OPML 导入改为 body 参数接收 - OPML 导出 URL XML 转义 - 首次抓取改为 BackgroundTasks 异步 - articles.py HTTPException 移到顶部 import - FTS5 异常显式日志 - FTS5 查询加引号包裹防布尔注入 - 中文摘要支持中文标点 - 去掉未使用的 hashlib import 部署: - Dockerfile 锁 python:3.12.7-slim - requirements 锁定具体版本 - healthcheck 不用 curl（镜像里没有） - docker-compose 使用 .env 文件 - 新增 .env 配置文件
2026-06-11 14:31:29 +08:00
parent 54e7db0ef0
commit c59dd304f7
17 changed files with 701 additions and 106 deletions
@@ -2,7 +2,6 @@
 import time
 import re
 import html
-import hashlib
 from datetime import datetime, timezone
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.parse import urljoin
@@ -99,16 +98,16 @@ def parse_article(entry, feed_id: int) -> dict:
    link = entry.get("link", "")
    author = entry.get("author", "")

-    # 发布时间
+    # 发布时间 — 统一存为 UTC aware datetime
    published_at = None
    if hasattr(entry, "published_parsed") and entry.published_parsed:
        try:
-            published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
+            published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc)
        except (ValueError, TypeError):
            pass
    if not published_at and hasattr(entry, "updated_parsed") and entry.updated_parsed:
        try:
-            published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
+            published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
        except (ValueError, TypeError):
            pass

@@ -172,9 +171,14 @@ def generate_summary(content: str, max_length: int = 300) -> str:
    if len(text) <= max_length:
        return text

-    # 在句子边界截断
+    # 在句子边界截断（支持中英文标点）
    truncated = text[:max_length]
-    last_period = max(truncated.rfind("。"), truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
+    last_period = max(
+        truncated.rfind("。"), truncated.rfind(". "),
+        truncated.rfind("! "), truncated.rfind("? "),
+        truncated.rfind("？"), truncated.rfind("！"),
+        truncated.rfind("；"),
+    )
    if last_period > max_length * 0.5:
        return truncated[:last_period + 1]

@@ -195,7 +199,7 @@ def fetch_and_store_feed(feed_id: int) -> dict:

        if not result["success"]:
            # 记录失败
-            feed.last_fetch_at = datetime.utcnow()
+            feed.last_fetch_at = datetime.now(timezone.utc)
            feed.last_fetch_status = "fail"
            feed.last_error = result["error"]
            feed.fail_count += 1
@@ -218,34 +222,53 @@ def fetch_and_store_feed(feed_id: int) -> dict:
        if hasattr(parsed.feed, "description"):
            feed.description = parsed.feed.description[:1000]

-        # 存储文章
-        new_count = 0
+        # 存储文章 — 先收集所有文章，内存去重后批量入库
+        seen_links = set()
+        articles_to_add = []
+        articles_to_update = []
+
        for entry in parsed.entries:
            article_data = parse_article(entry, feed_id)
-            if not article_data["link"]:
+            link = article_data.get("link", "")
+            if not link or link in seen_links:
                continue
+            seen_links.add(link)
+            articles_to_add.append(article_data)

-            # 检查是否已存在（基于 link）
-            existing = db.query(Article).filter(Article.link == article_data["link"]).first()
-            if existing:
-                # 更新已有文章
-                existing.title = article_data["title"] or existing.title
-                existing.content = article_data["content"] or existing.content
-                existing.summary = article_data["summary"] or existing.summary
-                existing.author = article_data["author"] or existing.author
-                if article_data["published_at"]:
-                    existing.published_at = article_data["published_at"]
-            else:
-                article = Article(**article_data)
-                db.add(article)
-                new_count += 1
+        # 批量查询已有文章
+        if articles_to_add:
+            existing_links = {
+                row[0] for row in db.query(Article.link).filter(
+                    Article.link.in_([a["link"] for a in articles_to_add])
+                ).all()
+            }
+
+            new_count = 0
+            for article_data in articles_to_add:
+                if article_data["link"] in existing_links:
+                    articles_to_update.append(article_data)
+                else:
+                    article = Article(**article_data)
+                    db.add(article)
+                    new_count += 1
+
+            # 更新已有文章
+            for article_data in articles_to_update:
+                existing = db.query(Article).filter(Article.link == article_data["link"]).first()
+                if existing:
+                    existing.title = article_data["title"] or existing.title
+                    existing.content = article_data["content"] or existing.content
+                    existing.summary = article_data["summary"] or existing.summary
+                    existing.author = article_data["author"] or existing.author
+                    if article_data["published_at"]:
+                        existing.published_at = article_data["published_at"]

        # 更新 feed 统计
-        feed.last_fetch_at = datetime.utcnow()
+        feed.last_fetch_at = datetime.now(timezone.utc)
        feed.last_fetch_status = "success"
        feed.last_error = ""
        feed.success_count += 1
-        feed.article_count = db.query(Article).filter(Article.feed_id == feed_id).count()
+        feed.article_count += new_count

        log = FetchLog(
            feed_id=feed_id,