feat: 修复代码审核报告问题

2026-06-12 16:04:03 +08:00
commit bae47a2411
46 changed files with 6231 additions and 0 deletions
@@ -0,0 +1,92 @@
+"""LLM API 客户端，兼容 OpenAI API 格式"""
+import json
+import logging
+from typing import Optional
+
+from openai import OpenAI, APIError
+
+from config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class AIClient:
+    """封装 LLM 调用，支持重试和 JSON 输出"""
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        model: Optional[str] = None,
+        timeout: Optional[int] = None,
+        max_retries: Optional[int] = None,
+    ):
+        self.api_key = api_key or settings.OPENAI_API_KEY
+        self.base_url = base_url or settings.OPENAI_BASE_URL
+        self.model = model or settings.OPENAI_MODEL
+        self.timeout = timeout or settings.OPENAI_TIMEOUT
+        self.max_retries = max_retries or settings.OPENAI_MAX_RETRIES
+
+        self._client: Optional[OpenAI] = None
+
+    @property
+    def client(self) -> OpenAI:
+        if self._client is None:
+            self._client = OpenAI(
+                api_key=self.api_key,
+                base_url=self.base_url,
+                timeout=self.timeout,
+                max_retries=self.max_retries,
+            )
+        return self._client
+
+    def chat_completion(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        temperature: float = 0.3,
+        json_mode: bool = False,
+    ) -> str:
+        """调用 LLM 返回文本"""
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+
+        kwargs = {
+            "model": self.model,
+            "messages": messages,
+            "temperature": temperature,
+        }
+        if json_mode:
+            kwargs["response_format"] = {"type": "json_object"}
+
+        try:
+            resp = self.client.chat.completions.create(**kwargs)
+            content = resp.choices[0].message.content or ""
+            return content.strip()
+        except APIError as exc:
+            logger.error("LLM API 调用失败: %s", exc)
+            raise
+
+    def chat_completion_json(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        temperature: float = 0.3,
+    ) -> dict:
+        """调用 LLM 并解析返回的 JSON"""
+        content = self.chat_completion(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            temperature=temperature,
+            json_mode=True,
+        )
+        try:
+            return json.loads(content)
+        except json.JSONDecodeError as exc:
+            logger.error("LLM 返回不是合法 JSON: %s - content=%s", exc, content[:500])
+            raise
+
+
+ai_client = AIClient()
@@ -0,0 +1,168 @@
+"""每日简报生成"""
+import json
+import logging
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Dict, Any, List
+
+from sqlalchemy.orm import Session
+
+from config import settings
+from models import EnrichedArticle, DailyBrief
+
+logger = logging.getLogger(__name__)
+
+
+def _format_article(article: EnrichedArticle) -> Dict[str, Any]:
+    """把文章格式化为简报中的条目"""
+    return {
+        "id": article.id,
+        "rk_article_id": article.rk_article_id,
+        "title": article.title or "",
+        "link": article.link or "",
+        "author": article.author or "",
+        "feed_title": article.feed_title or "",
+        "summary": article.ai_summary or article.original_summary or "",
+        "tags": article.tags or [],
+        "heat_score": article.heat_score,
+        "importance_score": article.importance_score,
+        "duplication_score": article.duplication_score,
+        "composite_score": article.composite_score,
+        "published_at": article.published_at.isoformat() if article.published_at else None,
+    }
+
+
+def _build_markdown(date_str: str, by_category: Dict[str, List[Dict[str, Any]]], stats: Dict[str, int]) -> str:
+    """生成 Markdown 简报"""
+    lines = [
+        f"# RSS 每日简报 ({date_str})",
+        "",
+        f"- 去重前文章数: {stats['total_articles']}",
+        f"- 去重后文章数: {stats['unique_articles']}",
+        f"- 生成分类数: {len(by_category)}",
+        "",
+        "---",
+        "",
+    ]
+
+    for category, items in sorted(by_category.items(), key=lambda x: x[0]):
+        lines.append(f"## {category}")
+        lines.append("")
+        for item in items:
+            tags = " ".join([f"`{t}`" for t in item["tags"]]) if item["tags"] else ""
+            lines.append(f"### {item['title']}")
+            lines.append(f"- 来源: {item['feed_title']} | 作者: {item.get('author') or '未知'}")
+            lines.append(f"- 标签: {tags}")
+            lines.append(f"- 热度: {item['heat_score']:.1f} | 重要性: {item['importance_score']:.1f} | 重复度: {item['duplication_score']:.1f} | 综合: {item['composite_score']:.1f}")
+            if item["summary"]:
+                lines.append(f"- 摘要: {item['summary']}")
+            if item["link"]:
+                lines.append(f"- [阅读原文]({item['link']})")
+            lines.append("")
+
+    return "\n".join(lines)
+
+
+def generate_daily_brief(db: Session, date_str: str = None, force: bool = False) -> Dict[str, Any]:
+    """
+    生成指定日期的每日简报。
+    若 date_str 为空则处理今天。
+    返回简报数据字典。
+    """
+    if date_str is None:
+        date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    # 检查是否已存在
+    existing = db.query(DailyBrief).filter(DailyBrief.brief_date == date_str).first()
+    if existing and not force:
+        logger.info("日期 %s 简报已存在，跳过生成", date_str)
+        return {
+            "date": date_str,
+            "total_articles": existing.total_articles,
+            "unique_articles": existing.unique_articles,
+            "markdown_path": existing.markdown_path,
+        }
+
+    day_start = datetime.strptime(date_str, "%Y-%m-%d")
+    day_end = day_start + timedelta(days=1)
+
+    # 取当天去重后的代表文章
+    query = (
+        db.query(EnrichedArticle)
+        .filter(
+            EnrichedArticle.fetched_at >= day_start,
+            EnrichedArticle.fetched_at < day_end,
+        )
+    )
+
+    # 默认只取代表文章或未归入重复组的文章
+    representative_articles = (
+        query.filter(
+            (EnrichedArticle.is_representative == True)
+            | (EnrichedArticle.duplicate_group_id == None)
+        )
+        .order_by(EnrichedArticle.composite_score.desc())
+        .all()
+    )
+
+    # 按分类分组并排序
+    by_category: Dict[str, List[Dict[str, Any]]] = {}
+    for art in representative_articles:
+        cat = art.category or "未分类"
+        if cat not in by_category:
+            by_category[cat] = []
+        by_category[cat].append(_format_article(art))
+
+    # 每个分类只保留 TOP N
+    top_n = settings.BRIEF_TOP_N_PER_CATEGORY
+    for cat in by_category:
+        by_category[cat] = by_category[cat][:top_n]
+
+    total_before_dedup = query.count()
+    unique_count = sum(len(items) for items in by_category.values())
+
+    stats = {
+        "total_articles": total_before_dedup,
+        "unique_articles": unique_count,
+    }
+
+    # 生成 Markdown 文件
+    output_dir = settings.brief_output_dir_path / date_str
+    output_dir.mkdir(parents=True, exist_ok=True)
+    markdown_path = output_dir / "daily-brief.md"
+    markdown_content = _build_markdown(date_str, by_category, stats)
+    markdown_path.write_text(markdown_content, encoding="utf-8")
+
+    # 更新文章 brief_date
+    for art in representative_articles:
+        art.brief_date = date_str
+
+    # 保存到数据库
+    brief_data = {
+        "date": date_str,
+        "total_articles": stats["total_articles"],
+        "unique_articles": stats["unique_articles"],
+        "by_category": by_category,
+        "markdown_path": str(markdown_path),
+    }
+
+    if existing:
+        existing.total_articles = stats["total_articles"]
+        existing.unique_articles = stats["unique_articles"]
+        existing.by_category = by_category
+        existing.markdown_path = str(markdown_path)
+        existing.updated_at = datetime.now(timezone.utc)
+    else:
+        db.add(
+            DailyBrief(
+                brief_date=date_str,
+                total_articles=stats["total_articles"],
+                unique_articles=stats["unique_articles"],
+                by_category=by_category,
+                markdown_path=str(markdown_path),
+            )
+        )
+
+    db.commit()
+    logger.info("简报生成完成: 日期=%s, 去重前=%d, 去重后=%d", date_str, stats["total_articles"], stats["unique_articles"])
+    return brief_data
@@ -0,0 +1,223 @@
+"""文章去重：URL 精确去重 + 标题/内容相似度去重"""
+import logging
+import re
+from datetime import datetime, timedelta, timezone
+from difflib import SequenceMatcher
+from typing import List, Dict, Tuple, Set
+
+from sqlalchemy.orm import Session
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+
+from config import settings
+from models import EnrichedArticle, DuplicateGroup
+
+logger = logging.getLogger(__name__)
+
+
+def _normalize_title(title: str) -> str:
+    """标题规范化：去除标点和多余空格，小写，保留中英文数字"""
+    if not title:
+        return ""
+    # 保留：单词字符、CJK 统一表意符号（含扩展 A/B/C/D/E）
+    title = re.sub(
+        r"[^\w一-鿿㐀-䶿\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f]",
+        " ",
+        title,
+    )
+    title = " ".join(title.split())
+    return title.lower()
+
+
+def _title_similarity(a: str, b: str) -> float:
+    """计算标题相似度"""
+    na = _normalize_title(a)
+    nb = _normalize_title(b)
+    if not na or not nb:
+        return 0.0
+    return SequenceMatcher(None, na, nb).ratio()
+
+
+def _content_similarity_matrix(contents: List[str]) -> np.ndarray:
+    """使用 TF-IDF + 余弦相似度计算内容相似度矩阵"""
+    if len(contents) < 2:
+        return np.zeros((len(contents), len(contents)))
+
+    # 过滤空内容
+    valid_contents = [c or "" for c in contents]
+    try:
+        vectorizer = TfidfVectorizer(
+            max_features=5000,
+            stop_words="english",
+            ngram_range=(1, 2),
+            min_df=1,
+        )
+        tfidf = vectorizer.fit_transform(valid_contents)
+        return cosine_similarity(tfidf)
+    except Exception as exc:
+        logger.warning("TF-IDF 相似度计算失败: %s", exc)
+        return np.zeros((len(contents), len(contents)))
+
+
+def _find_duplicate_clusters(
+    articles: List[EnrichedArticle],
+    title_threshold: float = None,
+    content_threshold: float = None,
+) -> List[Set[int]]:
+    """
+    基于标题相似度和内容相似度找出重复簇。
+    返回索引簇列表，每个簇是一组 articles 的索引集合。
+    """
+    title_threshold = title_threshold or settings.TITLE_SIMILARITY_THRESHOLD
+    content_threshold = content_threshold or settings.CONTENT_SIMILARITY_THRESHOLD
+
+    n = len(articles)
+    if n < 2:
+        return []
+
+    contents = []
+    for art in articles:
+        text = " ".join([
+            art.title or "",
+            art.ai_summary or art.original_summary or "",
+            art.content or "",
+        ])
+        contents.append(text[:2000])  # 限制长度加速计算
+
+    content_sim = _content_similarity_matrix(contents)
+
+    visited = [False] * n
+    clusters: List[Set[int]] = []
+
+    for i in range(n):
+        if visited[i]:
+            continue
+        cluster = {i}
+        queue = [i]
+        visited[i] = True
+
+        while queue:
+            cur = queue.pop(0)
+            for j in range(n):
+                if visited[j] or cur == j:
+                    continue
+
+                title_sim = _title_similarity(articles[cur].title or "", articles[j].title or "")
+                c_sim = content_sim[cur][j] if cur < n and j < n else 0.0
+
+                # 标题高度相似 或 内容高度相似均视为重复
+                if title_sim >= title_threshold or c_sim >= content_threshold:
+                    cluster.add(j)
+                    queue.append(j)
+                    visited[j] = True
+
+        if len(cluster) > 1:
+            clusters.append(cluster)
+
+    return clusters
+
+
+def _pick_representative(articles: List[EnrichedArticle], indices: Set[int]) -> EnrichedArticle:
+    """从重复组中选择代表文章：优先选有 AI 摘要、来源 Feed 分类明确、发布时间最早的"""
+    candidates = [articles[i] for i in indices]
+    # 排序：有 AI 摘要优先，然后有 Feed 分类，然后发布时间早
+    candidates.sort(
+        key=lambda a: (
+            bool(a.ai_summary),
+            bool(a.feed_category),
+            a.published_at or datetime.min,
+        ),
+        reverse=True,
+    )
+    return candidates[0]
+
+
+def deduplicate_articles(
+    db: Session,
+    date_str: str = None,
+    title_threshold: float = None,
+    content_threshold: float = None,
+) -> Dict[str, int]:
+    """
+    对指定日期的文章进行去重。
+    若 date_str 为空则处理今天（UTC）的文章。
+    返回统计：{"total": x, "duplicate_groups": y, "representatives": z}
+    """
+    if date_str is None:
+        date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    # 只清空该日期已有的去重组，避免破坏历史数据
+    day_start = datetime.strptime(date_str, "%Y-%m-%d")
+    day_end = day_start + timedelta(days=1)
+
+    old_groups = db.query(DuplicateGroup).filter(DuplicateGroup.brief_date == date_str).all()
+    for og in old_groups:
+        for art in og.articles:
+            art.duplicate_group_id = None
+            art.is_representative = False
+        db.delete(og)
+    db.commit()
+
+    # 重置该日期文章的去重标记
+    articles = (
+        db.query(EnrichedArticle)
+        .filter(
+            EnrichedArticle.fetched_at >= day_start,
+            EnrichedArticle.fetched_at < day_end,
+        )
+        .order_by(EnrichedArticle.published_at)
+        .all()
+    )
+
+    if not articles:
+        logger.info("日期 %s 无文章可去重", date_str)
+        return {"total": 0, "duplicate_groups": 0, "representatives": 0}
+
+    # 先 URL 去重：相同 link 只保留一篇
+    unique_articles: List[EnrichedArticle] = []
+    seen_links: set = set()
+    url_dup_count = 0
+    for art in articles:
+        link = (art.link or "").strip()
+        if link and link in seen_links:
+            url_dup_count += 1
+            continue
+        if link:
+            seen_links.add(link)
+        unique_articles.append(art)
+
+    clusters = _find_duplicate_clusters(
+        unique_articles,
+        title_threshold=title_threshold,
+        content_threshold=content_threshold,
+    )
+
+    stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0}
+
+    for cluster in clusters:
+        representative = _pick_representative(unique_articles, cluster)
+        member_ids = [unique_articles[i].id for i in cluster]
+
+        group = DuplicateGroup(
+            representative_article_id=representative.id,
+            member_article_ids=member_ids,
+            similarity_matrix={},  # 可后续补充
+            brief_date=date_str,
+        )
+        db.add(group)
+        db.flush()
+
+        for idx in cluster:
+            art = unique_articles[idx]
+            art.duplicate_group_id = group.id
+            art.is_representative = (art.id == representative.id)
+
+        stats["representatives"] += 1
+
+    db.commit()
+    logger.info(
+        "去重完成: 日期=%s, 总文章=%d, 重复组=%d, URL 重复=%d",
+        date_str, stats["total"], stats["duplicate_groups"], url_dup_count
+    )
+    return stats
@@ -0,0 +1,104 @@
+"""调用 rssKeeper 外部 API"""
+from datetime import datetime, timedelta
+from typing import List, Optional, Dict, Any
+import logging
+
+import requests
+
+from config import settings
+
+logger = logging.getLogger(__name__)
+
+
+class RSSKeeperClient:
+    """rssKeeper 外部 API 客户端"""
+
+    def __init__(self, base_url: Optional[str] = None, timeout: int = 30):
+        self.base_url = (base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
+        self.timeout = timeout
+
+    def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        url = f"{self.base_url}{path}"
+        try:
+            resp = requests.get(url, params=params, timeout=self.timeout)
+            resp.raise_for_status()
+            return resp.json()
+        except requests.RequestException as exc:
+            logger.error("请求 rssKeeper 失败: %s - %s", url, exc)
+            raise
+
+    def fetch_recent(
+        self,
+        hours: int = 24,
+        limit: int = 200,
+        feed_id: Optional[int] = None,
+        category: Optional[str] = None,
+        search: Optional[str] = None,
+        unread_only: bool = False,
+    ) -> List[Dict[str, Any]]:
+        """获取最近 N 小时的文章"""
+        params = {
+            "hours": hours,
+            "limit": limit,
+            "unread_only": unread_only,
+        }
+        if feed_id is not None:
+            params["feed_id"] = feed_id
+        if category is not None:
+            params["category"] = category
+        if search is not None:
+            params["search"] = search
+
+        data = self._get("/api/v1/external/recent", params=params)
+        return data.get("articles", [])
+
+    def fetch_by_date(self, date: str, category: Optional[str] = None) -> Dict[str, Any]:
+        """获取指定日期的文章聚合"""
+        params: Dict[str, Any] = {"date": date}
+        if category is not None:
+            params["category"] = category
+        return self._get("/api/v1/external/summary", params=params)
+
+    def fetch_feeds(
+        self,
+        health_status: Optional[str] = None,
+        category: Optional[str] = None,
+        error_type: Optional[str] = None,
+        is_active: Optional[bool] = True,
+    ) -> List[Dict[str, Any]]:
+        """获取 RSS 源列表"""
+        params: Dict[str, Any] = {}
+        if health_status is not None:
+            params["health_status"] = health_status
+        if category is not None:
+            params["category"] = category
+        if error_type is not None:
+            params["error_type"] = error_type
+        if is_active is not None:
+            params["is_active"] = is_active
+
+        data = self._get("/api/v1/external/feeds", params=params)
+        return data.get("feeds", [])
+
+    def fulltext_search(
+        self,
+        q: str,
+        limit: int = 50,
+        offset: int = 0,
+        category: Optional[str] = None,
+        feed_id: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """全文搜索文章"""
+        params: Dict[str, Any] = {
+            "q": q,
+            "limit": limit,
+            "offset": offset,
+        }
+        if category is not None:
+            params["category"] = category
+        if feed_id is not None:
+            params["feed_id"] = feed_id
+        return self._get("/api/v1/external/search", params=params)
+
+
+rss_client = RSSKeeperClient()
@@ -0,0 +1,147 @@
+"""基于规则计算文章热度、重要性、重复性分数"""
+import logging
+import math
+from datetime import datetime, timedelta, timezone
+from typing import List
+
+from sqlalchemy.orm import Session
+
+from config import settings
+from models import EnrichedArticle, Taxonomy
+from app.tagger import _count_matches, _normalize
+
+logger = logging.getLogger(__name__)
+
+
+# 综合分权重：热度 30%，重要性 50%，重复性 20%
+COMPOSITE_WEIGHT_HEAT = 0.3
+COMPOSITE_WEIGHT_IMPORTANCE = 0.5
+COMPOSITE_WEIGHT_DUPLICATION = 0.2
+
+
+def _build_text(article: EnrichedArticle) -> str:
+    """构建用于打分的文本"""
+    return " ".join([
+        article.title or "",
+        article.ai_summary or article.original_summary or "",
+        article.content or "",
+    ])
+
+
+def _score_by_rules(article: EnrichedArticle, rules: List[Taxonomy]) -> float:
+    """基于规则关键词匹配计算分数，规则权重越大得分越高"""
+    text = _build_text(article)
+    if not text.strip() or not rules:
+        return 0.0
+
+    score = 0.0
+    for rule in rules:
+        keywords = rule.keywords or []
+        hits = _count_matches(text, keywords)
+        if hits > 0:
+            score += min(hits, 5) * rule.weight * 10
+
+    return min(score, 100.0)
+
+
+def _freshness_score(article: EnrichedArticle) -> float:
+    """根据发布时间计算新鲜度加成"""
+    now = datetime.now(timezone.utc)
+    published = article.published_at
+    if not published:
+        return 0.0
+
+    # 数据库中读出的 published_at 可能为 naive，默认按 UTC 处理
+    if published.tzinfo is None:
+        published = published.replace(tzinfo=timezone.utc)
+
+    hours_old = (now - published).total_seconds() / 3600
+    if hours_old < 0:
+        hours_old = 0
+
+    # 24 小时内满分 20 分，超过 72 小时降至 0
+    if hours_old <= 24:
+        return 20.0
+    elif hours_old >= 72:
+        return 0.0
+    else:
+        return 20.0 * (1 - (hours_old - 24) / 48)
+
+
+def compute_heat_score(article: EnrichedArticle, heat_rules: List[Taxonomy]) -> float:
+    """热度分：关键词命中 + 新鲜度"""
+    base = _score_by_rules(article, heat_rules)
+    fresh = _freshness_score(article)
+    return min(base + fresh, 100.0)
+
+
+def compute_importance_score(article: EnrichedArticle, importance_rules: List[Taxonomy]) -> float:
+    """重要性分：关键词命中"""
+    return _score_by_rules(article, importance_rules)
+
+
+def compute_duplication_score(duplicate_count: int, max_count: int = 5) -> float:
+    """
+    重复性分：同一主题在多个源出现次数越多，重复性分越高。
+    出现 1 次为 0 分，>= max_count 为 100 分。
+    """
+    if duplicate_count <= 1:
+        return 0.0
+    score = (duplicate_count - 1) / (max_count - 1) * 100.0
+    return min(score, 100.0)
+
+
+def compute_composite_score(heat: float, importance: float, duplication: float) -> float:
+    """计算综合分"""
+    return round(
+        heat * COMPOSITE_WEIGHT_HEAT
+        + importance * COMPOSITE_WEIGHT_IMPORTANCE
+        + duplication * COMPOSITE_WEIGHT_DUPLICATION,
+        2,
+    )
+
+
+def score_articles(
+    db: Session,
+    article_ids: List[int] = None,
+    update_duplication: bool = False,
+) -> int:
+    """
+    对文章计算热度/重要性/综合分。
+    若 update_duplication=True，则同时根据重复组更新重复性分数。
+    返回处理数量。
+    """
+    heat_rules = db.query(Taxonomy).filter(Taxonomy.kind == "heat_rule").all()
+    importance_rules = db.query(Taxonomy).filter(Taxonomy.kind == "importance_rule").all()
+
+    query = db.query(EnrichedArticle)
+    if article_ids:
+        query = query.filter(EnrichedArticle.id.in_(article_ids))
+
+    articles = query.all()
+    count = 0
+    for article in articles:
+        article.heat_score = compute_heat_score(article, heat_rules)
+        article.importance_score = compute_importance_score(article, importance_rules)
+
+        if update_duplication:
+            dup_count = 0
+            if article.duplicate_group_id:
+                group = article.duplicate_group
+                if group and group.member_article_ids:
+                    # 非代表成员数量才是真正的重复次数
+                    dup_count = max(len(group.member_article_ids) - 1, 0)
+            article.duplication_score = compute_duplication_score(dup_count)
+
+        article.composite_score = compute_composite_score(
+            article.heat_score,
+            article.importance_score,
+            article.duplication_score,
+        )
+        count += 1
+        if count % 50 == 0:
+            db.commit()
+
+    db.commit()
+    logger.info("打分完成: %d 篇文章", count)
+    return count
@@ -0,0 +1,188 @@
+"""运行时配置管理：支持环境变量作为默认值，数据库覆盖"""
+import logging
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from sqlalchemy.orm import Session
+
+from config import settings
+from models import AppSetting
+
+logger = logging.getLogger(__name__)
+
+
+# 可在 Web UI 中编辑的配置项清单
+EDITABLE_SETTINGS = {
+    "RSSKEEPER_BASE_URL": {"description": "rssKeeper 服务地址", "sensitive": False},
+    "OPENAI_API_KEY": {"description": "LLM API Key", "sensitive": True},
+    "OPENAI_BASE_URL": {"description": "LLM API 基础地址", "sensitive": False},
+    "OPENAI_MODEL": {"description": "LLM 模型名", "sensitive": False},
+    "OPENAI_TIMEOUT": {"description": "LLM 调用超时（秒）", "sensitive": False},
+    "OPENAI_MAX_RETRIES": {"description": "LLM 调用最大重试次数", "sensitive": False},
+    "SUMMARIZE_INTERVAL_MINUTES": {"description": "摘要任务间隔（分钟）", "sensitive": False},
+    "TAG_SCORE_INTERVAL_MINUTES": {"description": "分类/打分/去重任务间隔（分钟）", "sensitive": False},
+    "DAILY_BRIEF_HOUR": {"description": "每日简报生成小时", "sensitive": False},
+    "DAILY_BRIEF_MINUTE": {"description": "每日简报生成分钟", "sensitive": False},
+    "TITLE_SIMILARITY_THRESHOLD": {"description": "标题相似度阈值", "sensitive": False},
+    "CONTENT_SIMILARITY_THRESHOLD": {"description": "内容相似度阈值", "sensitive": False},
+    "MAX_AI_SUMMARY_LENGTH": {"description": "AI 摘要最大长度", "sensitive": False},
+    "MIN_ORIGINAL_SUMMARY_LENGTH": {"description": "原始摘要最小长度", "sensitive": False},
+    "BRIEF_TOP_N_PER_CATEGORY": {"description": "简报每分类显示文章数", "sensitive": False},
+    "LOG_LEVEL": {"description": "日志级别", "sensitive": False},
+    "API_TOKEN": {"description": "API 鉴权 Token（为空时不启用鉴权）", "sensitive": True},
+    "CORS_ALLOWED_ORIGINS": {"description": "CORS 允许来源（逗号分隔）", "sensitive": False},
+}
+
+
+def _get_env_default(key: str) -> str:
+    """从 Pydantic Settings 获取环境变量默认值"""
+    value = getattr(settings, key, "")
+    return str(value) if value is not None else ""
+
+
+def _mask_sensitive(value: str) -> str:
+    """对敏感值做部分脱敏"""
+    if not value:
+        return ""
+    if len(value) <= 8:
+        return "*" * len(value)
+    return value[:4] + "..." + value[-4:]
+
+
+def init_default_settings(db: Session) -> None:
+    """若配置表为空，使用环境变量初始化默认配置"""
+    existing_count = db.query(AppSetting).count()
+    if existing_count > 0:
+        return
+
+    for key, meta in EDITABLE_SETTINGS.items():
+        default_value = _get_env_default(key)
+        db.add(
+            AppSetting(
+                key=key,
+                value=default_value,
+                description=meta["description"],
+                is_sensitive=meta["sensitive"],
+            )
+        )
+
+    db.commit()
+    logger.info("已初始化默认配置项: %d 条", len(EDITABLE_SETTINGS))
+
+
+def get_setting(db: Session, key: str, default: Any = None) -> Any:
+    """从数据库读取配置，若不存在则返回环境变量默认值"""
+    setting = db.query(AppSetting).filter(AppSetting.key == key).first()
+    if setting:
+        return setting.value
+    return _get_env_default(key) if default is None else default
+
+
+def get_setting_value(key: str, default: Any = None) -> Any:
+    """不依赖 Session，直接创建临时会话读取"""
+    from database import SessionLocal
+    db = SessionLocal()
+    try:
+        return get_setting(db, key, default)
+    finally:
+        db.close()
+
+
+def set_setting(db: Session, key: str, value: str) -> bool:
+    """更新单个配置项"""
+    if key not in EDITABLE_SETTINGS:
+        return False
+
+    setting = db.query(AppSetting).filter(AppSetting.key == key).first()
+    if setting:
+        setting.value = str(value)
+        setting.updated_at = datetime.now(timezone.utc)
+    else:
+        meta = EDITABLE_SETTINGS[key]
+        db.add(
+            AppSetting(
+                key=key,
+                value=str(value),
+                description=meta["description"],
+                is_sensitive=meta["sensitive"],
+            )
+        )
+
+    db.commit()
+    logger.info("配置已更新: %s", key)
+    return True
+
+
+def list_settings(db: Session, mask_sensitive: bool = True) -> List[Dict[str, Any]]:
+    """列出所有可编辑配置"""
+    db_settings = {s.key: s for s in db.query(AppSetting).all()}
+    result = []
+
+    for key, meta in EDITABLE_SETTINGS.items():
+        setting = db_settings.get(key)
+        value = setting.value if setting else _get_env_default(key)
+        is_sensitive = meta["sensitive"]
+
+        if is_sensitive and mask_sensitive:
+            display_value = _mask_sensitive(value)
+            is_masked = True
+        else:
+            display_value = value
+            is_masked = False
+
+        result.append({
+            "key": key,
+            "value": display_value,
+            "real_value": value if not mask_sensitive else None,
+            "description": meta["description"],
+            "is_sensitive": is_sensitive,
+            "is_masked": is_masked,
+            "updated_at": setting.updated_at.isoformat() if setting else None,
+        })
+
+    return result
+
+
+def reset_settings(db: Session) -> None:
+    """将所有配置重置为环境变量默认值"""
+    for key in EDITABLE_SETTINGS:
+        set_setting(db, key, _get_env_default(key))
+    logger.info("配置已重置为环境变量默认值")
+
+
+def apply_db_settings_to_config(db: Session = None) -> None:
+    """将数据库中的配置覆盖到全局 settings 对象，重启后生效"""
+    close_db = False
+    if db is None:
+        from database import SessionLocal
+        db = SessionLocal()
+        close_db = True
+    try:
+        for key in EDITABLE_SETTINGS:
+            db_value = get_setting(db, key)
+            if db_value is None or db_value == "":
+                continue
+            field_info = settings.model_fields.get(key)
+            if field_info is None:
+                continue
+            target_type = field_info.annotation
+            try:
+                if target_type is int:
+                    converted = int(db_value)
+                elif target_type is float:
+                    converted = float(db_value)
+                elif target_type is bool:
+                    converted = db_value.lower() in ("true", "1", "yes")
+                elif target_type is Path:
+                    converted = Path(db_value)
+                else:
+                    converted = db_value
+                setattr(settings, key, converted)
+                logger.debug("已应用配置: %s=%s", key, converted)
+            except Exception as exc:
+                logger.error("应用配置 %s=%s 失败: %s", key, db_value, exc)
+                raise ValueError(f"配置项 {key} 的值无效: {db_value}") from exc
+    finally:
+        if close_db:
+            db.close()
@@ -0,0 +1,154 @@
+"""文章摘要生成器：对无摘要或短摘要文章调用 LLM 生成 AI 摘要"""
+import logging
+from datetime import datetime, timezone
+from typing import List, Dict, Any
+
+from sqlalchemy.orm import Session
+
+from app.ai_client import ai_client
+from app.rss_client import rss_client
+from config import settings
+from models import EnrichedArticle
+
+logger = logging.getLogger(__name__)
+
+
+SUMMARY_SYSTEM_PROMPT = """你是一位擅长阅读 RSS 新闻并提炼摘要的助手。
+请用简洁流畅的中文总结文章核心内容，要求：
+1. 长度控制在 {max_length} 个汉字以内。
+2. 包含文章最重要的 1-3 个要点。
+3. 不要添加个人评价，不要复述原文标题。
+4. 若原文是英文，请用中文输出摘要。
+"""
+
+
+SUMMARY_USER_PROMPT_TEMPLATE = """请为以下文章生成摘要。
+
+标题：{title}
+作者：{author}
+来源：{feed_title}
+
+正文：
+{content}
+"""
+
+
+def _needs_summary(article: EnrichedArticle) -> bool:
+    """判断是否需要生成 AI 摘要"""
+    if not article.ai_summary:
+        return True
+    original = article.original_summary or ""
+    if len(original.strip()) < settings.MIN_ORIGINAL_SUMMARY_LENGTH:
+        return True
+    return False
+
+
+def _prepare_content(raw_content: str, max_chars: int = 8000) -> str:
+    """清洗并截断正文，避免超过 LLM 上下文"""
+    text = raw_content or ""
+    # 简单去除多余空白
+    text = " ".join(text.split())
+    return text[:max_chars]
+
+
+def _generate_summary(article: EnrichedArticle) -> str:
+    """调用 LLM 生成单篇文章摘要"""
+    content = _prepare_content(article.content or article.original_summary or "")
+    if not content.strip():
+        # 如果连原始摘要都没有，只能基于标题生成
+        content = article.title or ""
+
+    system_prompt = SUMMARY_SYSTEM_PROMPT.format(max_length=settings.MAX_AI_SUMMARY_LENGTH)
+    user_prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
+        title=article.title or "",
+        author=article.author or "",
+        feed_title=article.feed_title or "",
+        content=content,
+    )
+
+    try:
+        summary = ai_client.chat_completion(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            temperature=0.3,
+        )
+        return summary[: settings.MAX_AI_SUMMARY_LENGTH]
+    except Exception as exc:
+        logger.error("生成 article_id=%d 摘要失败: %s", article.rk_article_id, exc)
+        return ""
+
+
+def _article_from_rss(raw: Dict[str, Any]) -> Dict[str, Any]:
+    """把 rssKeeper 返回的文章转换为可写入 enriched 表的字典"""
+    published_at = raw.get("published_at")
+    if isinstance(published_at, str):
+        try:
+            published_at = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
+        except Exception:
+            published_at = None
+
+    return {
+        "rk_article_id": raw["id"],
+        "title": raw.get("title", "") or "",
+        "link": raw.get("link", "") or "",
+        "feed_id": raw.get("feed_id", 0),
+        "feed_title": raw.get("feed_title", "") or "",
+        "feed_category": raw.get("category", "") or "",
+        "author": raw.get("author", "") or "",
+        "published_at": published_at,
+        "original_summary": raw.get("summary", "") or "",
+        "content": raw.get("content", "") or "",
+    }
+
+
+def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[str, int]:
+    """
+    拉取最近文章，补充 AI 摘要。
+    返回统计信息：{"fetched": x, "created": y, "summarized": z}
+    """
+    articles = rss_client.fetch_recent(hours=hours, limit=limit)
+    if not articles:
+        logger.info("未拉取到新文章")
+        return {"fetched": 0, "created": 0, "summarized": 0}
+
+    stats = {"fetched": len(articles), "created": 0, "summarized": 0}
+
+    for raw in articles:
+        data = _article_from_rss(raw)
+        article = db.query(EnrichedArticle).filter(
+            EnrichedArticle.rk_article_id == data["rk_article_id"]
+        ).first()
+
+        if article is None:
+            article = EnrichedArticle(**data)
+            db.add(article)
+            db.flush()
+            stats["created"] += 1
+        else:
+            # 更新已有记录的基础字段
+            article.title = data["title"] or article.title
+            article.link = data["link"] or article.link
+            article.feed_title = data["feed_title"] or article.feed_title
+            article.feed_category = data["feed_category"] or article.feed_category
+            article.author = data["author"] or article.author
+            article.published_at = data["published_at"] or article.published_at
+            article.original_summary = data["original_summary"] or article.original_summary
+            article.content = data["content"] or article.content
+            article.fetched_at = datetime.now(timezone.utc)
+
+        if _needs_summary(article):
+            ai_summary = _generate_summary(article)
+            if ai_summary:
+                article.ai_summary = ai_summary
+                stats["summarized"] += 1
+
+        # 每 10 篇提交一次，避免长时间事务
+        if stats["summarized"] % 10 == 0:
+            db.commit()
+
+    db.commit()
+    logger.info(
+        "摘要任务完成: fetched=%d, created=%d, summarized=%d",
+        stats["fetched"], stats["created"], stats["summarized"]
+    )
+    return stats
@@ -0,0 +1,116 @@
+"""基于规则给文章分类、打标签"""
+import logging
+import re
+from typing import List, Dict, Any, Tuple
+
+from sqlalchemy.orm import Session
+
+from models import EnrichedArticle, Taxonomy
+
+logger = logging.getLogger(__name__)
+
+
+def _normalize(text: str) -> str:
+    """规范化文本用于关键词匹配"""
+    if not text:
+        return ""
+    # 去除多余空白，统一小写
+    text = " ".join(text.split())
+    return text.lower()
+
+
+def _count_matches(text: str, keywords: List[str]) -> int:
+    """统计关键词在文本中的命中次数（不区分大小写）"""
+    if not text or not keywords:
+        return 0
+    text_norm = _normalize(text)
+    count = 0
+    for kw in keywords:
+        if not kw:
+            continue
+        kw_norm = _normalize(kw)
+        # 简单子串匹配；中文关键词也适用
+        count += text_norm.count(kw_norm)
+    return count
+
+
+def classify_article(article: EnrichedArticle, categories: List[Taxonomy]) -> str:
+    """为文章选择最匹配的分类"""
+    text = " ".join([
+        article.title or "",
+        article.ai_summary or article.original_summary or "",
+        article.content or "",
+    ])
+
+    best_category = ""
+    best_score = 0
+
+    for cat in categories:
+        score = _count_matches(text, cat.keywords or [])
+        # 如果文章来自某个 Feed 分类，给予少量加成
+        if article.feed_category and article.feed_category == cat.name:
+            score += 2
+        if score > best_score:
+            best_score = score
+            best_category = cat.name
+
+    # 若完全没有命中，回退到源分类
+    if not best_category and article.feed_category:
+        best_category = article.feed_category
+
+    if not best_category:
+        best_category = "未分类"
+
+    return best_category
+
+
+def tag_article(article: EnrichedArticle, tags: List[Taxonomy]) -> List[str]:
+    """为文章打上命中的标签"""
+    text = " ".join([
+        article.title or "",
+        article.ai_summary or article.original_summary or "",
+        article.content or "",
+    ])
+
+    matched = []
+    for tag in tags:
+        if _count_matches(text, tag.keywords or []) > 0:
+            matched.append(tag.name)
+
+    # 去重并保持顺序
+    return list(dict.fromkeys(matched))
+
+
+def tag_articles(db: Session, article_ids: List[int] = None) -> int:
+    """
+    对文章进行分类和打标签。
+    若指定 article_ids 则只处理这些文章；否则处理所有未分类或没有标签的文章。
+    返回处理数量。
+    """
+    categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").all()
+    tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").all()
+
+    if not categories:
+        logger.warning("taxonomy 中无 category 数据，跳过分类")
+        return 0
+
+    query = db.query(EnrichedArticle)
+    if article_ids:
+        query = query.filter(EnrichedArticle.id.in_(article_ids))
+    else:
+        query = query.filter(
+            (EnrichedArticle.category == "") | (EnrichedArticle.category == None)
+        )
+
+    articles = query.all()
+    count = 0
+    for article in articles:
+        article.category = classify_article(article, categories)
+        article.tags = tag_article(article, tags)
+        count += 1
+        if count % 50 == 0:
+            db.commit()
+
+    db.commit()
+    logger.info("分类/打标签完成: %d 篇文章", count)
+    return count
@@ -0,0 +1,140 @@
+"""分类/标签/打分规则体系的初始化与维护"""
+import json
+import logging
+from typing import List, Dict, Any
+
+from sqlalchemy.orm import Session
+
+from app.ai_client import ai_client
+from app.rss_client import rss_client
+from models import Taxonomy
+
+logger = logging.getLogger(__name__)
+
+
+TAXONOMY_SYSTEM_PROMPT = """你是一位专业的信息分类与内容分析专家。
+请根据用户提供的 RSS 文章样本，生成一套适合的中文内容分类体系、标签体系和打分规则。
+
+输出必须是合法的 JSON，格式如下：
+{
+  "categories": [
+    {"name": "科技", "description": "人工智能、芯片、互联网、软件等", "keywords": ["AI", "芯片", "大模型", ...]}
+  ],
+  "tags": [
+    {"name": "人工智能", "description": "...", "keywords": ["AI", "人工智能", "大模型", ...]}
+  ],
+  "heat_rules": [
+    {"name": "热点事件", "keywords": ["突发", "重磅", "刚刚", "发布"], "weight": 1.5}
+  ],
+  "importance_rules": [
+    {"name": "政策法规", "keywords": ["政策", "监管", "法规", "征求意见"], "weight": 1.5}
+  ],
+  "duplication_indicators": [
+    {"name": "同一事件", "keywords": ["宣布", "发布", "推出"], "weight": 1.0}
+  ]
+}
+
+要求：
+1. categories 数量控制在 8-12 个，覆盖科技、财经、新闻、设计、生活等常见 RSS 主题。
+2. tags 数量控制在 30-50 个，尽量细化但避免过度重叠。
+3. heat_rules 和 importance_rules 各 10-20 条，weight 范围 0.5-2.0。
+4. 所有 keywords 用中文或中英双语，便于后续关键词匹配。
+5. 不要输出任何解释文字，只输出 JSON。
+"""
+
+
+def _build_sample_prompt(articles: List[Dict[str, Any]]) -> str:
+    lines = [f"共有 {len(articles)} 篇文章样本："]
+    for idx, art in enumerate(articles[:50], 1):
+        title = art.get("title", "")
+        summary = art.get("summary", "") or art.get("content", "")[:300]
+        feed = art.get("feed_title", "")
+        cat = art.get("category", "")
+        lines.append(f"\n[{idx}] 标题：{title}")
+        lines.append(f"    来源：{feed} | 源分类：{cat}")
+        lines.append(f"    摘要：{summary[:400]}")
+    return "\n".join(lines)
+
+
+def bootstrap_taxonomy(db: Session, force: bool = False) -> bool:
+    """
+    初始化分类/标签/打分规则。
+    若 force=True 则清空后重建；否则仅在表为空时初始化。
+    """
+    existing = db.query(Taxonomy).first()
+    if existing and not force:
+        logger.info("taxonomy 表已存在，跳过初始化")
+        return False
+
+    if force:
+        db.query(Taxonomy).delete()
+        db.commit()
+        logger.info("强制重新初始化 taxonomy")
+
+    logger.info("开始从 rssKeeper 拉取样本文章并生成分类体系...")
+    articles = rss_client.fetch_recent(hours=24 * 7, limit=200)
+    if not articles:
+        logger.warning("未获取到样本文章，无法生成分类体系")
+        return False
+
+    user_prompt = _build_sample_prompt(articles)
+    try:
+        result = ai_client.chat_completion_json(
+            system_prompt=TAXONOMY_SYSTEM_PROMPT,
+            user_prompt=user_prompt,
+            temperature=0.5,
+        )
+    except Exception as exc:
+        logger.error("生成分类体系失败: %s", exc)
+        return False
+
+    _save_taxonomy(db, result)
+    logger.info("taxonomy 初始化完成，共写入 %d 条规则", db.query(Taxonomy).count())
+    return True
+
+
+def _save_taxonomy(db: Session, data: Dict[str, Any]) -> None:
+    """把 LLM 返回的分类体系写入数据库"""
+
+    def _add(kind: str, items: List[Dict[str, Any]], default_weight: float = 1.0):
+        for item in items:
+            name = item.get("name", "").strip()
+            if not name:
+                continue
+            keywords = item.get("keywords", [])
+            if isinstance(keywords, str):
+                keywords = [keywords]
+            db.add(
+                Taxonomy(
+                    name=name,
+                    kind=kind,
+                    description=item.get("description", ""),
+                    keywords=keywords,
+                    weight=float(item.get("weight", default_weight)),
+                    created_by_ai=True,
+                )
+            )
+
+    _add("category", data.get("categories", []))
+    _add("tag", data.get("tags", []))
+    _add("heat_rule", data.get("heat_rules", []), default_weight=1.0)
+    _add("importance_rule", data.get("importance_rules", []), default_weight=1.0)
+    _add("duplication_rule", data.get("duplication_indicators", []), default_weight=1.0)
+
+    db.commit()
+
+
+def ensure_taxonomy(db: Session) -> bool:
+    """确保 taxonomy 表非空，若为空则触发初始化"""
+    existing = db.query(Taxonomy).first()
+    if existing:
+        return True
+    return bootstrap_taxonomy(db)
+
+
+def list_taxonomy(db: Session, kind: str = None) -> List[Taxonomy]:
+    """列出分类体系规则"""
+    query = db.query(Taxonomy)
+    if kind:
+        query = query.filter(Taxonomy.kind == kind)
+    return query.order_by(Taxonomy.kind, Taxonomy.name).all()