feat: 修复代码审核报告问题

This commit is contained in:
congsh
2026-06-12 16:04:03 +08:00
commit bae47a2411
46 changed files with 6231 additions and 0 deletions
View File
+92
View File
@@ -0,0 +1,92 @@
"""LLM API 客户端,兼容 OpenAI API 格式"""
import json
import logging
from typing import Optional
from openai import OpenAI, APIError
from config import settings
logger = logging.getLogger(__name__)
class AIClient:
"""封装 LLM 调用,支持重试和 JSON 输出"""
def __init__(
self,
api_key: Optional[str] = None,
base_url: Optional[str] = None,
model: Optional[str] = None,
timeout: Optional[int] = None,
max_retries: Optional[int] = None,
):
self.api_key = api_key or settings.OPENAI_API_KEY
self.base_url = base_url or settings.OPENAI_BASE_URL
self.model = model or settings.OPENAI_MODEL
self.timeout = timeout or settings.OPENAI_TIMEOUT
self.max_retries = max_retries or settings.OPENAI_MAX_RETRIES
self._client: Optional[OpenAI] = None
@property
def client(self) -> OpenAI:
if self._client is None:
self._client = OpenAI(
api_key=self.api_key,
base_url=self.base_url,
timeout=self.timeout,
max_retries=self.max_retries,
)
return self._client
def chat_completion(
self,
system_prompt: str,
user_prompt: str,
temperature: float = 0.3,
json_mode: bool = False,
) -> str:
"""调用 LLM 返回文本"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
kwargs = {
"model": self.model,
"messages": messages,
"temperature": temperature,
}
if json_mode:
kwargs["response_format"] = {"type": "json_object"}
try:
resp = self.client.chat.completions.create(**kwargs)
content = resp.choices[0].message.content or ""
return content.strip()
except APIError as exc:
logger.error("LLM API 调用失败: %s", exc)
raise
def chat_completion_json(
self,
system_prompt: str,
user_prompt: str,
temperature: float = 0.3,
) -> dict:
"""调用 LLM 并解析返回的 JSON"""
content = self.chat_completion(
system_prompt=system_prompt,
user_prompt=user_prompt,
temperature=temperature,
json_mode=True,
)
try:
return json.loads(content)
except json.JSONDecodeError as exc:
logger.error("LLM 返回不是合法 JSON: %s - content=%s", exc, content[:500])
raise
ai_client = AIClient()
+168
View File
@@ -0,0 +1,168 @@
"""每日简报生成"""
import json
import logging
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, Any, List
from sqlalchemy.orm import Session
from config import settings
from models import EnrichedArticle, DailyBrief
logger = logging.getLogger(__name__)
def _format_article(article: EnrichedArticle) -> Dict[str, Any]:
"""把文章格式化为简报中的条目"""
return {
"id": article.id,
"rk_article_id": article.rk_article_id,
"title": article.title or "",
"link": article.link or "",
"author": article.author or "",
"feed_title": article.feed_title or "",
"summary": article.ai_summary or article.original_summary or "",
"tags": article.tags or [],
"heat_score": article.heat_score,
"importance_score": article.importance_score,
"duplication_score": article.duplication_score,
"composite_score": article.composite_score,
"published_at": article.published_at.isoformat() if article.published_at else None,
}
def _build_markdown(date_str: str, by_category: Dict[str, List[Dict[str, Any]]], stats: Dict[str, int]) -> str:
"""生成 Markdown 简报"""
lines = [
f"# RSS 每日简报 ({date_str})",
"",
f"- 去重前文章数: {stats['total_articles']}",
f"- 去重后文章数: {stats['unique_articles']}",
f"- 生成分类数: {len(by_category)}",
"",
"---",
"",
]
for category, items in sorted(by_category.items(), key=lambda x: x[0]):
lines.append(f"## {category}")
lines.append("")
for item in items:
tags = " ".join([f"`{t}`" for t in item["tags"]]) if item["tags"] else ""
lines.append(f"### {item['title']}")
lines.append(f"- 来源: {item['feed_title']} | 作者: {item.get('author') or '未知'}")
lines.append(f"- 标签: {tags}")
lines.append(f"- 热度: {item['heat_score']:.1f} | 重要性: {item['importance_score']:.1f} | 重复度: {item['duplication_score']:.1f} | 综合: {item['composite_score']:.1f}")
if item["summary"]:
lines.append(f"- 摘要: {item['summary']}")
if item["link"]:
lines.append(f"- [阅读原文]({item['link']})")
lines.append("")
return "\n".join(lines)
def generate_daily_brief(db: Session, date_str: str = None, force: bool = False) -> Dict[str, Any]:
"""
生成指定日期的每日简报。
若 date_str 为空则处理今天。
返回简报数据字典。
"""
if date_str is None:
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
# 检查是否已存在
existing = db.query(DailyBrief).filter(DailyBrief.brief_date == date_str).first()
if existing and not force:
logger.info("日期 %s 简报已存在,跳过生成", date_str)
return {
"date": date_str,
"total_articles": existing.total_articles,
"unique_articles": existing.unique_articles,
"markdown_path": existing.markdown_path,
}
day_start = datetime.strptime(date_str, "%Y-%m-%d")
day_end = day_start + timedelta(days=1)
# 取当天去重后的代表文章
query = (
db.query(EnrichedArticle)
.filter(
EnrichedArticle.fetched_at >= day_start,
EnrichedArticle.fetched_at < day_end,
)
)
# 默认只取代表文章或未归入重复组的文章
representative_articles = (
query.filter(
(EnrichedArticle.is_representative == True)
| (EnrichedArticle.duplicate_group_id == None)
)
.order_by(EnrichedArticle.composite_score.desc())
.all()
)
# 按分类分组并排序
by_category: Dict[str, List[Dict[str, Any]]] = {}
for art in representative_articles:
cat = art.category or "未分类"
if cat not in by_category:
by_category[cat] = []
by_category[cat].append(_format_article(art))
# 每个分类只保留 TOP N
top_n = settings.BRIEF_TOP_N_PER_CATEGORY
for cat in by_category:
by_category[cat] = by_category[cat][:top_n]
total_before_dedup = query.count()
unique_count = sum(len(items) for items in by_category.values())
stats = {
"total_articles": total_before_dedup,
"unique_articles": unique_count,
}
# 生成 Markdown 文件
output_dir = settings.brief_output_dir_path / date_str
output_dir.mkdir(parents=True, exist_ok=True)
markdown_path = output_dir / "daily-brief.md"
markdown_content = _build_markdown(date_str, by_category, stats)
markdown_path.write_text(markdown_content, encoding="utf-8")
# 更新文章 brief_date
for art in representative_articles:
art.brief_date = date_str
# 保存到数据库
brief_data = {
"date": date_str,
"total_articles": stats["total_articles"],
"unique_articles": stats["unique_articles"],
"by_category": by_category,
"markdown_path": str(markdown_path),
}
if existing:
existing.total_articles = stats["total_articles"]
existing.unique_articles = stats["unique_articles"]
existing.by_category = by_category
existing.markdown_path = str(markdown_path)
existing.updated_at = datetime.now(timezone.utc)
else:
db.add(
DailyBrief(
brief_date=date_str,
total_articles=stats["total_articles"],
unique_articles=stats["unique_articles"],
by_category=by_category,
markdown_path=str(markdown_path),
)
)
db.commit()
logger.info("简报生成完成: 日期=%s, 去重前=%d, 去重后=%d", date_str, stats["total_articles"], stats["unique_articles"])
return brief_data
+223
View File
@@ -0,0 +1,223 @@
"""文章去重:URL 精确去重 + 标题/内容相似度去重"""
import logging
import re
from datetime import datetime, timedelta, timezone
from difflib import SequenceMatcher
from typing import List, Dict, Tuple, Set
from sqlalchemy.orm import Session
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from config import settings
from models import EnrichedArticle, DuplicateGroup
logger = logging.getLogger(__name__)
def _normalize_title(title: str) -> str:
"""标题规范化:去除标点和多余空格,小写,保留中英文数字"""
if not title:
return ""
# 保留:单词字符、CJK 统一表意符号(含扩展 A/B/C/D/E)
title = re.sub(
r"[^\w一-鿿㐀-䶿\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f]",
" ",
title,
)
title = " ".join(title.split())
return title.lower()
def _title_similarity(a: str, b: str) -> float:
"""计算标题相似度"""
na = _normalize_title(a)
nb = _normalize_title(b)
if not na or not nb:
return 0.0
return SequenceMatcher(None, na, nb).ratio()
def _content_similarity_matrix(contents: List[str]) -> np.ndarray:
"""使用 TF-IDF + 余弦相似度计算内容相似度矩阵"""
if len(contents) < 2:
return np.zeros((len(contents), len(contents)))
# 过滤空内容
valid_contents = [c or "" for c in contents]
try:
vectorizer = TfidfVectorizer(
max_features=5000,
stop_words="english",
ngram_range=(1, 2),
min_df=1,
)
tfidf = vectorizer.fit_transform(valid_contents)
return cosine_similarity(tfidf)
except Exception as exc:
logger.warning("TF-IDF 相似度计算失败: %s", exc)
return np.zeros((len(contents), len(contents)))
def _find_duplicate_clusters(
articles: List[EnrichedArticle],
title_threshold: float = None,
content_threshold: float = None,
) -> List[Set[int]]:
"""
基于标题相似度和内容相似度找出重复簇。
返回索引簇列表,每个簇是一组 articles 的索引集合。
"""
title_threshold = title_threshold or settings.TITLE_SIMILARITY_THRESHOLD
content_threshold = content_threshold or settings.CONTENT_SIMILARITY_THRESHOLD
n = len(articles)
if n < 2:
return []
contents = []
for art in articles:
text = " ".join([
art.title or "",
art.ai_summary or art.original_summary or "",
art.content or "",
])
contents.append(text[:2000]) # 限制长度加速计算
content_sim = _content_similarity_matrix(contents)
visited = [False] * n
clusters: List[Set[int]] = []
for i in range(n):
if visited[i]:
continue
cluster = {i}
queue = [i]
visited[i] = True
while queue:
cur = queue.pop(0)
for j in range(n):
if visited[j] or cur == j:
continue
title_sim = _title_similarity(articles[cur].title or "", articles[j].title or "")
c_sim = content_sim[cur][j] if cur < n and j < n else 0.0
# 标题高度相似 或 内容高度相似均视为重复
if title_sim >= title_threshold or c_sim >= content_threshold:
cluster.add(j)
queue.append(j)
visited[j] = True
if len(cluster) > 1:
clusters.append(cluster)
return clusters
def _pick_representative(articles: List[EnrichedArticle], indices: Set[int]) -> EnrichedArticle:
"""从重复组中选择代表文章:优先选有 AI 摘要、来源 Feed 分类明确、发布时间最早的"""
candidates = [articles[i] for i in indices]
# 排序:有 AI 摘要优先,然后有 Feed 分类,然后发布时间早
candidates.sort(
key=lambda a: (
bool(a.ai_summary),
bool(a.feed_category),
a.published_at or datetime.min,
),
reverse=True,
)
return candidates[0]
def deduplicate_articles(
db: Session,
date_str: str = None,
title_threshold: float = None,
content_threshold: float = None,
) -> Dict[str, int]:
"""
对指定日期的文章进行去重。
若 date_str 为空则处理今天(UTC)的文章。
返回统计:{"total": x, "duplicate_groups": y, "representatives": z}
"""
if date_str is None:
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
# 只清空该日期已有的去重组,避免破坏历史数据
day_start = datetime.strptime(date_str, "%Y-%m-%d")
day_end = day_start + timedelta(days=1)
old_groups = db.query(DuplicateGroup).filter(DuplicateGroup.brief_date == date_str).all()
for og in old_groups:
for art in og.articles:
art.duplicate_group_id = None
art.is_representative = False
db.delete(og)
db.commit()
# 重置该日期文章的去重标记
articles = (
db.query(EnrichedArticle)
.filter(
EnrichedArticle.fetched_at >= day_start,
EnrichedArticle.fetched_at < day_end,
)
.order_by(EnrichedArticle.published_at)
.all()
)
if not articles:
logger.info("日期 %s 无文章可去重", date_str)
return {"total": 0, "duplicate_groups": 0, "representatives": 0}
# 先 URL 去重:相同 link 只保留一篇
unique_articles: List[EnrichedArticle] = []
seen_links: set = set()
url_dup_count = 0
for art in articles:
link = (art.link or "").strip()
if link and link in seen_links:
url_dup_count += 1
continue
if link:
seen_links.add(link)
unique_articles.append(art)
clusters = _find_duplicate_clusters(
unique_articles,
title_threshold=title_threshold,
content_threshold=content_threshold,
)
stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0}
for cluster in clusters:
representative = _pick_representative(unique_articles, cluster)
member_ids = [unique_articles[i].id for i in cluster]
group = DuplicateGroup(
representative_article_id=representative.id,
member_article_ids=member_ids,
similarity_matrix={}, # 可后续补充
brief_date=date_str,
)
db.add(group)
db.flush()
for idx in cluster:
art = unique_articles[idx]
art.duplicate_group_id = group.id
art.is_representative = (art.id == representative.id)
stats["representatives"] += 1
db.commit()
logger.info(
"去重完成: 日期=%s, 总文章=%d, 重复组=%d, URL 重复=%d",
date_str, stats["total"], stats["duplicate_groups"], url_dup_count
)
return stats
+104
View File
@@ -0,0 +1,104 @@
"""调用 rssKeeper 外部 API"""
from datetime import datetime, timedelta
from typing import List, Optional, Dict, Any
import logging
import requests
from config import settings
logger = logging.getLogger(__name__)
class RSSKeeperClient:
"""rssKeeper 外部 API 客户端"""
def __init__(self, base_url: Optional[str] = None, timeout: int = 30):
self.base_url = (base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
self.timeout = timeout
def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
url = f"{self.base_url}{path}"
try:
resp = requests.get(url, params=params, timeout=self.timeout)
resp.raise_for_status()
return resp.json()
except requests.RequestException as exc:
logger.error("请求 rssKeeper 失败: %s - %s", url, exc)
raise
def fetch_recent(
self,
hours: int = 24,
limit: int = 200,
feed_id: Optional[int] = None,
category: Optional[str] = None,
search: Optional[str] = None,
unread_only: bool = False,
) -> List[Dict[str, Any]]:
"""获取最近 N 小时的文章"""
params = {
"hours": hours,
"limit": limit,
"unread_only": unread_only,
}
if feed_id is not None:
params["feed_id"] = feed_id
if category is not None:
params["category"] = category
if search is not None:
params["search"] = search
data = self._get("/api/v1/external/recent", params=params)
return data.get("articles", [])
def fetch_by_date(self, date: str, category: Optional[str] = None) -> Dict[str, Any]:
"""获取指定日期的文章聚合"""
params: Dict[str, Any] = {"date": date}
if category is not None:
params["category"] = category
return self._get("/api/v1/external/summary", params=params)
def fetch_feeds(
self,
health_status: Optional[str] = None,
category: Optional[str] = None,
error_type: Optional[str] = None,
is_active: Optional[bool] = True,
) -> List[Dict[str, Any]]:
"""获取 RSS 源列表"""
params: Dict[str, Any] = {}
if health_status is not None:
params["health_status"] = health_status
if category is not None:
params["category"] = category
if error_type is not None:
params["error_type"] = error_type
if is_active is not None:
params["is_active"] = is_active
data = self._get("/api/v1/external/feeds", params=params)
return data.get("feeds", [])
def fulltext_search(
self,
q: str,
limit: int = 50,
offset: int = 0,
category: Optional[str] = None,
feed_id: Optional[int] = None,
) -> Dict[str, Any]:
"""全文搜索文章"""
params: Dict[str, Any] = {
"q": q,
"limit": limit,
"offset": offset,
}
if category is not None:
params["category"] = category
if feed_id is not None:
params["feed_id"] = feed_id
return self._get("/api/v1/external/search", params=params)
rss_client = RSSKeeperClient()
+147
View File
@@ -0,0 +1,147 @@
"""基于规则计算文章热度、重要性、重复性分数"""
import logging
import math
from datetime import datetime, timedelta, timezone
from typing import List
from sqlalchemy.orm import Session
from config import settings
from models import EnrichedArticle, Taxonomy
from app.tagger import _count_matches, _normalize
logger = logging.getLogger(__name__)
# 综合分权重:热度 30%,重要性 50%,重复性 20%
COMPOSITE_WEIGHT_HEAT = 0.3
COMPOSITE_WEIGHT_IMPORTANCE = 0.5
COMPOSITE_WEIGHT_DUPLICATION = 0.2
def _build_text(article: EnrichedArticle) -> str:
"""构建用于打分的文本"""
return " ".join([
article.title or "",
article.ai_summary or article.original_summary or "",
article.content or "",
])
def _score_by_rules(article: EnrichedArticle, rules: List[Taxonomy]) -> float:
"""基于规则关键词匹配计算分数,规则权重越大得分越高"""
text = _build_text(article)
if not text.strip() or not rules:
return 0.0
score = 0.0
for rule in rules:
keywords = rule.keywords or []
hits = _count_matches(text, keywords)
if hits > 0:
score += min(hits, 5) * rule.weight * 10
return min(score, 100.0)
def _freshness_score(article: EnrichedArticle) -> float:
"""根据发布时间计算新鲜度加成"""
now = datetime.now(timezone.utc)
published = article.published_at
if not published:
return 0.0
# 数据库中读出的 published_at 可能为 naive,默认按 UTC 处理
if published.tzinfo is None:
published = published.replace(tzinfo=timezone.utc)
hours_old = (now - published).total_seconds() / 3600
if hours_old < 0:
hours_old = 0
# 24 小时内满分 20 分,超过 72 小时降至 0
if hours_old <= 24:
return 20.0
elif hours_old >= 72:
return 0.0
else:
return 20.0 * (1 - (hours_old - 24) / 48)
def compute_heat_score(article: EnrichedArticle, heat_rules: List[Taxonomy]) -> float:
"""热度分:关键词命中 + 新鲜度"""
base = _score_by_rules(article, heat_rules)
fresh = _freshness_score(article)
return min(base + fresh, 100.0)
def compute_importance_score(article: EnrichedArticle, importance_rules: List[Taxonomy]) -> float:
"""重要性分:关键词命中"""
return _score_by_rules(article, importance_rules)
def compute_duplication_score(duplicate_count: int, max_count: int = 5) -> float:
"""
重复性分:同一主题在多个源出现次数越多,重复性分越高。
出现 1 次为 0 分,>= max_count 为 100 分。
"""
if duplicate_count <= 1:
return 0.0
score = (duplicate_count - 1) / (max_count - 1) * 100.0
return min(score, 100.0)
def compute_composite_score(heat: float, importance: float, duplication: float) -> float:
"""计算综合分"""
return round(
heat * COMPOSITE_WEIGHT_HEAT
+ importance * COMPOSITE_WEIGHT_IMPORTANCE
+ duplication * COMPOSITE_WEIGHT_DUPLICATION,
2,
)
def score_articles(
db: Session,
article_ids: List[int] = None,
update_duplication: bool = False,
) -> int:
"""
对文章计算热度/重要性/综合分。
若 update_duplication=True,则同时根据重复组更新重复性分数。
返回处理数量。
"""
heat_rules = db.query(Taxonomy).filter(Taxonomy.kind == "heat_rule").all()
importance_rules = db.query(Taxonomy).filter(Taxonomy.kind == "importance_rule").all()
query = db.query(EnrichedArticle)
if article_ids:
query = query.filter(EnrichedArticle.id.in_(article_ids))
articles = query.all()
count = 0
for article in articles:
article.heat_score = compute_heat_score(article, heat_rules)
article.importance_score = compute_importance_score(article, importance_rules)
if update_duplication:
dup_count = 0
if article.duplicate_group_id:
group = article.duplicate_group
if group and group.member_article_ids:
# 非代表成员数量才是真正的重复次数
dup_count = max(len(group.member_article_ids) - 1, 0)
article.duplication_score = compute_duplication_score(dup_count)
article.composite_score = compute_composite_score(
article.heat_score,
article.importance_score,
article.duplication_score,
)
count += 1
if count % 50 == 0:
db.commit()
db.commit()
logger.info("打分完成: %d 篇文章", count)
return count
+188
View File
@@ -0,0 +1,188 @@
"""运行时配置管理:支持环境变量作为默认值,数据库覆盖"""
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
from sqlalchemy.orm import Session
from config import settings
from models import AppSetting
logger = logging.getLogger(__name__)
# 可在 Web UI 中编辑的配置项清单
EDITABLE_SETTINGS = {
"RSSKEEPER_BASE_URL": {"description": "rssKeeper 服务地址", "sensitive": False},
"OPENAI_API_KEY": {"description": "LLM API Key", "sensitive": True},
"OPENAI_BASE_URL": {"description": "LLM API 基础地址", "sensitive": False},
"OPENAI_MODEL": {"description": "LLM 模型名", "sensitive": False},
"OPENAI_TIMEOUT": {"description": "LLM 调用超时(秒)", "sensitive": False},
"OPENAI_MAX_RETRIES": {"description": "LLM 调用最大重试次数", "sensitive": False},
"SUMMARIZE_INTERVAL_MINUTES": {"description": "摘要任务间隔(分钟)", "sensitive": False},
"TAG_SCORE_INTERVAL_MINUTES": {"description": "分类/打分/去重任务间隔(分钟)", "sensitive": False},
"DAILY_BRIEF_HOUR": {"description": "每日简报生成小时", "sensitive": False},
"DAILY_BRIEF_MINUTE": {"description": "每日简报生成分钟", "sensitive": False},
"TITLE_SIMILARITY_THRESHOLD": {"description": "标题相似度阈值", "sensitive": False},
"CONTENT_SIMILARITY_THRESHOLD": {"description": "内容相似度阈值", "sensitive": False},
"MAX_AI_SUMMARY_LENGTH": {"description": "AI 摘要最大长度", "sensitive": False},
"MIN_ORIGINAL_SUMMARY_LENGTH": {"description": "原始摘要最小长度", "sensitive": False},
"BRIEF_TOP_N_PER_CATEGORY": {"description": "简报每分类显示文章数", "sensitive": False},
"LOG_LEVEL": {"description": "日志级别", "sensitive": False},
"API_TOKEN": {"description": "API 鉴权 Token(为空时不启用鉴权)", "sensitive": True},
"CORS_ALLOWED_ORIGINS": {"description": "CORS 允许来源(逗号分隔)", "sensitive": False},
}
def _get_env_default(key: str) -> str:
"""从 Pydantic Settings 获取环境变量默认值"""
value = getattr(settings, key, "")
return str(value) if value is not None else ""
def _mask_sensitive(value: str) -> str:
"""对敏感值做部分脱敏"""
if not value:
return ""
if len(value) <= 8:
return "*" * len(value)
return value[:4] + "..." + value[-4:]
def init_default_settings(db: Session) -> None:
"""若配置表为空,使用环境变量初始化默认配置"""
existing_count = db.query(AppSetting).count()
if existing_count > 0:
return
for key, meta in EDITABLE_SETTINGS.items():
default_value = _get_env_default(key)
db.add(
AppSetting(
key=key,
value=default_value,
description=meta["description"],
is_sensitive=meta["sensitive"],
)
)
db.commit()
logger.info("已初始化默认配置项: %d", len(EDITABLE_SETTINGS))
def get_setting(db: Session, key: str, default: Any = None) -> Any:
"""从数据库读取配置,若不存在则返回环境变量默认值"""
setting = db.query(AppSetting).filter(AppSetting.key == key).first()
if setting:
return setting.value
return _get_env_default(key) if default is None else default
def get_setting_value(key: str, default: Any = None) -> Any:
"""不依赖 Session,直接创建临时会话读取"""
from database import SessionLocal
db = SessionLocal()
try:
return get_setting(db, key, default)
finally:
db.close()
def set_setting(db: Session, key: str, value: str) -> bool:
"""更新单个配置项"""
if key not in EDITABLE_SETTINGS:
return False
setting = db.query(AppSetting).filter(AppSetting.key == key).first()
if setting:
setting.value = str(value)
setting.updated_at = datetime.now(timezone.utc)
else:
meta = EDITABLE_SETTINGS[key]
db.add(
AppSetting(
key=key,
value=str(value),
description=meta["description"],
is_sensitive=meta["sensitive"],
)
)
db.commit()
logger.info("配置已更新: %s", key)
return True
def list_settings(db: Session, mask_sensitive: bool = True) -> List[Dict[str, Any]]:
"""列出所有可编辑配置"""
db_settings = {s.key: s for s in db.query(AppSetting).all()}
result = []
for key, meta in EDITABLE_SETTINGS.items():
setting = db_settings.get(key)
value = setting.value if setting else _get_env_default(key)
is_sensitive = meta["sensitive"]
if is_sensitive and mask_sensitive:
display_value = _mask_sensitive(value)
is_masked = True
else:
display_value = value
is_masked = False
result.append({
"key": key,
"value": display_value,
"real_value": value if not mask_sensitive else None,
"description": meta["description"],
"is_sensitive": is_sensitive,
"is_masked": is_masked,
"updated_at": setting.updated_at.isoformat() if setting else None,
})
return result
def reset_settings(db: Session) -> None:
"""将所有配置重置为环境变量默认值"""
for key in EDITABLE_SETTINGS:
set_setting(db, key, _get_env_default(key))
logger.info("配置已重置为环境变量默认值")
def apply_db_settings_to_config(db: Session = None) -> None:
"""将数据库中的配置覆盖到全局 settings 对象,重启后生效"""
close_db = False
if db is None:
from database import SessionLocal
db = SessionLocal()
close_db = True
try:
for key in EDITABLE_SETTINGS:
db_value = get_setting(db, key)
if db_value is None or db_value == "":
continue
field_info = settings.model_fields.get(key)
if field_info is None:
continue
target_type = field_info.annotation
try:
if target_type is int:
converted = int(db_value)
elif target_type is float:
converted = float(db_value)
elif target_type is bool:
converted = db_value.lower() in ("true", "1", "yes")
elif target_type is Path:
converted = Path(db_value)
else:
converted = db_value
setattr(settings, key, converted)
logger.debug("已应用配置: %s=%s", key, converted)
except Exception as exc:
logger.error("应用配置 %s=%s 失败: %s", key, db_value, exc)
raise ValueError(f"配置项 {key} 的值无效: {db_value}") from exc
finally:
if close_db:
db.close()
+154
View File
@@ -0,0 +1,154 @@
"""文章摘要生成器:对无摘要或短摘要文章调用 LLM 生成 AI 摘要"""
import logging
from datetime import datetime, timezone
from typing import List, Dict, Any
from sqlalchemy.orm import Session
from app.ai_client import ai_client
from app.rss_client import rss_client
from config import settings
from models import EnrichedArticle
logger = logging.getLogger(__name__)
SUMMARY_SYSTEM_PROMPT = """你是一位擅长阅读 RSS 新闻并提炼摘要的助手。
请用简洁流畅的中文总结文章核心内容,要求:
1. 长度控制在 {max_length} 个汉字以内。
2. 包含文章最重要的 1-3 个要点。
3. 不要添加个人评价,不要复述原文标题。
4. 若原文是英文,请用中文输出摘要。
"""
SUMMARY_USER_PROMPT_TEMPLATE = """请为以下文章生成摘要。
标题:{title}
作者:{author}
来源:{feed_title}
正文:
{content}
"""
def _needs_summary(article: EnrichedArticle) -> bool:
"""判断是否需要生成 AI 摘要"""
if not article.ai_summary:
return True
original = article.original_summary or ""
if len(original.strip()) < settings.MIN_ORIGINAL_SUMMARY_LENGTH:
return True
return False
def _prepare_content(raw_content: str, max_chars: int = 8000) -> str:
"""清洗并截断正文,避免超过 LLM 上下文"""
text = raw_content or ""
# 简单去除多余空白
text = " ".join(text.split())
return text[:max_chars]
def _generate_summary(article: EnrichedArticle) -> str:
"""调用 LLM 生成单篇文章摘要"""
content = _prepare_content(article.content or article.original_summary or "")
if not content.strip():
# 如果连原始摘要都没有,只能基于标题生成
content = article.title or ""
system_prompt = SUMMARY_SYSTEM_PROMPT.format(max_length=settings.MAX_AI_SUMMARY_LENGTH)
user_prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
title=article.title or "",
author=article.author or "",
feed_title=article.feed_title or "",
content=content,
)
try:
summary = ai_client.chat_completion(
system_prompt=system_prompt,
user_prompt=user_prompt,
temperature=0.3,
)
return summary[: settings.MAX_AI_SUMMARY_LENGTH]
except Exception as exc:
logger.error("生成 article_id=%d 摘要失败: %s", article.rk_article_id, exc)
return ""
def _article_from_rss(raw: Dict[str, Any]) -> Dict[str, Any]:
"""把 rssKeeper 返回的文章转换为可写入 enriched 表的字典"""
published_at = raw.get("published_at")
if isinstance(published_at, str):
try:
published_at = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
except Exception:
published_at = None
return {
"rk_article_id": raw["id"],
"title": raw.get("title", "") or "",
"link": raw.get("link", "") or "",
"feed_id": raw.get("feed_id", 0),
"feed_title": raw.get("feed_title", "") or "",
"feed_category": raw.get("category", "") or "",
"author": raw.get("author", "") or "",
"published_at": published_at,
"original_summary": raw.get("summary", "") or "",
"content": raw.get("content", "") or "",
}
def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[str, int]:
"""
拉取最近文章,补充 AI 摘要。
返回统计信息:{"fetched": x, "created": y, "summarized": z}
"""
articles = rss_client.fetch_recent(hours=hours, limit=limit)
if not articles:
logger.info("未拉取到新文章")
return {"fetched": 0, "created": 0, "summarized": 0}
stats = {"fetched": len(articles), "created": 0, "summarized": 0}
for raw in articles:
data = _article_from_rss(raw)
article = db.query(EnrichedArticle).filter(
EnrichedArticle.rk_article_id == data["rk_article_id"]
).first()
if article is None:
article = EnrichedArticle(**data)
db.add(article)
db.flush()
stats["created"] += 1
else:
# 更新已有记录的基础字段
article.title = data["title"] or article.title
article.link = data["link"] or article.link
article.feed_title = data["feed_title"] or article.feed_title
article.feed_category = data["feed_category"] or article.feed_category
article.author = data["author"] or article.author
article.published_at = data["published_at"] or article.published_at
article.original_summary = data["original_summary"] or article.original_summary
article.content = data["content"] or article.content
article.fetched_at = datetime.now(timezone.utc)
if _needs_summary(article):
ai_summary = _generate_summary(article)
if ai_summary:
article.ai_summary = ai_summary
stats["summarized"] += 1
# 每 10 篇提交一次,避免长时间事务
if stats["summarized"] % 10 == 0:
db.commit()
db.commit()
logger.info(
"摘要任务完成: fetched=%d, created=%d, summarized=%d",
stats["fetched"], stats["created"], stats["summarized"]
)
return stats
+116
View File
@@ -0,0 +1,116 @@
"""基于规则给文章分类、打标签"""
import logging
import re
from typing import List, Dict, Any, Tuple
from sqlalchemy.orm import Session
from models import EnrichedArticle, Taxonomy
logger = logging.getLogger(__name__)
def _normalize(text: str) -> str:
"""规范化文本用于关键词匹配"""
if not text:
return ""
# 去除多余空白,统一小写
text = " ".join(text.split())
return text.lower()
def _count_matches(text: str, keywords: List[str]) -> int:
"""统计关键词在文本中的命中次数(不区分大小写)"""
if not text or not keywords:
return 0
text_norm = _normalize(text)
count = 0
for kw in keywords:
if not kw:
continue
kw_norm = _normalize(kw)
# 简单子串匹配;中文关键词也适用
count += text_norm.count(kw_norm)
return count
def classify_article(article: EnrichedArticle, categories: List[Taxonomy]) -> str:
"""为文章选择最匹配的分类"""
text = " ".join([
article.title or "",
article.ai_summary or article.original_summary or "",
article.content or "",
])
best_category = ""
best_score = 0
for cat in categories:
score = _count_matches(text, cat.keywords or [])
# 如果文章来自某个 Feed 分类,给予少量加成
if article.feed_category and article.feed_category == cat.name:
score += 2
if score > best_score:
best_score = score
best_category = cat.name
# 若完全没有命中,回退到源分类
if not best_category and article.feed_category:
best_category = article.feed_category
if not best_category:
best_category = "未分类"
return best_category
def tag_article(article: EnrichedArticle, tags: List[Taxonomy]) -> List[str]:
"""为文章打上命中的标签"""
text = " ".join([
article.title or "",
article.ai_summary or article.original_summary or "",
article.content or "",
])
matched = []
for tag in tags:
if _count_matches(text, tag.keywords or []) > 0:
matched.append(tag.name)
# 去重并保持顺序
return list(dict.fromkeys(matched))
def tag_articles(db: Session, article_ids: List[int] = None) -> int:
"""
对文章进行分类和打标签。
若指定 article_ids 则只处理这些文章;否则处理所有未分类或没有标签的文章。
返回处理数量。
"""
categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").all()
tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").all()
if not categories:
logger.warning("taxonomy 中无 category 数据,跳过分类")
return 0
query = db.query(EnrichedArticle)
if article_ids:
query = query.filter(EnrichedArticle.id.in_(article_ids))
else:
query = query.filter(
(EnrichedArticle.category == "") | (EnrichedArticle.category == None)
)
articles = query.all()
count = 0
for article in articles:
article.category = classify_article(article, categories)
article.tags = tag_article(article, tags)
count += 1
if count % 50 == 0:
db.commit()
db.commit()
logger.info("分类/打标签完成: %d 篇文章", count)
return count
+140
View File
@@ -0,0 +1,140 @@
"""分类/标签/打分规则体系的初始化与维护"""
import json
import logging
from typing import List, Dict, Any
from sqlalchemy.orm import Session
from app.ai_client import ai_client
from app.rss_client import rss_client
from models import Taxonomy
logger = logging.getLogger(__name__)
TAXONOMY_SYSTEM_PROMPT = """你是一位专业的信息分类与内容分析专家。
请根据用户提供的 RSS 文章样本,生成一套适合的中文内容分类体系、标签体系和打分规则。
输出必须是合法的 JSON,格式如下:
{
"categories": [
{"name": "科技", "description": "人工智能、芯片、互联网、软件等", "keywords": ["AI", "芯片", "大模型", ...]}
],
"tags": [
{"name": "人工智能", "description": "...", "keywords": ["AI", "人工智能", "大模型", ...]}
],
"heat_rules": [
{"name": "热点事件", "keywords": ["突发", "重磅", "刚刚", "发布"], "weight": 1.5}
],
"importance_rules": [
{"name": "政策法规", "keywords": ["政策", "监管", "法规", "征求意见"], "weight": 1.5}
],
"duplication_indicators": [
{"name": "同一事件", "keywords": ["宣布", "发布", "推出"], "weight": 1.0}
]
}
要求:
1. categories 数量控制在 8-12 个,覆盖科技、财经、新闻、设计、生活等常见 RSS 主题。
2. tags 数量控制在 30-50 个,尽量细化但避免过度重叠。
3. heat_rules 和 importance_rules 各 10-20 条,weight 范围 0.5-2.0。
4. 所有 keywords 用中文或中英双语,便于后续关键词匹配。
5. 不要输出任何解释文字,只输出 JSON。
"""
def _build_sample_prompt(articles: List[Dict[str, Any]]) -> str:
lines = [f"共有 {len(articles)} 篇文章样本:"]
for idx, art in enumerate(articles[:50], 1):
title = art.get("title", "")
summary = art.get("summary", "") or art.get("content", "")[:300]
feed = art.get("feed_title", "")
cat = art.get("category", "")
lines.append(f"\n[{idx}] 标题:{title}")
lines.append(f" 来源:{feed} | 源分类:{cat}")
lines.append(f" 摘要:{summary[:400]}")
return "\n".join(lines)
def bootstrap_taxonomy(db: Session, force: bool = False) -> bool:
"""
初始化分类/标签/打分规则。
若 force=True 则清空后重建;否则仅在表为空时初始化。
"""
existing = db.query(Taxonomy).first()
if existing and not force:
logger.info("taxonomy 表已存在,跳过初始化")
return False
if force:
db.query(Taxonomy).delete()
db.commit()
logger.info("强制重新初始化 taxonomy")
logger.info("开始从 rssKeeper 拉取样本文章并生成分类体系...")
articles = rss_client.fetch_recent(hours=24 * 7, limit=200)
if not articles:
logger.warning("未获取到样本文章,无法生成分类体系")
return False
user_prompt = _build_sample_prompt(articles)
try:
result = ai_client.chat_completion_json(
system_prompt=TAXONOMY_SYSTEM_PROMPT,
user_prompt=user_prompt,
temperature=0.5,
)
except Exception as exc:
logger.error("生成分类体系失败: %s", exc)
return False
_save_taxonomy(db, result)
logger.info("taxonomy 初始化完成,共写入 %d 条规则", db.query(Taxonomy).count())
return True
def _save_taxonomy(db: Session, data: Dict[str, Any]) -> None:
"""把 LLM 返回的分类体系写入数据库"""
def _add(kind: str, items: List[Dict[str, Any]], default_weight: float = 1.0):
for item in items:
name = item.get("name", "").strip()
if not name:
continue
keywords = item.get("keywords", [])
if isinstance(keywords, str):
keywords = [keywords]
db.add(
Taxonomy(
name=name,
kind=kind,
description=item.get("description", ""),
keywords=keywords,
weight=float(item.get("weight", default_weight)),
created_by_ai=True,
)
)
_add("category", data.get("categories", []))
_add("tag", data.get("tags", []))
_add("heat_rule", data.get("heat_rules", []), default_weight=1.0)
_add("importance_rule", data.get("importance_rules", []), default_weight=1.0)
_add("duplication_rule", data.get("duplication_indicators", []), default_weight=1.0)
db.commit()
def ensure_taxonomy(db: Session) -> bool:
"""确保 taxonomy 表非空,若为空则触发初始化"""
existing = db.query(Taxonomy).first()
if existing:
return True
return bootstrap_taxonomy(db)
def list_taxonomy(db: Session, kind: str = None) -> List[Taxonomy]:
"""列出分类体系规则"""
query = db.query(Taxonomy)
if kind:
query = query.filter(Taxonomy.kind == kind)
return query.order_by(Taxonomy.kind, Taxonomy.name).all()