feat: 修复代码审核报告问题
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
"""LLM API 客户端,兼容 OpenAI API 格式"""
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from openai import OpenAI, APIError
|
||||
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AIClient:
|
||||
"""封装 LLM 调用,支持重试和 JSON 输出"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
timeout: Optional[int] = None,
|
||||
max_retries: Optional[int] = None,
|
||||
):
|
||||
self.api_key = api_key or settings.OPENAI_API_KEY
|
||||
self.base_url = base_url or settings.OPENAI_BASE_URL
|
||||
self.model = model or settings.OPENAI_MODEL
|
||||
self.timeout = timeout or settings.OPENAI_TIMEOUT
|
||||
self.max_retries = max_retries or settings.OPENAI_MAX_RETRIES
|
||||
|
||||
self._client: Optional[OpenAI] = None
|
||||
|
||||
@property
|
||||
def client(self) -> OpenAI:
|
||||
if self._client is None:
|
||||
self._client = OpenAI(
|
||||
api_key=self.api_key,
|
||||
base_url=self.base_url,
|
||||
timeout=self.timeout,
|
||||
max_retries=self.max_retries,
|
||||
)
|
||||
return self._client
|
||||
|
||||
def chat_completion(
|
||||
self,
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
temperature: float = 0.3,
|
||||
json_mode: bool = False,
|
||||
) -> str:
|
||||
"""调用 LLM 返回文本"""
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
kwargs = {
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
}
|
||||
if json_mode:
|
||||
kwargs["response_format"] = {"type": "json_object"}
|
||||
|
||||
try:
|
||||
resp = self.client.chat.completions.create(**kwargs)
|
||||
content = resp.choices[0].message.content or ""
|
||||
return content.strip()
|
||||
except APIError as exc:
|
||||
logger.error("LLM API 调用失败: %s", exc)
|
||||
raise
|
||||
|
||||
def chat_completion_json(
|
||||
self,
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
temperature: float = 0.3,
|
||||
) -> dict:
|
||||
"""调用 LLM 并解析返回的 JSON"""
|
||||
content = self.chat_completion(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
temperature=temperature,
|
||||
json_mode=True,
|
||||
)
|
||||
try:
|
||||
return json.loads(content)
|
||||
except json.JSONDecodeError as exc:
|
||||
logger.error("LLM 返回不是合法 JSON: %s - content=%s", exc, content[:500])
|
||||
raise
|
||||
|
||||
|
||||
ai_client = AIClient()
|
||||
+168
@@ -0,0 +1,168 @@
|
||||
"""每日简报生成"""
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, DailyBrief
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _format_article(article: EnrichedArticle) -> Dict[str, Any]:
|
||||
"""把文章格式化为简报中的条目"""
|
||||
return {
|
||||
"id": article.id,
|
||||
"rk_article_id": article.rk_article_id,
|
||||
"title": article.title or "",
|
||||
"link": article.link or "",
|
||||
"author": article.author or "",
|
||||
"feed_title": article.feed_title or "",
|
||||
"summary": article.ai_summary or article.original_summary or "",
|
||||
"tags": article.tags or [],
|
||||
"heat_score": article.heat_score,
|
||||
"importance_score": article.importance_score,
|
||||
"duplication_score": article.duplication_score,
|
||||
"composite_score": article.composite_score,
|
||||
"published_at": article.published_at.isoformat() if article.published_at else None,
|
||||
}
|
||||
|
||||
|
||||
def _build_markdown(date_str: str, by_category: Dict[str, List[Dict[str, Any]]], stats: Dict[str, int]) -> str:
|
||||
"""生成 Markdown 简报"""
|
||||
lines = [
|
||||
f"# RSS 每日简报 ({date_str})",
|
||||
"",
|
||||
f"- 去重前文章数: {stats['total_articles']}",
|
||||
f"- 去重后文章数: {stats['unique_articles']}",
|
||||
f"- 生成分类数: {len(by_category)}",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
]
|
||||
|
||||
for category, items in sorted(by_category.items(), key=lambda x: x[0]):
|
||||
lines.append(f"## {category}")
|
||||
lines.append("")
|
||||
for item in items:
|
||||
tags = " ".join([f"`{t}`" for t in item["tags"]]) if item["tags"] else ""
|
||||
lines.append(f"### {item['title']}")
|
||||
lines.append(f"- 来源: {item['feed_title']} | 作者: {item.get('author') or '未知'}")
|
||||
lines.append(f"- 标签: {tags}")
|
||||
lines.append(f"- 热度: {item['heat_score']:.1f} | 重要性: {item['importance_score']:.1f} | 重复度: {item['duplication_score']:.1f} | 综合: {item['composite_score']:.1f}")
|
||||
if item["summary"]:
|
||||
lines.append(f"- 摘要: {item['summary']}")
|
||||
if item["link"]:
|
||||
lines.append(f"- [阅读原文]({item['link']})")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def generate_daily_brief(db: Session, date_str: str = None, force: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
生成指定日期的每日简报。
|
||||
若 date_str 为空则处理今天。
|
||||
返回简报数据字典。
|
||||
"""
|
||||
if date_str is None:
|
||||
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
# 检查是否已存在
|
||||
existing = db.query(DailyBrief).filter(DailyBrief.brief_date == date_str).first()
|
||||
if existing and not force:
|
||||
logger.info("日期 %s 简报已存在,跳过生成", date_str)
|
||||
return {
|
||||
"date": date_str,
|
||||
"total_articles": existing.total_articles,
|
||||
"unique_articles": existing.unique_articles,
|
||||
"markdown_path": existing.markdown_path,
|
||||
}
|
||||
|
||||
day_start = datetime.strptime(date_str, "%Y-%m-%d")
|
||||
day_end = day_start + timedelta(days=1)
|
||||
|
||||
# 取当天去重后的代表文章
|
||||
query = (
|
||||
db.query(EnrichedArticle)
|
||||
.filter(
|
||||
EnrichedArticle.fetched_at >= day_start,
|
||||
EnrichedArticle.fetched_at < day_end,
|
||||
)
|
||||
)
|
||||
|
||||
# 默认只取代表文章或未归入重复组的文章
|
||||
representative_articles = (
|
||||
query.filter(
|
||||
(EnrichedArticle.is_representative == True)
|
||||
| (EnrichedArticle.duplicate_group_id == None)
|
||||
)
|
||||
.order_by(EnrichedArticle.composite_score.desc())
|
||||
.all()
|
||||
)
|
||||
|
||||
# 按分类分组并排序
|
||||
by_category: Dict[str, List[Dict[str, Any]]] = {}
|
||||
for art in representative_articles:
|
||||
cat = art.category or "未分类"
|
||||
if cat not in by_category:
|
||||
by_category[cat] = []
|
||||
by_category[cat].append(_format_article(art))
|
||||
|
||||
# 每个分类只保留 TOP N
|
||||
top_n = settings.BRIEF_TOP_N_PER_CATEGORY
|
||||
for cat in by_category:
|
||||
by_category[cat] = by_category[cat][:top_n]
|
||||
|
||||
total_before_dedup = query.count()
|
||||
unique_count = sum(len(items) for items in by_category.values())
|
||||
|
||||
stats = {
|
||||
"total_articles": total_before_dedup,
|
||||
"unique_articles": unique_count,
|
||||
}
|
||||
|
||||
# 生成 Markdown 文件
|
||||
output_dir = settings.brief_output_dir_path / date_str
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
markdown_path = output_dir / "daily-brief.md"
|
||||
markdown_content = _build_markdown(date_str, by_category, stats)
|
||||
markdown_path.write_text(markdown_content, encoding="utf-8")
|
||||
|
||||
# 更新文章 brief_date
|
||||
for art in representative_articles:
|
||||
art.brief_date = date_str
|
||||
|
||||
# 保存到数据库
|
||||
brief_data = {
|
||||
"date": date_str,
|
||||
"total_articles": stats["total_articles"],
|
||||
"unique_articles": stats["unique_articles"],
|
||||
"by_category": by_category,
|
||||
"markdown_path": str(markdown_path),
|
||||
}
|
||||
|
||||
if existing:
|
||||
existing.total_articles = stats["total_articles"]
|
||||
existing.unique_articles = stats["unique_articles"]
|
||||
existing.by_category = by_category
|
||||
existing.markdown_path = str(markdown_path)
|
||||
existing.updated_at = datetime.now(timezone.utc)
|
||||
else:
|
||||
db.add(
|
||||
DailyBrief(
|
||||
brief_date=date_str,
|
||||
total_articles=stats["total_articles"],
|
||||
unique_articles=stats["unique_articles"],
|
||||
by_category=by_category,
|
||||
markdown_path=str(markdown_path),
|
||||
)
|
||||
)
|
||||
|
||||
db.commit()
|
||||
logger.info("简报生成完成: 日期=%s, 去重前=%d, 去重后=%d", date_str, stats["total_articles"], stats["unique_articles"])
|
||||
return brief_data
|
||||
@@ -0,0 +1,223 @@
|
||||
"""文章去重:URL 精确去重 + 标题/内容相似度去重"""
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from difflib import SequenceMatcher
|
||||
from typing import List, Dict, Tuple, Set
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
import numpy as np
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, DuplicateGroup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _normalize_title(title: str) -> str:
|
||||
"""标题规范化:去除标点和多余空格,小写,保留中英文数字"""
|
||||
if not title:
|
||||
return ""
|
||||
# 保留:单词字符、CJK 统一表意符号(含扩展 A/B/C/D/E)
|
||||
title = re.sub(
|
||||
r"[^\w一-鿿㐀-䶿\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f]",
|
||||
" ",
|
||||
title,
|
||||
)
|
||||
title = " ".join(title.split())
|
||||
return title.lower()
|
||||
|
||||
|
||||
def _title_similarity(a: str, b: str) -> float:
|
||||
"""计算标题相似度"""
|
||||
na = _normalize_title(a)
|
||||
nb = _normalize_title(b)
|
||||
if not na or not nb:
|
||||
return 0.0
|
||||
return SequenceMatcher(None, na, nb).ratio()
|
||||
|
||||
|
||||
def _content_similarity_matrix(contents: List[str]) -> np.ndarray:
|
||||
"""使用 TF-IDF + 余弦相似度计算内容相似度矩阵"""
|
||||
if len(contents) < 2:
|
||||
return np.zeros((len(contents), len(contents)))
|
||||
|
||||
# 过滤空内容
|
||||
valid_contents = [c or "" for c in contents]
|
||||
try:
|
||||
vectorizer = TfidfVectorizer(
|
||||
max_features=5000,
|
||||
stop_words="english",
|
||||
ngram_range=(1, 2),
|
||||
min_df=1,
|
||||
)
|
||||
tfidf = vectorizer.fit_transform(valid_contents)
|
||||
return cosine_similarity(tfidf)
|
||||
except Exception as exc:
|
||||
logger.warning("TF-IDF 相似度计算失败: %s", exc)
|
||||
return np.zeros((len(contents), len(contents)))
|
||||
|
||||
|
||||
def _find_duplicate_clusters(
|
||||
articles: List[EnrichedArticle],
|
||||
title_threshold: float = None,
|
||||
content_threshold: float = None,
|
||||
) -> List[Set[int]]:
|
||||
"""
|
||||
基于标题相似度和内容相似度找出重复簇。
|
||||
返回索引簇列表,每个簇是一组 articles 的索引集合。
|
||||
"""
|
||||
title_threshold = title_threshold or settings.TITLE_SIMILARITY_THRESHOLD
|
||||
content_threshold = content_threshold or settings.CONTENT_SIMILARITY_THRESHOLD
|
||||
|
||||
n = len(articles)
|
||||
if n < 2:
|
||||
return []
|
||||
|
||||
contents = []
|
||||
for art in articles:
|
||||
text = " ".join([
|
||||
art.title or "",
|
||||
art.ai_summary or art.original_summary or "",
|
||||
art.content or "",
|
||||
])
|
||||
contents.append(text[:2000]) # 限制长度加速计算
|
||||
|
||||
content_sim = _content_similarity_matrix(contents)
|
||||
|
||||
visited = [False] * n
|
||||
clusters: List[Set[int]] = []
|
||||
|
||||
for i in range(n):
|
||||
if visited[i]:
|
||||
continue
|
||||
cluster = {i}
|
||||
queue = [i]
|
||||
visited[i] = True
|
||||
|
||||
while queue:
|
||||
cur = queue.pop(0)
|
||||
for j in range(n):
|
||||
if visited[j] or cur == j:
|
||||
continue
|
||||
|
||||
title_sim = _title_similarity(articles[cur].title or "", articles[j].title or "")
|
||||
c_sim = content_sim[cur][j] if cur < n and j < n else 0.0
|
||||
|
||||
# 标题高度相似 或 内容高度相似均视为重复
|
||||
if title_sim >= title_threshold or c_sim >= content_threshold:
|
||||
cluster.add(j)
|
||||
queue.append(j)
|
||||
visited[j] = True
|
||||
|
||||
if len(cluster) > 1:
|
||||
clusters.append(cluster)
|
||||
|
||||
return clusters
|
||||
|
||||
|
||||
def _pick_representative(articles: List[EnrichedArticle], indices: Set[int]) -> EnrichedArticle:
|
||||
"""从重复组中选择代表文章:优先选有 AI 摘要、来源 Feed 分类明确、发布时间最早的"""
|
||||
candidates = [articles[i] for i in indices]
|
||||
# 排序:有 AI 摘要优先,然后有 Feed 分类,然后发布时间早
|
||||
candidates.sort(
|
||||
key=lambda a: (
|
||||
bool(a.ai_summary),
|
||||
bool(a.feed_category),
|
||||
a.published_at or datetime.min,
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return candidates[0]
|
||||
|
||||
|
||||
def deduplicate_articles(
|
||||
db: Session,
|
||||
date_str: str = None,
|
||||
title_threshold: float = None,
|
||||
content_threshold: float = None,
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
对指定日期的文章进行去重。
|
||||
若 date_str 为空则处理今天(UTC)的文章。
|
||||
返回统计:{"total": x, "duplicate_groups": y, "representatives": z}
|
||||
"""
|
||||
if date_str is None:
|
||||
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
# 只清空该日期已有的去重组,避免破坏历史数据
|
||||
day_start = datetime.strptime(date_str, "%Y-%m-%d")
|
||||
day_end = day_start + timedelta(days=1)
|
||||
|
||||
old_groups = db.query(DuplicateGroup).filter(DuplicateGroup.brief_date == date_str).all()
|
||||
for og in old_groups:
|
||||
for art in og.articles:
|
||||
art.duplicate_group_id = None
|
||||
art.is_representative = False
|
||||
db.delete(og)
|
||||
db.commit()
|
||||
|
||||
# 重置该日期文章的去重标记
|
||||
articles = (
|
||||
db.query(EnrichedArticle)
|
||||
.filter(
|
||||
EnrichedArticle.fetched_at >= day_start,
|
||||
EnrichedArticle.fetched_at < day_end,
|
||||
)
|
||||
.order_by(EnrichedArticle.published_at)
|
||||
.all()
|
||||
)
|
||||
|
||||
if not articles:
|
||||
logger.info("日期 %s 无文章可去重", date_str)
|
||||
return {"total": 0, "duplicate_groups": 0, "representatives": 0}
|
||||
|
||||
# 先 URL 去重:相同 link 只保留一篇
|
||||
unique_articles: List[EnrichedArticle] = []
|
||||
seen_links: set = set()
|
||||
url_dup_count = 0
|
||||
for art in articles:
|
||||
link = (art.link or "").strip()
|
||||
if link and link in seen_links:
|
||||
url_dup_count += 1
|
||||
continue
|
||||
if link:
|
||||
seen_links.add(link)
|
||||
unique_articles.append(art)
|
||||
|
||||
clusters = _find_duplicate_clusters(
|
||||
unique_articles,
|
||||
title_threshold=title_threshold,
|
||||
content_threshold=content_threshold,
|
||||
)
|
||||
|
||||
stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0}
|
||||
|
||||
for cluster in clusters:
|
||||
representative = _pick_representative(unique_articles, cluster)
|
||||
member_ids = [unique_articles[i].id for i in cluster]
|
||||
|
||||
group = DuplicateGroup(
|
||||
representative_article_id=representative.id,
|
||||
member_article_ids=member_ids,
|
||||
similarity_matrix={}, # 可后续补充
|
||||
brief_date=date_str,
|
||||
)
|
||||
db.add(group)
|
||||
db.flush()
|
||||
|
||||
for idx in cluster:
|
||||
art = unique_articles[idx]
|
||||
art.duplicate_group_id = group.id
|
||||
art.is_representative = (art.id == representative.id)
|
||||
|
||||
stats["representatives"] += 1
|
||||
|
||||
db.commit()
|
||||
logger.info(
|
||||
"去重完成: 日期=%s, 总文章=%d, 重复组=%d, URL 重复=%d",
|
||||
date_str, stats["total"], stats["duplicate_groups"], url_dup_count
|
||||
)
|
||||
return stats
|
||||
@@ -0,0 +1,104 @@
|
||||
"""调用 rssKeeper 外部 API"""
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Optional, Dict, Any
|
||||
import logging
|
||||
|
||||
import requests
|
||||
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RSSKeeperClient:
|
||||
"""rssKeeper 外部 API 客户端"""
|
||||
|
||||
def __init__(self, base_url: Optional[str] = None, timeout: int = 30):
|
||||
self.base_url = (base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
|
||||
self.timeout = timeout
|
||||
|
||||
def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
url = f"{self.base_url}{path}"
|
||||
try:
|
||||
resp = requests.get(url, params=params, timeout=self.timeout)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
except requests.RequestException as exc:
|
||||
logger.error("请求 rssKeeper 失败: %s - %s", url, exc)
|
||||
raise
|
||||
|
||||
def fetch_recent(
|
||||
self,
|
||||
hours: int = 24,
|
||||
limit: int = 200,
|
||||
feed_id: Optional[int] = None,
|
||||
category: Optional[str] = None,
|
||||
search: Optional[str] = None,
|
||||
unread_only: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""获取最近 N 小时的文章"""
|
||||
params = {
|
||||
"hours": hours,
|
||||
"limit": limit,
|
||||
"unread_only": unread_only,
|
||||
}
|
||||
if feed_id is not None:
|
||||
params["feed_id"] = feed_id
|
||||
if category is not None:
|
||||
params["category"] = category
|
||||
if search is not None:
|
||||
params["search"] = search
|
||||
|
||||
data = self._get("/api/v1/external/recent", params=params)
|
||||
return data.get("articles", [])
|
||||
|
||||
def fetch_by_date(self, date: str, category: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""获取指定日期的文章聚合"""
|
||||
params: Dict[str, Any] = {"date": date}
|
||||
if category is not None:
|
||||
params["category"] = category
|
||||
return self._get("/api/v1/external/summary", params=params)
|
||||
|
||||
def fetch_feeds(
|
||||
self,
|
||||
health_status: Optional[str] = None,
|
||||
category: Optional[str] = None,
|
||||
error_type: Optional[str] = None,
|
||||
is_active: Optional[bool] = True,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""获取 RSS 源列表"""
|
||||
params: Dict[str, Any] = {}
|
||||
if health_status is not None:
|
||||
params["health_status"] = health_status
|
||||
if category is not None:
|
||||
params["category"] = category
|
||||
if error_type is not None:
|
||||
params["error_type"] = error_type
|
||||
if is_active is not None:
|
||||
params["is_active"] = is_active
|
||||
|
||||
data = self._get("/api/v1/external/feeds", params=params)
|
||||
return data.get("feeds", [])
|
||||
|
||||
def fulltext_search(
|
||||
self,
|
||||
q: str,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
category: Optional[str] = None,
|
||||
feed_id: Optional[int] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""全文搜索文章"""
|
||||
params: Dict[str, Any] = {
|
||||
"q": q,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
}
|
||||
if category is not None:
|
||||
params["category"] = category
|
||||
if feed_id is not None:
|
||||
params["feed_id"] = feed_id
|
||||
return self._get("/api/v1/external/search", params=params)
|
||||
|
||||
|
||||
rss_client = RSSKeeperClient()
|
||||
+147
@@ -0,0 +1,147 @@
|
||||
"""基于规则计算文章热度、重要性、重复性分数"""
|
||||
import logging
|
||||
import math
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import List
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, Taxonomy
|
||||
from app.tagger import _count_matches, _normalize
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# 综合分权重:热度 30%,重要性 50%,重复性 20%
|
||||
COMPOSITE_WEIGHT_HEAT = 0.3
|
||||
COMPOSITE_WEIGHT_IMPORTANCE = 0.5
|
||||
COMPOSITE_WEIGHT_DUPLICATION = 0.2
|
||||
|
||||
|
||||
def _build_text(article: EnrichedArticle) -> str:
|
||||
"""构建用于打分的文本"""
|
||||
return " ".join([
|
||||
article.title or "",
|
||||
article.ai_summary or article.original_summary or "",
|
||||
article.content or "",
|
||||
])
|
||||
|
||||
|
||||
def _score_by_rules(article: EnrichedArticle, rules: List[Taxonomy]) -> float:
|
||||
"""基于规则关键词匹配计算分数,规则权重越大得分越高"""
|
||||
text = _build_text(article)
|
||||
if not text.strip() or not rules:
|
||||
return 0.0
|
||||
|
||||
score = 0.0
|
||||
for rule in rules:
|
||||
keywords = rule.keywords or []
|
||||
hits = _count_matches(text, keywords)
|
||||
if hits > 0:
|
||||
score += min(hits, 5) * rule.weight * 10
|
||||
|
||||
return min(score, 100.0)
|
||||
|
||||
|
||||
def _freshness_score(article: EnrichedArticle) -> float:
|
||||
"""根据发布时间计算新鲜度加成"""
|
||||
now = datetime.now(timezone.utc)
|
||||
published = article.published_at
|
||||
if not published:
|
||||
return 0.0
|
||||
|
||||
# 数据库中读出的 published_at 可能为 naive,默认按 UTC 处理
|
||||
if published.tzinfo is None:
|
||||
published = published.replace(tzinfo=timezone.utc)
|
||||
|
||||
hours_old = (now - published).total_seconds() / 3600
|
||||
if hours_old < 0:
|
||||
hours_old = 0
|
||||
|
||||
# 24 小时内满分 20 分,超过 72 小时降至 0
|
||||
if hours_old <= 24:
|
||||
return 20.0
|
||||
elif hours_old >= 72:
|
||||
return 0.0
|
||||
else:
|
||||
return 20.0 * (1 - (hours_old - 24) / 48)
|
||||
|
||||
|
||||
def compute_heat_score(article: EnrichedArticle, heat_rules: List[Taxonomy]) -> float:
|
||||
"""热度分:关键词命中 + 新鲜度"""
|
||||
base = _score_by_rules(article, heat_rules)
|
||||
fresh = _freshness_score(article)
|
||||
return min(base + fresh, 100.0)
|
||||
|
||||
|
||||
def compute_importance_score(article: EnrichedArticle, importance_rules: List[Taxonomy]) -> float:
|
||||
"""重要性分:关键词命中"""
|
||||
return _score_by_rules(article, importance_rules)
|
||||
|
||||
|
||||
def compute_duplication_score(duplicate_count: int, max_count: int = 5) -> float:
|
||||
"""
|
||||
重复性分:同一主题在多个源出现次数越多,重复性分越高。
|
||||
出现 1 次为 0 分,>= max_count 为 100 分。
|
||||
"""
|
||||
if duplicate_count <= 1:
|
||||
return 0.0
|
||||
score = (duplicate_count - 1) / (max_count - 1) * 100.0
|
||||
return min(score, 100.0)
|
||||
|
||||
|
||||
def compute_composite_score(heat: float, importance: float, duplication: float) -> float:
|
||||
"""计算综合分"""
|
||||
return round(
|
||||
heat * COMPOSITE_WEIGHT_HEAT
|
||||
+ importance * COMPOSITE_WEIGHT_IMPORTANCE
|
||||
+ duplication * COMPOSITE_WEIGHT_DUPLICATION,
|
||||
2,
|
||||
)
|
||||
|
||||
|
||||
def score_articles(
|
||||
db: Session,
|
||||
article_ids: List[int] = None,
|
||||
update_duplication: bool = False,
|
||||
) -> int:
|
||||
"""
|
||||
对文章计算热度/重要性/综合分。
|
||||
若 update_duplication=True,则同时根据重复组更新重复性分数。
|
||||
返回处理数量。
|
||||
"""
|
||||
heat_rules = db.query(Taxonomy).filter(Taxonomy.kind == "heat_rule").all()
|
||||
importance_rules = db.query(Taxonomy).filter(Taxonomy.kind == "importance_rule").all()
|
||||
|
||||
query = db.query(EnrichedArticle)
|
||||
if article_ids:
|
||||
query = query.filter(EnrichedArticle.id.in_(article_ids))
|
||||
|
||||
articles = query.all()
|
||||
count = 0
|
||||
for article in articles:
|
||||
article.heat_score = compute_heat_score(article, heat_rules)
|
||||
article.importance_score = compute_importance_score(article, importance_rules)
|
||||
|
||||
if update_duplication:
|
||||
dup_count = 0
|
||||
if article.duplicate_group_id:
|
||||
group = article.duplicate_group
|
||||
if group and group.member_article_ids:
|
||||
# 非代表成员数量才是真正的重复次数
|
||||
dup_count = max(len(group.member_article_ids) - 1, 0)
|
||||
article.duplication_score = compute_duplication_score(dup_count)
|
||||
|
||||
article.composite_score = compute_composite_score(
|
||||
article.heat_score,
|
||||
article.importance_score,
|
||||
article.duplication_score,
|
||||
)
|
||||
count += 1
|
||||
if count % 50 == 0:
|
||||
db.commit()
|
||||
|
||||
db.commit()
|
||||
logger.info("打分完成: %d 篇文章", count)
|
||||
return count
|
||||
@@ -0,0 +1,188 @@
|
||||
"""运行时配置管理:支持环境变量作为默认值,数据库覆盖"""
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from models import AppSetting
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# 可在 Web UI 中编辑的配置项清单
|
||||
EDITABLE_SETTINGS = {
|
||||
"RSSKEEPER_BASE_URL": {"description": "rssKeeper 服务地址", "sensitive": False},
|
||||
"OPENAI_API_KEY": {"description": "LLM API Key", "sensitive": True},
|
||||
"OPENAI_BASE_URL": {"description": "LLM API 基础地址", "sensitive": False},
|
||||
"OPENAI_MODEL": {"description": "LLM 模型名", "sensitive": False},
|
||||
"OPENAI_TIMEOUT": {"description": "LLM 调用超时(秒)", "sensitive": False},
|
||||
"OPENAI_MAX_RETRIES": {"description": "LLM 调用最大重试次数", "sensitive": False},
|
||||
"SUMMARIZE_INTERVAL_MINUTES": {"description": "摘要任务间隔(分钟)", "sensitive": False},
|
||||
"TAG_SCORE_INTERVAL_MINUTES": {"description": "分类/打分/去重任务间隔(分钟)", "sensitive": False},
|
||||
"DAILY_BRIEF_HOUR": {"description": "每日简报生成小时", "sensitive": False},
|
||||
"DAILY_BRIEF_MINUTE": {"description": "每日简报生成分钟", "sensitive": False},
|
||||
"TITLE_SIMILARITY_THRESHOLD": {"description": "标题相似度阈值", "sensitive": False},
|
||||
"CONTENT_SIMILARITY_THRESHOLD": {"description": "内容相似度阈值", "sensitive": False},
|
||||
"MAX_AI_SUMMARY_LENGTH": {"description": "AI 摘要最大长度", "sensitive": False},
|
||||
"MIN_ORIGINAL_SUMMARY_LENGTH": {"description": "原始摘要最小长度", "sensitive": False},
|
||||
"BRIEF_TOP_N_PER_CATEGORY": {"description": "简报每分类显示文章数", "sensitive": False},
|
||||
"LOG_LEVEL": {"description": "日志级别", "sensitive": False},
|
||||
"API_TOKEN": {"description": "API 鉴权 Token(为空时不启用鉴权)", "sensitive": True},
|
||||
"CORS_ALLOWED_ORIGINS": {"description": "CORS 允许来源(逗号分隔)", "sensitive": False},
|
||||
}
|
||||
|
||||
|
||||
def _get_env_default(key: str) -> str:
|
||||
"""从 Pydantic Settings 获取环境变量默认值"""
|
||||
value = getattr(settings, key, "")
|
||||
return str(value) if value is not None else ""
|
||||
|
||||
|
||||
def _mask_sensitive(value: str) -> str:
|
||||
"""对敏感值做部分脱敏"""
|
||||
if not value:
|
||||
return ""
|
||||
if len(value) <= 8:
|
||||
return "*" * len(value)
|
||||
return value[:4] + "..." + value[-4:]
|
||||
|
||||
|
||||
def init_default_settings(db: Session) -> None:
|
||||
"""若配置表为空,使用环境变量初始化默认配置"""
|
||||
existing_count = db.query(AppSetting).count()
|
||||
if existing_count > 0:
|
||||
return
|
||||
|
||||
for key, meta in EDITABLE_SETTINGS.items():
|
||||
default_value = _get_env_default(key)
|
||||
db.add(
|
||||
AppSetting(
|
||||
key=key,
|
||||
value=default_value,
|
||||
description=meta["description"],
|
||||
is_sensitive=meta["sensitive"],
|
||||
)
|
||||
)
|
||||
|
||||
db.commit()
|
||||
logger.info("已初始化默认配置项: %d 条", len(EDITABLE_SETTINGS))
|
||||
|
||||
|
||||
def get_setting(db: Session, key: str, default: Any = None) -> Any:
|
||||
"""从数据库读取配置,若不存在则返回环境变量默认值"""
|
||||
setting = db.query(AppSetting).filter(AppSetting.key == key).first()
|
||||
if setting:
|
||||
return setting.value
|
||||
return _get_env_default(key) if default is None else default
|
||||
|
||||
|
||||
def get_setting_value(key: str, default: Any = None) -> Any:
|
||||
"""不依赖 Session,直接创建临时会话读取"""
|
||||
from database import SessionLocal
|
||||
db = SessionLocal()
|
||||
try:
|
||||
return get_setting(db, key, default)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def set_setting(db: Session, key: str, value: str) -> bool:
|
||||
"""更新单个配置项"""
|
||||
if key not in EDITABLE_SETTINGS:
|
||||
return False
|
||||
|
||||
setting = db.query(AppSetting).filter(AppSetting.key == key).first()
|
||||
if setting:
|
||||
setting.value = str(value)
|
||||
setting.updated_at = datetime.now(timezone.utc)
|
||||
else:
|
||||
meta = EDITABLE_SETTINGS[key]
|
||||
db.add(
|
||||
AppSetting(
|
||||
key=key,
|
||||
value=str(value),
|
||||
description=meta["description"],
|
||||
is_sensitive=meta["sensitive"],
|
||||
)
|
||||
)
|
||||
|
||||
db.commit()
|
||||
logger.info("配置已更新: %s", key)
|
||||
return True
|
||||
|
||||
|
||||
def list_settings(db: Session, mask_sensitive: bool = True) -> List[Dict[str, Any]]:
|
||||
"""列出所有可编辑配置"""
|
||||
db_settings = {s.key: s for s in db.query(AppSetting).all()}
|
||||
result = []
|
||||
|
||||
for key, meta in EDITABLE_SETTINGS.items():
|
||||
setting = db_settings.get(key)
|
||||
value = setting.value if setting else _get_env_default(key)
|
||||
is_sensitive = meta["sensitive"]
|
||||
|
||||
if is_sensitive and mask_sensitive:
|
||||
display_value = _mask_sensitive(value)
|
||||
is_masked = True
|
||||
else:
|
||||
display_value = value
|
||||
is_masked = False
|
||||
|
||||
result.append({
|
||||
"key": key,
|
||||
"value": display_value,
|
||||
"real_value": value if not mask_sensitive else None,
|
||||
"description": meta["description"],
|
||||
"is_sensitive": is_sensitive,
|
||||
"is_masked": is_masked,
|
||||
"updated_at": setting.updated_at.isoformat() if setting else None,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def reset_settings(db: Session) -> None:
|
||||
"""将所有配置重置为环境变量默认值"""
|
||||
for key in EDITABLE_SETTINGS:
|
||||
set_setting(db, key, _get_env_default(key))
|
||||
logger.info("配置已重置为环境变量默认值")
|
||||
|
||||
|
||||
def apply_db_settings_to_config(db: Session = None) -> None:
|
||||
"""将数据库中的配置覆盖到全局 settings 对象,重启后生效"""
|
||||
close_db = False
|
||||
if db is None:
|
||||
from database import SessionLocal
|
||||
db = SessionLocal()
|
||||
close_db = True
|
||||
try:
|
||||
for key in EDITABLE_SETTINGS:
|
||||
db_value = get_setting(db, key)
|
||||
if db_value is None or db_value == "":
|
||||
continue
|
||||
field_info = settings.model_fields.get(key)
|
||||
if field_info is None:
|
||||
continue
|
||||
target_type = field_info.annotation
|
||||
try:
|
||||
if target_type is int:
|
||||
converted = int(db_value)
|
||||
elif target_type is float:
|
||||
converted = float(db_value)
|
||||
elif target_type is bool:
|
||||
converted = db_value.lower() in ("true", "1", "yes")
|
||||
elif target_type is Path:
|
||||
converted = Path(db_value)
|
||||
else:
|
||||
converted = db_value
|
||||
setattr(settings, key, converted)
|
||||
logger.debug("已应用配置: %s=%s", key, converted)
|
||||
except Exception as exc:
|
||||
logger.error("应用配置 %s=%s 失败: %s", key, db_value, exc)
|
||||
raise ValueError(f"配置项 {key} 的值无效: {db_value}") from exc
|
||||
finally:
|
||||
if close_db:
|
||||
db.close()
|
||||
@@ -0,0 +1,154 @@
|
||||
"""文章摘要生成器:对无摘要或短摘要文章调用 LLM 生成 AI 摘要"""
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.ai_client import ai_client
|
||||
from app.rss_client import rss_client
|
||||
from config import settings
|
||||
from models import EnrichedArticle
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SUMMARY_SYSTEM_PROMPT = """你是一位擅长阅读 RSS 新闻并提炼摘要的助手。
|
||||
请用简洁流畅的中文总结文章核心内容,要求:
|
||||
1. 长度控制在 {max_length} 个汉字以内。
|
||||
2. 包含文章最重要的 1-3 个要点。
|
||||
3. 不要添加个人评价,不要复述原文标题。
|
||||
4. 若原文是英文,请用中文输出摘要。
|
||||
"""
|
||||
|
||||
|
||||
SUMMARY_USER_PROMPT_TEMPLATE = """请为以下文章生成摘要。
|
||||
|
||||
标题:{title}
|
||||
作者:{author}
|
||||
来源:{feed_title}
|
||||
|
||||
正文:
|
||||
{content}
|
||||
"""
|
||||
|
||||
|
||||
def _needs_summary(article: EnrichedArticle) -> bool:
|
||||
"""判断是否需要生成 AI 摘要"""
|
||||
if not article.ai_summary:
|
||||
return True
|
||||
original = article.original_summary or ""
|
||||
if len(original.strip()) < settings.MIN_ORIGINAL_SUMMARY_LENGTH:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _prepare_content(raw_content: str, max_chars: int = 8000) -> str:
|
||||
"""清洗并截断正文,避免超过 LLM 上下文"""
|
||||
text = raw_content or ""
|
||||
# 简单去除多余空白
|
||||
text = " ".join(text.split())
|
||||
return text[:max_chars]
|
||||
|
||||
|
||||
def _generate_summary(article: EnrichedArticle) -> str:
|
||||
"""调用 LLM 生成单篇文章摘要"""
|
||||
content = _prepare_content(article.content or article.original_summary or "")
|
||||
if not content.strip():
|
||||
# 如果连原始摘要都没有,只能基于标题生成
|
||||
content = article.title or ""
|
||||
|
||||
system_prompt = SUMMARY_SYSTEM_PROMPT.format(max_length=settings.MAX_AI_SUMMARY_LENGTH)
|
||||
user_prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
|
||||
title=article.title or "",
|
||||
author=article.author or "",
|
||||
feed_title=article.feed_title or "",
|
||||
content=content,
|
||||
)
|
||||
|
||||
try:
|
||||
summary = ai_client.chat_completion(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.3,
|
||||
)
|
||||
return summary[: settings.MAX_AI_SUMMARY_LENGTH]
|
||||
except Exception as exc:
|
||||
logger.error("生成 article_id=%d 摘要失败: %s", article.rk_article_id, exc)
|
||||
return ""
|
||||
|
||||
|
||||
def _article_from_rss(raw: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""把 rssKeeper 返回的文章转换为可写入 enriched 表的字典"""
|
||||
published_at = raw.get("published_at")
|
||||
if isinstance(published_at, str):
|
||||
try:
|
||||
published_at = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
|
||||
except Exception:
|
||||
published_at = None
|
||||
|
||||
return {
|
||||
"rk_article_id": raw["id"],
|
||||
"title": raw.get("title", "") or "",
|
||||
"link": raw.get("link", "") or "",
|
||||
"feed_id": raw.get("feed_id", 0),
|
||||
"feed_title": raw.get("feed_title", "") or "",
|
||||
"feed_category": raw.get("category", "") or "",
|
||||
"author": raw.get("author", "") or "",
|
||||
"published_at": published_at,
|
||||
"original_summary": raw.get("summary", "") or "",
|
||||
"content": raw.get("content", "") or "",
|
||||
}
|
||||
|
||||
|
||||
def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[str, int]:
|
||||
"""
|
||||
拉取最近文章,补充 AI 摘要。
|
||||
返回统计信息:{"fetched": x, "created": y, "summarized": z}
|
||||
"""
|
||||
articles = rss_client.fetch_recent(hours=hours, limit=limit)
|
||||
if not articles:
|
||||
logger.info("未拉取到新文章")
|
||||
return {"fetched": 0, "created": 0, "summarized": 0}
|
||||
|
||||
stats = {"fetched": len(articles), "created": 0, "summarized": 0}
|
||||
|
||||
for raw in articles:
|
||||
data = _article_from_rss(raw)
|
||||
article = db.query(EnrichedArticle).filter(
|
||||
EnrichedArticle.rk_article_id == data["rk_article_id"]
|
||||
).first()
|
||||
|
||||
if article is None:
|
||||
article = EnrichedArticle(**data)
|
||||
db.add(article)
|
||||
db.flush()
|
||||
stats["created"] += 1
|
||||
else:
|
||||
# 更新已有记录的基础字段
|
||||
article.title = data["title"] or article.title
|
||||
article.link = data["link"] or article.link
|
||||
article.feed_title = data["feed_title"] or article.feed_title
|
||||
article.feed_category = data["feed_category"] or article.feed_category
|
||||
article.author = data["author"] or article.author
|
||||
article.published_at = data["published_at"] or article.published_at
|
||||
article.original_summary = data["original_summary"] or article.original_summary
|
||||
article.content = data["content"] or article.content
|
||||
article.fetched_at = datetime.now(timezone.utc)
|
||||
|
||||
if _needs_summary(article):
|
||||
ai_summary = _generate_summary(article)
|
||||
if ai_summary:
|
||||
article.ai_summary = ai_summary
|
||||
stats["summarized"] += 1
|
||||
|
||||
# 每 10 篇提交一次,避免长时间事务
|
||||
if stats["summarized"] % 10 == 0:
|
||||
db.commit()
|
||||
|
||||
db.commit()
|
||||
logger.info(
|
||||
"摘要任务完成: fetched=%d, created=%d, summarized=%d",
|
||||
stats["fetched"], stats["created"], stats["summarized"]
|
||||
)
|
||||
return stats
|
||||
+116
@@ -0,0 +1,116 @@
|
||||
"""基于规则给文章分类、打标签"""
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Dict, Any, Tuple
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models import EnrichedArticle, Taxonomy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _normalize(text: str) -> str:
|
||||
"""规范化文本用于关键词匹配"""
|
||||
if not text:
|
||||
return ""
|
||||
# 去除多余空白,统一小写
|
||||
text = " ".join(text.split())
|
||||
return text.lower()
|
||||
|
||||
|
||||
def _count_matches(text: str, keywords: List[str]) -> int:
|
||||
"""统计关键词在文本中的命中次数(不区分大小写)"""
|
||||
if not text or not keywords:
|
||||
return 0
|
||||
text_norm = _normalize(text)
|
||||
count = 0
|
||||
for kw in keywords:
|
||||
if not kw:
|
||||
continue
|
||||
kw_norm = _normalize(kw)
|
||||
# 简单子串匹配;中文关键词也适用
|
||||
count += text_norm.count(kw_norm)
|
||||
return count
|
||||
|
||||
|
||||
def classify_article(article: EnrichedArticle, categories: List[Taxonomy]) -> str:
|
||||
"""为文章选择最匹配的分类"""
|
||||
text = " ".join([
|
||||
article.title or "",
|
||||
article.ai_summary or article.original_summary or "",
|
||||
article.content or "",
|
||||
])
|
||||
|
||||
best_category = ""
|
||||
best_score = 0
|
||||
|
||||
for cat in categories:
|
||||
score = _count_matches(text, cat.keywords or [])
|
||||
# 如果文章来自某个 Feed 分类,给予少量加成
|
||||
if article.feed_category and article.feed_category == cat.name:
|
||||
score += 2
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_category = cat.name
|
||||
|
||||
# 若完全没有命中,回退到源分类
|
||||
if not best_category and article.feed_category:
|
||||
best_category = article.feed_category
|
||||
|
||||
if not best_category:
|
||||
best_category = "未分类"
|
||||
|
||||
return best_category
|
||||
|
||||
|
||||
def tag_article(article: EnrichedArticle, tags: List[Taxonomy]) -> List[str]:
|
||||
"""为文章打上命中的标签"""
|
||||
text = " ".join([
|
||||
article.title or "",
|
||||
article.ai_summary or article.original_summary or "",
|
||||
article.content or "",
|
||||
])
|
||||
|
||||
matched = []
|
||||
for tag in tags:
|
||||
if _count_matches(text, tag.keywords or []) > 0:
|
||||
matched.append(tag.name)
|
||||
|
||||
# 去重并保持顺序
|
||||
return list(dict.fromkeys(matched))
|
||||
|
||||
|
||||
def tag_articles(db: Session, article_ids: List[int] = None) -> int:
|
||||
"""
|
||||
对文章进行分类和打标签。
|
||||
若指定 article_ids 则只处理这些文章;否则处理所有未分类或没有标签的文章。
|
||||
返回处理数量。
|
||||
"""
|
||||
categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").all()
|
||||
tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").all()
|
||||
|
||||
if not categories:
|
||||
logger.warning("taxonomy 中无 category 数据,跳过分类")
|
||||
return 0
|
||||
|
||||
query = db.query(EnrichedArticle)
|
||||
if article_ids:
|
||||
query = query.filter(EnrichedArticle.id.in_(article_ids))
|
||||
else:
|
||||
query = query.filter(
|
||||
(EnrichedArticle.category == "") | (EnrichedArticle.category == None)
|
||||
)
|
||||
|
||||
articles = query.all()
|
||||
count = 0
|
||||
for article in articles:
|
||||
article.category = classify_article(article, categories)
|
||||
article.tags = tag_article(article, tags)
|
||||
count += 1
|
||||
if count % 50 == 0:
|
||||
db.commit()
|
||||
|
||||
db.commit()
|
||||
logger.info("分类/打标签完成: %d 篇文章", count)
|
||||
return count
|
||||
+140
@@ -0,0 +1,140 @@
|
||||
"""分类/标签/打分规则体系的初始化与维护"""
|
||||
import json
|
||||
import logging
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.ai_client import ai_client
|
||||
from app.rss_client import rss_client
|
||||
from models import Taxonomy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
TAXONOMY_SYSTEM_PROMPT = """你是一位专业的信息分类与内容分析专家。
|
||||
请根据用户提供的 RSS 文章样本,生成一套适合的中文内容分类体系、标签体系和打分规则。
|
||||
|
||||
输出必须是合法的 JSON,格式如下:
|
||||
{
|
||||
"categories": [
|
||||
{"name": "科技", "description": "人工智能、芯片、互联网、软件等", "keywords": ["AI", "芯片", "大模型", ...]}
|
||||
],
|
||||
"tags": [
|
||||
{"name": "人工智能", "description": "...", "keywords": ["AI", "人工智能", "大模型", ...]}
|
||||
],
|
||||
"heat_rules": [
|
||||
{"name": "热点事件", "keywords": ["突发", "重磅", "刚刚", "发布"], "weight": 1.5}
|
||||
],
|
||||
"importance_rules": [
|
||||
{"name": "政策法规", "keywords": ["政策", "监管", "法规", "征求意见"], "weight": 1.5}
|
||||
],
|
||||
"duplication_indicators": [
|
||||
{"name": "同一事件", "keywords": ["宣布", "发布", "推出"], "weight": 1.0}
|
||||
]
|
||||
}
|
||||
|
||||
要求:
|
||||
1. categories 数量控制在 8-12 个,覆盖科技、财经、新闻、设计、生活等常见 RSS 主题。
|
||||
2. tags 数量控制在 30-50 个,尽量细化但避免过度重叠。
|
||||
3. heat_rules 和 importance_rules 各 10-20 条,weight 范围 0.5-2.0。
|
||||
4. 所有 keywords 用中文或中英双语,便于后续关键词匹配。
|
||||
5. 不要输出任何解释文字,只输出 JSON。
|
||||
"""
|
||||
|
||||
|
||||
def _build_sample_prompt(articles: List[Dict[str, Any]]) -> str:
|
||||
lines = [f"共有 {len(articles)} 篇文章样本:"]
|
||||
for idx, art in enumerate(articles[:50], 1):
|
||||
title = art.get("title", "")
|
||||
summary = art.get("summary", "") or art.get("content", "")[:300]
|
||||
feed = art.get("feed_title", "")
|
||||
cat = art.get("category", "")
|
||||
lines.append(f"\n[{idx}] 标题:{title}")
|
||||
lines.append(f" 来源:{feed} | 源分类:{cat}")
|
||||
lines.append(f" 摘要:{summary[:400]}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def bootstrap_taxonomy(db: Session, force: bool = False) -> bool:
|
||||
"""
|
||||
初始化分类/标签/打分规则。
|
||||
若 force=True 则清空后重建;否则仅在表为空时初始化。
|
||||
"""
|
||||
existing = db.query(Taxonomy).first()
|
||||
if existing and not force:
|
||||
logger.info("taxonomy 表已存在,跳过初始化")
|
||||
return False
|
||||
|
||||
if force:
|
||||
db.query(Taxonomy).delete()
|
||||
db.commit()
|
||||
logger.info("强制重新初始化 taxonomy")
|
||||
|
||||
logger.info("开始从 rssKeeper 拉取样本文章并生成分类体系...")
|
||||
articles = rss_client.fetch_recent(hours=24 * 7, limit=200)
|
||||
if not articles:
|
||||
logger.warning("未获取到样本文章,无法生成分类体系")
|
||||
return False
|
||||
|
||||
user_prompt = _build_sample_prompt(articles)
|
||||
try:
|
||||
result = ai_client.chat_completion_json(
|
||||
system_prompt=TAXONOMY_SYSTEM_PROMPT,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.5,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error("生成分类体系失败: %s", exc)
|
||||
return False
|
||||
|
||||
_save_taxonomy(db, result)
|
||||
logger.info("taxonomy 初始化完成,共写入 %d 条规则", db.query(Taxonomy).count())
|
||||
return True
|
||||
|
||||
|
||||
def _save_taxonomy(db: Session, data: Dict[str, Any]) -> None:
|
||||
"""把 LLM 返回的分类体系写入数据库"""
|
||||
|
||||
def _add(kind: str, items: List[Dict[str, Any]], default_weight: float = 1.0):
|
||||
for item in items:
|
||||
name = item.get("name", "").strip()
|
||||
if not name:
|
||||
continue
|
||||
keywords = item.get("keywords", [])
|
||||
if isinstance(keywords, str):
|
||||
keywords = [keywords]
|
||||
db.add(
|
||||
Taxonomy(
|
||||
name=name,
|
||||
kind=kind,
|
||||
description=item.get("description", ""),
|
||||
keywords=keywords,
|
||||
weight=float(item.get("weight", default_weight)),
|
||||
created_by_ai=True,
|
||||
)
|
||||
)
|
||||
|
||||
_add("category", data.get("categories", []))
|
||||
_add("tag", data.get("tags", []))
|
||||
_add("heat_rule", data.get("heat_rules", []), default_weight=1.0)
|
||||
_add("importance_rule", data.get("importance_rules", []), default_weight=1.0)
|
||||
_add("duplication_rule", data.get("duplication_indicators", []), default_weight=1.0)
|
||||
|
||||
db.commit()
|
||||
|
||||
|
||||
def ensure_taxonomy(db: Session) -> bool:
|
||||
"""确保 taxonomy 表非空,若为空则触发初始化"""
|
||||
existing = db.query(Taxonomy).first()
|
||||
if existing:
|
||||
return True
|
||||
return bootstrap_taxonomy(db)
|
||||
|
||||
|
||||
def list_taxonomy(db: Session, kind: str = None) -> List[Taxonomy]:
|
||||
"""列出分类体系规则"""
|
||||
query = db.query(Taxonomy)
|
||||
if kind:
|
||||
query = query.filter(Taxonomy.kind == kind)
|
||||
return query.order_by(Taxonomy.kind, Taxonomy.name).all()
|
||||
Reference in New Issue
Block a user