feat: 修复代码审核报告问题
This commit is contained in:
@@ -0,0 +1,154 @@
|
||||
"""文章摘要生成器:对无摘要或短摘要文章调用 LLM 生成 AI 摘要"""
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.ai_client import ai_client
|
||||
from app.rss_client import rss_client
|
||||
from config import settings
|
||||
from models import EnrichedArticle
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SUMMARY_SYSTEM_PROMPT = """你是一位擅长阅读 RSS 新闻并提炼摘要的助手。
|
||||
请用简洁流畅的中文总结文章核心内容,要求:
|
||||
1. 长度控制在 {max_length} 个汉字以内。
|
||||
2. 包含文章最重要的 1-3 个要点。
|
||||
3. 不要添加个人评价,不要复述原文标题。
|
||||
4. 若原文是英文,请用中文输出摘要。
|
||||
"""
|
||||
|
||||
|
||||
SUMMARY_USER_PROMPT_TEMPLATE = """请为以下文章生成摘要。
|
||||
|
||||
标题:{title}
|
||||
作者:{author}
|
||||
来源:{feed_title}
|
||||
|
||||
正文:
|
||||
{content}
|
||||
"""
|
||||
|
||||
|
||||
def _needs_summary(article: EnrichedArticle) -> bool:
|
||||
"""判断是否需要生成 AI 摘要"""
|
||||
if not article.ai_summary:
|
||||
return True
|
||||
original = article.original_summary or ""
|
||||
if len(original.strip()) < settings.MIN_ORIGINAL_SUMMARY_LENGTH:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _prepare_content(raw_content: str, max_chars: int = 8000) -> str:
|
||||
"""清洗并截断正文,避免超过 LLM 上下文"""
|
||||
text = raw_content or ""
|
||||
# 简单去除多余空白
|
||||
text = " ".join(text.split())
|
||||
return text[:max_chars]
|
||||
|
||||
|
||||
def _generate_summary(article: EnrichedArticle) -> str:
|
||||
"""调用 LLM 生成单篇文章摘要"""
|
||||
content = _prepare_content(article.content or article.original_summary or "")
|
||||
if not content.strip():
|
||||
# 如果连原始摘要都没有,只能基于标题生成
|
||||
content = article.title or ""
|
||||
|
||||
system_prompt = SUMMARY_SYSTEM_PROMPT.format(max_length=settings.MAX_AI_SUMMARY_LENGTH)
|
||||
user_prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
|
||||
title=article.title or "",
|
||||
author=article.author or "",
|
||||
feed_title=article.feed_title or "",
|
||||
content=content,
|
||||
)
|
||||
|
||||
try:
|
||||
summary = ai_client.chat_completion(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.3,
|
||||
)
|
||||
return summary[: settings.MAX_AI_SUMMARY_LENGTH]
|
||||
except Exception as exc:
|
||||
logger.error("生成 article_id=%d 摘要失败: %s", article.rk_article_id, exc)
|
||||
return ""
|
||||
|
||||
|
||||
def _article_from_rss(raw: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""把 rssKeeper 返回的文章转换为可写入 enriched 表的字典"""
|
||||
published_at = raw.get("published_at")
|
||||
if isinstance(published_at, str):
|
||||
try:
|
||||
published_at = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
|
||||
except Exception:
|
||||
published_at = None
|
||||
|
||||
return {
|
||||
"rk_article_id": raw["id"],
|
||||
"title": raw.get("title", "") or "",
|
||||
"link": raw.get("link", "") or "",
|
||||
"feed_id": raw.get("feed_id", 0),
|
||||
"feed_title": raw.get("feed_title", "") or "",
|
||||
"feed_category": raw.get("category", "") or "",
|
||||
"author": raw.get("author", "") or "",
|
||||
"published_at": published_at,
|
||||
"original_summary": raw.get("summary", "") or "",
|
||||
"content": raw.get("content", "") or "",
|
||||
}
|
||||
|
||||
|
||||
def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[str, int]:
|
||||
"""
|
||||
拉取最近文章,补充 AI 摘要。
|
||||
返回统计信息:{"fetched": x, "created": y, "summarized": z}
|
||||
"""
|
||||
articles = rss_client.fetch_recent(hours=hours, limit=limit)
|
||||
if not articles:
|
||||
logger.info("未拉取到新文章")
|
||||
return {"fetched": 0, "created": 0, "summarized": 0}
|
||||
|
||||
stats = {"fetched": len(articles), "created": 0, "summarized": 0}
|
||||
|
||||
for raw in articles:
|
||||
data = _article_from_rss(raw)
|
||||
article = db.query(EnrichedArticle).filter(
|
||||
EnrichedArticle.rk_article_id == data["rk_article_id"]
|
||||
).first()
|
||||
|
||||
if article is None:
|
||||
article = EnrichedArticle(**data)
|
||||
db.add(article)
|
||||
db.flush()
|
||||
stats["created"] += 1
|
||||
else:
|
||||
# 更新已有记录的基础字段
|
||||
article.title = data["title"] or article.title
|
||||
article.link = data["link"] or article.link
|
||||
article.feed_title = data["feed_title"] or article.feed_title
|
||||
article.feed_category = data["feed_category"] or article.feed_category
|
||||
article.author = data["author"] or article.author
|
||||
article.published_at = data["published_at"] or article.published_at
|
||||
article.original_summary = data["original_summary"] or article.original_summary
|
||||
article.content = data["content"] or article.content
|
||||
article.fetched_at = datetime.now(timezone.utc)
|
||||
|
||||
if _needs_summary(article):
|
||||
ai_summary = _generate_summary(article)
|
||||
if ai_summary:
|
||||
article.ai_summary = ai_summary
|
||||
stats["summarized"] += 1
|
||||
|
||||
# 每 10 篇提交一次,避免长时间事务
|
||||
if stats["summarized"] % 10 == 0:
|
||||
db.commit()
|
||||
|
||||
db.commit()
|
||||
logger.info(
|
||||
"摘要任务完成: fetched=%d, created=%d, summarized=%d",
|
||||
stats["fetched"], stats["created"], stats["summarized"]
|
||||
)
|
||||
return stats
|
||||
Reference in New Issue
Block a user