155 lines
5.3 KiB
Python
155 lines
5.3 KiB
Python
"""文章摘要生成器:对无摘要或短摘要文章调用 LLM 生成 AI 摘要"""
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Any
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.ai_client import ai_client
|
|
from app.rss_client import rss_client
|
|
from config import settings
|
|
from models import EnrichedArticle
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
SUMMARY_SYSTEM_PROMPT = """你是一位擅长阅读 RSS 新闻并提炼摘要的助手。
|
|
请用简洁流畅的中文总结文章核心内容,要求:
|
|
1. 长度控制在 {max_length} 个汉字以内。
|
|
2. 包含文章最重要的 1-3 个要点。
|
|
3. 不要添加个人评价,不要复述原文标题。
|
|
4. 若原文是英文,请用中文输出摘要。
|
|
"""
|
|
|
|
|
|
SUMMARY_USER_PROMPT_TEMPLATE = """请为以下文章生成摘要。
|
|
|
|
标题:{title}
|
|
作者:{author}
|
|
来源:{feed_title}
|
|
|
|
正文:
|
|
{content}
|
|
"""
|
|
|
|
|
|
def _needs_summary(article: EnrichedArticle) -> bool:
|
|
"""判断是否需要生成 AI 摘要"""
|
|
if not article.ai_summary:
|
|
return True
|
|
original = article.original_summary or ""
|
|
if len(original.strip()) < settings.MIN_ORIGINAL_SUMMARY_LENGTH:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _prepare_content(raw_content: str, max_chars: int = 8000) -> str:
|
|
"""清洗并截断正文,避免超过 LLM 上下文"""
|
|
text = raw_content or ""
|
|
# 简单去除多余空白
|
|
text = " ".join(text.split())
|
|
return text[:max_chars]
|
|
|
|
|
|
def _generate_summary(article: EnrichedArticle) -> str:
|
|
"""调用 LLM 生成单篇文章摘要"""
|
|
content = _prepare_content(article.content or article.original_summary or "")
|
|
if not content.strip():
|
|
# 如果连原始摘要都没有,只能基于标题生成
|
|
content = article.title or ""
|
|
|
|
system_prompt = SUMMARY_SYSTEM_PROMPT.format(max_length=settings.MAX_AI_SUMMARY_LENGTH)
|
|
user_prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
|
|
title=article.title or "",
|
|
author=article.author or "",
|
|
feed_title=article.feed_title or "",
|
|
content=content,
|
|
)
|
|
|
|
try:
|
|
summary = ai_client.chat_completion(
|
|
system_prompt=system_prompt,
|
|
user_prompt=user_prompt,
|
|
temperature=0.3,
|
|
)
|
|
return summary[: settings.MAX_AI_SUMMARY_LENGTH]
|
|
except Exception as exc:
|
|
logger.error("生成 article_id=%d 摘要失败: %s", article.rk_article_id, exc)
|
|
return ""
|
|
|
|
|
|
def _article_from_rss(raw: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""把 rssKeeper 返回的文章转换为可写入 enriched 表的字典"""
|
|
published_at = raw.get("published_at")
|
|
if isinstance(published_at, str):
|
|
try:
|
|
published_at = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
|
|
except Exception:
|
|
published_at = None
|
|
|
|
return {
|
|
"rk_article_id": raw["id"],
|
|
"title": raw.get("title", "") or "",
|
|
"link": raw.get("link", "") or "",
|
|
"feed_id": raw.get("feed_id", 0),
|
|
"feed_title": raw.get("feed_title", "") or "",
|
|
"feed_category": raw.get("category", "") or "",
|
|
"author": raw.get("author", "") or "",
|
|
"published_at": published_at,
|
|
"original_summary": raw.get("summary", "") or "",
|
|
"content": raw.get("content", "") or "",
|
|
}
|
|
|
|
|
|
def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[str, int]:
|
|
"""
|
|
拉取最近文章,补充 AI 摘要。
|
|
返回统计信息:{"fetched": x, "created": y, "summarized": z}
|
|
"""
|
|
articles = rss_client.fetch_recent(hours=hours, limit=limit)
|
|
if not articles:
|
|
logger.info("未拉取到新文章")
|
|
return {"fetched": 0, "created": 0, "summarized": 0}
|
|
|
|
stats = {"fetched": len(articles), "created": 0, "summarized": 0}
|
|
|
|
for raw in articles:
|
|
data = _article_from_rss(raw)
|
|
article = db.query(EnrichedArticle).filter(
|
|
EnrichedArticle.rk_article_id == data["rk_article_id"]
|
|
).first()
|
|
|
|
if article is None:
|
|
article = EnrichedArticle(**data)
|
|
db.add(article)
|
|
db.flush()
|
|
stats["created"] += 1
|
|
else:
|
|
# 更新已有记录的基础字段
|
|
article.title = data["title"] or article.title
|
|
article.link = data["link"] or article.link
|
|
article.feed_title = data["feed_title"] or article.feed_title
|
|
article.feed_category = data["feed_category"] or article.feed_category
|
|
article.author = data["author"] or article.author
|
|
article.published_at = data["published_at"] or article.published_at
|
|
article.original_summary = data["original_summary"] or article.original_summary
|
|
article.content = data["content"] or article.content
|
|
article.fetched_at = datetime.now(timezone.utc)
|
|
|
|
if _needs_summary(article):
|
|
ai_summary = _generate_summary(article)
|
|
if ai_summary:
|
|
article.ai_summary = ai_summary
|
|
stats["summarized"] += 1
|
|
|
|
# 每 10 篇提交一次,避免长时间事务
|
|
if stats["summarized"] % 10 == 0:
|
|
db.commit()
|
|
|
|
db.commit()
|
|
logger.info(
|
|
"摘要任务完成: fetched=%d, created=%d, summarized=%d",
|
|
stats["fetched"], stats["created"], stats["summarized"]
|
|
)
|
|
return stats
|