Files
dataClean/app/summarizer.py
T

155 lines
5.3 KiB
Python
Raw Normal View History

2026-06-12 16:04:03 +08:00
"""文章摘要生成器:对无摘要或短摘要文章调用 LLM 生成 AI 摘要"""
import logging
from datetime import datetime, timezone
from typing import List, Dict, Any
from sqlalchemy.orm import Session
from app.ai_client import ai_client
from app.rss_client import rss_client
from config import settings
from models import EnrichedArticle
logger = logging.getLogger(__name__)
SUMMARY_SYSTEM_PROMPT = """你是一位擅长阅读 RSS 新闻并提炼摘要的助手。
请用简洁流畅的中文总结文章核心内容,要求:
1. 长度控制在 {max_length} 个汉字以内。
2. 包含文章最重要的 1-3 个要点。
3. 不要添加个人评价,不要复述原文标题。
4. 若原文是英文,请用中文输出摘要。
"""
SUMMARY_USER_PROMPT_TEMPLATE = """请为以下文章生成摘要。
标题:{title}
作者:{author}
来源:{feed_title}
正文:
{content}
"""
def _needs_summary(article: EnrichedArticle) -> bool:
"""判断是否需要生成 AI 摘要"""
if not article.ai_summary:
return True
original = article.original_summary or ""
if len(original.strip()) < settings.MIN_ORIGINAL_SUMMARY_LENGTH:
return True
return False
def _prepare_content(raw_content: str, max_chars: int = 8000) -> str:
"""清洗并截断正文,避免超过 LLM 上下文"""
text = raw_content or ""
# 简单去除多余空白
text = " ".join(text.split())
return text[:max_chars]
def _generate_summary(article: EnrichedArticle) -> str:
"""调用 LLM 生成单篇文章摘要"""
content = _prepare_content(article.content or article.original_summary or "")
if not content.strip():
# 如果连原始摘要都没有,只能基于标题生成
content = article.title or ""
system_prompt = SUMMARY_SYSTEM_PROMPT.format(max_length=settings.MAX_AI_SUMMARY_LENGTH)
user_prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
title=article.title or "",
author=article.author or "",
feed_title=article.feed_title or "",
content=content,
)
try:
summary = ai_client.chat_completion(
system_prompt=system_prompt,
user_prompt=user_prompt,
temperature=0.3,
)
return summary[: settings.MAX_AI_SUMMARY_LENGTH]
except Exception as exc:
logger.error("生成 article_id=%d 摘要失败: %s", article.rk_article_id, exc)
return ""
def _article_from_rss(raw: Dict[str, Any]) -> Dict[str, Any]:
"""把 rssKeeper 返回的文章转换为可写入 enriched 表的字典"""
published_at = raw.get("published_at")
if isinstance(published_at, str):
try:
published_at = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
except Exception:
published_at = None
return {
"rk_article_id": raw["id"],
"title": raw.get("title", "") or "",
"link": raw.get("link", "") or "",
"feed_id": raw.get("feed_id", 0),
"feed_title": raw.get("feed_title", "") or "",
"feed_category": raw.get("category", "") or "",
"author": raw.get("author", "") or "",
"published_at": published_at,
"original_summary": raw.get("summary", "") or "",
"content": raw.get("content", "") or "",
}
def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[str, int]:
"""
拉取最近文章,补充 AI 摘要。
返回统计信息:{"fetched": x, "created": y, "summarized": z}
"""
articles = rss_client.fetch_recent(hours=hours, limit=limit)
if not articles:
logger.info("未拉取到新文章")
return {"fetched": 0, "created": 0, "summarized": 0}
stats = {"fetched": len(articles), "created": 0, "summarized": 0}
for raw in articles:
data = _article_from_rss(raw)
article = db.query(EnrichedArticle).filter(
EnrichedArticle.rk_article_id == data["rk_article_id"]
).first()
if article is None:
article = EnrichedArticle(**data)
db.add(article)
db.flush()
stats["created"] += 1
else:
# 更新已有记录的基础字段
article.title = data["title"] or article.title
article.link = data["link"] or article.link
article.feed_title = data["feed_title"] or article.feed_title
article.feed_category = data["feed_category"] or article.feed_category
article.author = data["author"] or article.author
article.published_at = data["published_at"] or article.published_at
article.original_summary = data["original_summary"] or article.original_summary
article.content = data["content"] or article.content
article.fetched_at = datetime.now(timezone.utc)
if _needs_summary(article):
ai_summary = _generate_summary(article)
if ai_summary:
article.ai_summary = ai_summary
stats["summarized"] += 1
# 每 10 篇提交一次,避免长时间事务
if stats["summarized"] % 10 == 0:
db.commit()
db.commit()
logger.info(
"摘要任务完成: fetched=%d, created=%d, summarized=%d",
stats["fetched"], stats["created"], stats["summarized"]
)
return stats