"""文章摘要生成器:对无摘要或短摘要文章调用 LLM 生成 AI 摘要""" import logging from datetime import datetime, timezone from typing import List, Dict, Any from sqlalchemy.orm import Session from app.ai_client import ai_client from app.rss_client import rss_client from config import settings from models import EnrichedArticle logger = logging.getLogger(__name__) SUMMARY_SYSTEM_PROMPT = """你是一位擅长阅读 RSS 新闻并提炼摘要的助手。 请用简洁流畅的中文总结文章核心内容,要求: 1. 长度控制在 {max_length} 个汉字以内。 2. 包含文章最重要的 1-3 个要点。 3. 不要添加个人评价,不要复述原文标题。 4. 若原文是英文,请用中文输出摘要。 """ SUMMARY_USER_PROMPT_TEMPLATE = """请为以下文章生成摘要。 标题:{title} 作者:{author} 来源:{feed_title} 正文: {content} """ def _needs_summary(article: EnrichedArticle) -> bool: """判断是否需要生成 AI 摘要""" if not article.ai_summary: return True original = article.original_summary or "" if len(original.strip()) < settings.MIN_ORIGINAL_SUMMARY_LENGTH: return True return False def _prepare_content(raw_content: str, max_chars: int = 8000) -> str: """清洗并截断正文,避免超过 LLM 上下文""" text = raw_content or "" # 简单去除多余空白 text = " ".join(text.split()) return text[:max_chars] def _generate_summary(article: EnrichedArticle) -> str: """调用 LLM 生成单篇文章摘要""" content = _prepare_content(article.content or article.original_summary or "") if not content.strip(): # 如果连原始摘要都没有,只能基于标题生成 content = article.title or "" system_prompt = SUMMARY_SYSTEM_PROMPT.format(max_length=settings.MAX_AI_SUMMARY_LENGTH) user_prompt = SUMMARY_USER_PROMPT_TEMPLATE.format( title=article.title or "", author=article.author or "", feed_title=article.feed_title or "", content=content, ) try: summary = ai_client.chat_completion( system_prompt=system_prompt, user_prompt=user_prompt, temperature=0.3, ) return summary[: settings.MAX_AI_SUMMARY_LENGTH] except Exception as exc: logger.error("生成 article_id=%d 摘要失败: %s", article.rk_article_id, exc) return "" def _article_from_rss(raw: Dict[str, Any]) -> Dict[str, Any]: """把 rssKeeper 返回的文章转换为可写入 enriched 表的字典""" published_at = raw.get("published_at") if isinstance(published_at, str): try: published_at = datetime.fromisoformat(published_at.replace("Z", "+00:00")) except Exception: published_at = None return { "rk_article_id": raw["id"], "title": raw.get("title", "") or "", "link": raw.get("link", "") or "", "feed_id": raw.get("feed_id", 0), "feed_title": raw.get("feed_title", "") or "", "feed_category": raw.get("category", "") or "", "author": raw.get("author", "") or "", "published_at": published_at, "original_summary": raw.get("summary", "") or "", "content": raw.get("content", "") or "", } def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[str, int]: """ 拉取最近文章,补充 AI 摘要。 返回统计信息:{"fetched": x, "created": y, "summarized": z} """ articles = rss_client.fetch_recent(hours=hours, limit=limit) if not articles: logger.info("未拉取到新文章") return {"fetched": 0, "created": 0, "summarized": 0} stats = {"fetched": len(articles), "created": 0, "summarized": 0} for raw in articles: data = _article_from_rss(raw) article = db.query(EnrichedArticle).filter( EnrichedArticle.rk_article_id == data["rk_article_id"] ).first() if article is None: article = EnrichedArticle(**data) db.add(article) db.flush() stats["created"] += 1 else: # 更新已有记录的基础字段 article.title = data["title"] or article.title article.link = data["link"] or article.link article.feed_title = data["feed_title"] or article.feed_title article.feed_category = data["feed_category"] or article.feed_category article.author = data["author"] or article.author article.published_at = data["published_at"] or article.published_at article.original_summary = data["original_summary"] or article.original_summary article.content = data["content"] or article.content article.fetched_at = datetime.now(timezone.utc) if _needs_summary(article): ai_summary = _generate_summary(article) if ai_summary: article.ai_summary = ai_summary stats["summarized"] += 1 # 每 10 篇提交一次,避免长时间事务 if stats["summarized"] % 10 == 0: db.commit() db.commit() logger.info( "摘要任务完成: fetched=%d, created=%d, summarized=%d", stats["fetched"], stats["created"], stats["summarized"] ) return stats