feat: init rssKeeper - RSS 抓取、管理与检索系统

完整功能包括: - FastAPI 后端 + SQLite + FTS5 全文搜索 - RSS 源管理、自动发现、OPML 导入导出 - 文章抓取、去重、分类、全文检索 - RSS 源健康度监控 - Vue 3 + Element Plus 暗色主题 Web UI - 对外 REST API 供 AI 分析调用 - Docker + docker-compose 部署
2026-06-11 14:03:36 +08:00
commit 54e7db0ef0
28 changed files with 2915 additions and 0 deletions
@@ -0,0 +1,298 @@
+"""RSS 抓取核心逻辑"""
+import time
+import re
+import html
+import hashlib
+from datetime import datetime, timezone
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from urllib.parse import urljoin
+import requests
+import feedparser
+from bs4 import BeautifulSoup
+from sqlalchemy.orm import Session
+from models import Feed, Article, FetchLog
+from database import SessionLocal
+import config
+
+
+def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict:
+    """抓取单个 RSS 源
+    返回 {"success": bool, "feed_data": parsed, "error": str, "response_time_ms": int}
+    """
+    start_time = time.time()
+    try:
+        headers = {
+            "User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
+            "Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
+        }
+        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
+        response.raise_for_status()
+
+        # 解析 RSS
+        parsed = feedparser.parse(response.content)
+
+        response_time_ms = int((time.time() - start_time) * 1000)
+
+        if parsed.bozo and hasattr(parsed, 'bozo_exception'):
+            # 有解析警告但可能仍然可用
+            pass
+
+        return {
+            "success": True,
+            "feed_data": parsed,
+            "error": None,
+            "response_time_ms": response_time_ms,
+        }
+    except requests.exceptions.RequestException as e:
+        return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
+    except Exception as e:
+        return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
+
+
+def discover_feed_url(url: str, timeout: int = 15) -> list:
+    """从任意网页自动发现 RSS/Atom feed URL
+    返回找到的 feed URL 列表
+    """
+    try:
+        headers = {
+            "User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
+        }
+        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.content, "html.parser")
+        feed_urls = []
+
+        # 查找 <link rel="alternate"> 标签
+        for link in soup.find_all("link", rel="alternate"):
+            link_type = link.get("type", "").lower()
+            href = link.get("href", "")
+            if href and any(t in link_type for t in ["rss", "atom", "xml"]):
+                full_url = urljoin(response.url, href)
+                feed_urls.append(full_url)
+
+        # 也查找常见的 RSS 链接
+        common_patterns = [
+            "/rss", "/feed", "/feeds", "/atom.xml", "/rss.xml",
+            "/index.xml", "/feed.xml", "/?feed=rss2",
+        ]
+        for pattern in common_patterns:
+            candidate = urljoin(response.url, pattern)
+            if candidate not in feed_urls:
+                # 验证是否是有效的 feed
+                try:
+                    resp = requests.head(candidate, headers=headers, timeout=5, allow_redirects=True)
+                    content_type = resp.headers.get("Content-Type", "").lower()
+                    if any(t in content_type for t in ["rss", "atom", "xml"]):
+                        feed_urls.append(candidate)
+                except Exception:
+                    pass
+
+        return list(dict.fromkeys(feed_urls))  # 去重保持顺序
+    except Exception:
+        return []
+
+
+def parse_article(entry, feed_id: int) -> dict:
+    """从 feedparser entry 解析文章数据"""
+    title = entry.get("title", "")
+    link = entry.get("link", "")
+    author = entry.get("author", "")
+
+    # 发布时间
+    published_at = None
+    if hasattr(entry, "published_parsed") and entry.published_parsed:
+        try:
+            published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
+        except (ValueError, TypeError):
+            pass
+    if not published_at and hasattr(entry, "updated_parsed") and entry.updated_parsed:
+        try:
+            published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
+        except (ValueError, TypeError):
+            pass
+
+    # 内容：优先 summary，其次 content
+    content = ""
+    if hasattr(entry, "content") and entry.content:
+        content = entry.content[0].value
+    elif hasattr(entry, "summary"):
+        content = entry.summary
+
+    # 清洗 HTML
+    content = clean_html(content)
+
+    # 生成摘要
+    summary = generate_summary(content)
+
+    return {
+        "feed_id": feed_id,
+        "title": title[:1024],
+        "link": link[:2048],
+        "author": author[:256],
+        "published_at": published_at,
+        "content": content[:config.MAX_ARTICLE_CONTENT_LENGTH],
+        "summary": summary[:config.MAX_SUMMARY_LENGTH],
+    }
+
+
+def clean_html(html_text: str) -> str:
+    """清洗 HTML，去除 script/style 标签，转为安全文本"""
+    if not html_text:
+        return ""
+
+    # 先解码 HTML 实体
+    text = html.unescape(html_text)
+
+    # 用 BeautifulSoup 清理
+    soup = BeautifulSoup(text, "html.parser")
+
+    # 移除 script 和 style
+    for tag in soup(["script", "style", "iframe", "object", "embed"]):
+        tag.decompose()
+
+    # 获取纯文本
+    cleaned = soup.get_text(separator="\n")
+
+    # 压缩空白行
+    cleaned = re.sub(r"\n\s*\n+", "\n\n", cleaned)
+    cleaned = cleaned.strip()
+
+    return cleaned
+
+
+def generate_summary(content: str, max_length: int = 300) -> str:
+    """从内容生成摘要"""
+    if not content:
+        return ""
+
+    # 去掉多余空白
+    text = re.sub(r"\s+", " ", content).strip()
+
+    if len(text) <= max_length:
+        return text
+
+    # 在句子边界截断
+    truncated = text[:max_length]
+    last_period = max(truncated.rfind("。"), truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
+    if last_period > max_length * 0.5:
+        return truncated[:last_period + 1]
+
+    return truncated + "..."
+
+
+def fetch_and_store_feed(feed_id: int) -> dict:
+    """抓取指定 RSS 源并存储文章
+    返回抓取结果统计
+    """
+    db = SessionLocal()
+    try:
+        feed = db.query(Feed).filter(Feed.id == feed_id).first()
+        if not feed:
+            return {"success": False, "error": "Feed not found", "articles_count": 0}
+
+        result = fetch_feed(feed.url)
+
+        if not result["success"]:
+            # 记录失败
+            feed.last_fetch_at = datetime.utcnow()
+            feed.last_fetch_status = "fail"
+            feed.last_error = result["error"]
+            feed.fail_count += 1
+
+            log = FetchLog(
+                feed_id=feed_id,
+                status="fail",
+                error_message=result["error"],
+                response_time_ms=result.get("response_time_ms"),
+            )
+            db.add(log)
+            db.commit()
+            return {"success": False, "error": result["error"], "articles_count": 0}
+
+        parsed = result["feed_data"]
+
+        # 更新 feed 元信息
+        if hasattr(parsed.feed, "title"):
+            feed.title = parsed.feed.title[:512]
+        if hasattr(parsed.feed, "description"):
+            feed.description = parsed.feed.description[:1000]
+
+        # 存储文章
+        new_count = 0
+        for entry in parsed.entries:
+            article_data = parse_article(entry, feed_id)
+            if not article_data["link"]:
+                continue
+
+            # 检查是否已存在（基于 link）
+            existing = db.query(Article).filter(Article.link == article_data["link"]).first()
+            if existing:
+                # 更新已有文章
+                existing.title = article_data["title"] or existing.title
+                existing.content = article_data["content"] or existing.content
+                existing.summary = article_data["summary"] or existing.summary
+                existing.author = article_data["author"] or existing.author
+                if article_data["published_at"]:
+                    existing.published_at = article_data["published_at"]
+            else:
+                article = Article(**article_data)
+                db.add(article)
+                new_count += 1
+
+        # 更新 feed 统计
+        feed.last_fetch_at = datetime.utcnow()
+        feed.last_fetch_status = "success"
+        feed.last_error = ""
+        feed.success_count += 1
+        feed.article_count = db.query(Article).filter(Article.feed_id == feed_id).count()
+
+        log = FetchLog(
+            feed_id=feed_id,
+            status="success",
+            articles_fetched=new_count,
+            response_time_ms=result.get("response_time_ms"),
+        )
+        db.add(log)
+        db.commit()
+
+        return {
+            "success": True,
+            "articles_count": new_count,
+            "feed_title": feed.title,
+        }
+    except Exception as e:
+        db.rollback()
+        return {"success": False, "error": str(e), "articles_count": 0}
+    finally:
+        db.close()
+
+
+def fetch_all_feeds(feed_ids: list = None) -> list:
+    """并发抓取多个 RSS 源
+    返回每个源的抓取结果列表
+    """
+    db = SessionLocal()
+    try:
+        query = db.query(Feed).filter(Feed.is_active == True)
+        if feed_ids:
+            query = query.filter(Feed.id.in_(feed_ids))
+        feeds = query.all()
+    finally:
+        db.close()
+
+    results = []
+    with ThreadPoolExecutor(max_workers=config.FETCH_CONCURRENCY) as executor:
+        future_to_feed = {
+            executor.submit(fetch_and_store_feed, feed.id): feed
+            for feed in feeds
+        }
+        for future in as_completed(future_to_feed):
+            feed = future_to_feed[future]
+            try:
+                result = future.result()
+                results.append({"feed_id": feed.id, **result})
+            except Exception as e:
+                results.append({"feed_id": feed.id, "success": False, "error": str(e), "articles_count": 0})
+
+    return results