54e7db0ef0
完整功能包括: - FastAPI 后端 + SQLite + FTS5 全文搜索 - RSS 源管理、自动发现、OPML 导入导出 - 文章抓取、去重、分类、全文检索 - RSS 源健康度监控 - Vue 3 + Element Plus 暗色主题 Web UI - 对外 REST API 供 AI 分析调用 - Docker + docker-compose 部署
299 lines
9.7 KiB
Python
299 lines
9.7 KiB
Python
"""RSS 抓取核心逻辑"""
|
||
import time
|
||
import re
|
||
import html
|
||
import hashlib
|
||
from datetime import datetime, timezone
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from urllib.parse import urljoin
|
||
import requests
|
||
import feedparser
|
||
from bs4 import BeautifulSoup
|
||
from sqlalchemy.orm import Session
|
||
from models import Feed, Article, FetchLog
|
||
from database import SessionLocal
|
||
import config
|
||
|
||
|
||
def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict:
|
||
"""抓取单个 RSS 源
|
||
返回 {"success": bool, "feed_data": parsed, "error": str, "response_time_ms": int}
|
||
"""
|
||
start_time = time.time()
|
||
try:
|
||
headers = {
|
||
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
|
||
"Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
|
||
}
|
||
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
|
||
response.raise_for_status()
|
||
|
||
# 解析 RSS
|
||
parsed = feedparser.parse(response.content)
|
||
|
||
response_time_ms = int((time.time() - start_time) * 1000)
|
||
|
||
if parsed.bozo and hasattr(parsed, 'bozo_exception'):
|
||
# 有解析警告但可能仍然可用
|
||
pass
|
||
|
||
return {
|
||
"success": True,
|
||
"feed_data": parsed,
|
||
"error": None,
|
||
"response_time_ms": response_time_ms,
|
||
}
|
||
except requests.exceptions.RequestException as e:
|
||
return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
|
||
except Exception as e:
|
||
return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
|
||
|
||
|
||
def discover_feed_url(url: str, timeout: int = 15) -> list:
|
||
"""从任意网页自动发现 RSS/Atom feed URL
|
||
返回找到的 feed URL 列表
|
||
"""
|
||
try:
|
||
headers = {
|
||
"User-Agent": "rssKeeper/1.0 (+https://github.com/rssKeeper)",
|
||
}
|
||
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
|
||
response.raise_for_status()
|
||
|
||
soup = BeautifulSoup(response.content, "html.parser")
|
||
feed_urls = []
|
||
|
||
# 查找 <link rel="alternate"> 标签
|
||
for link in soup.find_all("link", rel="alternate"):
|
||
link_type = link.get("type", "").lower()
|
||
href = link.get("href", "")
|
||
if href and any(t in link_type for t in ["rss", "atom", "xml"]):
|
||
full_url = urljoin(response.url, href)
|
||
feed_urls.append(full_url)
|
||
|
||
# 也查找常见的 RSS 链接
|
||
common_patterns = [
|
||
"/rss", "/feed", "/feeds", "/atom.xml", "/rss.xml",
|
||
"/index.xml", "/feed.xml", "/?feed=rss2",
|
||
]
|
||
for pattern in common_patterns:
|
||
candidate = urljoin(response.url, pattern)
|
||
if candidate not in feed_urls:
|
||
# 验证是否是有效的 feed
|
||
try:
|
||
resp = requests.head(candidate, headers=headers, timeout=5, allow_redirects=True)
|
||
content_type = resp.headers.get("Content-Type", "").lower()
|
||
if any(t in content_type for t in ["rss", "atom", "xml"]):
|
||
feed_urls.append(candidate)
|
||
except Exception:
|
||
pass
|
||
|
||
return list(dict.fromkeys(feed_urls)) # 去重保持顺序
|
||
except Exception:
|
||
return []
|
||
|
||
|
||
def parse_article(entry, feed_id: int) -> dict:
|
||
"""从 feedparser entry 解析文章数据"""
|
||
title = entry.get("title", "")
|
||
link = entry.get("link", "")
|
||
author = entry.get("author", "")
|
||
|
||
# 发布时间
|
||
published_at = None
|
||
if hasattr(entry, "published_parsed") and entry.published_parsed:
|
||
try:
|
||
published_at = datetime(*entry.published_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
|
||
except (ValueError, TypeError):
|
||
pass
|
||
if not published_at and hasattr(entry, "updated_parsed") and entry.updated_parsed:
|
||
try:
|
||
published_at = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc).replace(tzinfo=None)
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
# 内容:优先 summary,其次 content
|
||
content = ""
|
||
if hasattr(entry, "content") and entry.content:
|
||
content = entry.content[0].value
|
||
elif hasattr(entry, "summary"):
|
||
content = entry.summary
|
||
|
||
# 清洗 HTML
|
||
content = clean_html(content)
|
||
|
||
# 生成摘要
|
||
summary = generate_summary(content)
|
||
|
||
return {
|
||
"feed_id": feed_id,
|
||
"title": title[:1024],
|
||
"link": link[:2048],
|
||
"author": author[:256],
|
||
"published_at": published_at,
|
||
"content": content[:config.MAX_ARTICLE_CONTENT_LENGTH],
|
||
"summary": summary[:config.MAX_SUMMARY_LENGTH],
|
||
}
|
||
|
||
|
||
def clean_html(html_text: str) -> str:
|
||
"""清洗 HTML,去除 script/style 标签,转为安全文本"""
|
||
if not html_text:
|
||
return ""
|
||
|
||
# 先解码 HTML 实体
|
||
text = html.unescape(html_text)
|
||
|
||
# 用 BeautifulSoup 清理
|
||
soup = BeautifulSoup(text, "html.parser")
|
||
|
||
# 移除 script 和 style
|
||
for tag in soup(["script", "style", "iframe", "object", "embed"]):
|
||
tag.decompose()
|
||
|
||
# 获取纯文本
|
||
cleaned = soup.get_text(separator="\n")
|
||
|
||
# 压缩空白行
|
||
cleaned = re.sub(r"\n\s*\n+", "\n\n", cleaned)
|
||
cleaned = cleaned.strip()
|
||
|
||
return cleaned
|
||
|
||
|
||
def generate_summary(content: str, max_length: int = 300) -> str:
|
||
"""从内容生成摘要"""
|
||
if not content:
|
||
return ""
|
||
|
||
# 去掉多余空白
|
||
text = re.sub(r"\s+", " ", content).strip()
|
||
|
||
if len(text) <= max_length:
|
||
return text
|
||
|
||
# 在句子边界截断
|
||
truncated = text[:max_length]
|
||
last_period = max(truncated.rfind("。"), truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
|
||
if last_period > max_length * 0.5:
|
||
return truncated[:last_period + 1]
|
||
|
||
return truncated + "..."
|
||
|
||
|
||
def fetch_and_store_feed(feed_id: int) -> dict:
|
||
"""抓取指定 RSS 源并存储文章
|
||
返回抓取结果统计
|
||
"""
|
||
db = SessionLocal()
|
||
try:
|
||
feed = db.query(Feed).filter(Feed.id == feed_id).first()
|
||
if not feed:
|
||
return {"success": False, "error": "Feed not found", "articles_count": 0}
|
||
|
||
result = fetch_feed(feed.url)
|
||
|
||
if not result["success"]:
|
||
# 记录失败
|
||
feed.last_fetch_at = datetime.utcnow()
|
||
feed.last_fetch_status = "fail"
|
||
feed.last_error = result["error"]
|
||
feed.fail_count += 1
|
||
|
||
log = FetchLog(
|
||
feed_id=feed_id,
|
||
status="fail",
|
||
error_message=result["error"],
|
||
response_time_ms=result.get("response_time_ms"),
|
||
)
|
||
db.add(log)
|
||
db.commit()
|
||
return {"success": False, "error": result["error"], "articles_count": 0}
|
||
|
||
parsed = result["feed_data"]
|
||
|
||
# 更新 feed 元信息
|
||
if hasattr(parsed.feed, "title"):
|
||
feed.title = parsed.feed.title[:512]
|
||
if hasattr(parsed.feed, "description"):
|
||
feed.description = parsed.feed.description[:1000]
|
||
|
||
# 存储文章
|
||
new_count = 0
|
||
for entry in parsed.entries:
|
||
article_data = parse_article(entry, feed_id)
|
||
if not article_data["link"]:
|
||
continue
|
||
|
||
# 检查是否已存在(基于 link)
|
||
existing = db.query(Article).filter(Article.link == article_data["link"]).first()
|
||
if existing:
|
||
# 更新已有文章
|
||
existing.title = article_data["title"] or existing.title
|
||
existing.content = article_data["content"] or existing.content
|
||
existing.summary = article_data["summary"] or existing.summary
|
||
existing.author = article_data["author"] or existing.author
|
||
if article_data["published_at"]:
|
||
existing.published_at = article_data["published_at"]
|
||
else:
|
||
article = Article(**article_data)
|
||
db.add(article)
|
||
new_count += 1
|
||
|
||
# 更新 feed 统计
|
||
feed.last_fetch_at = datetime.utcnow()
|
||
feed.last_fetch_status = "success"
|
||
feed.last_error = ""
|
||
feed.success_count += 1
|
||
feed.article_count = db.query(Article).filter(Article.feed_id == feed_id).count()
|
||
|
||
log = FetchLog(
|
||
feed_id=feed_id,
|
||
status="success",
|
||
articles_fetched=new_count,
|
||
response_time_ms=result.get("response_time_ms"),
|
||
)
|
||
db.add(log)
|
||
db.commit()
|
||
|
||
return {
|
||
"success": True,
|
||
"articles_count": new_count,
|
||
"feed_title": feed.title,
|
||
}
|
||
except Exception as e:
|
||
db.rollback()
|
||
return {"success": False, "error": str(e), "articles_count": 0}
|
||
finally:
|
||
db.close()
|
||
|
||
|
||
def fetch_all_feeds(feed_ids: list = None) -> list:
|
||
"""并发抓取多个 RSS 源
|
||
返回每个源的抓取结果列表
|
||
"""
|
||
db = SessionLocal()
|
||
try:
|
||
query = db.query(Feed).filter(Feed.is_active == True)
|
||
if feed_ids:
|
||
query = query.filter(Feed.id.in_(feed_ids))
|
||
feeds = query.all()
|
||
finally:
|
||
db.close()
|
||
|
||
results = []
|
||
with ThreadPoolExecutor(max_workers=config.FETCH_CONCURRENCY) as executor:
|
||
future_to_feed = {
|
||
executor.submit(fetch_and_store_feed, feed.id): feed
|
||
for feed in feeds
|
||
}
|
||
for future in as_completed(future_to_feed):
|
||
feed = future_to_feed[future]
|
||
try:
|
||
result = future.result()
|
||
results.append({"feed_id": feed.id, **result})
|
||
except Exception as e:
|
||
results.append({"feed_id": feed.id, "success": False, "error": str(e), "articles_count": 0})
|
||
|
||
return results
|