rssKeeper/backend/rss_fetcher.py

"""RSS 抓取核心逻辑"""
import time
import re
import html
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin
import requests
import feedparser
from bs4 import BeautifulSoup
from sqlalchemy.orm import Session
from models import Feed, Article, FetchLog
from database import SessionLocal
import config

# 国内域名后缀/关键字 — 这些直连，其余走代理
CN_DOMAINS = (
    ".cn", ".com.cn", ".org.cn", ".net.cn",
    "36kr.com", "zhihu.com", "weibo.com", "douban.com", "bilibili.com",
    "tmtpost.com", "ifanr.com", "geekpark.net", "pingwest.com",
    "juejin.cn", "segmentfault.com", "cnblogs.com", "csdn.net",
    "qq.com", "163.com", "sohu.com", "sina.com.cn", "baidu.com",
    "taobao.com", "jd.com", "aliyun.com",
    "xinhuanet.com", "people.com.cn", "sciencenet.cn",
    "localhost", "127.0.0.1", "192.168.",
)


def _get_proxies(url: str) -> dict:
    """根据 URL 判断是否需要代理，返回 proxies dict"""
    if not config.HTTPS_PROXY:
        return {}
    from urllib.parse import urlparse
    host = urlparse(url).hostname or ""
    # 国内域名直连
    for d in CN_DOMAINS:
        if host.endswith(d) or host == d:
            return {}
    # 外网走代理
    return {
        "http": config.HTTP_PROXY or config.HTTPS_PROXY,
        "https": config.HTTPS_PROXY,
    }


def classify_error(error: str) -> str:
    """根据错误信息分类错误类型"""
    if not error:
        return ""
    err = error.lower()

    if "404" in error or "not found" in err:
        return "url_invalid"
    if "403" in error or "forbidden" in err:
        return "forbidden"
    if "429" in error or "too many request" in err:
        return "rate_limited"
    if "timeout" in err or "timed out" in err:
        return "timeout"
    if "connecttimeout" in err or "connectiontimeout" in err:
        return "timeout"
    if "could not resolve" in err or "name or service not known" in err or "nodename nor servname" in err:
        return "dns_failure"
    if "connection refused" in err:
        return "connection_refused"
    if "connection aborted" in err or "remotedisconnected" in err or "remote end closed" in err:
        return "connection_reset"
    if "ssl" in err or "certificate" in err or "certifi" in err:
        return "ssl_error"
    if "max retries" in err or "newconnectionerror" in err:
        return "unreachable"
    if "invalid url" in err or "no host" in err or "missing scheme" in err:
        return "url_malformed"
    if "5" in error and "server error" in err:
        return "server_error"
    return "unknown"


def fetch_feed(url: str, timeout: int = config.FETCH_TIMEOUT) -> dict:
    """抓取单个 RSS 源
    返回 {"success": bool, "feed_data": parsed, "error": str, "response_time_ms": int}
    """
    start_time = time.time()
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36",
            "Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
        }
        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, proxies=_get_proxies(url))
        response.raise_for_status()

        # 解析 RSS
        parsed = feedparser.parse(response.content)

        response_time_ms = int((time.time() - start_time) * 1000)

        if parsed.bozo and hasattr(parsed, 'bozo_exception'):
            # 有解析警告但可能仍然可用
            pass

        return {
            "success": True,
            "feed_data": parsed,
            "error": None,
            "response_time_ms": response_time_ms,
        }
    except requests.exceptions.RequestException as e:
        return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}
    except Exception as e:
        return {"success": False, "feed_data": None, "error": str(e), "response_time_ms": None}


def discover_feed_url(url: str, timeout: int = 15) -> list:
    """从任意网页自动发现 RSS/Atom feed URL
    返回找到的 feed URL 列表
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36",
        }
        response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, proxies=_get_proxies(url))
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")
        feed_urls = []

        # 查找 <link rel="alternate"> 标签
        for link in soup.find_all("link", rel="alternate"):
            link_type = link.get("type", "").lower()
            href = link.get("href", "")
            if href and any(t in link_type for t in ["rss", "atom", "xml"]):
                full_url = urljoin(response.url, href)
                feed_urls.append(full_url)

        # 也查找常见的 RSS 链接
        common_patterns = [
            "/rss", "/feed", "/feeds", "/atom.xml", "/rss.xml",
            "/index.xml", "/feed.xml", "/?feed=rss2",
        ]
        for pattern in common_patterns:
            candidate = urljoin(response.url, pattern)
            if candidate not in feed_urls:
                # 验证是否是有效的 feed
                try:
                    resp = requests.head(candidate, headers=headers, timeout=5, allow_redirects=True)
                    content_type = resp.headers.get("Content-Type", "").lower()
                    if any(t in content_type for t in ["rss", "atom", "xml"]):
                        feed_urls.append(candidate)
                except Exception:
                    pass

        return list(dict.fromkeys(feed_urls))  # 去重保持顺序
    except Exception:
        return []


def parse_article(entry, feed_id: int) -> dict:
    """从 feedparser entry 解析文章数据"""
    title = entry.get("title", "")
    link = entry.get("link", "")
    author = entry.get("author", "")

    # 发布时间 — 统一存为 UTC aware datetime
    published_at = None
    if hasattr(entry, "published_parsed") and entry.published_parsed:
        try:
            published_at = datetime(*entry.published_parsed[:6])
        except (ValueError, TypeError):
            pass
    if not published_at and hasattr(entry, "updated_parsed") and entry.updated_parsed:
        try:
            published_at = datetime(*entry.updated_parsed[:6])
        except (ValueError, TypeError):
            pass

    # 内容：优先 summary，其次 content
    content = ""
    if hasattr(entry, "content") and entry.content:
        content = entry.content[0].value
    elif hasattr(entry, "summary"):
        content = entry.summary

    # 清洗 HTML
    content = clean_html(content)

    # 生成摘要
    summary = generate_summary(content)

    return {
        "feed_id": feed_id,
        "title": title[:1024],
        "link": link[:2048],
        "author": author[:256],
        "published_at": published_at,
        "content": content[:config.MAX_ARTICLE_CONTENT_LENGTH],
        "summary": summary[:config.MAX_SUMMARY_LENGTH],
    }


def clean_html(html_text: str) -> str:
    """清洗 HTML，去除 script/style 标签，转为安全文本"""
    if not html_text:
        return ""

    # 先解码 HTML 实体
    text = html.unescape(html_text)

    # 用 BeautifulSoup 清理
    soup = BeautifulSoup(text, "html.parser")

    # 移除 script 和 style
    for tag in soup(["script", "style", "iframe", "object", "embed"]):
        tag.decompose()

    # 获取纯文本
    cleaned = soup.get_text(separator="\n")

    # 压缩空白行
    cleaned = re.sub(r"\n\s*\n+", "\n\n", cleaned)
    cleaned = cleaned.strip()

    return cleaned


def generate_summary(content: str, max_length: int = 300) -> str:
    """从内容生成摘要"""
    if not content:
        return ""

    # 去掉多余空白
    text = re.sub(r"\s+", " ", content).strip()

    if len(text) <= max_length:
        return text

    # 在句子边界截断（支持中英文标点）
    truncated = text[:max_length]
    last_period = max(
        truncated.rfind("。"), truncated.rfind(". "),
        truncated.rfind("! "), truncated.rfind("? "),
        truncated.rfind("？"), truncated.rfind("！"),
        truncated.rfind("；"),
    )
    if last_period > max_length * 0.5:
        return truncated[:last_period + 1]

    return truncated + "..."


def fetch_and_store_feed(feed_id: int) -> dict:
    """抓取指定 RSS 源并存储文章
    返回抓取结果统计
    """
    db = SessionLocal()
    try:
        feed = db.query(Feed).filter(Feed.id == feed_id).first()
        if not feed:
            return {"success": False, "error": "Feed not found", "articles_count": 0}

        result = fetch_feed(feed.url)

        if not result["success"]:
            # 记录失败
            feed.last_fetch_at = datetime.utcnow()
            feed.last_fetch_status = "fail"
            feed.last_error = result["error"]
            feed.error_type = classify_error(result["error"])
            feed.fail_count += 1

            log = FetchLog(
                feed_id=feed_id,
                status="fail",
                error_message=result["error"],
                response_time_ms=result.get("response_time_ms"),
            )
            db.add(log)
            db.commit()
            return {"success": False, "error": result["error"], "articles_count": 0}

        parsed = result["feed_data"]

        # 更新 feed 元信息
        if hasattr(parsed.feed, "title"):
            feed.title = parsed.feed.title[:512]
        if hasattr(parsed.feed, "description"):
            feed.description = parsed.feed.description[:1000]

        # 存储文章 — 先收集所有文章，内存去重后批量入库
        seen_links = set()
        articles_to_add = []
        articles_to_update = []

        for entry in parsed.entries:
            article_data = parse_article(entry, feed_id)
            link = article_data.get("link", "")
            if not link or link in seen_links:
                continue
            seen_links.add(link)
            articles_to_add.append(article_data)

        # 批量查询已有文章
        if articles_to_add:
            existing_links = {
                row[0] for row in db.query(Article.link).filter(
                    Article.link.in_([a["link"] for a in articles_to_add])
                ).all()
            }

            new_count = 0
            for article_data in articles_to_add:
                if article_data["link"] in existing_links:
                    articles_to_update.append(article_data)
                else:
                    article = Article(**article_data)
                    db.add(article)
                    new_count += 1

            # 更新已有文章
            for article_data in articles_to_update:
                existing = db.query(Article).filter(Article.link == article_data["link"]).first()
                if existing:
                    existing.title = article_data["title"] or existing.title
                    existing.content = article_data["content"] or existing.content
                    existing.summary = article_data["summary"] or existing.summary
                    existing.author = article_data["author"] or existing.author
                    if article_data["published_at"]:
                        existing.published_at = article_data["published_at"]

        # 更新 feed 统计
        feed.last_fetch_at = datetime.utcnow()
        feed.last_fetch_status = "success"
        feed.last_error = ""
        feed.error_type = ""
        feed.success_count += 1
        feed.article_count += new_count

        log = FetchLog(
            feed_id=feed_id,
            status="success",
            articles_fetched=new_count,
            response_time_ms=result.get("response_time_ms"),
        )
        db.add(log)
        db.commit()

        return {
            "success": True,
            "articles_count": new_count,
            "feed_title": feed.title,
        }
    except Exception as e:
        db.rollback()
        return {"success": False, "error": str(e), "articles_count": 0}
    finally:
        db.close()


def fetch_all_feeds(feed_ids: list = None) -> list:
    """并发抓取多个 RSS 源
    返回每个源的抓取结果列表
    """
    db = SessionLocal()
    try:
        query = db.query(Feed).filter(Feed.is_active == True)
        if feed_ids:
            query = query.filter(Feed.id.in_(feed_ids))
        feeds = query.all()
    finally:
        db.close()

    results = []
    with ThreadPoolExecutor(max_workers=config.FETCH_CONCURRENCY) as executor:
        future_to_feed = {
            executor.submit(fetch_and_store_feed, feed.id): feed
            for feed in feeds
        }
        for future in as_completed(future_to_feed):
            feed = future_to_feed[future]
            try:
                result = future.result()
                results.append({"feed_id": feed.id, **result})
            except Exception as e:
                results.append({"feed_id": feed.id, "success": False, "error": str(e), "articles_count": 0})

    return results