commit ba6e7669e8e166143109555669ffc7bb710fc311 Author: congsh Date: Mon Jun 15 17:01:57 2026 +0800 Initial commit: RSS platform phase 1 skeleton with code review fixes Features: - FastAPI + SQLAlchemy 2.0 async + PostgreSQL/pgvector + Redis backend - Vue 3 + TypeScript + Element Plus frontend - JWT auth with access/refresh tokens and revocation - Admin/member RBAC - RSS feed CRUD and article listing - Settings management with Fernet encryption for sensitive values - Redis distributed lock service - Alembic initial migration - Docker Compose development environment Fixes from code review: - Fix DB session leak in dependency injection - Restrict registration to admin only - Add default admin password warning - Implement JWT refresh tokens and jti blacklist - Strengthen password policy - Use func.count for pagination totals - Replace NullPool with AsyncAdaptedQueuePool - Remove init_db from lifespan to enforce alembic migrations - Add request_id middleware and logging filter - Fix vite.config.ts env loading - Add frontend token refresh interceptor - Add Vue error handler Co-Authored-By: Claude Opus 4.8 (1M context) diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..79302bd --- /dev/null +++ b/.env.example @@ -0,0 +1,47 @@ +# 数据库 +DATABASE_URL=postgresql+asyncpg://rss:rss@postgres:5432/rss_platform + +# Redis +REDIS_URL=redis://redis:6379/0 + +# JWT +# 生产环境必须使用随机生成的、长度 >= 32 的字符串 +SECRET_KEY=change-me-in-production-min-32-chars-long-required +ACCESS_TOKEN_EXPIRE_MINUTES=15 +REFRESH_TOKEN_EXPIRE_DAYS=7 + +# AI(占位,后续阶段启用) +AI_DEFAULT_PROVIDER=openai +AI_DEFAULT_MODEL=gpt-4o-mini + +# 对象存储 +STORAGE_TYPE=minio +MINIO_ENDPOINT=minio:9000 +MINIO_ACCESS_KEY=minioadmin +MINIO_SECRET_KEY=minioadmin +MINIO_BUCKET=rss-platform + +# CORS +CORS_ALLOWED_ORIGINS=http://localhost:5173,http://127.0.0.1:5173 + +# 敏感设置加密密钥(可选但强烈建议) +# 生成方式:python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())" +# 为空时敏感配置以明文存储 +SETTINGS_ENCRYPTION_KEY= + +# 日志 +LOG_LEVEL=INFO + +# RSS 抓取 +FETCH_CONCURRENCY=10 +FETCH_TIMEOUT=30 +DEFAULT_FETCH_INTERVAL=60 +MIN_FETCH_INTERVAL=15 + +# 端口(开发环境) +BACKEND_PORT=8000 +FRONTEND_PORT=5173 +POSTGRES_PORT=5432 +REDIS_PORT=6379 +MINIO_API_PORT=9000 +MINIO_CONSOLE_PORT=9001 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e6c92c --- /dev/null +++ b/.gitignore @@ -0,0 +1,72 @@ +.env +.env.local +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +.venv +venv/ +ENV/ +env/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# Logs +*.log +logs/ + +# Database +*.db +*.sqlite +*.sqlite3 + +# Docker +data/ +postgres_data/ +redis_data/ +minio_data/ +platform_data/ + +# Frontend +frontend/node_modules/ +frontend/dist/ +frontend/.vite/ +*.local + +# OS +.DS_Store +Thumbs.db + +# pytest +.pytest_cache/ +.coverage +htmlcov/ + +# Alembic +backend/alembic/versions/*.pyc + +# Minio +.minio.sys/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..395e526 --- /dev/null +++ b/Makefile @@ -0,0 +1,45 @@ +.PHONY: help dev up down build migrate test lint format clean + +help: + @echo "RSS Platform 开发命令:" + @echo " make dev - 启动开发环境 (docker-compose up -d)" + @echo " make up - 启动服务" + @echo " make down - 停止服务" + @echo " make build - 重新构建镜像" + @echo " make migrate - 执行数据库迁移" + @echo " make test - 运行后端测试" + @echo " make lint - 代码检查" + @echo " make format - 代码格式化" + @echo " make clean - 清理容器与数据卷" + +dev: up migrate + @echo "开发环境已启动" + @echo "后端: http://localhost:8000" + @echo "前端: http://localhost:5173" + +up: + docker-compose up -d + +down: + docker-compose down + +build: + docker-compose build + +migrate: + docker-compose exec backend alembic upgrade head + +test: + docker-compose exec backend pytest tests/ -v + +lint: + docker-compose exec backend ruff check app tests + +format: + docker-compose exec backend ruff format app tests + +clean: + docker-compose down -v + find . -type d -name __pycache__ -exec rm -rf {} + + find . -type d -name node_modules -exec rm -rf {} + + find . -type d -name .pytest_cache -exec rm -rf {} + diff --git a/README.md b/README.md new file mode 100644 index 0000000..99ba194 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +# RSS 信息处理平台 + +模块化、工业化、AI 驱动的 RSS 信息处理平台,统一承接 RSS 抓取、数据清洗、AI 分类/摘要/打分、去重、日报产出、聊天问答等能力。 + +## 快速开始 + +### 1. 环境准备 + +复制示例配置: + +```bash +cp .env.example .env +``` + +根据需要修改 `.env` 中的配置,尤其是 `SECRET_KEY`。 + +### 2. 启动开发环境 + +```bash +make dev +``` + +服务将启动: +- 后端:`http://localhost:8000` +- 前端:`http://localhost:5173` +- PostgreSQL:`localhost:5432` +- Redis:`localhost:6379` +- MinIO:`http://localhost:9000` + +### 3. 首次使用 + +默认会创建一个管理员账号(见 `.env` 中 `DEFAULT_ADMIN_USERNAME` / `DEFAULT_ADMIN_PASSWORD`),使用它登录前端。 + +**安全提示**:生产环境务必修改 `SECRET_KEY` 和默认管理员密码;如未修改,`/health` 接口会返回安全警告。 + +### 4. 停止环境 + +```bash +make down +``` + +## 开发命令 + +```bash +make migrate # 执行数据库迁移 +make test # 运行测试 +make lint # 代码检查 +make format # 代码格式化 +``` + +## 项目结构 + +``` +rss-platform/ +├── backend/ # FastAPI 后端 +├── frontend/ # Vue 3 + TypeScript 前端 +├── plugins/ # 可插拔插件(去重算法等) +├── docker/ # Docker 相关文件 +├── scripts/ # 工具脚本 +└── docs/ # 文档 +``` + +## 配套文档 + +- 架构设计:`docs/design.md` +- 开发步骤:`docs/dev-plan.md` +- 代码审核与修复记录:`docs/code-review.md` diff --git a/backend/alembic.ini b/backend/alembic.ini new file mode 100644 index 0000000..433dec0 --- /dev/null +++ b/backend/alembic.ini @@ -0,0 +1,110 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = alembic + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Add the appropriate environment variable or run the following command to set: +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python-dateutil library. +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to alembic/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is 'os', which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default as used by the template. + +# set to 'true' to search source files recursively for each version package +# in src/ layout projects. +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = postgresql+asyncpg://rss:rss@postgres:5432/rss_platform + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, against a Python environment +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/backend/alembic/env.py b/backend/alembic/env.py new file mode 100644 index 0000000..686ec9b --- /dev/null +++ b/backend/alembic/env.py @@ -0,0 +1,85 @@ +"""Alembic environment configuration.""" +import asyncio +from logging.config import fileConfig + +from sqlalchemy import pool +from sqlalchemy.engine import Connection +from sqlalchemy.ext.asyncio import async_engine_from_config + +from alembic import context + +from app.models.base import Base +from app.core.config import settings + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def get_url(): + return settings.DATABASE_URL + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode.""" + url = get_url() + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def do_run_migrations(connection: Connection) -> None: + context.configure(connection=connection, target_metadata=target_metadata) + + with context.begin_transaction(): + context.run_migrations() + + +async def run_async_migrations() -> None: + """In this scenario we need to create an Engine + and associate a connection with the context. + """ + + configuration = config.get_section(config.config_ini_section, {}) + configuration["sqlalchemy.url"] = get_url() + connectable = async_engine_from_config( + configuration, + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + async with connectable.connect() as connection: + await connection.run_sync(do_run_migrations) + + await connectable.dispose() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode.""" + asyncio.run(run_async_migrations()) + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/backend/alembic/script.py.mako b/backend/alembic/script.py.mako new file mode 100644 index 0000000..fbc4b07 --- /dev/null +++ b/backend/alembic/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/backend/alembic/versions/001_initial_schema.py b/backend/alembic/versions/001_initial_schema.py new file mode 100644 index 0000000..3ee425a --- /dev/null +++ b/backend/alembic/versions/001_initial_schema.py @@ -0,0 +1,367 @@ +"""Initial schema. + +Revision ID: 001 +Revises: +Create Date: 2026-06-15 00:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = "001" +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Users + op.create_table( + "users", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("username", sa.String(64), nullable=False), + sa.Column("password_hash", sa.String(255), nullable=False), + sa.Column("role", sa.String(32), nullable=False), + sa.Column("is_active", sa.Boolean(), nullable=False), + sa.Column("last_login_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("username"), + ) + op.create_index("ix_users_username", "users", ["username"], unique=False) + op.create_index("ix_users_role", "users", ["role"], unique=False) + + # Feeds + op.create_table( + "feeds", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("url", sa.String(2048), nullable=False), + sa.Column("title", sa.String(512), nullable=True), + sa.Column("description", sa.Text(), nullable=True), + sa.Column("category", sa.String(128), nullable=True), + sa.Column("is_active", sa.Boolean(), nullable=False), + sa.Column("fetch_interval_minutes", sa.Integer(), nullable=False), + sa.Column("priority", sa.Integer(), nullable=False), + sa.Column("parser_config", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("proxy_policy", sa.String(32), nullable=False), + sa.Column("last_fetch_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("last_fetch_status", sa.String(32), nullable=True), + sa.Column("last_error", sa.Text(), nullable=True), + sa.Column("error_type", sa.String(64), nullable=True), + sa.Column("success_count", sa.Integer(), nullable=False), + sa.Column("fail_count", sa.Integer(), nullable=False), + sa.Column("article_count", sa.Integer(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("url"), + ) + op.create_index("ix_feeds_url", "feeds", ["url"], unique=False) + op.create_index("ix_feeds_is_active", "feeds", ["is_active"], unique=False) + + # Raw articles + op.create_table( + "raw_articles", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("feed_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("external_id", sa.String(255), nullable=True), + sa.Column("title", sa.String(1024), nullable=True), + sa.Column("link", sa.String(2048), nullable=False), + sa.Column("author", sa.String(256), nullable=True), + sa.Column("published_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("fetched_at", sa.DateTime(timezone=True), nullable=False), + sa.Column("content", sa.Text(), nullable=True), + sa.Column("summary", sa.Text(), nullable=True), + sa.Column("raw_html", sa.Text(), nullable=True), + sa.Column("content_hash", sa.String(64), nullable=True), + sa.Column("language", sa.String(16), nullable=True), + sa.Column("status", sa.String(32), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint(["feed_id"], ["feeds.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_raw_articles_feed_id", "raw_articles", ["feed_id"], unique=False) + op.create_index("ix_raw_articles_link", "raw_articles", ["link"], unique=False) + op.create_index("ix_raw_articles_external_id", "raw_articles", ["external_id"], unique=False) + op.create_index("ix_raw_articles_published_at", "raw_articles", ["published_at"], unique=False) + op.create_index("ix_raw_articles_fetched_at", "raw_articles", ["fetched_at"], unique=False) + op.create_index("ix_raw_articles_status", "raw_articles", ["status"], unique=False) + + # Duplicate groups (created first, FK to cleaned_articles added later) + op.create_table( + "duplicate_groups", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("representative_article_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("member_article_ids", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("similarity_matrix", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("brief_date", sa.String(10), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_duplicate_groups_brief_date", "duplicate_groups", ["brief_date"], unique=False) + + # Cleaned articles + op.create_table( + "cleaned_articles", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("raw_article_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("feed_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("title", sa.String(1024), nullable=True), + sa.Column("link", sa.String(2048), nullable=False), + sa.Column("author", sa.String(256), nullable=True), + sa.Column("feed_title", sa.String(512), nullable=True), + sa.Column("feed_category", sa.String(128), nullable=True), + sa.Column("published_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("fetched_at", sa.DateTime(timezone=True), nullable=False), + sa.Column("content", sa.Text(), nullable=True), + sa.Column("content_length", sa.Integer(), nullable=False), + sa.Column("original_summary", sa.Text(), nullable=True), + sa.Column("ai_summary", sa.Text(), nullable=True), + sa.Column("category", sa.String(128), nullable=True), + sa.Column("tags", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("heat_score", sa.Float(), nullable=False), + sa.Column("importance_score", sa.Float(), nullable=False), + sa.Column("duplication_score", sa.Float(), nullable=False), + sa.Column("composite_score", sa.Float(), nullable=False), + sa.Column("duplicate_group_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("is_representative", sa.Boolean(), nullable=False), + sa.Column("reference_links", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("processing_status", sa.String(32), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint(["raw_article_id"], ["raw_articles.id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint(["feed_id"], ["feeds.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["duplicate_group_id"], ["duplicate_groups.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_cleaned_articles_raw_article_id", "cleaned_articles", ["raw_article_id"], unique=False) + op.create_index("ix_cleaned_articles_feed_id", "cleaned_articles", ["feed_id"], unique=False) + op.create_index("ix_cleaned_articles_link", "cleaned_articles", ["link"], unique=False) + op.create_index("ix_cleaned_articles_title", "cleaned_articles", ["title"], unique=False) + op.create_index("ix_cleaned_articles_published_at", "cleaned_articles", ["published_at"], unique=False) + op.create_index("ix_cleaned_articles_fetched_at", "cleaned_articles", ["fetched_at"], unique=False) + op.create_index("ix_cleaned_articles_category", "cleaned_articles", ["category"], unique=False) + op.create_index("ix_cleaned_articles_duplicate_group_id", "cleaned_articles", ["duplicate_group_id"], unique=False) + op.create_index("ix_cleaned_articles_is_representative", "cleaned_articles", ["is_representative"], unique=False) + op.create_index("ix_cleaned_articles_processing_status", "cleaned_articles", ["processing_status"], unique=False) + op.create_index("ix_cleaned_articles_tags", "cleaned_articles", ["tags"], postgresql_using="gin") + op.create_index("ix_cleaned_articles_reference_links", "cleaned_articles", ["reference_links"], postgresql_using="gin") + + # Add FK from duplicate_groups to cleaned_articles (circular dependency resolution) + op.create_foreign_key( + "fk_duplicate_groups_representative_article_id", + "duplicate_groups", + "cleaned_articles", + ["representative_article_id"], + ["id"], + ondelete="SET NULL", + ) + + # Article references + op.create_table( + "article_references", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("source_article_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("referenced_article_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("reference_type", sa.String(64), nullable=False), + sa.Column("reference_link", sa.String(2048), nullable=True), + sa.Column("reference_title", sa.String(1024), nullable=True), + sa.Column("similarity", sa.Float(), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint(["source_article_id"], ["cleaned_articles.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["referenced_article_id"], ["cleaned_articles.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_article_references_source_article_id", "article_references", ["source_article_id"], unique=False) + op.create_index("ix_article_references_referenced_article_id", "article_references", ["referenced_article_id"], unique=False) + op.create_index("ix_article_references_reference_type", "article_references", ["reference_type"], unique=False) + + # Skills + op.create_table( + "skills", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("name", sa.String(128), nullable=False), + sa.Column("slug", sa.String(128), nullable=False), + sa.Column("description", sa.Text(), nullable=True), + sa.Column("type", sa.String(32), nullable=False), + sa.Column("version", sa.Integer(), nullable=False), + sa.Column("is_default", sa.Boolean(), nullable=False), + sa.Column("system_prompt", sa.Text(), nullable=False), + sa.Column("output_schema", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column("tools", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("input_schema", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column("example_inputs", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("created_by", sa.String(64), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("slug"), + ) + op.create_index("ix_skills_slug", "skills", ["slug"], unique=False) + op.create_index("ix_skills_type", "skills", ["type"], unique=False) + + # AI provider configs + op.create_table( + "ai_provider_configs", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("name", sa.String(128), nullable=False), + sa.Column("provider", sa.String(64), nullable=False), + sa.Column("base_url", sa.String(512), nullable=True), + sa.Column("api_key_encrypted", sa.Text(), nullable=True), + sa.Column("default_model", sa.String(128), nullable=True), + sa.Column("timeout", sa.Integer(), nullable=False), + sa.Column("max_retries", sa.Integer(), nullable=False), + sa.Column("rate_limit_rpm", sa.Integer(), nullable=False), + sa.Column("is_active", sa.Boolean(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_ai_provider_configs_provider", "ai_provider_configs", ["provider"], unique=False) + + # AI task configs + op.create_table( + "ai_task_configs", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("task_type", sa.String(64), nullable=False), + sa.Column("name", sa.String(128), nullable=False), + sa.Column("provider_config_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("model", sa.String(128), nullable=False), + sa.Column("skill_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("temperature", sa.Float(), nullable=False), + sa.Column("max_tokens", sa.Integer(), nullable=True), + sa.Column("top_p", sa.Float(), nullable=False), + sa.Column("system_prompt_override", sa.Text(), nullable=True), + sa.Column("fallback_config_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("enabled", sa.Boolean(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint(["provider_config_id"], ["ai_provider_configs.id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint(["skill_id"], ["skills.id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint(["fallback_config_id"], ["ai_task_configs.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_ai_task_configs_task_type", "ai_task_configs", ["task_type"], unique=False) + + # Output tasks + op.create_table( + "output_tasks", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("name", sa.String(128), nullable=False), + sa.Column("task_type", sa.String(64), nullable=False), + sa.Column("skill_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("schedule", sa.String(128), nullable=True), + sa.Column("filter_config", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("output_config", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("is_active", sa.Boolean(), nullable=False), + sa.Column("last_run_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("last_output_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint(["skill_id"], ["skills.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["last_output_id"], ["outputs.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + ) + + # Outputs + op.create_table( + "outputs", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("output_task_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("content", sa.Text(), nullable=True), + sa.Column("content_html", sa.Text(), nullable=True), + sa.Column("references", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint(["output_task_id"], ["output_tasks.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_outputs_output_task_id", "outputs", ["output_task_id"], unique=False) + + # Chat sessions + op.create_table( + "chat_sessions", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("title", sa.String(256), nullable=True), + sa.Column("skill_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("context_window", sa.Integer(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["skill_id"], ["skills.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_chat_sessions_user_id", "chat_sessions", ["user_id"], unique=False) + + # Chat messages + op.create_table( + "chat_messages", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("session_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("role", sa.String(32), nullable=False), + sa.Column("content", sa.Text(), nullable=True), + sa.Column("tool_calls", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("tool_results", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("references", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("token_usage", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.ForeignKeyConstraint(["session_id"], ["chat_sessions.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_chat_messages_session_id", "chat_messages", ["session_id"], unique=False) + op.create_index("ix_chat_messages_role", "chat_messages", ["role"], unique=False) + + # Locks + op.create_table( + "locks", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("lock_name", sa.String(128), nullable=False), + sa.Column("owner_id", sa.String(128), nullable=True), + sa.Column("acquired_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("expires_at", sa.DateTime(timezone=True), nullable=True), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("lock_name"), + ) + + # App settings + op.create_table( + "app_settings", + sa.Column("id", postgresql.UUID(as_uuid=True), server_default=sa.text("uuid_generate_v4()"), nullable=False), + sa.Column("key", sa.String(128), nullable=False), + sa.Column("value", sa.Text(), nullable=False), + sa.Column("description", sa.Text(), nullable=True), + sa.Column("is_sensitive", sa.Boolean(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("key"), + ) + op.create_index("ix_app_settings_key", "app_settings", ["key"], unique=False) + + +def downgrade() -> None: + op.drop_table("app_settings") + op.drop_table("locks") + op.drop_table("chat_messages") + op.drop_table("chat_sessions") + op.drop_table("outputs") + op.drop_table("output_tasks") + op.drop_table("ai_task_configs") + op.drop_table("ai_provider_configs") + op.drop_table("skills") + op.drop_table("article_references") + op.drop_constraint("fk_duplicate_groups_representative_article_id", "duplicate_groups", type_="foreignkey") + op.drop_table("cleaned_articles") + op.drop_table("duplicate_groups") + op.drop_table("raw_articles") + op.drop_table("feeds") + op.drop_table("users") diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/api/deps.py b/backend/app/api/deps.py new file mode 100644 index 0000000..d402847 --- /dev/null +++ b/backend/app/api/deps.py @@ -0,0 +1,82 @@ +"""FastAPI dependencies.""" +from typing import AsyncGenerator + +from fastapi import Depends, HTTPException, Request, status +from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.auth import decode_token, is_token_revoked +from app.core.database import get_db as _get_db +from app.core.rbac import require_admin +from app.core.redis import get_redis +from app.models.user import User +from app.schemas.user import TokenPayload + +security = HTTPBearer(auto_error=False) + + +async def get_db() -> AsyncGenerator[AsyncSession, None]: + """Yield async database session managed by FastAPI.""" + async for session in _get_db(): + yield session + + +async def get_current_user( + credentials: HTTPAuthorizationCredentials | None = Depends(security), + db: AsyncSession = Depends(get_db), +) -> User: + """Get current authenticated user from JWT access token.""" + if not credentials: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Not authenticated", + headers={"WWW-Authenticate": "Bearer"}, + ) + + token = credentials.credentials + + try: + payload = decode_token(token, expected_type="access") + token_data = TokenPayload(**payload) + except ValueError as exc: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail=f"Invalid authentication credentials: {exc}", + headers={"WWW-Authenticate": "Bearer"}, + ) from exc + + if not token_data.sub or not token_data.jti: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid token payload", + headers={"WWW-Authenticate": "Bearer"}, + ) + + revoked = await is_token_revoked(token_data.jti) + if revoked: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Token has been revoked", + headers={"WWW-Authenticate": "Bearer"}, + ) + + user = await db.get(User, token_data.sub) + if user is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="User not found", + headers={"WWW-Authenticate": "Bearer"}, + ) + + if not user.is_active: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Inactive user", + ) + + return user + + +async def get_current_admin(current_user: User = Depends(get_current_user)) -> User: + """Get current user and require admin role.""" + return require_admin(current_user) diff --git a/backend/app/api/v1/__init__.py b/backend/app/api/v1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/api/v1/admin/__init__.py b/backend/app/api/v1/admin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/api/v1/admin/locks.py b/backend/app/api/v1/admin/locks.py new file mode 100644 index 0000000..5515b16 --- /dev/null +++ b/backend/app/api/v1/admin/locks.py @@ -0,0 +1,55 @@ +"""Admin locks router.""" +from datetime import datetime, timezone + +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.api.deps import get_current_admin, get_db +from app.models.lock import Lock +from app.models.user import User +from app.schemas.common import MessageResponse + +router = APIRouter(prefix="/locks", tags=["admin"]) + + +@router.get("") +async def list_locks( + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_admin), +): + """List active locks.""" + result = await db.execute(select(Lock)) + locks = result.scalars().all() + + now = datetime.now(timezone.utc) + active_locks = [ + { + "id": str(lock.id), + "lock_name": lock.lock_name, + "owner_id": lock.owner_id, + "acquired_at": lock.acquired_at.isoformat() if lock.acquired_at else None, + "expires_at": lock.expires_at.isoformat() if lock.expires_at else None, + "is_expired": lock.expires_at is not None and lock.expires_at < now, + } + for lock in locks + ] + + return {"total": len(active_locks), "items": active_locks} + + +@router.delete("/{lock_name}", response_model=MessageResponse) +async def force_release_lock( + lock_name: str, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_admin), +): + """Force release a lock.""" + result = await db.execute(select(Lock).where(Lock.lock_name == lock_name)) + lock = result.scalar_one_or_none() + if not lock: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Lock not found") + + await db.delete(lock) + await db.commit() + return {"message": f"Lock {lock_name} released"} diff --git a/backend/app/api/v1/articles.py b/backend/app/api/v1/articles.py new file mode 100644 index 0000000..cd926c4 --- /dev/null +++ b/backend/app/api/v1/articles.py @@ -0,0 +1,79 @@ +"""Articles router.""" +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.api.deps import get_current_user, get_db +from app.models.article import CleanedArticle +from app.models.user import User +from app.schemas.article import ArticleListParams, ArticleOut +from app.schemas.common import MessageResponse, PaginatedResponse + +router = APIRouter(prefix="/articles", tags=["articles"]) + + +@router.get("", response_model=PaginatedResponse) +async def list_articles( + params: ArticleListParams = Depends(), + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """List cleaned articles with filters.""" + query = select(CleanedArticle) + + if params.feed_id: + query = query.where(CleanedArticle.feed_id == params.feed_id) + if params.category: + query = query.where(CleanedArticle.category == params.category) + if params.tag: + query = query.where(CleanedArticle.tags.contains([params.tag])) + if params.search: + query = query.where( + CleanedArticle.title.ilike(f"%{params.search}%") + | CleanedArticle.ai_summary.ilike(f"%{params.search}%") + ) + if params.is_read is not None: + # CleanedArticle doesn't have is_read in current schema; placeholder + pass + + # Count + count_query = select(func.count()).select_from(query.subquery()) + total = (await db.execute(count_query)).scalar_one() + + # Paginate + query = ( + query.offset(params.skip) + .limit(params.limit) + .order_by(CleanedArticle.published_at.desc().nulls_last()) + ) + result = await db.execute(query) + items = result.scalars().all() + + return { + "total": total, + "items": [ArticleOut.model_validate(item) for item in items], + } + + +@router.get("/{article_id}", response_model=ArticleOut) +async def get_article( + article_id: str, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """Get a single cleaned article.""" + article = await db.get(CleanedArticle, article_id) + if not article: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Article not found") + return ArticleOut.model_validate(article) + + +@router.put("/{article_id}/read", response_model=MessageResponse) +async def mark_article_read( + article_id: str, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """Mark an article as read (placeholder).""" + # In Phase 1, cleaned_articles doesn't have is_read field yet + return {"message": "Article marked as read"} diff --git a/backend/app/api/v1/auth.py b/backend/app/api/v1/auth.py new file mode 100644 index 0000000..dec054b --- /dev/null +++ b/backend/app/api/v1/auth.py @@ -0,0 +1,143 @@ +"""Authentication router.""" +from datetime import datetime, timezone + +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.api.deps import get_current_admin, get_current_user, get_db +from app.core.auth import ( + create_access_token, + create_refresh_token, + decode_token, + get_password_hash, + revoke_token, + verify_password, +) +from app.models.user import User +from app.schemas.user import ( + RefreshTokenRequest, + TokenResponse, + UserCreate, + UserLogin, + UserOut, +) + +router = APIRouter(prefix="/auth", tags=["auth"]) + + +@router.post("/register", response_model=UserOut) +async def register( + user_in: UserCreate, + db: AsyncSession = Depends(get_db), + _: User = Depends(get_current_admin), +): + """Register a new user (admin only).""" + # Check if username exists + result = await db.execute(select(User).where(User.username == user_in.username)) + existing = result.scalar_one_or_none() + if existing: + raise HTTPException( + status_code=status.HTTP_409_CONFLICT, + detail="Username already exists", + ) + + user = User( + username=user_in.username, + password_hash=get_password_hash(user_in.password), + role=user_in.role, + is_active=user_in.is_active, + ) + db.add(user) + await db.commit() + await db.refresh(user) + return user + + +@router.post("/login", response_model=TokenResponse) +async def login( + credentials: UserLogin, + db: AsyncSession = Depends(get_db), +): + """Login and get access/refresh tokens.""" + result = await db.execute(select(User).where(User.username == credentials.username)) + user = result.scalar_one_or_none() + + if not user or not verify_password(credentials.password, user.password_hash): + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Incorrect username or password", + headers={"WWW-Authenticate": "Bearer"}, + ) + + if not user.is_active: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Inactive user", + ) + + user.last_login_at = datetime.now(timezone.utc) + await db.commit() + + access_token, _ = create_access_token(sub=str(user.id), role=user.role) + refresh_token, _ = create_refresh_token(sub=str(user.id)) + return { + "access_token": access_token, + "refresh_token": refresh_token, + "token_type": "bearer", + } + + +@router.post("/refresh", response_model=TokenResponse) +async def refresh( + req: RefreshTokenRequest, + db: AsyncSession = Depends(get_db), +): + """Exchange a valid refresh token for a new token pair.""" + try: + payload = decode_token(req.refresh_token, expected_type="refresh") + except ValueError as exc: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail=f"Invalid refresh token: {exc}", + headers={"WWW-Authenticate": "Bearer"}, + ) from exc + + user = await db.get(User, payload["sub"]) + if user is None or not user.is_active: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid user", + headers={"WWW-Authenticate": "Bearer"}, + ) + + access_token, _ = create_access_token(sub=str(user.id), role=user.role) + refresh_token, _ = create_refresh_token(sub=str(user.id)) + return { + "access_token": access_token, + "refresh_token": refresh_token, + "token_type": "bearer", + } + + +@router.post("/logout", status_code=status.HTTP_204_NO_CONTENT) +async def logout( + req: RefreshTokenRequest, +): + """Revoke the provided refresh token.""" + try: + payload = decode_token(req.refresh_token, expected_type="refresh") + except ValueError: + return None + + exp = payload.get("exp") + if exp: + expires_at = datetime.fromtimestamp(exp, tz=timezone.utc) + await revoke_token(payload["jti"], expires_at) + return None + + +@router.get("/me", response_model=UserOut) +async def get_me(current_user: User = Depends(get_current_user)): + """Get current user info.""" + return current_user diff --git a/backend/app/api/v1/feeds.py b/backend/app/api/v1/feeds.py new file mode 100644 index 0000000..6c1872d --- /dev/null +++ b/backend/app/api/v1/feeds.py @@ -0,0 +1,135 @@ +"""Feeds router.""" +from fastapi import APIRouter, Depends, HTTPException, Query, status +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.api.deps import get_current_user, get_db +from app.models.feed import Feed +from app.models.user import User +from app.schemas.common import MessageResponse, PaginatedResponse, PaginationParams +from app.schemas.feed import FeedCreate, FeedOut, FeedUpdate + +router = APIRouter(prefix="/feeds", tags=["feeds"]) + + +@router.get("", response_model=PaginatedResponse) +async def list_feeds( + pagination: PaginationParams = Depends(), + category: str | None = Query(None), + search: str | None = Query(None), + is_active: bool | None = Query(None), + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """List RSS feeds with pagination and filters.""" + query = select(Feed) + + if category: + query = query.where(Feed.category == category) + if search: + query = query.where( + Feed.title.ilike(f"%{search}%") + | Feed.url.ilike(f"%{search}%") + | Feed.description.ilike(f"%{search}%") + ) + if is_active is not None: + query = query.where(Feed.is_active == is_active) + + # Get total count + count_query = select(func.count()).select_from(query.subquery()) + total = (await db.execute(count_query)).scalar_one() + + # Get paginated items + query = query.offset(pagination.skip).limit(pagination.limit).order_by(Feed.created_at.desc()) + result = await db.execute(query) + items = result.scalars().all() + + return { + "total": total, + "items": [FeedOut.model_validate(item) for item in items], + } + + +@router.get("/{feed_id}", response_model=FeedOut) +async def get_feed( + feed_id: str, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """Get a single feed by ID.""" + feed = await db.get(Feed, feed_id) + if not feed: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed not found") + return FeedOut.model_validate(feed) + + +@router.post("", response_model=FeedOut, status_code=status.HTTP_201_CREATED) +async def create_feed( + feed_in: FeedCreate, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """Create a new RSS feed.""" + # Check URL uniqueness + result = await db.execute(select(Feed).where(Feed.url == str(feed_in.url))) + existing = result.scalar_one_or_none() + if existing: + raise HTTPException( + status_code=status.HTTP_409_CONFLICT, + detail="Feed with this URL already exists", + ) + + feed = Feed( + url=str(feed_in.url), + title=feed_in.title or "", + description=feed_in.description or "", + category=feed_in.category or "", + is_active=feed_in.is_active, + fetch_interval_minutes=feed_in.fetch_interval_minutes, + priority=feed_in.priority, + parser_config=feed_in.parser_config, + proxy_policy=feed_in.proxy_policy, + ) + db.add(feed) + await db.commit() + await db.refresh(feed) + return FeedOut.model_validate(feed) + + +@router.put("/{feed_id}", response_model=FeedOut) +async def update_feed( + feed_id: str, + feed_in: FeedUpdate, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """Update an existing feed.""" + feed = await db.get(Feed, feed_id) + if not feed: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed not found") + + update_data = feed_in.model_dump(exclude_unset=True) + for field, value in update_data.items(): + if field == "url" and value is not None: + value = str(value) + setattr(feed, field, value) + + await db.commit() + await db.refresh(feed) + return FeedOut.model_validate(feed) + + +@router.delete("/{feed_id}", response_model=MessageResponse) +async def delete_feed( + feed_id: str, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """Delete a feed.""" + feed = await db.get(Feed, feed_id) + if not feed: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Feed not found") + + await db.delete(feed) + await db.commit() + return {"message": "Feed deleted successfully"} diff --git a/backend/app/api/v1/health.py b/backend/app/api/v1/health.py new file mode 100644 index 0000000..d860af5 --- /dev/null +++ b/backend/app/api/v1/health.py @@ -0,0 +1,52 @@ +"""Health check router.""" +from fastapi import APIRouter, Depends, Request +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncSession + +from app.api.deps import get_current_admin, get_db +from app.core.redis import check_redis_health + +router = APIRouter(prefix="/health", tags=["health"]) + + +@router.get("") +async def health_check(request: Request, db: AsyncSession = Depends(get_db)): + """Basic health check.""" + db_ok = False + try: + await db.execute(text("SELECT 1")) + db_ok = True + except Exception: + db_ok = False + + redis_ok = await check_redis_health() + + status_code = "ok" if db_ok and redis_ok else "degraded" + + response = { + "status": status_code, + "service": "rss-platform", + "db": "ok" if db_ok else "error", + "redis": "ok" if redis_ok else "error", + } + warnings = getattr(request.app.state, "startup_warnings", None) + if warnings: + response["warnings"] = warnings + return response + + +@router.get("/db", dependencies=[Depends(get_current_admin)]) +async def db_health(db: AsyncSession = Depends(get_db)): + """Database health check.""" + try: + await db.execute(text("SELECT 1")) + return {"status": "ok", "component": "database"} + except Exception as exc: + return {"status": "error", "component": "database", "detail": str(exc)} + + +@router.get("/redis", dependencies=[Depends(get_current_admin)]) +async def redis_health(): + """Redis health check.""" + ok = await check_redis_health() + return {"status": "ok" if ok else "error", "component": "redis"} diff --git a/backend/app/api/v1/settings.py b/backend/app/api/v1/settings.py new file mode 100644 index 0000000..b64d683 --- /dev/null +++ b/backend/app/api/v1/settings.py @@ -0,0 +1,92 @@ +"""Settings router.""" +from typing import Any + +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy.ext.asyncio import AsyncSession + +from app.api.deps import get_current_admin, get_current_user, get_db +from app.models.user import User +from app.schemas.common import MessageResponse +from app.services.settings_service import ( + apply_db_settings_to_config, + list_settings, + reset_settings, + set_setting, +) + +router = APIRouter(prefix="/settings", tags=["settings"]) + + +@router.get("") +async def get_settings( + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """List all settings.""" + return await list_settings(db, mask_sensitive=current_user.role != "admin") + + +@router.put("/{key}") +async def update_setting( + key: str, + value: dict[str, Any], + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_admin), +): + """Update a single setting.""" + if "value" not in value: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Request body must contain 'value' field", + ) + + success = await set_setting(db, key, value["value"]) + if not success: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid setting key: {key}", + ) + + await apply_db_settings_to_config(db) + return {"message": "Setting updated", "key": key} + + +@router.put("") +async def batch_update_settings( + data: dict[str, Any], + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_admin), +): + """Update multiple settings.""" + settings_data = data.get("settings", {}) + if not settings_data: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Request body must contain 'settings' object", + ) + + errors = [] + for key, value in settings_data.items(): + success = await set_setting(db, key, value) + if not success: + errors.append(key) + + if errors: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid setting keys: {', '.join(errors)}", + ) + + await apply_db_settings_to_config(db) + return {"message": "Settings updated", "count": len(settings_data)} + + +@router.post("/reset", response_model=MessageResponse) +async def reset_all_settings( + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_admin), +): + """Reset all settings to environment defaults.""" + await reset_settings(db) + await apply_db_settings_to_config(db) + return {"message": "Settings reset to defaults"} diff --git a/backend/app/core/__init__.py b/backend/app/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/core/auth.py b/backend/app/core/auth.py new file mode 100644 index 0000000..8062cd4 --- /dev/null +++ b/backend/app/core/auth.py @@ -0,0 +1,102 @@ +"""Authentication and authorization utilities.""" +from datetime import datetime, timedelta, timezone +from typing import Any +from uuid import uuid4 + +from jose import JWTError, jwt +from passlib.context import CryptContext + +from app.core.config import settings +from app.core.redis import get_redis + +pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") + +ALGORITHM = "HS256" +TOKEN_TYPE_ACCESS = "access" +TOKEN_TYPE_REFRESH = "refresh" + +# Redis key for revoked JWT jti set +REVOKED_JTIS_KEY = "auth:revoked_jtis" + + +def verify_password(plain_password: str, hashed_password: str) -> bool: + """Verify a plain password against a hash.""" + return pwd_context.verify(plain_password, hashed_password) + + +def get_password_hash(password: str) -> str: + """Hash a password.""" + return pwd_context.hash(password) + + +def _create_token(data: dict[str, Any], expires_delta: timedelta, token_type: str) -> tuple[str, str]: + """Create a JWT with jti/type claims. Returns (token, jti).""" + jti = str(uuid4()) + to_encode = data.copy() + expire = datetime.now(timezone.utc) + expires_delta + to_encode.update({ + "exp": expire, + "iat": datetime.now(timezone.utc), + "jti": jti, + "type": token_type, + }) + encoded_jwt = jwt.encode(to_encode, settings.SECRET_KEY, algorithm=ALGORITHM) + return encoded_jwt, jti + + +def create_access_token(sub: str, role: str | None = None) -> tuple[str, str]: + """Create a short-lived JWT access token. Returns (token, jti).""" + data: dict[str, Any] = {"sub": sub} + if role is not None: + data["role"] = role + return _create_token( + data, + timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES), + TOKEN_TYPE_ACCESS, + ) + + +def create_refresh_token(sub: str) -> tuple[str, str]: + """Create a long-lived JWT refresh token. Returns (token, jti).""" + return _create_token( + {"sub": sub}, + timedelta(days=settings.REFRESH_TOKEN_EXPIRE_DAYS), + TOKEN_TYPE_REFRESH, + ) + + +def decode_token(token: str, expected_type: str = TOKEN_TYPE_ACCESS) -> dict[str, Any]: + """Decode and validate a JWT token, checking type claim.""" + try: + payload = jwt.decode(token, settings.SECRET_KEY, algorithms=[ALGORITHM]) + except JWTError as exc: + raise ValueError("Invalid token") from exc + + token_type = payload.get("type") + if token_type != expected_type: + raise ValueError(f"Invalid token type: expected {expected_type}, got {token_type}") + + if "sub" not in payload or "jti" not in payload: + raise ValueError("Invalid token payload") + + return payload + + +async def is_token_revoked(jti: str) -> bool: + """Check whether a token jti has been revoked.""" + redis = await get_redis() + if redis is None: + # Without Redis we cannot reliably maintain a revocation list. + return False + return await redis.sismember(REVOKED_JTIS_KEY, jti) + + +async def revoke_token(jti: str, expires_at: datetime) -> None: + """Revoke a token by its jti with TTL matching token expiry.""" + redis = await get_redis() + if redis is None: + return + ttl_seconds = int((expires_at - datetime.now(timezone.utc)).total_seconds()) + if ttl_seconds > 0: + await redis.sadd(REVOKED_JTIS_KEY, jti) + await redis.expire(REVOKED_JTIS_KEY, ttl_seconds) diff --git a/backend/app/core/config.py b/backend/app/core/config.py new file mode 100644 index 0000000..b9780ca --- /dev/null +++ b/backend/app/core/config.py @@ -0,0 +1,70 @@ +"""Application configuration.""" +from pathlib import Path + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """Application settings loaded from environment variables.""" + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + # Database + DATABASE_URL: str = "postgresql+asyncpg://rss:rss@postgres:5432/rss_platform" + + # Redis + REDIS_URL: str = "redis://redis:6379/0" + + # JWT + SECRET_KEY: str = Field(..., min_length=32) + ACCESS_TOKEN_EXPIRE_MINUTES: int = 15 + REFRESH_TOKEN_EXPIRE_DAYS: int = 7 + + # AI + AI_DEFAULT_PROVIDER: str = "openai" + AI_DEFAULT_MODEL: str = "gpt-4o-mini" + + # Storage + STORAGE_TYPE: str = "minio" + MINIO_ENDPOINT: str = "minio:9000" + MINIO_ACCESS_KEY: str = "minioadmin" + MINIO_SECRET_KEY: str = "minioadmin" + MINIO_BUCKET: str = "rss-platform" + + # CORS + CORS_ALLOWED_ORIGINS: str = "" + + # Default admin + DEFAULT_ADMIN_USERNAME: str = "admin" + DEFAULT_ADMIN_PASSWORD: str = "admin" + + # Sensitive settings encryption + SETTINGS_ENCRYPTION_KEY: str = "" + + # Logging + LOG_LEVEL: str = "INFO" + + # RSS Fetching + FETCH_CONCURRENCY: int = 10 + FETCH_TIMEOUT: int = 30 + DEFAULT_FETCH_INTERVAL: int = 60 + MIN_FETCH_INTERVAL: int = 15 + + # Ports (for reference) + BACKEND_PORT: int = 8000 + FRONTEND_PORT: int = 5173 + + @property + def cors_origins(self) -> list[str]: + """Parse CORS_ALLOWED_ORIGINS into list.""" + if not self.CORS_ALLOWED_ORIGINS: + return [] + return [origin.strip() for origin in self.CORS_ALLOWED_ORIGINS.split(",") if origin.strip()] + + +settings = Settings() diff --git a/backend/app/core/database.py b/backend/app/core/database.py new file mode 100644 index 0000000..2185811 --- /dev/null +++ b/backend/app/core/database.py @@ -0,0 +1,35 @@ +"""Database configuration and session management.""" +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + +from app.core.config import settings + +engine = create_async_engine( + settings.DATABASE_URL, + echo=settings.LOG_LEVEL == "DEBUG", + future=True, + pool_size=10, + max_overflow=20, + pool_pre_ping=True, +) + +AsyncSessionLocal = async_sessionmaker( + engine, + class_=AsyncSession, + expire_on_commit=False, + autoflush=False, + autocommit=False, +) + + +async def get_db() -> AsyncSession: + """Dependency for FastAPI to get async DB session.""" + async with AsyncSessionLocal() as session: + try: + yield session + finally: + await session.close() + + +async def close_db() -> None: + """Close database connections.""" + await engine.dispose() diff --git a/backend/app/core/exceptions.py b/backend/app/core/exceptions.py new file mode 100644 index 0000000..e43c6fd --- /dev/null +++ b/backend/app/core/exceptions.py @@ -0,0 +1,64 @@ +"""Custom exceptions and error handlers.""" +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse + +from app.core.logging import get_logger + +logger = get_logger(__name__) + + +class PlatformException(Exception): + """Base exception for the platform.""" + + def __init__(self, message: str, status_code: int = 400): + super().__init__(message) + self.message = message + self.status_code = status_code + + +class AuthenticationError(PlatformException): + """Authentication failed.""" + + def __init__(self, message: str = "Authentication failed"): + super().__init__(message, status_code=401) + + +class AuthorizationError(PlatformException): + """Authorization failed.""" + + def __init__(self, message: str = "Forbidden"): + super().__init__(message, status_code=403) + + +class NotFoundError(PlatformException): + """Resource not found.""" + + def __init__(self, message: str = "Resource not found"): + super().__init__(message, status_code=404) + + +class ConflictError(PlatformException): + """Resource conflict.""" + + def __init__(self, message: str = "Conflict"): + super().__init__(message, status_code=409) + + +def add_exception_handlers(app: FastAPI) -> None: + """Register global exception handlers.""" + + @app.exception_handler(PlatformException) + async def platform_exception_handler(request: Request, exc: PlatformException): + logger.warning("Platform exception: %s", exc.message) + return JSONResponse( + status_code=exc.status_code, + content={"detail": exc.message}, + ) + + @app.exception_handler(Exception) + async def generic_exception_handler(request: Request, exc: Exception): + logger.exception("Unhandled exception: %s", exc) + return JSONResponse( + status_code=500, + content={"detail": "Internal server error"}, + ) diff --git a/backend/app/core/logging.py b/backend/app/core/logging.py new file mode 100644 index 0000000..01f0288 --- /dev/null +++ b/backend/app/core/logging.py @@ -0,0 +1,40 @@ +"""Logging configuration.""" +import logging +import sys +from contextvars import ContextVar + +request_id_var: ContextVar[str] = ContextVar("request_id", default="") + + +def configure_logging(log_level: str = "INFO") -> None: + """Configure structured logging.""" + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - [%(request_id)s] %(message)s" + ) + handler.setFormatter(formatter) + handler.addFilter(RequestIdFilter()) + + root_logger = logging.getLogger() + root_logger.setLevel(getattr(logging, log_level.upper(), logging.INFO)) + root_logger.handlers = [] + root_logger.addHandler(handler) + + # Reduce noise from third-party libraries + logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING) + logging.getLogger("uvicorn.access").setLevel(logging.WARNING) + + +class RequestIdFilter(logging.Filter): + """Inject request_id into log records.""" + + def filter(self, record: logging.LogRecord) -> bool: + record.request_id = request_id_var.get() # type: ignore[attr-defined] + return True + + +def get_logger(name: str) -> logging.Logger: + """Get a logger with request_id filter.""" + logger = logging.getLogger(name) + logger.addFilter(RequestIdFilter()) + return logger diff --git a/backend/app/core/rbac.py b/backend/app/core/rbac.py new file mode 100644 index 0000000..d96208f --- /dev/null +++ b/backend/app/core/rbac.py @@ -0,0 +1,30 @@ +"""Role-based access control.""" +from enum import Enum + +from fastapi import Depends, HTTPException, status + +from app.models.user import User + + +class Role(str, Enum): + """User roles.""" + + ADMIN = "admin" + MEMBER = "member" + + +def require_admin(current_user: User) -> User: + """Dependency that requires admin role.""" + if current_user.role != Role.ADMIN: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Admin privileges required", + ) + return current_user + + +def has_permission(user: User, required_role: Role) -> bool: + """Check if user has required role.""" + if user.role == Role.ADMIN: + return True + return user.role == required_role diff --git a/backend/app/core/redis.py b/backend/app/core/redis.py new file mode 100644 index 0000000..30e6540 --- /dev/null +++ b/backend/app/core/redis.py @@ -0,0 +1,32 @@ +"""Redis connection management.""" +from redis.asyncio import Redis + +from app.core.config import settings + +_redis: Redis | None = None + + +async def get_redis() -> Redis: + """Get or create Redis connection.""" + global _redis + if _redis is None: + _redis = Redis.from_url(settings.REDIS_URL, decode_responses=True) + return _redis + + +async def close_redis() -> None: + """Close Redis connection.""" + global _redis + if _redis: + await _redis.close() + _redis = None + + +async def check_redis_health() -> bool: + """Check if Redis is reachable.""" + try: + redis = await get_redis() + await redis.ping() + return True + except Exception: + return False diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py new file mode 100644 index 0000000..f51e50a --- /dev/null +++ b/backend/app/models/__init__.py @@ -0,0 +1,34 @@ +"""Models package.""" +from app.models.ai_config import AIProviderConfig, AITaskConfig +from app.models.article import CleanedArticle, RawArticle +from app.models.base import Base, TimestampMixin, UUIDMixin, utc_now +from app.models.chat import ChatMessage, ChatSession +from app.models.feed import Feed +from app.models.lock import Lock +from app.models.output import Output, OutputTask +from app.models.reference import ArticleReference, DuplicateGroup +from app.models.setting import AppSetting +from app.models.skill import Skill +from app.models.user import User + +__all__ = [ + "Base", + "TimestampMixin", + "UUIDMixin", + "utc_now", + "User", + "Feed", + "RawArticle", + "CleanedArticle", + "ArticleReference", + "DuplicateGroup", + "Skill", + "AIProviderConfig", + "AITaskConfig", + "OutputTask", + "Output", + "ChatSession", + "ChatMessage", + "Lock", + "AppSetting", +] diff --git a/backend/app/models/ai_config.py b/backend/app/models/ai_config.py new file mode 100644 index 0000000..554d8d4 --- /dev/null +++ b/backend/app/models/ai_config.py @@ -0,0 +1,45 @@ +"""AI configuration models.""" +from sqlalchemy import Boolean, Float, ForeignKey, Integer, JSON, String, Text +from sqlalchemy.orm import Mapped, mapped_column + +from app.models.base import Base, TimestampMixin, UUIDMixin + + +class AIProviderConfig(Base, UUIDMixin, TimestampMixin): + """AI provider configuration (OpenAI, Anthropic, etc.).""" + + __tablename__ = "ai_provider_configs" + + name: Mapped[str] = mapped_column(String(128), nullable=False) + provider: Mapped[str] = mapped_column(String(64), nullable=False, index=True) + base_url: Mapped[str | None] = mapped_column(String(512), nullable=True) + api_key_encrypted: Mapped[str | None] = mapped_column(Text, nullable=True) + default_model: Mapped[str | None] = mapped_column(String(128), nullable=True) + timeout: Mapped[int] = mapped_column(Integer, default=60, nullable=False) + max_retries: Mapped[int] = mapped_column(Integer, default=3, nullable=False) + rate_limit_rpm: Mapped[int] = mapped_column(Integer, default=60, nullable=False) + is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) + + +class AITaskConfig(Base, UUIDMixin, TimestampMixin): + """AI task configuration (which model/skill for which task).""" + + __tablename__ = "ai_task_configs" + + task_type: Mapped[str] = mapped_column(String(64), nullable=False, index=True) + name: Mapped[str] = mapped_column(String(128), nullable=False) + provider_config_id: Mapped[str | None] = mapped_column( + ForeignKey("ai_provider_configs.id", ondelete="SET NULL"), nullable=True + ) + model: Mapped[str] = mapped_column(String(128), nullable=False) + skill_id: Mapped[str | None] = mapped_column( + ForeignKey("skills.id", ondelete="SET NULL"), nullable=True + ) + temperature: Mapped[float] = mapped_column(Float, default=0.3, nullable=False) + max_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True) + top_p: Mapped[float] = mapped_column(Float, default=1.0, nullable=False) + system_prompt_override: Mapped[str | None] = mapped_column(Text, nullable=True) + fallback_config_id: Mapped[str | None] = mapped_column( + ForeignKey("ai_task_configs.id", ondelete="SET NULL"), nullable=True + ) + enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) diff --git a/backend/app/models/article.py b/backend/app/models/article.py new file mode 100644 index 0000000..44e8250 --- /dev/null +++ b/backend/app/models/article.py @@ -0,0 +1,81 @@ +"""Article models: raw and cleaned.""" +from datetime import datetime + +from sqlalchemy import Boolean, DateTime, Float, ForeignKey, Integer, JSON, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.models.base import Base, TimestampMixin, UUIDMixin + + +class RawArticle(Base, UUIDMixin, TimestampMixin): + """Raw article fetched from RSS feed.""" + + __tablename__ = "raw_articles" + + feed_id: Mapped[str] = mapped_column( + ForeignKey("feeds.id", ondelete="CASCADE"), nullable=False, index=True + ) + external_id: Mapped[str | None] = mapped_column(String(255), nullable=True, index=True) + title: Mapped[str | None] = mapped_column(String(1024), default="", index=True) + link: Mapped[str] = mapped_column(String(2048), nullable=False, index=True) + author: Mapped[str | None] = mapped_column(String(256), default="") + published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True, index=True) + fetched_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), nullable=False, index=True + ) + content: Mapped[str | None] = mapped_column(Text, default="") + summary: Mapped[str | None] = mapped_column(Text, default="") + raw_html: Mapped[str | None] = mapped_column(Text, default="") + content_hash: Mapped[str | None] = mapped_column(String(64), default="") + language: Mapped[str | None] = mapped_column(String(16), default="") + status: Mapped[str] = mapped_column(String(32), default="pending", nullable=False, index=True) + + feed: Mapped["Feed"] = relationship("Feed", back_populates="raw_articles") + cleaned_article: Mapped["CleanedArticle | None"] = relationship( + "CleanedArticle", back_populates="raw_article", uselist=False + ) + + +class CleanedArticle(Base, UUIDMixin, TimestampMixin): + """Cleaned and AI-enriched article.""" + + __tablename__ = "cleaned_articles" + + raw_article_id: Mapped[str | None] = mapped_column( + ForeignKey("raw_articles.id", ondelete="SET NULL"), nullable=True, index=True + ) + feed_id: Mapped[str] = mapped_column( + ForeignKey("feeds.id", ondelete="CASCADE"), nullable=False, index=True + ) + + title: Mapped[str | None] = mapped_column(String(1024), default="", index=True) + link: Mapped[str] = mapped_column(String(2048), default="", index=True) + author: Mapped[str | None] = mapped_column(String(256), default="") + feed_title: Mapped[str | None] = mapped_column(String(512), default="") + feed_category: Mapped[str | None] = mapped_column(String(128), default="") + + published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True, index=True) + fetched_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, index=True) + + content: Mapped[str | None] = mapped_column(Text, default="") + content_length: Mapped[int] = mapped_column(Integer, default=0, nullable=False) + original_summary: Mapped[str | None] = mapped_column(Text, default="") + ai_summary: Mapped[str | None] = mapped_column(Text, default="") + + category: Mapped[str | None] = mapped_column(String(128), default="", index=True) + tags: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + + heat_score: Mapped[float] = mapped_column(Float, default=0.0, nullable=False) + importance_score: Mapped[float] = mapped_column(Float, default=0.0, nullable=False) + duplication_score: Mapped[float] = mapped_column(Float, default=0.0, nullable=False) + composite_score: Mapped[float] = mapped_column(Float, default=0.0, nullable=False) + + duplicate_group_id: Mapped[str | None] = mapped_column( + ForeignKey("duplicate_groups.id", ondelete="SET NULL"), nullable=True, index=True + ) + is_representative: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False, index=True) + reference_links: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + processing_status: Mapped[str] = mapped_column(String(32), default="pending", nullable=False, index=True) + + raw_article: Mapped["RawArticle | None"] = relationship("RawArticle", back_populates="cleaned_article") + duplicate_group: Mapped["DuplicateGroup | None"] = relationship("DuplicateGroup", back_populates="articles") diff --git a/backend/app/models/base.py b/backend/app/models/base.py new file mode 100644 index 0000000..54ad79b --- /dev/null +++ b/backend/app/models/base.py @@ -0,0 +1,45 @@ +"""SQLAlchemy 2.0 async base and session factory.""" +from datetime import datetime, timezone +from uuid import uuid4 + +from sqlalchemy import DateTime, func +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column + + +class Base(DeclarativeBase): + """Base class for all models.""" + + pass + + +class TimestampMixin: + """Adds created_at and updated_at columns.""" + + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), + server_default=func.now(), + nullable=False, + ) + updated_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), + server_default=func.now(), + onupdate=func.now(), + nullable=False, + ) + + +class UUIDMixin: + """Adds UUID primary key.""" + + id: Mapped[str] = mapped_column( + UUID(as_uuid=True), + primary_key=True, + default=uuid4, + index=True, + ) + + +def utc_now() -> datetime: + """Return timezone-aware UTC now.""" + return datetime.now(timezone.utc) diff --git a/backend/app/models/chat.py b/backend/app/models/chat.py new file mode 100644 index 0000000..92d9e81 --- /dev/null +++ b/backend/app/models/chat.py @@ -0,0 +1,42 @@ +"""Chat models.""" +from sqlalchemy import ForeignKey, JSON, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.models.base import Base, TimestampMixin, UUIDMixin + + +class ChatSession(Base, UUIDMixin, TimestampMixin): + """Chat session.""" + + __tablename__ = "chat_sessions" + + user_id: Mapped[str | None] = mapped_column( + ForeignKey("users.id", ondelete="CASCADE"), nullable=True, index=True + ) + title: Mapped[str | None] = mapped_column(String(256), default="") + skill_id: Mapped[str | None] = mapped_column( + ForeignKey("skills.id", ondelete="SET NULL"), nullable=True + ) + context_window: Mapped[int] = mapped_column(default=10, nullable=False) + + messages: Mapped[list["ChatMessage"]] = relationship( + "ChatMessage", back_populates="session", cascade="all, delete-orphan" + ) + + +class ChatMessage(Base, UUIDMixin, TimestampMixin): + """Chat message.""" + + __tablename__ = "chat_messages" + + session_id: Mapped[str] = mapped_column( + ForeignKey("chat_sessions.id", ondelete="CASCADE"), nullable=False, index=True + ) + role: Mapped[str] = mapped_column(String(32), nullable=False, index=True) # user / assistant / tool + content: Mapped[str | None] = mapped_column(Text, default="") + tool_calls: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + tool_results: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + references: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + token_usage: Mapped[dict | None] = mapped_column(JSON, nullable=True) + + session: Mapped["ChatSession"] = relationship("ChatSession", back_populates="messages") diff --git a/backend/app/models/feed.py b/backend/app/models/feed.py new file mode 100644 index 0000000..4203142 --- /dev/null +++ b/backend/app/models/feed.py @@ -0,0 +1,59 @@ +"""Feed model.""" +from datetime import datetime, timezone + +from sqlalchemy import Boolean, DateTime, Integer, JSON, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.models.base import Base, TimestampMixin, UUIDMixin + + +class Feed(Base, UUIDMixin, TimestampMixin): + """RSS feed source.""" + + __tablename__ = "feeds" + + url: Mapped[str] = mapped_column(String(2048), unique=True, nullable=False, index=True) + title: Mapped[str | None] = mapped_column(String(512), default="") + description: Mapped[str | None] = mapped_column(Text, default="") + category: Mapped[str | None] = mapped_column(String(128), default="") + is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False, index=True) + fetch_interval_minutes: Mapped[int] = mapped_column(Integer, default=60, nullable=False) + priority: Mapped[int] = mapped_column(Integer, default=5, nullable=False) + parser_config: Mapped[dict] = mapped_column(JSON, default=dict, nullable=False) + proxy_policy: Mapped[str] = mapped_column(String(32), default="auto", nullable=False) + + # Fetch statistics + last_fetch_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + last_fetch_status: Mapped[str | None] = mapped_column(String(32), default="") + last_error: Mapped[str | None] = mapped_column(Text, default="") + error_type: Mapped[str | None] = mapped_column(String(64), default="") + success_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) + fail_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) + article_count: Mapped[int] = mapped_column(Integer, default=0, nullable=False) + + raw_articles: Mapped[list["RawArticle"]] = relationship( + "RawArticle", back_populates="feed", cascade="all, delete-orphan" + ) + + def health_status(self, now: datetime | None = None) -> str: + """Compute feed health status.""" + if now is None: + now = datetime.now(timezone.utc) + + total = self.success_count + self.fail_count + if total == 0: + return "unknown" + + success_rate = self.success_count / total + days_since = None + if self.last_fetch_at: + days_since = (now - self.last_fetch_at).days + + if success_rate >= 0.9 and (days_since is None or days_since <= 7): + return "healthy" + if success_rate >= 0.5 and (days_since is None or days_since <= 7): + return "warning" + return "unhealthy" + + def __repr__(self) -> str: + return f"" diff --git a/backend/app/models/lock.py b/backend/app/models/lock.py new file mode 100644 index 0000000..0e1e72c --- /dev/null +++ b/backend/app/models/lock.py @@ -0,0 +1,24 @@ +"""Lock model.""" +from datetime import datetime, timezone + +from sqlalchemy import DateTime, String +from sqlalchemy.orm import Mapped, mapped_column + +from app.models.base import Base, UUIDMixin + + +def _utc_now() -> datetime: + return datetime.now(timezone.utc) + + +class Lock(Base, UUIDMixin): + """Distributed lock record (fallback when Redis is unavailable).""" + + __tablename__ = "locks" + + lock_name: Mapped[str] = mapped_column(String(128), unique=True, nullable=False, index=True) + owner_id: Mapped[str | None] = mapped_column(String(128), nullable=True) + acquired_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), nullable=False, default=_utc_now + ) + expires_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) diff --git a/backend/app/models/output.py b/backend/app/models/output.py new file mode 100644 index 0000000..a5cb6c1 --- /dev/null +++ b/backend/app/models/output.py @@ -0,0 +1,41 @@ +"""Output task and output record models.""" +from datetime import datetime + +from sqlalchemy import Boolean, DateTime, ForeignKey, JSON, String, Text +from sqlalchemy.orm import Mapped, mapped_column + +from app.models.base import Base, TimestampMixin, UUIDMixin + + +class OutputTask(Base, UUIDMixin, TimestampMixin): + """Configurable output task (e.g. daily brief).""" + + __tablename__ = "output_tasks" + + name: Mapped[str] = mapped_column(String(128), nullable=False) + task_type: Mapped[str] = mapped_column(String(64), default="daily_brief", nullable=False, index=True) + skill_id: Mapped[str] = mapped_column( + ForeignKey("skills.id", ondelete="CASCADE"), nullable=False + ) + schedule: Mapped[str | None] = mapped_column(String(128), nullable=True) # cron expression + filter_config: Mapped[dict] = mapped_column(JSON, default=dict, nullable=False) + output_config: Mapped[dict] = mapped_column(JSON, default=dict, nullable=False) + is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) + last_run_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + last_output_id: Mapped[str | None] = mapped_column( + ForeignKey("outputs.id", ondelete="SET NULL"), nullable=True + ) + + +class Output(Base, UUIDMixin, TimestampMixin): + """Generated output record.""" + + __tablename__ = "outputs" + + output_task_id: Mapped[str | None] = mapped_column( + ForeignKey("output_tasks.id", ondelete="SET NULL"), nullable=True, index=True + ) + content: Mapped[str | None] = mapped_column(Text, default="") + content_html: Mapped[str | None] = mapped_column(Text, default="") + references: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + metadata: Mapped[dict] = mapped_column(JSON, default=dict, nullable=False) diff --git a/backend/app/models/reference.py b/backend/app/models/reference.py new file mode 100644 index 0000000..597c5c4 --- /dev/null +++ b/backend/app/models/reference.py @@ -0,0 +1,39 @@ +"""Reference and duplicate group models.""" +from sqlalchemy import Float, ForeignKey, JSON, String +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.models.base import Base, TimestampMixin, UUIDMixin + + +class ArticleReference(Base, UUIDMixin, TimestampMixin): + """Reference from a cleaned article to another related article.""" + + __tablename__ = "article_references" + + source_article_id: Mapped[str] = mapped_column( + ForeignKey("cleaned_articles.id", ondelete="CASCADE"), nullable=False, index=True + ) + referenced_article_id: Mapped[str | None] = mapped_column( + ForeignKey("cleaned_articles.id", ondelete="SET NULL"), nullable=True, index=True + ) + reference_type: Mapped[str] = mapped_column(String(64), nullable=False, index=True) + reference_link: Mapped[str | None] = mapped_column(String(2048), default="") + reference_title: Mapped[str | None] = mapped_column(String(1024), default="") + similarity: Mapped[float | None] = mapped_column(Float, nullable=True) + + +class DuplicateGroup(Base, UUIDMixin, TimestampMixin): + """Group of duplicate articles.""" + + __tablename__ = "duplicate_groups" + + representative_article_id: Mapped[str | None] = mapped_column( + ForeignKey("cleaned_articles.id", ondelete="SET NULL"), nullable=True, index=True + ) + member_article_ids: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + similarity_matrix: Mapped[dict] = mapped_column(JSON, default=dict, nullable=False) + brief_date: Mapped[str | None] = mapped_column(String(10), default="", index=True) + + articles: Mapped[list["CleanedArticle"]] = relationship( + "CleanedArticle", back_populates="duplicate_group" + ) diff --git a/backend/app/models/setting.py b/backend/app/models/setting.py new file mode 100644 index 0000000..4323e43 --- /dev/null +++ b/backend/app/models/setting.py @@ -0,0 +1,16 @@ +"""App setting model.""" +from sqlalchemy import Boolean, String, Text +from sqlalchemy.orm import Mapped, mapped_column + +from app.models.base import Base, TimestampMixin, UUIDMixin + + +class AppSetting(Base, UUIDMixin, TimestampMixin): + """Runtime application setting.""" + + __tablename__ = "app_settings" + + key: Mapped[str] = mapped_column(String(128), unique=True, nullable=False, index=True) + value: Mapped[str] = mapped_column(Text, default="", nullable=False) + description: Mapped[str | None] = mapped_column(Text, default="") + is_sensitive: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) diff --git a/backend/app/models/skill.py b/backend/app/models/skill.py new file mode 100644 index 0000000..889dc4a --- /dev/null +++ b/backend/app/models/skill.py @@ -0,0 +1,26 @@ +"""Skill model.""" +from sqlalchemy import Boolean, Integer, JSON, String, Text +from sqlalchemy.orm import Mapped, mapped_column + +from app.models.base import Base, TimestampMixin, UUIDMixin + + +class Skill(Base, UUIDMixin, TimestampMixin): + """Reusable skill configuration for AI outputs.""" + + __tablename__ = "skills" + + name: Mapped[str] = mapped_column(String(128), nullable=False) + slug: Mapped[str] = mapped_column(String(128), unique=True, nullable=False, index=True) + description: Mapped[str | None] = mapped_column(Text, default="") + type: Mapped[str] = mapped_column(String(32), nullable=False, index=True) # output / tool / agent + version: Mapped[int] = mapped_column(Integer, default=1, nullable=False) + is_default: Mapped[bool] = mapped_column(Boolean, default=False, nullable=False) + + system_prompt: Mapped[str] = mapped_column(Text, nullable=False) + output_schema: Mapped[dict | None] = mapped_column(JSON, nullable=True) + tools: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + input_schema: Mapped[dict | None] = mapped_column(JSON, nullable=True) + example_inputs: Mapped[list] = mapped_column(JSON, default=list, nullable=False) + + created_by: Mapped[str | None] = mapped_column(String(64), nullable=True) diff --git a/backend/app/models/user.py b/backend/app/models/user.py new file mode 100644 index 0000000..84f86a0 --- /dev/null +++ b/backend/app/models/user.py @@ -0,0 +1,22 @@ +"""User model.""" +from datetime import datetime + +from sqlalchemy import Boolean, DateTime, String +from sqlalchemy.orm import Mapped, mapped_column + +from app.models.base import Base, TimestampMixin, UUIDMixin, utc_now + + +class User(Base, UUIDMixin, TimestampMixin): + """Platform user.""" + + __tablename__ = "users" + + username: Mapped[str] = mapped_column(String(64), unique=True, nullable=False, index=True) + password_hash: Mapped[str] = mapped_column(String(255), nullable=False) + role: Mapped[str] = mapped_column(String(32), default="member", nullable=False, index=True) + is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) + last_login_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + + def __repr__(self) -> str: + return f"" diff --git a/backend/app/schemas/__init__.py b/backend/app/schemas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/schemas/article.py b/backend/app/schemas/article.py new file mode 100644 index 0000000..523a17f --- /dev/null +++ b/backend/app/schemas/article.py @@ -0,0 +1,57 @@ +"""Article Pydantic schemas.""" +from pydantic import BaseModel, ConfigDict, Field + + +class ArticleListParams(BaseModel): + """Article list query parameters.""" + + feed_id: str | None = None + category: str | None = None + tag: str | None = None + search: str | None = None + is_read: bool | None = None + skip: int = 0 + limit: int = Field(default=50, le=200) + + +class ArticleOut(BaseModel): + """Cleaned article output schema.""" + + model_config = ConfigDict(from_attributes=True) + + id: str + raw_article_id: str | None = None + feed_id: str + title: str | None = None + link: str + author: str | None = None + feed_title: str | None = None + feed_category: str | None = None + published_at: str | None = None + fetched_at: str + content: str | None = None + original_summary: str | None = None + ai_summary: str | None = None + category: str | None = None + tags: list[str] = [] + heat_score: float = 0.0 + importance_score: float = 0.0 + duplication_score: float = 0.0 + composite_score: float = 0.0 + is_representative: bool = True + reference_links: list[dict] = [] + processing_status: str = "pending" + created_at: str + updated_at: str + + @classmethod + def model_validate(cls, obj): + """Format datetime fields.""" + data = {} + for key in obj.__dict__: + value = getattr(obj, key) + if key in ("created_at", "updated_at", "published_at", "fetched_at") and value is not None: + data[key] = value.isoformat() + else: + data[key] = value + return cls.model_construct(**data) diff --git a/backend/app/schemas/common.py b/backend/app/schemas/common.py new file mode 100644 index 0000000..bab3af3 --- /dev/null +++ b/backend/app/schemas/common.py @@ -0,0 +1,28 @@ +"""Common Pydantic schemas.""" +from pydantic import BaseModel, ConfigDict, Field + + +class PaginationParams(BaseModel): + """Pagination query parameters.""" + + skip: int = Field(default=0, ge=0) + limit: int = Field(default=50, ge=1, le=200) + + +class PaginatedResponse(BaseModel): + """Paginated response wrapper.""" + + total: int + items: list + + +class MessageResponse(BaseModel): + """Simple message response.""" + + message: str + + +class BaseSchema(BaseModel): + """Base schema with ORM mode.""" + + model_config = ConfigDict(from_attributes=True) diff --git a/backend/app/schemas/feed.py b/backend/app/schemas/feed.py new file mode 100644 index 0000000..a16db4b --- /dev/null +++ b/backend/app/schemas/feed.py @@ -0,0 +1,66 @@ +"""Feed Pydantic schemas.""" +from pydantic import BaseModel, ConfigDict, Field, HttpUrl + + +class FeedBase(BaseModel): + """Base feed schema.""" + + url: HttpUrl + title: str | None = Field(default="", max_length=512) + description: str | None = "" + category: str | None = Field(default="", max_length=128) + is_active: bool = True + fetch_interval_minutes: int = Field(default=60, ge=15) + priority: int = Field(default=5, ge=1, le=10) + parser_config: dict = {} + proxy_policy: str = "auto" + + +class FeedCreate(FeedBase): + """Feed creation schema.""" + + pass + + +class FeedUpdate(BaseModel): + """Feed update schema.""" + + title: str | None = Field(default=None, max_length=512) + description: str | None = None + category: str | None = Field(default=None, max_length=128) + is_active: bool | None = None + fetch_interval_minutes: int | None = Field(default=None, ge=15) + priority: int | None = Field(default=None, ge=1, le=10) + parser_config: dict | None = None + proxy_policy: str | None = None + + +class FeedOut(FeedBase): + """Feed output schema.""" + + model_config = ConfigDict(from_attributes=True) + + id: str + last_fetch_at: str | None = None + last_fetch_status: str | None = None + last_error: str | None = None + error_type: str | None = None + success_count: int = 0 + fail_count: int = 0 + article_count: int = 0 + health_status: str = "unknown" + created_at: str + updated_at: str + + @classmethod + def model_validate(cls, obj): + """Override to compute health_status and format datetimes.""" + data = {} + for key in obj.__dict__: + value = getattr(obj, key) + if key in ("created_at", "updated_at", "last_fetch_at") and value is not None: + data[key] = value.isoformat() + else: + data[key] = value + data["health_status"] = obj.health_status() + return cls.model_construct(**data) diff --git a/backend/app/schemas/user.py b/backend/app/schemas/user.py new file mode 100644 index 0000000..dd581cc --- /dev/null +++ b/backend/app/schemas/user.py @@ -0,0 +1,76 @@ +"""User Pydantic schemas.""" +import re + +from pydantic import BaseModel, ConfigDict, Field, field_validator + +_PASSWORD_RE = re.compile(r"^(?=.*[A-Za-z])(?=.*\d)[A-Za-z\d@$!%*?&_.-]{8,128}$") + + +class UserBase(BaseModel): + """Base user schema.""" + + username: str = Field(..., min_length=3, max_length=64) + role: str = "member" + is_active: bool = True + + @field_validator("role") + @classmethod + def _validate_role(cls, value: str) -> str: + allowed = {"admin", "member"} + if value not in allowed: + raise ValueError(f"role must be one of {allowed}") + return value + + +class UserCreate(UserBase): + """User creation schema.""" + + password: str = Field(..., min_length=8, max_length=128) + + @field_validator("password") + @classmethod + def _validate_password_strength(cls, value: str) -> str: + if not _PASSWORD_RE.match(value): + raise ValueError( + "password must be 8-128 characters and contain at least one letter and one number" + ) + return value + + +class UserOut(UserBase): + """User output schema.""" + + model_config = ConfigDict(from_attributes=True) + + id: str + + +class UserLogin(BaseModel): + """User login schema.""" + + username: str + password: str + + +class TokenResponse(BaseModel): + """Token response schema.""" + + access_token: str + refresh_token: str + token_type: str = "bearer" + + +class TokenPayload(BaseModel): + """JWT token payload.""" + + sub: str | None = None + role: str | None = None + jti: str | None = None + type: str | None = None + exp: int | None = None + + +class RefreshTokenRequest(BaseModel): + """Refresh token request schema.""" + + refresh_token: str diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/services/lock_service.py b/backend/app/services/lock_service.py new file mode 100644 index 0000000..2e0bd7c --- /dev/null +++ b/backend/app/services/lock_service.py @@ -0,0 +1,153 @@ +"""Distributed lock service with Redis and DB fallback.""" +from datetime import datetime, timedelta, timezone +from uuid import uuid4 + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.database import AsyncSessionLocal +from app.core.logging import get_logger +from app.core.redis import get_redis +from app.models.lock import Lock + +logger = get_logger(__name__) + + +class LockService: + """Distributed lock service.""" + + def __init__(self, owner_id: str | None = None): + self.owner_id = owner_id or str(uuid4()) + + async def acquire(self, lock_name: str, ttl: int = 60) -> bool: + """Acquire a lock with given TTL in seconds.""" + # Try Redis first + try: + redis = await get_redis() + acquired = await redis.set(lock_name, self.owner_id, nx=True, ex=ttl) + if acquired: + return True + except Exception as exc: + logger.warning("Redis lock failed, falling back to DB: %s", exc) + + # Fallback to DB + return await self._acquire_db(lock_name, ttl) + + async def release(self, lock_name: str) -> bool: + """Release a lock.""" + # Try Redis first + try: + redis = await get_redis() + # Only release if we own it + current_owner = await redis.get(lock_name) + if current_owner == self.owner_id: + await redis.delete(lock_name) + return True + except Exception as exc: + logger.warning("Redis unlock failed, falling back to DB: %s", exc) + + return await self._release_db(lock_name) + + async def extend(self, lock_name: str, ttl: int = 60) -> bool: + """Extend lock TTL.""" + try: + redis = await get_redis() + current_owner = await redis.get(lock_name) + if current_owner == self.owner_id: + await redis.expire(lock_name, ttl) + return True + except Exception as exc: + logger.warning("Redis extend failed: %s", exc) + + return await self._extend_db(lock_name, ttl) + + async def is_locked(self, lock_name: str) -> bool: + """Check if a lock is held.""" + try: + redis = await get_redis() + exists = await redis.exists(lock_name) + if exists: + return True + except Exception: + pass + + async with AsyncSessionLocal() as db: + result = await db.execute(select(Lock).where(Lock.lock_name == lock_name)) + lock = result.scalar_one_or_none() + if not lock: + return False + if lock.expires_at and lock.expires_at < datetime.now(timezone.utc): + return False + return True + + async def _acquire_db(self, lock_name: str, ttl: int) -> bool: + async with AsyncSessionLocal() as db: + now = datetime.now(timezone.utc) + expires_at = now + timedelta(seconds=ttl) + + # Try to update expired lock + result = await db.execute( + select(Lock).where( + Lock.lock_name == lock_name, + Lock.expires_at < now, + ) + ) + lock = result.scalar_one_or_none() + if lock: + lock.owner_id = self.owner_id + lock.acquired_at = now + lock.expires_at = expires_at + await db.commit() + return True + + # Try to insert new lock + lock = Lock( + lock_name=lock_name, + owner_id=self.owner_id, + acquired_at=now, + expires_at=expires_at, + ) + db.add(lock) + try: + await db.commit() + return True + except Exception: + await db.rollback() + return False + + async def _release_db(self, lock_name: str) -> bool: + async with AsyncSessionLocal() as db: + result = await db.execute( + select(Lock).where( + Lock.lock_name == lock_name, + Lock.owner_id == self.owner_id, + ) + ) + lock = result.scalar_one_or_none() + if not lock: + return False + + await db.delete(lock) + await db.commit() + return True + + async def _extend_db(self, lock_name: str, ttl: int) -> bool: + async with AsyncSessionLocal() as db: + result = await db.execute( + select(Lock).where( + Lock.lock_name == lock_name, + Lock.owner_id == self.owner_id, + ) + ) + lock = result.scalar_one_or_none() + if not lock: + return False + + lock.expires_at = datetime.now(timezone.utc) + timedelta(seconds=ttl) + await db.commit() + return True + + +async def get_lock_service(owner_id: str | None = None) -> LockService: + """Get a lock service instance.""" + return LockService(owner_id=owner_id) diff --git a/backend/app/services/settings_service.py b/backend/app/services/settings_service.py new file mode 100644 index 0000000..20f1df1 --- /dev/null +++ b/backend/app/services/settings_service.py @@ -0,0 +1,227 @@ +"""Application settings management service.""" +from typing import Any + +from cryptography.fernet import Fernet, InvalidToken +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.config import settings +from app.core.logging import get_logger +from app.models.setting import AppSetting + +logger = get_logger(__name__) + +EDITABLE_SETTINGS = { + "RSSKEEPER_BASE_URL": {"description": "rssKeeper 服务地址", "sensitive": False}, + "OPENAI_API_KEY": {"description": "LLM API Key", "sensitive": True}, + "OPENAI_BASE_URL": {"description": "LLM API 基础地址", "sensitive": False}, + "OPENAI_MODEL": {"description": "LLM 模型名", "sensitive": False}, + "OPENAI_TIMEOUT": {"description": "LLM 调用超时(秒)", "sensitive": False}, + "OPENAI_MAX_RETRIES": {"description": "LLM 最大重试次数", "sensitive": False}, + "SUMMARIZE_INTERVAL_MINUTES": {"description": "摘要任务间隔(分钟)", "sensitive": False}, + "TAG_SCORE_INTERVAL_MINUTES": {"description": "分类/打分/去重任务间隔(分钟)", "sensitive": False}, + "DAILY_BRIEF_HOUR": {"description": "每日简报生成小时", "sensitive": False}, + "DAILY_BRIEF_MINUTE": {"description": "每日简报生成分钟", "sensitive": False}, + "TITLE_SIMILARITY_THRESHOLD": {"description": "标题相似度阈值", "sensitive": False}, + "CONTENT_SIMILARITY_THRESHOLD": {"description": "内容相似度阈值", "sensitive": False}, + "MAX_AI_SUMMARY_LENGTH": {"description": "AI 摘要最大长度", "sensitive": False}, + "MIN_ORIGINAL_SUMMARY_LENGTH": {"description": "原始摘要最小长度", "sensitive": False}, + "BRIEF_TOP_N_PER_CATEGORY": {"description": "简报每分类显示文章数", "sensitive": False}, + "LOG_LEVEL": {"description": "日志级别", "sensitive": False}, + "API_TOKEN": {"description": "API 鉴权 Token(为空时不启用)", "sensitive": True}, + "CORS_ALLOWED_ORIGINS": {"description": "CORS 允许来源(逗号分隔)", "sensitive": False}, +} + +# Prefix to detect encrypted values +_ENC_PREFIX = "enc:" + + +def _get_fernet() -> Fernet | None: + """Get Fernet instance if encryption key is configured.""" + key = settings.SETTINGS_ENCRYPTION_KEY + if not key: + return None + try: + return Fernet(key.encode() if isinstance(key, str) else key) + except Exception as exc: + logger.error("SETTINGS_ENCRYPTION_KEY 无效: %s", exc) + return None + + +def _encrypt(value: str) -> str: + """Encrypt a sensitive value if encryption is enabled.""" + if not value: + return value + fernet = _get_fernet() + if fernet is None: + return value + return _ENC_PREFIX + fernet.encrypt(value.encode()).decode() + + +def _decrypt(value: str) -> str: + """Decrypt a sensitive value if it was encrypted.""" + if not value or not value.startswith(_ENC_PREFIX): + return value + fernet = _get_fernet() + if fernet is None: + logger.warning("发现加密配置值但 SETTINGS_ENCRYPTION_KEY 未配置,无法解密") + return value + try: + ciphertext = value[len(_ENC_PREFIX):].encode() + return fernet.decrypt(ciphertext).decode() + except InvalidToken: + logger.warning("配置值解密失败(token 无效)") + return value + except Exception as exc: + logger.error("配置值解密失败: %s", exc) + return value + + +def _get_env_default(key: str) -> str: + """Get default value from environment/settings.""" + value = getattr(settings, key, "") + return str(value) if value is not None else "" + + +def _mask_sensitive(value: str) -> str: + """Mask sensitive value for display.""" + if not value: + return "" + if len(value) <= 8: + return "*" * len(value) + return f"{value[:4]}...{value[-4:]}" + + +async def init_default_settings(db: AsyncSession) -> None: + """Initialize default settings from environment if table is empty.""" + result = await db.execute(select(AppSetting)) + existing = result.scalars().first() + if existing: + return + + for key, meta in EDITABLE_SETTINGS.items(): + default_value = _get_env_default(key) + stored_value = _encrypt(default_value) if meta["sensitive"] else default_value + db.add( + AppSetting( + key=key, + value=stored_value, + description=meta["description"], + is_sensitive=meta["sensitive"], + ) + ) + + await db.commit() + logger.info("已初始化默认配置项: %d 条", len(EDITABLE_SETTINGS)) + + +async def _get_raw_setting(db: AsyncSession, key: str) -> AppSetting | None: + """Get setting row from DB.""" + result = await db.execute(select(AppSetting).where(AppSetting.key == key)) + return result.scalar_one_or_none() + + +async def get_setting(db: AsyncSession, key: str, default: Any = None) -> Any: + """Get decrypted setting value from DB or env default.""" + setting = await _get_raw_setting(db, key) + if setting: + return _decrypt(setting.value) if setting.is_sensitive else setting.value + return _get_env_default(key) if default is None else default + + +async def set_setting(db: AsyncSession, key: str, value: str) -> bool: + """Update a setting (encrypt sensitive values).""" + if key not in EDITABLE_SETTINGS: + return False + + meta = EDITABLE_SETTINGS[key] + stored_value = _encrypt(str(value)) if meta["sensitive"] else str(value) + + setting = await _get_raw_setting(db, key) + if setting: + setting.value = stored_value + else: + setting = AppSetting( + key=key, + value=stored_value, + description=meta["description"], + is_sensitive=meta["sensitive"], + ) + db.add(setting) + + await db.commit() + logger.info("配置已更新: %s", key) + return True + + +async def list_settings(db: AsyncSession, mask_sensitive: bool = True) -> list[dict[str, Any]]: + """List all settings.""" + result = await db.execute(select(AppSetting)) + db_settings = {s.key: s for s in result.scalars().all()} + + output = [] + for key, meta in EDITABLE_SETTINGS.items(): + setting = db_settings.get(key) + is_sensitive = meta["sensitive"] + + if setting: + raw_value = setting.value + updated_at = setting.updated_at.isoformat() if setting.updated_at else None + else: + raw_value = _get_env_default(key) + updated_at = None + + decrypted_value = _decrypt(raw_value) if is_sensitive else raw_value + + if is_sensitive and mask_sensitive: + display_value = _mask_sensitive(decrypted_value) + is_masked = True + else: + display_value = decrypted_value + is_masked = False + + output.append({ + "key": key, + "value": display_value, + "real_value": decrypted_value if not mask_sensitive else None, + "description": meta["description"], + "is_sensitive": is_sensitive, + "is_masked": is_masked, + "updated_at": updated_at, + }) + + return output + + +async def apply_db_settings_to_config(db: AsyncSession) -> None: + """Apply DB settings to runtime config.""" + for key in EDITABLE_SETTINGS: + db_value = await get_setting(db, key) + if db_value is None or db_value == "": + continue + + field_info = settings.model_fields.get(key) + if field_info is None: + continue + + target_type = field_info.annotation + try: + if target_type is int: + converted = int(db_value) + elif target_type is float: + converted = float(db_value) + elif target_type is bool: + converted = db_value.lower() in ("true", "1", "yes") + else: + converted = db_value + setattr(settings, key, converted) + except Exception as exc: + logger.error("应用配置 %s=%s 失败: %s", key, db_value, exc) + raise ValueError(f"配置项 {key} 的值无效: {db_value}") from exc + + +async def reset_settings(db: AsyncSession) -> None: + """Reset all settings to env defaults.""" + for key in EDITABLE_SETTINGS: + await set_setting(db, key, _get_env_default(key)) + logger.info("配置已重置为环境变量默认值") diff --git a/backend/app/services/task_runtime.py b/backend/app/services/task_runtime.py new file mode 100644 index 0000000..5344d04 --- /dev/null +++ b/backend/app/services/task_runtime.py @@ -0,0 +1,118 @@ +"""Task runtime progress tracking service.""" +from datetime import datetime, timezone +from typing import Any + +from app.core.logging import get_logger +from app.core.redis import get_redis + +logger = get_logger(__name__) + +TASK_STATUS_IDLE = "idle" +TASK_STATUS_RUNNING = "running" +TASK_STATUS_SUCCESS = "success" +TASK_STATUS_ERROR = "error" + + +class TaskRuntime: + """Runtime task progress tracker using Redis.""" + + def __init__(self): + self._redis = None + + async def _get_redis(self): + if self._redis is None: + self._redis = await get_redis() + return self._redis + + def _key(self, task_key: str) -> str: + return f"task_progress:{task_key}" + + async def update_progress( + self, + task_key: str, + *, + status: str | None = None, + stage: str | None = None, + current: int | None = None, + total: int | None = None, + message: str | None = None, + trigger: str | None = None, + ) -> None: + """Update task progress.""" + try: + redis = await self._get_redis() + key = self._key(task_key) + + existing = await redis.hgetall(key) + data = dict(existing) if existing else {} + + if status: + data["status"] = status + if stage: + data["stage"] = stage + if current is not None: + data["current"] = str(current) + if total is not None: + data["total"] = str(total) + if message is not None: + data["message"] = message + if trigger: + data["trigger"] = trigger + + data["updated_at"] = datetime.now(timezone.utc).isoformat() + if status == TASK_STATUS_RUNNING and "started_at" not in data: + data["started_at"] = data["updated_at"] + if status in (TASK_STATUS_SUCCESS, TASK_STATUS_ERROR): + data["finished_at"] = data["updated_at"] + + await redis.hset(key, mapping=data) + except Exception as exc: + logger.warning("Failed to update task progress: %s", exc) + + async def get_progress(self, task_key: str) -> dict[str, Any]: + """Get task progress.""" + try: + redis = await self._get_redis() + data = await redis.hgetall(self._key(task_key)) + if not data: + return self._empty_progress(task_key) + return { + "task_key": task_key, + "status": data.get("status", TASK_STATUS_IDLE), + "stage": data.get("stage", ""), + "current": int(data.get("current", 0)), + "total": int(data.get("total", 0)), + "message": data.get("message"), + "trigger": data.get("trigger"), + "started_at": data.get("started_at"), + "updated_at": data.get("updated_at"), + "finished_at": data.get("finished_at"), + } + except Exception as exc: + logger.warning("Failed to get task progress: %s", exc) + return self._empty_progress(task_key) + + async def reset_progress(self, task_key: str) -> None: + """Reset task progress to idle.""" + try: + redis = await self._get_redis() + await redis.delete(self._key(task_key)) + except Exception as exc: + logger.warning("Failed to reset task progress: %s", exc) + + def _empty_progress(self, task_key: str) -> dict[str, Any]: + return { + "task_key": task_key, + "status": TASK_STATUS_IDLE, + "stage": "", + "current": 0, + "total": 0, + "message": None, + "trigger": None, + "started_at": None, + "updated_at": None, + "finished_at": None, + } + + +task_runtime = TaskRuntime() diff --git a/backend/app/tasks/__init__.py b/backend/app/tasks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000..8996e36 --- /dev/null +++ b/backend/main.py @@ -0,0 +1,131 @@ +"""RSS Platform FastAPI application.""" +from contextlib import asynccontextmanager +from uuid import uuid4 + +from fastapi import FastAPI, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles + +from app.api.v1 import auth, articles, feeds, health, settings +from app.api.v1.admin import locks +from app.core.config import settings +from app.core.database import close_db +from app.core.exceptions import add_exception_handlers +from app.core.logging import configure_logging, request_id_var +from app.core.redis import close_redis +from app.services.settings_service import apply_db_settings_to_config, init_default_settings + +configure_logging(settings.LOG_LEVEL) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Application lifespan manager.""" + from app.core.database import AsyncSessionLocal + + app.state.startup_warnings = [] + + async with AsyncSessionLocal() as db: + await init_default_settings(db) + await apply_db_settings_to_config(db) + warnings = await _create_default_admin(db) + app.state.startup_warnings.extend(warnings) + + yield + + # Shutdown + await close_db() + await close_redis() + + +async def _create_default_admin(db) -> list[str]: + """Create default admin user if no users exist.""" + from sqlalchemy import select + + from app.core.auth import get_password_hash + from app.models.user import User + + warnings: list[str] = [] + result = await db.execute(select(User)) + if result.scalar_one_or_none(): + return warnings + + if ( + settings.DEFAULT_ADMIN_USERNAME == "admin" + and settings.DEFAULT_ADMIN_PASSWORD == "admin" + ): + warnings.append( + "Default admin credentials are admin/admin. Please change the password immediately." + ) + + admin = User( + username=settings.DEFAULT_ADMIN_USERNAME, + password_hash=get_password_hash(settings.DEFAULT_ADMIN_PASSWORD), + role="admin", + is_active=True, + ) + db.add(admin) + await db.commit() + return warnings + + +app = FastAPI( + title="RSS Platform", + description="模块化、工业化、AI 驱动的 RSS 信息处理平台", + version="0.1.0", + lifespan=lifespan, +) + +# CORS +cors_origins = settings.cors_origins +if not cors_origins: + # In production, CORS_ALLOWED_ORIGINS must be configured explicitly. + # Dev fallback uses the known frontend origin instead of wildcard. + cors_origins = ["http://localhost:5173"] + +app.add_middleware( + CORSMiddleware, + allow_origins=cors_origins, + allow_credentials=False, + allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"], + allow_headers=["Content-Type", "Authorization", "X-API-Key", "X-Request-ID"], +) + + +@app.middleware("http") +async def request_id_middleware(request: Request, call_next): + """Attach request_id from header or generate a new one for logging.""" + request_id = request.headers.get("X-Request-ID") or str(uuid4()) + token = request_id_var.set(request_id) + try: + response = await call_next(request) + response.headers["X-Request-ID"] = request_id + return response + finally: + request_id_var.reset(token) + + +# Exception handlers +add_exception_handlers(app) + +# API routers +app.include_router(auth.router, prefix="/api/v1") +app.include_router(feeds.router, prefix="/api/v1") +app.include_router(articles.router, prefix="/api/v1") +app.include_router(health.router, prefix="/api/v1") +app.include_router(settings.router, prefix="/api/v1") +app.include_router(locks.router, prefix="/api/v1/admin") + + +@app.get("/") +async def root(): + """Root endpoint.""" + return {"message": "RSS Platform API", "version": "0.1.0"} + + +# Static files (frontend build) +import os + +static_dir = os.path.join(os.path.dirname(__file__), "static") +if os.path.isdir(static_dir): + app.mount("/", StaticFiles(directory=static_dir, html=True), name="static") diff --git a/backend/pyproject.toml b/backend/pyproject.toml new file mode 100644 index 0000000..3c4a8c4 --- /dev/null +++ b/backend/pyproject.toml @@ -0,0 +1,70 @@ +[project] +name = "rss-platform" +version = "0.1.0" +description = "模块化、工业化、AI 驱动的 RSS 信息处理平台" +requires-python = ">=3.12" +dependencies = [ + "fastapi==0.115.0", + "uvicorn[standard]==0.30.0", + "sqlalchemy[asyncio]==2.0.31", + "asyncpg==0.29.0", + "alembic==1.13.2", + "psycopg2-binary==2.9.9", + "pydantic==2.8.2", + "pydantic-settings==2.3.4", + "python-jose[cryptography]==3.3.0", + "passlib[bcrypt]==1.7.4", + "cryptography==42.0.8", + "python-multipart==0.0.9", + "httpx==0.27.0", + "feedparser==6.0.11", + "beautifulsoup4==4.12.3", + "lxml==5.2.2", + "redis==5.0.7", + "celery==5.4.0", + "langdetect==1.0.9", + "prometheus-client==0.20.0", + "sentry-sdk==2.7.0", +] + +[project.optional-dependencies] +dev = [ + "pytest==8.2.2", + "pytest-asyncio==0.23.7", + "aiosqlite==0.20.0", + "ruff==0.5.0", + "black==24.4.2", +] + +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +py-modules = [] + +[tool.ruff] +line-length = 120 +target-version = "py312" + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "F", # Pyflakes + "I", # isort + "N", # pep8-naming + "W", # pycodestyle warnings +] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] +pythonpath = ["."] + +[tool.black] +line-length = 120 +target-version = ["py312"] diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py new file mode 100644 index 0000000..e10c6a3 --- /dev/null +++ b/backend/tests/conftest.py @@ -0,0 +1,33 @@ +"""Test configuration.""" +import pytest +from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine +from sqlalchemy.orm import sessionmaker + +from app.models.base import Base + + +@pytest.fixture(scope="function") +async def db(): + """Create a fresh in-memory SQLite database for each test.""" + engine = create_async_engine( + "sqlite+aiosqlite:///:memory:", + future=True, + echo=False, + ) + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + + AsyncSessionLocal = sessionmaker( + engine, + class_=AsyncSession, + expire_on_commit=False, + autoflush=False, + autocommit=False, + ) + + async with AsyncSessionLocal() as session: + yield session + + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.drop_all) + await engine.dispose() diff --git a/backend/tests/test_auth.py b/backend/tests/test_auth.py new file mode 100644 index 0000000..f34b33d --- /dev/null +++ b/backend/tests/test_auth.py @@ -0,0 +1,36 @@ +"""Authentication tests.""" +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.auth import get_password_hash, verify_password +from app.models.user import User + + +@pytest.mark.asyncio +async def test_password_hash(): + """Test password hashing and verification.""" + password = "testpassword" + hashed = get_password_hash(password) + assert verify_password(password, hashed) + assert not verify_password("wrongpassword", hashed) + + +@pytest.mark.asyncio +async def test_user_creation(db: AsyncSession): + """Test user creation.""" + user = User( + username="testuser", + password_hash=get_password_hash("testpass"), + role="member", + is_active=True, + ) + db.add(user) + await db.commit() + await db.refresh(user) + + result = await db.execute(select(User).where(User.username == "testuser")) + fetched = result.scalar_one_or_none() + assert fetched is not None + assert fetched.username == "testuser" + assert fetched.role == "member" diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c43efa4 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,100 @@ +services: + postgres: + image: ankane/pgvector:latest + container_name: rss-platform-postgres + environment: + POSTGRES_USER: rss + POSTGRES_PASSWORD: rss + POSTGRES_DB: rss_platform + volumes: + - postgres_data:/var/lib/postgresql/data + - ./docker/init-scripts:/docker-entrypoint-initdb.d:ro + ports: + - "${POSTGRES_PORT:-5432}:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U rss -d rss_platform"] + interval: 5s + timeout: 5s + retries: 5 + + redis: + image: redis:7-alpine + container_name: rss-platform-redis + volumes: + - redis_data:/data + ports: + - "${REDIS_PORT:-6379}:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 5s + retries: 5 + + minio: + image: minio/minio:latest + container_name: rss-platform-minio + command: server /data --console-address ":9001" + environment: + MINIO_ROOT_USER: ${MINIO_ACCESS_KEY:-minioadmin} + MINIO_ROOT_PASSWORD: ${MINIO_SECRET_KEY:-minioadmin} + volumes: + - minio_data:/data + ports: + - "${MINIO_API_PORT:-9000}:9000" + - "${MINIO_CONSOLE_PORT:-9001}:9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 10s + timeout: 5s + retries: 5 + + backend: + build: + context: . + dockerfile: docker/backend.Dockerfile + target: development + container_name: rss-platform-backend + env_file: + - .env + environment: + - DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://rss:rss@postgres:5432/rss_platform} + - REDIS_URL=${REDIS_URL:-redis://redis:6379/0} + volumes: + - ./backend:/app:cached + - platform_data:/app/data + ports: + - "${BACKEND_PORT:-8000}:8000" + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload --log-level ${LOG_LEVEL:-info} + healthcheck: + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + + frontend: + build: + context: ./frontend + dockerfile: ../docker/frontend.Dockerfile + container_name: rss-platform-frontend + volumes: + - ./frontend:/app:cached + - /app/node_modules + ports: + - "${FRONTEND_PORT:-5173}:5173" + environment: + - VITE_API_BASE_URL=http://localhost:${BACKEND_PORT:-8000}/api/v1 + command: npm run dev -- --host + depends_on: + - backend + +volumes: + postgres_data: + redis_data: + minio_data: + platform_data: diff --git a/docker/backend.Dockerfile b/docker/backend.Dockerfile new file mode 100644 index 0000000..8a9768e --- /dev/null +++ b/docker/backend.Dockerfile @@ -0,0 +1,76 @@ +# syntax=docker/dockerfile:1 + +# ---------- Builder stage ---------- +FROM python:3.12-slim AS builder + +WORKDIR /build + +# Install build dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + libpq-dev \ + && rm -rf /var/lib/apt/lists/* + +# Copy dependency file +COPY backend/pyproject.toml ./ + +# Install dependencies into a virtual environment +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -e . + +# ---------- Development stage ---------- +FROM python:3.12-slim AS development + +WORKDIR /app + +# Create non-root user +RUN useradd --create-home --uid 1000 app && \ + mkdir -p /app/data && \ + chown -R app:app /app + +# Install runtime dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy virtual environment from builder +COPY --from=builder /opt/venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Switch to non-root user +USER app + +EXPOSE 8000 + +# ---------- Production stage ---------- +FROM python:3.12-slim AS production + +WORKDIR /app + +# Create non-root user +RUN useradd --create-home --uid 1000 app && \ + mkdir -p /app/data && \ + chown -R app:app /app + +# Install runtime dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy virtual environment from builder +COPY --from=builder /opt/venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Copy backend code +COPY --chown=app:app backend/ /app/ + +# Switch to non-root user +USER app + +EXPOSE 8000 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/docker/frontend.Dockerfile b/docker/frontend.Dockerfile new file mode 100644 index 0000000..5e74922 --- /dev/null +++ b/docker/frontend.Dockerfile @@ -0,0 +1,10 @@ +FROM node:20-alpine + +WORKDIR /app + +# Install dependencies for dev hot reload +RUN apk add --no-cache git + +EXPOSE 5173 + +CMD ["sh", "-c", "npm install && npm run dev -- --host"] diff --git a/docker/init-scripts/01-init-pgvector.sql b/docker/init-scripts/01-init-pgvector.sql new file mode 100644 index 0000000..e64d1b9 --- /dev/null +++ b/docker/init-scripts/01-init-pgvector.sql @@ -0,0 +1,2 @@ +CREATE EXTENSION IF NOT EXISTS pgvector; +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; diff --git a/docs/code-review.md b/docs/code-review.md new file mode 100644 index 0000000..d369a6f --- /dev/null +++ b/docs/code-review.md @@ -0,0 +1,390 @@ +# RSS 平台代码审核报告 + +- 审核对象:`/home/congsh/workspace/dev/rssWorkFlow` +- 技术栈:FastAPI + Vue 3 + PostgreSQL + Redis + MinIO +- 审核范围:后端核心 / API / 模型 / 服务 / 前端 / 部署配置 +- 评级标准:P0(必须立即修复) / P1(生产前修复) / P2(建议优化) + +--- + +## 一、P0:严重缺陷(必须立即修复) + +### 1. 数据库会话资源泄露 + +`backend/app/api/deps.py:18-21` + +```python +async def get_db() -> AsyncSession: + async for session in _get_db(): + return session +``` + +`_get_db()` 是 `async generator`,正常依赖注入会通过 `yield` 让 FastAPI 接管 session 生命周期。但这里用 `async for` + `return`,`finally` 中的 `await session.close()` 永远不会被执行,session 一直被占用,连接池最终耗尽。 + +修复: + +```python +async def get_db(): + async for session in _get_db(): + yield session +``` + +或者直接复用 `database.py:23` 的 `get_db`,避免二次包装。 + +### 2. `/auth/register` 接口存在越权 + +`backend/app/api/v1/auth.py:16-40` + +- 任何未登录用户都可调用注册接口 +- `UserCreate.role` 未做限制,可直接传 `"admin"` 创建管理员账号 + +修复:在 `register` 加上 `current_user: User = Depends(get_current_admin)`,或对 `role` 字段做白名单(普通用户只能注册 `member`),并限制注册端点的访问(首次部署后关闭或改为邀请制)。 + +### 3. 默认管理员弱口令 + 凭证入仓 + +- `.env` 与 `.env.example` 内容完全一致,且包含 `SECRET_KEY=change-me-...` 和 `DEFAULT_ADMIN_PASSWORD=admin` +- `.env` 已存在本地仓库(虽然 `.gitignore` 写了 `.env`,但若曾误提交则泄露) +- `main.py:40-58` 启动时如果无用户则创建 `admin/admin`,且无强制改密流程 + +修复: +1. 在 `main.py:40-58` 增加「首次启动时检查默认密码,若仍为 `admin/admin` 则在 health 端点暴露 warning」 +2. 增加「首次登录强制修改密码」逻辑 +3. 仓库 `.env` 不可提交任何真实凭证;CI 校验 `SECRET_KEY` 不能等于占位值 + +### 4. JWT 设计与刷新机制缺失 + +`backend/app/core/auth.py` + +- 无 refresh token +- `ACCESS_TOKEN_EXPIRE_MINUTES=480`(8 小时)过长且无续签 +- payload 没有 `iat` / `jti`,无主动吊销机制(用户被禁后 token 仍可用 8 小时) +- payload 中带 `role` 字段(`auth.py:68`)后端已查 DB 重新加载,覆盖逻辑没问题,但字段冗余且易引发误解 + +修复:拆分 access (15min) + refresh (7d) + 引入 `jti` 维护吊销集合(Redis 黑名单)。 + +--- + +## 二、P1:安全风险(生产前需修复) + +### 5. CORS dev fallback 使用通配 + +`backend/main.py:78-85` + +```python +else: + app.add_middleware(CORSMiddleware, allow_origins=["*"], ...) +``` + +只要 `CORS_ALLOWED_ORIGINS` 为空就放开 `*`,生产环境如果忘记配置即全放开。`allow_credentials=False` 缓解了,但 `allow_methods=["*"]` + `allow_headers=["*"]` 仍过于宽松。 + +修复:开发态也强制要求配置 `CORS_ALLOWED_ORIGINS`,缺失时启动失败或显式 `WARNING`。 + +### 6. 密码强度弱 + +`backend/app/schemas/user.py:16` + +```python +password: str = Field(..., min_length=6, max_length=128) +``` + +6 位纯数字即可。 + +修复:最少 8/10 位 + 至少字母+数字的复杂度校验。 + +### 7. 缺少登录限流与审计 + +- 无失败次数限制(存在暴力破解风险) +- 无登录/关键操作审计日志 + +修复:基于 Redis 接入 slowapi 或自实现 IP+用户维度限流;关键操作(改密、改 admin、删除 feed)落审计表。 + +### 8. 敏感设置未加密落库 + +`backend/app/services/settings_service.py:105-133` + +非 admin 调用 `list_settings(mask_sensitive=True)` 时返回 `real_value=null`,逻辑正确。但 `OPENAI_API_KEY` 等敏感值在 `apply_db_settings_to_config` 写入内存时是明文——OK;问题是未在 settings_service 中对 `value` 做加密落库,DB 泄露即明文 API Key 泄露。 + +修复:使用 `cryptography.fernet` 加密存储敏感设置。 + +### 9. RSS Feed URL 缺少 SSRF 防护 + +`feed.py:15` `String(2048)` 看似够用,但 `HttpUrl` 在 `feed.py:8` 校验仅做 URL 格式校验,没有 `https://` 强制,没有 `SSRF` 防护(后端抓取时可能请求内网地址)。 + +修复:抓取时强制 `https`/`http`,并维护 IP 黑名单或代理出口策略。 + +--- + +## 三、P1:性能与正确性 + +### 10. 分页计数全量拉取 + +`backend/app/api/v1/feeds.py:39-40` 与 `articles.py:40-41` + +```python +count_result = await db.execute(select(Feed.id).select_from(query.subquery())) +total = len(count_result.scalars().all()) +``` + +把每条 ID 都取回 Python 再 `len()`,表大时是灾难。应该用 `func.count()`: + +```python +from sqlalchemy import func +count_query = select(func.count()).select_from(query.subquery()) +total = (await db.execute(count_query)).scalar_one() +``` + +### 11. 数据库连接池策略不当 + +`backend/app/core/database.py:7-12` + +```python +engine = create_async_engine(..., poolclass=NullPool) +``` + +`NullPool` 适合 serverless/单次执行,但 Docker 长驻服务中会每次请求都创建/销毁 Postgres 连接,TPS 高时延显著。应改为默认 `AsyncAdaptedQueuePool`(带 `pool_size` + `max_overflow`)。 + +### 12. lifespan 中 init_db 与 alembic 冲突 + +`backend/main.py:24` + +```python +await init_db() # Base.metadata.create_all +``` + +启动时用 `create_all` 自动建表,会绕过 alembic 迁移;表结构偏离后将无法再用 `alembic upgrade` 演进。 + +修复:移除 `init_db()` 调用,统一通过 `make migrate` 走 alembic。`init_default_settings` 也应在迁移脚本或独立 seed 任务中执行。 + +### 13. 数据库迁移命名不合规 + +`backend/alembic/versions/001_initial_schema.py` + +Alembic 推荐 `xxxxxx_initial_schema.py` 哈希前缀。缺少前缀会导致 `alembic history` 出现歧义、`autogenerate` 比较报错。 + +### 14. task_runtime Redis hash 字段类型不严谨 + +`app/services/task_runtime.py` + +`update_progress` 把 `current`/`total` 存为字符串,`get_progress` 再 `int()`。如果 key 在 `update_progress` 之前被外部写为 `int`,会被覆盖为字符串,行为不统一。建议明确约定。 + +--- + +## 四、P2:代码质量 + +### 15. get_current_user 双重读取 Authorization + +`backend/app/api/deps.py:30-37` + +`HTTPBearer(auto_error=False)` 已处理,又手动再 `request.headers.get("Authorization")`。是冗余逻辑,建议只保留一种。 + +### 16. rbac.has_permission 语义模糊 + +`backend/app/core/rbac.py:26-30` + +```python +def has_permission(user: User, required_role: Role) -> bool: + if user.role == Role.ADMIN: + return True + return user.role == required_role +``` + +返回 `True` 当用户是 admin,无论 `required_role` 是什么——这其实是「admin 拥有所有权限」的语义,但函数命名是「是否有某角色权限」,容易误用。需在 docstring 中明确:返回值等价于 `user.role == ADMIN or user.role == required_role`。 + +### 17. require_admin 与 get_current_admin 重复实现 + +`rbac.py:16-23` 与 `deps.py:80-87` 做同一件事,应统一从一处导入。 + +### 18. 缺少统一日志 request_id 注入 + +`backend/app/core/logging.py:6` 定义了 `request_id_var: ContextVar`,但没有 ASGI 中间件去赋值 `request_id`,所有日志的 `[%s]` 始终是空字符串,request_id 形同虚设。 + +修复:添加 ASGI middleware 从 header `X-Request-ID` 读取或生成 UUID,写入 `ContextVar`。 + +### 19. 健康检查端点无认证 + +`backend/app/api/v1/health.py` + +返回了 `db`、`redis` 状态字符串,可被外部用于探测内网拓扑。建议: + +- 基础 `/health` 仅返回 `ok/degraded` +- 详细诊断使用 `/health/db`、`/health/redis` 并在生产加 `Depends(get_current_admin)` + +### 20. 前端 vite.config.ts 误用 process.env + +`frontend/vite.config.ts:17,22` + +```ts +target: process.env.VITE_API_BASE_URL || 'http://localhost:8000' +``` + +Vite 在 Node 端运行时 `process.env` 通常取不到 `VITE_*` 变量(这些只在客户端 `import.meta.env` 中存在)。应改用 `loadEnv`: + +```ts +import { defineConfig, loadEnv } from 'vite' +export default defineConfig(({ mode }) => { + const env = loadEnv(mode, process.cwd(), '') + return { server: { proxy: { '/api': { target: env.VITE_API_BASE_URL || 'http://localhost:8000' } } } } +}) +``` + +### 21. 前端路由守卫在 token 失效但 user 缓存存在时的中间态 + +`frontend/src/router/index.ts:38-57` + +`authStore.isAuthenticated` 要求 `token && user` 都有值。F5 刷新时:localStorage 有 token 但 Pinia 中 `user` 为 null——通过 `fetchUser()` 重新拉取,正常。 + +但如果 **user 已被禁用** 且 token 仍有效,则 `get_current_user` 抛 403,前端会 `authStore.logout()`。**如果服务器端 `is_active` 没及时同步**(集群场景),会出现 token 在内存有效、DB 无效的中间态。建议每次路由切换都强制重新校验。 + +### 22. 前端 token 存 localStorage + +`frontend/src/stores/auth.ts:7,19` + +XSS 风险:恶意脚本可读 `localStorage.getItem('token')`。生产建议:access token 走 httpOnly cookie,或限制 token 权限到只读。 + +### 23. axios 拦截器 401 硬跳页 + +`frontend/src/api/index.ts:29-33` + +401 时直接 `window.location.href = '/login'`,硬刷路由。如果某些请求是登录后业务请求(如 fetchMe 失败),会突然跳页而不是返回错误。考虑在 store 层处理跳转。 + +### 24. Dashboard 拉 1000 条数据统计 + +`frontend/src/views/DashboardView.vue:46` + +```ts +const res = await feedsApi.list({ limit: 1000 }) +``` + +后端 `PaginationParams.limit` 没有上限,前端传多少就返回多少。数据大时慢。应该让后端提供专用统计接口。 + +### 25. 前端无全局错误边界 + +没有 `app.config.errorHandler`、没有 `` 组件,单个组件报错会白屏。 + +修复: + +```ts +app.config.errorHandler = (err, instance, info) => { + console.error('Unhandled error:', err, info) + // 接入 Sentry 等 +} +``` + +### 26. 分页参数未做边界校验 + +`feeds.py` 等处的 `PaginationParams.skip/limit` 无 max 限制,恶意请求 `limit=99999999` 会拉全表。 + +修复:`limit: int = Field(50, ge=1, le=200)`。 + +### 27. Dockerfile 细节 + +- `backend.Dockerfile` `pip install -e .` 但 production 阶段又 `COPY backend/`,无 `.dockerignore` 容易把 .env/数据卷带进去 +- `frontend.Dockerfile` 用 `npm install` 应改 `npm ci` 以保证 lock 一致 +- 无 production 多阶段构建(目前 `Dockerfile` 有 production stage 但 `docker-compose.yml` 只用 `development`) + +### 28. tests/conftest.py 自定义 event_loop 已弃用 + +`backend/tests/conftest.py:11-15` + +```python +@pytest.fixture(scope="session") +def event_loop(): + loop = asyncio.get_event_loop_policy().new_event_loop() + ... +``` + +pytest-asyncio 0.23+ 已标记为 deprecated,改为 `asyncio_mode = "auto"` + `event_loop_policy` fixture 或升级到 `pytest-asyncio` 0.23+ 推荐做法。 + +### 29. 测试覆盖率几乎为零 + +仅 2 个测试(密码哈希 + 用户创建),没有 API endpoint 集成测试、没有任何前端测试。CI 缺失。 + +建议:补 FastAPI `TestClient` 集成测试 + 至少 RBAC 权限矩阵测试;前端至少加 Vitest 单测覆盖 stores + 一个 E2E(Playwright)。 + +### 30. 文档缺失 + +`docs/` 目录为空,README 引用的「设计文档」「开发步骤」路径指向 `/home/congsh/workspace/chat/`,仓内 `docs/` 没文件——文档随项目仓库丢失风险。 + +修复:将 `/home/congsh/workspace/chat/rss-platform-design.md` 与 `rss-platform-dev-plan.md` 移到 `docs/` 下并更新 README 引用路径。 + +--- + +## 五、修复优先级汇总 + +| 优先级 | 文件 | 问题 | 建议改动 | +|--------|------|------|----------| +| P0 | `backend/app/api/deps.py:18-21` | DB session 泄露 | 改为 `async for ... yield session` 或直接复用 `database.get_db` | +| P0 | `backend/app/api/v1/auth.py:16-40` | 注册越权 | 加 admin 鉴权 + role 白名单 | +| P0 | `backend/main.py:40-58` | 默认弱口令 | 首次启动检查、强制改密 | +| P0 | `backend/app/core/auth.py` | 无 refresh / jti | 拆分 access/refresh + Redis 黑名单 | +| P1 | `backend/main.py:78-85` | CORS 通配 fallback | 移除或启动 fail-fast | +| P1 | `backend/app/schemas/user.py:16` | 密码长度 6 | 改为 10 + 复杂度校验 | +| P1 | `backend/app/api/v1/feeds.py:39` & `articles.py:40` | `len()` 计数 | 改 `func.count()` | +| P1 | `backend/app/core/database.py:7-12` | NullPool | 改默认连接池 | +| P1 | `backend/main.py:24` | `init_db` 绕过 alembic | 删除 init_db,统一 alembic | +| P2 | `frontend/vite.config.ts:17,22` | `process.env` 错用 | 改 `loadEnv` | +| P2 | `backend/app/core/logging.py` | request_id 形同虚设 | 加 ASGI 中间件赋值 | +| P2 | `frontend/src/stores/auth.ts` | localStorage 存 token | 改 httpOnly cookie 或加 CSP | +| P2 | 整体 | 缺少测试与 CI | 补 pytest + Vitest + Playwright + GH Actions | + +--- + +## 六、值得肯定的点 + +- 整体结构清晰,模块边界(core / models / schemas / services / api)划分合理 +- 模型层 `UUIDMixin` + `TimestampMixin` 抽象良好 +- Alembic + Pydantic Settings + async SQLAlchemy 2.0 类型注解到位 +- 前端 Pinia + 路由守卫 + axios 拦截器规范 +- 用 Pydantic 的 `HttpUrl`、`Field(ge/le)`、`min_length` 在请求侧就校验 +- 使用 `lifespan` 替代 deprecated 的 `on_event` + +--- + +## 七、修复记录(2026-06-15) + +本次根据本报告对 `rssWorkFlow` 代码进行了选择性修复,覆盖 P0 / P1 / 部分 P2 项。 + +### 已修复 + +| 优先级 | 问题 | 修复文件 | 修复内容 | +|--------|------|----------|----------| +| P0 | DB session 资源泄露 | `backend/app/api/deps.py` | `get_db` 改为 `async for ... yield session`,由 FastAPI 管理生命周期 | +| P0 | `/auth/register` 越权 | `backend/app/api/v1/auth.py` | 注册接口改为仅 admin 可调用;schema 限制 role 白名单 | +| P0 | 默认管理员弱口令 | `backend/main.py`, `.env.example`, `backend/app/api/v1/health.py` | 启动时检测 admin/admin 并在 `/health` 返回 warning;文档增加安全提示 | +| P0 | JWT 无 refresh/jti | `backend/app/core/auth.py`, `backend/app/api/v1/auth.py`, `backend/app/schemas/user.py`, `backend/app/core/config.py` | access/refresh 双 token;token 携带 jti/type/iat;支持 Redis 黑名单吊销 | +| P1 | CORS dev fallback 通配 | `backend/main.py` | 移除 `*` fallback,默认使用 `http://localhost:5173`;生产必须显式配置 | +| P1 | 密码强度不足 | `backend/app/schemas/user.py` | 密码要求 8-128 位且至少包含字母和数字 | +| P1 | 分页计数全量拉取 | `backend/app/api/v1/feeds.py`, `backend/app/api/v1/articles.py` | 改用 `select(func.count())` | +| P1 | 数据库连接池策略 | `backend/app/core/database.py` | 移除 `NullPool`,改用 `AsyncAdaptedQueuePool`(pool_size=10, max_overflow=20) | +| P1 | lifespan 中 init_db 绕过 alembic | `backend/main.py`, `backend/app/core/database.py` | 移除 `init_db()` 调用与函数定义,统一由 alembic 管理 schema | +| P1 | 敏感设置未加密 | `backend/app/services/settings_service.py`, `backend/app/core/config.py`, `backend/pyproject.toml` | 使用 Fernet 加密敏感配置项(`OPENAI_API_KEY`、`API_TOKEN`),明文/加密兼容 | +| P1 | 健康检查暴露拓扑 | `backend/app/api/v1/health.py` | `/health/db`、`/health/redis` 增加 admin 鉴权 | +| P2 | vite.config.ts 误用 process.env | `frontend/vite.config.ts` | 改用 `loadEnv` 读取环境变量 | +| P2 | request_id 未注入 | `backend/app/core/logging.py`, `backend/main.py` | 新增 ASGI middleware,从 header 读取或生成 request_id | +| P2 | pytest event_loop 弃用 | `backend/tests/conftest.py` | 移除自定义 `event_loop` fixture,依赖 `asyncio_mode = "auto"` | +| P2 | 前端无错误边界 | `frontend/src/main.ts` | 添加 `app.config.errorHandler` | +| P2 | 分页参数无上限 | `backend/app/schemas/common.py` | `skip >= 0`、`1 <= limit <= 200` | +| P2 | 文档未随仓库管理 | `docs/design.md`, `docs/dev-plan.md`, `README.md` | 将设计文档与开发步骤复制到 `docs/`,README 引用更新 | +| P2 | 前端 token 无刷新 | `frontend/src/stores/auth.ts`, `frontend/src/api/index.ts`, `frontend/src/api/auth.ts`, `frontend/src/types/index.ts` | 存储 refresh_token;401 时自动刷新,失败再跳转登录 | + +### 未修复 / 留待后续 + +- 登录限流与审计日志(P1):建议后续接入 slowapi 或自实现 Redis 限流,并新增 `audit_logs` 表。 +- Dockerfile 多阶段与 `.dockerignore` 优化(P2):当前阶段先用 dev 构建,production 镜像与构建 CI 留待部署阶段完善。 +- 测试覆盖率与 CI(P2):已留好 pytest / Vitest / Playwright 接入点,后续阶段补充集成测试与 GitHub Actions。 +- SSRF 防护(P1):RSS 抓取任务尚未实现,抓取模块中加入 URL 校验、IP 黑名单、代理出口策略。 + +### 验证 + +- 后端全部 Python 文件 `py_compile` 通过 +- `init_db` 已无引用 +- 前端 TypeScript 文件已完成同步修改(因无 node_modules,未运行 `tsc`) + +### 运行建议 + +1. `cd /home/congsh/workspace/dev/rssWorkFlow` +2. `cp .env.example .env` 并修改 `SECRET_KEY`、默认管理员密码、生成 `SETTINGS_ENCRYPTION_KEY` +3. `make dev` +4. `docker-compose exec backend alembic upgrade head` +5. 访问前端登录,验证 `/health` 无安全警告 +- Dockerfile 多阶段构建思路正确 diff --git a/docs/design.md b/docs/design.md new file mode 100644 index 0000000..80a6a6d --- /dev/null +++ b/docs/design.md @@ -0,0 +1,1120 @@ +# RSS 信息处理平台:完整设计文档 + +> 版本:v1.0 +> 日期:2026-06-15 +> 基于仓库:dataClean (778ccfb) + rssKeeper (4286731) + +--- + +## 一、项目背景与目标 + +### 1.1 背景 + +现有两个项目: + +- **rssKeeper**:RSS 抓取与原始文章管理服务,负责 RSS 源管理、定时抓取、文章存储、全文检索(FTS5)、健康度监控。 +- **dataClean**:rssKeeper 的下游清洗服务,负责 AI 摘要、分类/标签/打分、URL+内容去重、每日简报生成。 + +两者已通过 `/api/v1/external/*` 接口松耦合,dataClean 只读调用 rssKeeper。当前部署为单容器 + SQLite,适合小数据量和个人使用。 + +### 1.2 新平台目标 + +构建一个 **模块化、工业化、AI 驱动** 的 RSS 信息处理平台,统一承接 rssKeeper + dataClean 的能力,并扩展: + +- 聊天式 AI 知识产出(带引用链接) +- Skill 化产出编排(日报、聊天、自定义任务) +- AI 自优化 Prompt / 去重算法 +- 多供应商/多模型 AI 配置 +- 团队级简单鉴权与锁机制 +- 面向 30万~100万 文章量的可扩展架构 + +### 1.3 设计原则 + +| 原则 | 说明 | +|------|------| +| **模块化** | 抓取、清洗、AI 处理、检索、聊天、自优化、Skills 均为独立模块,接口契约清晰 | +| **可插拔** | 去重算法以外挂文件形式存在,Skills 可导入/覆盖/恢复默认 | +| **AI 原生** | 分类、Tag、摘要、打分、日报、聊天、自优化均由 AI 驱动,规则仅作兜底 | +| **配置即代码** | Prompt、AI 配置、Skills、去重算法均版本化管理,变更留痕 | +| **工业化** | Docker 部署、日志/监控、任务锁、失败重试、数据分区预留 | +| **渐进扩展** | 初期单库单服务,预留分库分表、独立检索、向量库扩展路径 | + +--- + +## 二、现状分析 + +### 2.1 rssKeeper 现状 + +**优势:** +- FastAPI + SQLAlchemy + APScheduler 技术栈成熟 +- RSS 源管理完整:增删改查、OPML 导入导出、自动发现 +- 文章抓取并发(ThreadPoolExecutor) +- 健康度监控与抓取日志 +- SQLite FTS5 全文搜索 +- Docker 部署就绪 + +**问题:** +- 无鉴权/限流,CORS 在部分版本宽松 +- 时区处理不一致(`datetime.utcnow()` 混用) +- 添加源时同步抓取会阻塞 HTTP +- 导入 OPML 接口 body/query 不一致(已知 bug) +- 无测试、无 CI +- SQLite 单库,百万级文章需迁移 + +### 2.2 dataClean 现状 + +**优势:** +- 模块划分清晰:`summarizer`、`tagger`、`scorer`、`deduplicator`、`brief`、`taxonomy` +- AI 调用封装 `AIClient` 支持运行时配置覆盖 +- 配置双层管理:环境变量 + 数据库 +- 任务进度可视化 +- 去重算法已考虑 URL + 标题 + 内容相似度 +- Docker 部署就绪 + +**问题:** +- 去重算法 O(n²) + SequenceMatcher,数据量大时性能差 +- 去重历史数据有破坏风险(近期版本已修复为按日期清空) +- 分类/标签基于规则,未充分发挥 AI +- 仅支持单一 LLM 配置 +- 无 Skill 概念,日报格式固定 +- 无自优化机制 +- 无聊天/产出能力 +- 无引用链接的 AI 产出 + +### 2.3 可复用资产 + +| 资产 | 来源 | 复用方式 | +|------|------|----------| +| RSS 抓取逻辑 | rssKeeper `rss_fetcher.py` | 迁移至 `feeds/fetcher.py`,解耦数据库操作 | +| Feed 模型与健康度 | rssKeeper `models.py` | 复用并扩展字段 | +| FTS5 搜索 | rssKeeper `fulltext_search.py` | 作为检索 V1,后续抽象接口 | +| 外部 API 设计 | rssKeeper `external_api.py` | 演变为平台内部检索/文章接口 | +| AI 客户端 | dataClean `app/ai_client.py` | 扩展为多 Provider 路由 | +| 去重算法 | dataClean `app/deduplicator.py` | 改造为插件接口,保留默认实现 | +| 任务进度 | dataClean `app/task_progress.py` | 复用并扩展为任务运行时 | +| 配置管理 | dataClean `app/settings_manager.py` | 复用并扩展为配置中心 | +| 简报生成 | dataClean `app/brief.py` | 改造为 Skill 驱动 | +| Taxonomy | dataClean `app/taxonomy.py` | 演变为 Skill + AI 分类体系 | + +--- + +## 三、总体架构 + +### 3.1 服务边界 + +采用 **单体模块化架构(Modular Monolith)**,而非微服务。原因: +- 当前团队规模小,单体降低运维复杂度 +- 模块间接口清晰,未来可拆分为独立服务 +- 350 源 / 日增 1000+ 的规模单体完全可承受 + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ 前端应用层 (Web UI) │ +│ RSS 源管理 │ 文章库 │ 检索 │ 日报任务 │ AI 聊天 │ Skills 管理 │ 配置 │ +└────────────────────────────────┬────────────────────────────────────┘ + │ +┌────────────────────────────────▼────────────────────────────────────┐ +│ API Gateway / FastAPI │ +│ JWT 鉴权 │ 限流 │ 路由 │ 锁服务 │ Skills 注册中心 │ +└────────────────────────────────┬────────────────────────────────────┘ + │ +┌────────────────────────────────▼────────────────────────────────────┐ +│ 核心业务模块 │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌─────────┐ │ +│ │ 抓取调度模块 │ │ 清洗流水线 │ │ AI 处理中心 │ │ 检索服务 │ │ +│ │ Feeds │ │ Pipeline │ │ Processor │ │ Search │ │ +│ └──────────────┘ │ - 去重插件 │ │ - 分类 │ └─────────┘ │ +│ │ - 数据校验 │ │ - Tag │ │ +│ └──────┬───────┘ │ - 摘要 │ │ +│ │ │ - 打分 │ │ +│ ┌──────▼──────────┼──┴───────────┤ │ +│ │ 任务调度器 │ │ │ +│ │ Celery + Redis │ │ │ +│ └─────────────────┘ │ │ +│ ┌──────────────────────────────────────────────────────────────┐ │ +│ │ 自优化调度器 │ │ +│ │ 每日 02:00:Prompt/Skills 自优化 + 去重算法自优化 + 留痕 │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +│ ┌──────────────────────────────────────────────────────────────┐ │ +│ │ 聊天与产出引擎 │ │ +│ │ 对话上下文 + Skills 调用 + 检索工具 + 引用链接渲染 │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +└────────────────────────────────┬────────────────────────────────────┘ + │ +┌────────────────────────────────▼────────────────────────────────────┐ +│ 数据与存储层 │ +│ PostgreSQL 16 + pgvector │ Redis │ 对象存储 (MinIO) │ 日志/监控 │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### 3.2 技术栈 + +| 层级 | 选型 | 理由 | +|------|------|------| +| 后端框架 | Python 3.12 + FastAPI | AI 生态最全,异步性能好,Pydantic 适合做配置/schema | +| 任务队列 | Celery + Redis | 成熟稳定,支持定时任务、任务锁、进度追踪 | +| 数据库 | PostgreSQL 16 + pgvector | 关系数据 + 向量检索一体,支持分区表 | +| 全文检索 | PostgreSQL tsvector (初期) → OpenSearch (扩展) | 初期降低复杂度,抽象检索接口便于切换 | +| 对象存储 | MinIO / 云 OSS | 存原始 HTML、图片、导出产物 | +| 前端 | Vue 3 + Element Plus + TypeScript | 团队后台管理 + 聊天界面 | +| AI 调用 | LiteLLM + 自封装 Provider 路由 | 统一接口支持多供应商、多模型、Fallback | +| 缓存/锁 | Redis | 分布式锁、任务状态、会话缓存 | +| 监控 | Prometheus + Grafana + Sentry | 工业化可观测性 | +| 部署 | Docker + Docker Compose | 用户指定 Docker 化 | + +--- + +## 四、模块详细设计 + +### 4.1 抓取调度模块(Feeds) + +**职责:** 管理 RSS 源、定时抓取、解析、原始文章入库。 + +**设计要点:** +- 每个 Feed 独立配置:URL、抓取频率、优先级、解析规则、代理策略、启用状态 +- 调度器从 APScheduler 迁移到 Celery Beat,支持任务持久化和水平扩展 +- 抓取 Worker 与主服务分离,未来可独立扩容 +- 原始文章内容存入对象存储,元数据入 `raw_articles` 表 +- 失败重试 + 死信队列 + 健康度统计 + +**核心模型:** +```python +class Feed: + id: int + url: str + title: str + description: str + category: str # 源预设分类 + is_active: bool + fetch_interval_minutes: int + priority: int # 抓取优先级 + parser_config: dict # 自定义解析规则 + proxy_policy: str # auto / direct / proxy + last_fetch_at: datetime + last_fetch_status: str + last_error: str + error_type: str + success_count: int + fail_count: int + article_count: int + health_status: str +``` + +**抓取流程:** +``` +Celery Beat 触发 → 按优先级/间隔筛选 Feed → 分发 fetch_feed_task +→ 下载 RSS → feedparser 解析 → clean_html → 生成 summary +→ 按 URL 去重 → 批量写入 raw_articles → 更新 Feed 统计 +``` + +### 4.2 清洗流水线(Pipeline) + +**职责:** 对原始文章进行标准化、去重、校验,产出清洗后的文章。 + +**流水线阶段:** + +``` +raw_articles + → normalize(URL 规范化、编码统一、时间标准化) + → extract(正文提取,若 RSS 内容不全则 fetch 原文页) + → dedup_exact(URL/标题精确去重) + → dedup_similar(调用外挂算法,相似度去重) + → enrich(补充语言、字数、内容 hash 等元数据) + → cleaned_articles +``` + +**去重插件接口(核心):** + +```python +# plugins/deduplication/current.py +from dataclasses import dataclass +from typing import List, Dict, Set + +@dataclass +class DedupInput: + article_id: str + title: str + link: str + content: str + content_length: int + published_at: str + feed_id: str + +@dataclass +class DuplicateGroup: + representative_id: str # 保留的主文章 ID + member_ids: List[str] # 重复文章 IDs + reason: str # 去重原因:url_exact / title_exact / content_similar + similarity_scores: Dict[str, float] + +class DeduplicationPlugin: + name: str = "default_tfidf" + version: str = "1.0.0" + + def find_duplicates(self, articles: List[DedupInput]) -> List[DuplicateGroup]: + """ + 输入:待去重文章列表 + 输出:重复组列表,每组指定 representative 和 members + 规则: + 1. URL 完全相同 → 直接归为一组 + 2. 标题完全相同 → 归为一组 + 3. 内容相似度 >= threshold → 归为一组,保留 content_length 最大的 + """ + ... +``` + +**引用关系:** +- 主文章保留完整内容,生成 `cleaned_article` 记录 +- 重复文章不丢弃,而是在 `article_references` 表中记录: + - `source_article_id`:主文章 + - `referenced_article_id`:重复文章 + - `reference_type`:`duplicate_url` / `duplicate_title` / `duplicate_content` + - `link`:重复文章原始链接 + - `similarity`:相似度分数 +- 主文章的 `reference_links` JSON 字段聚合所有引用链接,便于前端/AI 使用 + +### 4.3 AI 处理中心(AI Processor) + +**职责:** 管理所有 AI 任务,支持多 Provider/多模型/多配置。 + +**AI 任务类型:** + +| 任务 | 输入 | 输出 | 默认模型 | +|------|------|------|----------| +| `summarize` | 标题+正文 | AI 摘要 | 轻量模型 | +| `classify` | 标题+摘要+正文 | category + 置信度 | 推理模型 | +| `tag` | 标题+摘要+正文 | tags[] | 轻量模型 | +| `score` | 标题+摘要+正文+元数据 | heat/importance/composite | 推理模型 | +| `daily_brief` | 当日高分文章 + Skill | Markdown/JSON 日报 | 强模型 | +| `chat` | 对话上下文 + 检索结果 + Skill | 带引用链接的回答 | 强模型 | +| `optimize` | 当前 Prompt/算法 + 历史样例 | 优化建议/新 Prompt | 强模型 | + +**AI 配置模型:** + +```python +class AIProviderConfig: + id: str + name: str + provider: str # openai / anthropic / gemini / local + base_url: str + api_key: str # 加密存储 + default_model: str + timeout: int + max_retries: int + rate_limit_rpm: int + is_active: bool + +class AITaskConfig: + id: str + task_type: str # summarize / classify / tag / score / daily_brief / chat / optimize + name: str + provider_config_id: str + model: str + skill_id: str # 绑定的 Skill + temperature: float + max_tokens: int + top_p: float + system_prompt_override: str | None + enabled: bool +``` + +**多供应商路由:** +- 使用 LiteLLM 统一封装,或自实现 Provider 适配器 +- 每个任务独立配置,支持复制配置(Clone) +- 支持 Fallback:主模型失败时自动切换到备用模型 + +### 4.4 Skills 系统 + +**定义:** Skill = 指导 AI 产出的结构化配置,包括: +- 系统提示词(system prompt) +- 输出模板(output template/schema) +- 可用工具集(tools) +- 输入参数 schema +- 版本与默认值标记 + +**Skill 类型:** + +| 类型 | 用途 | 示例 | +|------|------|------| +| `output` | 产出型 Skill,直接生成内容 | 日报、行业观察、摘要 | +| `tool` | 工具型 Skill,AI 可调用的能力 | 搜索文章、获取文章详情、调用外部 API | +| `agent` | 复合型 Skill,组合多个工具 | 先搜索再总结再生成报告 | + +**Skill 模型:** + +```python +class Skill: + id: str + name: str + slug: str + description: str + type: str # output / tool / agent + version: int + is_default: bool # 是否系统默认,不可删除 + system_prompt: str + output_schema: dict # JSON Schema + tools: List[str] # 可调用的 tool_id 列表 + input_schema: dict # 输入参数 JSON Schema + example_inputs: List[dict] + created_by: str + created_at: datetime + updated_at: datetime +``` + +**Skill 管理:** +- 内置默认 Skills(不可删除,可恢复默认) +- 用户可修改、新增、导入、导出 +- 导入时支持覆盖(overwrite)或新建版本 +- 修改后自动生效(无重启) + +**默认内置 Skills:** + +| Skill | 用途 | +|-------|------| +| `daily-brief-default` | 默认日报生成 | +| `daily-tech-watch` | 科技观察日报 | +| `chat-researcher` | 研究型聊天助手 | +| `chat-summarizer` | 摘要型聊天助手 | +| `article-classifier` | 文章分类 | +| `article-tagger` | 文章打标签 | +| `article-scorer` | 文章打分 | +| `article-summarizer` | 文章摘要 | +| `search-articles` | 检索文章工具 | +| `fetch-external-api` | 调用外部 API 工具 | + +### 4.5 日报任务系统 + +**核心概念:** 日报 = 任务 + Skill + 数据筛选 + 定时/手动触发 + +```python +class OutputTask: + id: str + name: str + task_type: str # daily_brief / chat / custom + skill_id: str # 调用哪个 Skill + schedule: str # cron 表达式,为空则仅手动 + filter_config: dict # 数据筛选条件 + output_config: dict # 输出方式:web / email / file / webhook + is_active: bool + last_run_at: datetime + last_output_id: str +``` + +**数据筛选配置示例:** +```json +{ + "time_range": "last_24h", + "categories": ["科技", "AI"], + "min_composite_score": 70, + "tags": ["大模型"], + "exclude_duplicates": true, + "top_n": 50 +} +``` + +**新增日报流程:** +1. 用户在 UI 创建 OutputTask +2. 选择 Skill(可复用已有或新建) +3. 配置筛选条件 +4. 配置定时/手动触发 +5. 系统按 cron 自动执行,或用户手动触发 + +### 4.6 聊天与产出引擎 + +**聊天模型:** + +```python +class ChatSession: + id: str + user_id: str + title: str + skill_id: str # 当前会话使用的默认 Skill + context_messages: List[dict] + created_at: datetime + updated_at: datetime + +class ChatMessage: + id: str + session_id: str + role: str # user / assistant / tool + content: str + tool_calls: List[dict] # AI 调用的工具 + tool_results: List[dict] # 工具返回结果 + references: List[dict] # 引用文章链接 + created_at: datetime +``` + +**聊天流程:** +``` +用户输入 + → 意图识别(由 Skill 系统提示词决定) + → 如需检索:调用 search-articles tool + → 如需外部数据:调用 fetch-external-api tool + → 组装上下文(含检索结果/工具返回) + → 调用 LLM 生成回答 + → 解析引用标记,映射为文章链接 + → 返回带引用链接的 Markdown +``` + +**引用链接规范:** +- AI 输出中使用 `[^1^]`, `[^2^]` 等标记 +- 后端解析标记,从 `references` 中生成真实链接 +- 前端渲染为可点击脚注/卡片 + +示例输出: +```markdown +今日 AI 领域最重要的消息是 OpenAI 发布了新模型 [^1^], +同时 Google 也公布了相关进展 [^2^]。 + +[^1^]: [OpenAI 官方公告](https://openai.com/...) +[^2^]: [Google Blog](https://blog.google/...) +``` + +### 4.7 检索服务 + +**检索维度:** + +| 类型 | 实现 | 说明 | +|------|------|------| +| 全文检索 | PostgreSQL tsvector / OpenSearch | 标题、正文、摘要 | +| 语义检索 | pgvector / 独立向量库 | 基于 Embedding 的相似文章 | +| 元数据过滤 | SQL | 分类、Tag、分数、时间、Feed、来源 | +| 混合检索 | 全文 + 语义 + 元数据 | 未来扩展 | + +**检索接口抽象:** + +```python +class SearchEngine(ABC): + @abstractmethod + def search(self, query: SearchQuery) -> SearchResult: + ... + +class PostgresSearchEngine(SearchEngine): ... +class OpenSearchEngine(SearchEngine): ... +``` + +**SearchQuery 模型:** +```python +class SearchQuery: + q: str | None + semantic_q: str | None + category: str | None + tags: List[str] + feed_id: str | None + min_score: float | None + since: datetime + until: datetime + sort_by: str = "published_at_desc" + limit: int = 50 + offset: int = 0 +``` + +--- + +## 五、数据模型设计 + +### 5.1 核心实体关系 + +``` +users + └── chat_sessions + └── chat_messages + +feeds + └── raw_articles + └── cleaned_articles + ├── article_references (duplicate links) + ├── article_embeddings + └── article_versions + +skills + └── skill_versions + +ai_task_configs + └── ai_provider_configs + +output_tasks + └── outputs + +optimization_logs +``` + +### 5.2 表结构 + +#### users(用户) +```sql +CREATE TABLE users ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + username VARCHAR(64) UNIQUE NOT NULL, + password_hash VARCHAR(255) NOT NULL, + role VARCHAR(32) DEFAULT 'member', -- admin / member + is_active BOOLEAN DEFAULT TRUE, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### feeds(RSS 源) +```sql +CREATE TABLE feeds ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + url VARCHAR(2048) UNIQUE NOT NULL, + title VARCHAR(512), + description TEXT, + category VARCHAR(128), + is_active BOOLEAN DEFAULT TRUE, + fetch_interval_minutes INT DEFAULT 60, + priority INT DEFAULT 5, + parser_config JSONB DEFAULT '{}', + proxy_policy VARCHAR(32) DEFAULT 'auto', + last_fetch_at TIMESTAMPTZ, + last_fetch_status VARCHAR(32), + last_error TEXT, + error_type VARCHAR(32), + success_count INT DEFAULT 0, + fail_count INT DEFAULT 0, + article_count INT DEFAULT 0, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### raw_articles(原始文章) +```sql +CREATE TABLE raw_articles ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + feed_id UUID NOT NULL REFERENCES feeds(id) ON DELETE CASCADE, + external_id VARCHAR(255), -- 源系统 ID(如 rssKeeper article id) + title VARCHAR(1024), + link VARCHAR(2048) NOT NULL, + author VARCHAR(256), + published_at TIMESTAMPTZ, + fetched_at TIMESTAMPTZ DEFAULT NOW(), + content TEXT, + summary TEXT, + raw_html TEXT, -- 原始 HTML,存对象存储或分表 + content_hash VARCHAR(64), + language VARCHAR(16), + status VARCHAR(32) DEFAULT 'pending', -- pending / processed / failed + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +) PARTITION BY RANGE (fetched_at); +``` + +#### cleaned_articles(清洗后文章) +```sql +CREATE TABLE cleaned_articles ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + raw_article_id UUID REFERENCES raw_articles(id), + feed_id UUID NOT NULL REFERENCES feeds(id), + title VARCHAR(1024), + link VARCHAR(2048) NOT NULL, + author VARCHAR(256), + feed_title VARCHAR(512), + feed_category VARCHAR(128), + published_at TIMESTAMPTZ, + fetched_at TIMESTAMPTZ, + content TEXT, + content_length INT DEFAULT 0, + original_summary TEXT, + ai_summary TEXT, + category VARCHAR(128), + tags JSONB DEFAULT '[]', + heat_score FLOAT DEFAULT 0, + importance_score FLOAT DEFAULT 0, + duplication_score FLOAT DEFAULT 0, + composite_score FLOAT DEFAULT 0, + duplicate_group_id UUID, + is_representative BOOLEAN DEFAULT TRUE, + reference_links JSONB DEFAULT '[]', -- 重复文章引用链接 + processing_status VARCHAR(32) DEFAULT 'pending', + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +) PARTITION BY RANGE (fetched_at); +``` + +#### article_references(文章引用关系) +```sql +CREATE TABLE article_references ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + source_article_id UUID NOT NULL REFERENCES cleaned_articles(id), + referenced_article_id UUID REFERENCES cleaned_articles(id), + reference_type VARCHAR(64), -- duplicate_url / duplicate_title / duplicate_content + reference_link VARCHAR(2048), + reference_title VARCHAR(1024), + similarity FLOAT, + created_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### duplicate_groups(重复组) +```sql +CREATE TABLE duplicate_groups ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + representative_article_id UUID REFERENCES cleaned_articles(id), + member_article_ids JSONB DEFAULT '[]', + similarity_matrix JSONB DEFAULT '{}', + brief_date DATE, + created_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### skills(技能库) +```sql +CREATE TABLE skills ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name VARCHAR(128) NOT NULL, + slug VARCHAR(128) UNIQUE NOT NULL, + description TEXT, + type VARCHAR(32) NOT NULL, -- output / tool / agent + version INT DEFAULT 1, + is_default BOOLEAN DEFAULT FALSE, + system_prompt TEXT NOT NULL, + output_schema JSONB, + tools JSONB DEFAULT '[]', + input_schema JSONB, + example_inputs JSONB DEFAULT '[]', + created_by VARCHAR(64), + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### ai_provider_configs(AI 供应商配置) +```sql +CREATE TABLE ai_provider_configs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name VARCHAR(128) NOT NULL, + provider VARCHAR(64) NOT NULL, -- openai / anthropic / gemini / local + base_url VARCHAR(512), + api_key_encrypted TEXT, + default_model VARCHAR(128), + timeout INT DEFAULT 60, + max_retries INT DEFAULT 3, + rate_limit_rpm INT DEFAULT 60, + is_active BOOLEAN DEFAULT TRUE, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### ai_task_configs(AI 任务配置) +```sql +CREATE TABLE ai_task_configs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + task_type VARCHAR(64) NOT NULL, -- summarize / classify / tag / score / daily_brief / chat / optimize + name VARCHAR(128) NOT NULL, + provider_config_id UUID REFERENCES ai_provider_configs(id), + model VARCHAR(128) NOT NULL, + skill_id UUID REFERENCES skills(id), + temperature FLOAT DEFAULT 0.3, + max_tokens INT, + top_p FLOAT DEFAULT 1.0, + system_prompt_override TEXT, + fallback_config_id UUID REFERENCES ai_task_configs(id), + enabled BOOLEAN DEFAULT TRUE, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### output_tasks(产出任务) +```sql +CREATE TABLE output_tasks ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name VARCHAR(128) NOT NULL, + task_type VARCHAR(64) DEFAULT 'daily_brief', + skill_id UUID NOT NULL REFERENCES skills(id), + schedule VARCHAR(128), -- cron 表达式 + filter_config JSONB DEFAULT '{}', + output_config JSONB DEFAULT '{}', + is_active BOOLEAN DEFAULT TRUE, + last_run_at TIMESTAMPTZ, + last_output_id UUID, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### outputs(产出记录) +```sql +CREATE TABLE outputs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + output_task_id UUID REFERENCES output_tasks(id), + content TEXT, + content_html TEXT, + references JSONB DEFAULT '[]', + metadata JSONB DEFAULT '{}', + created_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### chat_sessions(聊天会话) +```sql +CREATE TABLE chat_sessions ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + user_id UUID REFERENCES users(id), + title VARCHAR(256), + skill_id UUID REFERENCES skills(id), + context_window INT DEFAULT 10, + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### chat_messages(聊天消息) +```sql +CREATE TABLE chat_messages ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + session_id UUID NOT NULL REFERENCES chat_sessions(id) ON DELETE CASCADE, + role VARCHAR(32) NOT NULL, -- user / assistant / tool + content TEXT, + tool_calls JSONB DEFAULT '[]', + tool_results JSONB DEFAULT '[]', + references JSONB DEFAULT '[]', + token_usage JSONB, + created_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### optimization_logs(自优化日志) +```sql +CREATE TABLE optimization_logs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + optimization_type VARCHAR(64), -- prompt / skill / dedup_algorithm + target_id UUID, + target_name VARCHAR(128), + previous_version INT, + new_version INT, + previous_content TEXT, + new_content TEXT, + evaluation_reason TEXT, + is_applied BOOLEAN DEFAULT FALSE, + applied_at TIMESTAMPTZ, + rollback_to_version INT, + created_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +#### locks(分布式锁记录) +```sql +CREATE TABLE locks ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + lock_name VARCHAR(128) UNIQUE NOT NULL, + owner_id UUID, + acquired_at TIMESTAMPTZ DEFAULT NOW(), + expires_at TIMESTAMPTZ, + created_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +### 5.3 分区策略 + +**按时间分区表:** +- `raw_articles` 按 `fetched_at` 月分区 +- `cleaned_articles` 按 `fetched_at` 月分区 +- `chat_messages` 按 `created_at` 月分区 + +**索引规划:** +```sql +-- 文章核心查询索引 +CREATE INDEX idx_cleaned_articles_fetched_at ON cleaned_articles(fetched_at DESC); +CREATE INDEX idx_cleaned_articles_category_score ON cleaned_articles(category, composite_score DESC); +CREATE INDEX idx_cleaned_articles_published_at ON cleaned_articles(published_at DESC); +CREATE INDEX idx_cleaned_articles_link ON cleaned_articles(link); + +-- GIN 索引用于 JSONB 标签 +CREATE INDEX idx_cleaned_articles_tags ON cleaned_articles USING GIN (tags); +CREATE INDEX idx_cleaned_articles_reference_links ON cleaned_articles USING GIN (reference_links); + +-- 全文搜索索引(可切 OpenSearch) +CREATE INDEX idx_cleaned_articles_fts ON cleaned_articles +USING GIN (to_tsvector('chinese', COALESCE(title,'') || ' ' || COALESCE(ai_summary,'') || ' ' || COALESCE(content,''))); +``` + +--- + +## 六、自优化系统设计 + +### 6.1 优化范围 + +每日凌晨 2:00 自动执行: +1. **Prompt/Skill 自优化**:摘要、分类、Tag、打分、日报、聊天 Skills +2. **去重算法自优化**:阈值、特征、算法策略 + +### 6.2 自优化标准 + +由 **AI 自己评审优化点**,无需人工标注。评审维度: + +| 优化对象 | 评审维度 | +|----------|----------| +| 摘要 Skill | 信息覆盖率、简洁性、是否保留关键数字/实体 | +| 分类 Skill | 分类边界清晰度、与样本的一致性 | +| Tag Skill | Tag 相关性、避免过度标签、Tag 粒度一致性 | +| 打分 Skill | 评分标准是否稳定、高分文章是否确实重要 | +| 日报 Skill | 结构清晰度、信息密度、引用完整性 | +| 去重算法 | 召回率、精确率、运行效率 | + +### 6.3 自优化流程 + +``` +1. 收集近 N 天数据 + - 最近执行的输入/输出样例 + - 优化历史(避免重复尝试) + - 当前 Prompt/Skill/算法 + +2. 优化器模型生成候选 + - 对每个 Skill:分析弱点 → 生成候选 Prompt + - 对去重算法:分析误判/漏判 → 生成候选算法 + +3. 优化器模型自评 + - 候选 vs 当前版本在历史样例上的模拟表现 + - 输出评审理由和评分 + +4. 自动发布 + - Skill:保存为新版本,自动生效 + - 去重算法:写入新版本文件,自动热加载 + +5. 留痕 + - 写入 optimization_logs + - 保留旧版本以便回滚 +``` + +### 6.4 去重算法版本管理 + +**目录结构:** +``` +platform/ +├── plugins/ +│ └── deduplication/ +│ ├── current.py # 当前生效 +│ ├── current.py.bak.1 # 上一个版本 +│ ├── current.py.bak.2 +│ └── versions/ +│ ├── dedup_v1_20260615_020000.py +│ ├── dedup_v2_20260616_020000.py +│ └── dedup_v3_20260617_020000.py +│ └── metadata.json # 版本历史、回滚点 +``` + +**热加载机制:** +- 使用 `watchdog` 监听 `current.py` 变更 +- 变更时尝试 `importlib.reload` 或动态加载 +- 加载失败时自动回滚到 `current.py.bak.1` +- 通过 API `/api/admin/deduplication/rollback?version=X` 可手动回滚 + +**去重算法自优化输入:** +```python +@dataclass +class DedupOptimizationInput: + current_algorithm_code: str + current_metadata: dict + sample_duplicate_groups: List[DuplicateGroup] + sample_false_positives: List[Tuple[str, str]] # 误判样例 + sample_false_negatives: List[Tuple[str, str]] # 漏判样例 + performance_metrics: dict # 运行时间、内存 +``` + +--- + +## 七、鉴权与锁机制 + +### 7.1 鉴权 + +- **JWT Token**:用户登录后发放 access_token +- **API Key**:供外部系统/脚本调用,与用户绑定 +- **简单 RBAC**: + - `admin`:管理用户、配置、Skills、任务 + - `member`:使用聊天、查看文章、触发个人任务 +- **接口保护**:写入/管理类接口需要 `admin`;读取类接口需要登录 + +### 7.2 锁机制 + +| 锁类型 | 实现 | 用途 | +|--------|------|------| +| **任务级锁** | Redis 分布式锁 | 防止同一任务(如去重、日报生成)并发执行 | +| **文章级锁** | Redis 分布式锁 + DB locks 表 | 防止多人同时编辑同一篇文章的元数据 | +| **调度锁** | Celery `max_instances=1` + Redis | 防止定时任务与手动任务冲突 | + +**锁工具接口:** +```python +class LockService: + async def acquire(self, lock_name: str, ttl: int, owner_id: str) -> bool + async def release(self, lock_name: str, owner_id: str) -> bool + async def extend(self, lock_name: str, ttl: int, owner_id: str) -> bool +``` + +--- + +## 八、Docker 部署架构 + +### 8.1 服务组成 + +```yaml +services: + web: # FastAPI 主服务 + worker-default: # Celery 默认队列 Worker + worker-ai: # Celery AI 任务队列 Worker + worker-fetch: # Celery RSS 抓取队列 Worker + beat: # Celery Beat 定时调度 + redis: # 缓存 + 锁 + 消息队列 + postgres: # 主数据库 + pgvector + minio: # 对象存储(可选) +``` + +### 8.2 目录挂载 + +``` +/data/ + ├── postgres/ # PostgreSQL 数据 + ├── redis/ # Redis 持久化 + ├── minio/ # 对象存储 + ├── logs/ # 应用日志 + ├── plugins/ # 插件目录(去重算法等) + └── backups/ # 自动备份 +``` + +### 8.3 环境变量 + +```env +# 数据库 +DATABASE_URL=postgresql+asyncpg://rss:rss@postgres:5432/rss_platform + +# Redis +REDIS_URL=redis://redis:6379/0 + +# AI +AI_OPTIMIZER_PROVIDER_ID=... +AI_CHAT_PROVIDER_ID=... + +# 安全 +SECRET_KEY=... +ACCESS_TOKEN_EXPIRE_MINUTES=480 + +# 存储 +STORAGE_TYPE=minio +MINIO_ENDPOINT=minio:9000 +MINIO_BUCKET=rss-platform + +# 调度 +SELF_OPTIMIZE_CRON=0 2 * * * +``` + +--- + +## 九、扩展性规划 + +### 9.1 数据量增长路径 + +| 阶段 | 文章量 | 数据库 | 检索 | 去重 | +|------|--------|--------|------|------| +| 阶段一 | < 10万 | PostgreSQL 单库 + 分区表 | PG tsvector | 内存+插件 | +| 阶段二 | 10万~50万 | PostgreSQL 分区表 | OpenSearch | 分桶+Embedding | +| 阶段三 | 50万~100万 | 读写分离 / 按时间分库 | OpenSearch + 向量库 | LSH/MinHash | +| 阶段四 | > 100万 | 分库分表 + 冷热分离 | 专用检索集群 | 近似最近邻 | + +### 9.2 服务拆分预留 + +未来可按模块拆分为独立服务: +- `feed-service`:RSS 抓取 +- `ai-service`:AI 处理 +- `search-service`:检索 +- `chat-service`:聊天与产出 +- `optimization-service`:自优化 + +--- + +## 十、关键接口概览 + +### 10.1 文章与检索 + +| 方法 | 路径 | 说明 | +|------|------|------| +| GET | `/api/articles` | 文章列表(分页、过滤) | +| GET | `/api/articles/{id}` | 文章详情 | +| POST | `/api/articles/{id}/lock` | 获取文章编辑锁 | +| DELETE | `/api/articles/{id}/lock` | 释放文章编辑锁 | +| GET | `/api/search` | 混合检索 | +| POST | `/api/search/semantic` | 语义检索 | + +### 10.2 Skills 与任务 + +| 方法 | 路径 | 说明 | +|------|------|------| +| GET | `/api/skills` | 列出 Skills | +| GET | `/api/skills/{id}` | Skill 详情 | +| POST | `/api/skills` | 创建 Skill | +| PUT | `/api/skills/{id}` | 更新 Skill | +| POST | `/api/skills/{id}/restore-default` | 恢复默认 | +| POST | `/api/skills/import` | 导入 Skill | +| GET | `/api/output-tasks` | 产出任务列表 | +| POST | `/api/output-tasks` | 创建产出任务 | +| POST | `/api/output-tasks/{id}/run` | 手动运行 | + +### 10.3 AI 配置 + +| 方法 | 路径 | 说明 | +|------|------|------| +| GET | `/api/ai/providers` | AI 供应商配置 | +| POST | `/api/ai/providers` | 新增供应商 | +| POST | `/api/ai/providers/{id}/clone` | 复制配置 | +| GET | `/api/ai/task-configs` | AI 任务配置 | +| POST | `/api/ai/task-configs` | 新增任务配置 | +| POST | `/api/ai/task-configs/{id}/clone` | 复制任务配置 | + +### 10.4 聊天 + +| 方法 | 路径 | 说明 | +|------|------|------| +| GET | `/api/chat/sessions` | 会话列表 | +| POST | `/api/chat/sessions` | 创建会话 | +| POST | `/api/chat/sessions/{id}/messages` | 发送消息 | +| GET | `/api/chat/sessions/{id}/messages` | 获取历史 | + +### 10.5 自优化与插件 + +| 方法 | 路径 | 说明 | +|------|------|------| +| GET | `/api/admin/optimization-logs` | 自优化日志 | +| POST | `/api/admin/optimization/run` | 手动触发自优化 | +| GET | `/api/admin/deduplication/versions` | 去重算法版本 | +| POST | `/api/admin/deduplication/rollback` | 回滚算法 | +| POST | `/api/admin/deduplication/reload` | 热加载当前算法 | + +--- + +## 十一、风险与应对 + +| 风险 | 影响 | 应对 | +|------|------|------| +| AI 自优化产生劣质 Prompt | 中 | 优化器自评 + 版本化 + 自动回滚 | +| 去重算法热加载失败 | 高 | 失败自动回滚上一个 `.bak` | +| 数据量激增导致查询慢 | 中 | 分区表 + 异步迁移路径 | +| AI 调用成本高 | 中 | 多供应商路由 + Fallback + 缓存 | +| 并发任务冲突 | 高 | Redis 分布式锁 + 任务级互斥 | +| 团队成员误操作 | 中 | 简单 RBAC + 文章级锁 | +| SQLite 迁移到 PostgreSQL | 中 | 一次性迁移脚本 + 数据校验 | + +--- + +## 十二、与现有仓库的关系 + +新平台不是完全重写,而是: + +1. **继承**:复用 rssKeeper 的抓取逻辑和 Feed 管理,dataClean 的 AI 调用和任务进度机制 +2. **升级**:数据库从 SQLite 升级到 PostgreSQL,引入分区表和向量扩展 +3. **重构**:将分类/标签/打分从规则驱动改为 AI + Skill 驱动 +4. **新增**:聊天产出、Skill 系统、自优化、多 AI 配置、团队鉴权 +5. **合并**:最终用一个统一平台替代两个分离项目 + +--- + +## 十三、文档清单 + +- [x] 本设计文档 +- [ ] 开发步骤文档(见 `rss-platform-dev-plan.md`) +- [ ] API 接口详细文档 +- [ ] Skill 开发指南 +- [ ] 自优化机制说明 +- [ ] Docker 部署手册 +- [ ] 数据迁移手册 diff --git a/docs/dev-plan.md b/docs/dev-plan.md new file mode 100644 index 0000000..287b066 --- /dev/null +++ b/docs/dev-plan.md @@ -0,0 +1,536 @@ +# RSS 信息处理平台:开发步骤文档 + +> 版本:v1.0 +> 日期:2026-06-15 +> 配套文档:`rss-platform-design.md` + +--- + +## 一、总体策略 + +### 1.1 开发原则 + +| 原则 | 说明 | +|------|------| +| **渐进式重构** | 不是一次性推翻重写,而是逐步迁移现有能力 | +| **先跑通再优化** | 先让核心流程(抓取→清洗→AI→存储)跑起来,再扩展高级功能 | +| **数据先行** | 先确定数据模型和迁移方案,再写业务逻辑 | +| **可回滚** | 每个阶段都保留回退到旧系统的能力 | +| **测试护航** | 关键模块必须有单元测试和接口测试 | + +### 1.2 阶段划分 + +``` +阶段一:基础骨架(4-5 周) + → 技术选型确认、项目结构、PostgreSQL 迁移、Docker 环境、基础 API + +阶段二:核心流程(4-5 周) + → RSS 抓取、清洗流水线、去重插件、AI 摘要/分类/Tag/打分、任务调度 + +阶段三:产出与聊天(3-4 周) + → Skill 系统、日报任务、聊天窗口、引用链接 + +阶段四:自优化与工业化(3-4 周) + → 自优化调度器、去重算法自优化、监控、日志、备份 + +阶段五:规模化与优化(持续) + → 性能优化、OpenSearch、分库分表、服务拆分 +``` + +--- + +## 二、阶段一:基础骨架(Week 1-5) + +### 目标 +搭建可运行的基础平台,完成数据库迁移、Docker 环境、用户鉴权、基础 API。 + +### 任务清单 + +#### Week 1:项目初始化与技术选型确认 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 2.1.1 创建 monorepo | 在 `/home/congsh/workspace/dev/rss-platform` 初始化项目 | `README.md`、`Makefile`、`docker-compose.yml` | +| 2.1.2 确定目录结构 | 按模块划分目录 | 见下方目录结构 | +| 2.1.3 选择并锁定依赖版本 | Python 3.12、FastAPI、SQLAlchemy 2.0、Celery、PostgreSQL、Redis | `backend/requirements.txt` | +| 2.1.4 配置代码质量工具 | ruff、black、pytest、pre-commit | `.pre-commit-config.yaml`、`pyproject.toml` | +| 2.1.5 创建 Docker 基础镜像 | 后端、前端、PostgreSQL、Redis、MinIO | `Dockerfile`、`docker-compose.dev.yml` | + +**推荐目录结构:** +``` +rss-platform/ +├── backend/ +│ ├── app/ +│ │ ├── core/ # 配置、日志、异常、鉴权 +│ │ ├── models/ # SQLAlchemy 模型 +│ │ ├── schemas/ # Pydantic 模型 +│ │ ├── api/ # API 路由 +│ │ ├── services/ # 业务服务 +│ │ ├── tasks/ # Celery 任务 +│ │ ├── plugins/ # 插件接口与加载器 +│ │ ├── ai/ # AI 调用、Provider 路由 +│ │ ├── skills/ # Skill 加载与执行 +│ │ ├── search/ # 检索服务 +│ │ ├── chat/ # 聊天引擎 +│ │ └── optimization/ # 自优化 +│ ├── alembic/ # 数据库迁移 +│ ├── tests/ +│ └── main.py +├── frontend/ +│ ├── src/ +│ │ ├── views/ +│ │ ├── components/ +│ │ ├── api/ +│ │ ├── stores/ +│ │ └── router/ +│ └── package.json +├── plugins/ +│ └── deduplication/ +├── docker/ +│ ├── backend.Dockerfile +│ ├── frontend.Dockerfile +│ └── docker-compose.yml +├── docs/ +└── scripts/ + ├── migrate_from_sqlite.py + └── init_dev_env.sh +``` + +#### Week 2:PostgreSQL 数据模型与迁移 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 2.2.1 设计并创建核心表 | users、feeds、raw_articles、cleaned_articles、article_references、skills 等 | `backend/app/models/` | +| 2.2.2 配置 Alembic | 初始化迁移工具 | `alembic.ini`、baseline migration | +| 2.2.3 编写 SQLite → PostgreSQL 迁移脚本 | 从 rssKeeper + dataClean 导出并导入 | `scripts/migrate_from_sqlite.py` | +| 2.2.4 验证迁移数据完整性 | 对比条数、关键字段抽样 | 迁移报告 | +| 2.2.5 配置 pgvector 扩展 | 安装并创建向量表 | `article_embeddings` 表 | + +#### Week 3:FastAPI 基础与鉴权 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 2.3.1 FastAPI 项目骨架 | lifespan、中间件、异常处理、日志 | `backend/main.py` | +| 2.3.2 JWT 鉴权 | 登录、注册、Token 刷新、API Key | `backend/app/core/auth.py` | +| 2.3.3 简单 RBAC | admin / member 角色 | `backend/app/core/rbac.py` | +| 2.3.4 CORS 安全配置 | 白名单、关闭 credentials | `backend/main.py` | +| 2.3.5 健康检查与 OpenAPI | `/health`、`/openapi.json` | API 文档 | + +#### Week 4:配置中心与锁服务 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 2.4.1 配置中心 | 环境变量 + DB 覆盖 + 版本化 | `backend/app/core/settings.py` | +| 2.4.2 Redis 连接 | 缓存、锁、消息队列 | `backend/app/core/redis.py` | +| 2.4.3 分布式锁服务 | 任务锁、文章锁 | `backend/app/services/lock_service.py` | +| 2.4.4 任务运行时状态 | 进度追踪、任务日志 | `backend/app/services/task_runtime.py` | +| 2.4.5 日志与请求 ID | 结构化日志、request_id | `backend/app/core/logging.py` | + +#### Week 5:前端基础与管理后台 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 2.5.1 Vue 3 + TS 项目初始化 | Vite、Element Plus、Pinia、Vue Router | `frontend/` | +| 2.5.2 登录页 | JWT 登录、API Key 管理 | `LoginView.vue` | +| 2.5.3 RSS 源管理页 | 列表、添加、编辑、删除 | `FeedsView.vue` | +| 2.5.4 文章列表页 | 分页、筛选 | `ArticlesView.vue` | +| 2.5.5 Docker Compose 联调 | 全栈本地启动 | `docker-compose.yml` | + +### 阶段一交付物 +- [ ] 可本地 `docker-compose up` 运行的基础平台 +- [ ] PostgreSQL 数据库 + 迁移脚本 +- [ ] JWT 鉴权 + 简单 RBAC +- [ ] RSS 源 CRUD + 文章列表 +- [ ] 分布式锁服务 + +### 阶段一里程碑 +**M1:基础平台骨架完成,可管理 RSS 源和查看文章列表。** + +--- + +## 三、阶段二:核心流程(Week 6-10) + +### 目标 +实现抓取→清洗→去重→AI 处理→存储的完整流水线。 + +### 任务清单 + +#### Week 6:RSS 抓取迁移与增强 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 3.6.1 迁移抓取逻辑 | 从 rssKeeper `rss_fetcher.py` 迁移 | `backend/app/services/feed_fetcher.py` | +| 3.6.2 Celery 抓取任务 | `fetch_feed_task`、`fetch_all_feeds_task` | `backend/app/tasks/feeds.py` | +| 3.6.3 Celery Beat 调度 | 按 Feed 间隔注册定时任务 | `backend/app/tasks/scheduler.py` | +| 3.6.4 抓取失败重试 | 指数退避、死信队列 | 重试装饰器 | +| 3.6.5 Feed 健康度统计 | 复用 rssKeeper 逻辑 | `backend/app/services/health_checker.py` | + +#### Week 7:清洗流水线 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 3.7.1 数据标准化 | URL 规范化、HTML 清洗、时间标准化 | `backend/app/services/normalizer.py` | +| 3.7.2 正文提取 | 优先 RSS content,fallback 原文页 | `backend/app/services/content_extractor.py` | +| 3.7.3 清洗任务 Celery 化 | `process_raw_article_task` | `backend/app/tasks/pipeline.py` | +| 3.7.4 内容 hash 与语言检测 | 用于后续去重 | 辅助字段 | +| 3.7.5 清洗状态机 | pending → processing → cleaned / failed | `processing_status` | + +#### Week 8:去重插件系统 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 3.8.1 定义插件接口 | `DeduplicationPlugin` | `backend/app/plugins/base.py` | +| 3.8.2 实现默认去重插件 | URL + 标题 exact + TF-IDF 内容相似 | `plugins/deduplication/current.py` | +| 3.8.3 插件加载器 | 动态 import + 热加载 | `backend/app/plugins/loader.py` | +| 3.8.4 引用关系写入 | `article_references` + `reference_links` | 去重服务 | +| 3.8.5 去重任务接口 | 手动触发、查看结果 | `/api/tasks/deduplicate` | +| 3.8.6 插件版本元数据 | `metadata.json` | 版本管理 | + +#### Week 9:AI 处理中心 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 3.9.1 AI Provider 配置模型 | openai / anthropic / gemini / local | `backend/app/models/ai_provider_config.py` | +| 3.9.2 AI 任务配置模型 | 每个任务独立配置 | `backend/app/models/ai_task_config.py` | +| 3.9.3 LiteLLM/自封装多供应商路由 | 统一调用接口 | `backend/app/ai/client.py` | +| 3.9.4 AI 摘要任务 | 复用 dataClean `summarizer.py` | `backend/app/tasks/ai_tasks.py` | +| 3.9.5 AI 分类任务 | 从规则分类迁移到 AI 分类 | `backend/app/tasks/ai_tasks.py` | +| 3.9.6 AI Tag 任务 | AI 打标签 | `backend/app/tasks/ai_tasks.py` | +| 3.9.7 AI 打分任务 | heat/importance/composite | `backend/app/tasks/ai_tasks.py` | + +#### Week 10:流水线编排与任务调度 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 3.10.1 流水线编排器 | 组合抓取→清洗→去重→AI 处理 | `backend/app/services/pipeline_orchestrator.py` | +| 3.10.2 任务进度追踪 | 复用 dataClean `task_progress.py` | `backend/app/services/task_progress.py` | +| 3.10.3 手动/定时任务互斥 | Redis 锁 + Celery | 调度服务 | +| 3.10.4 仪表盘统计 API | 文章数、分类分布、任务状态 | `/api/stats` | +| 3.10.5 前端仪表盘 | 统计卡片、任务进度 | `DashboardView.vue` | + +### 阶段二交付物 +- [ ] RSS 自动抓取 + Celery 调度 +- [ ] 清洗流水线 +- [ ] 去重插件系统(默认实现 + 热加载) +- [ ] AI 摘要/分类/Tag/打分 +- [ ] 流水线编排与任务进度 + +### 阶段二里程碑 +**M2:核心流程跑通,一篇文章从 RSS 抓取到 AI 处理完成可自动化执行。** + +--- + +## 四、阶段三:产出与聊天(Week 11-14) + +### 目标 +实现 Skill 系统、日报任务、聊天窗口、引用链接。 + +### 任务清单 + +#### Week 11:Skill 系统基础 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 4.11.1 Skill 数据模型 | skills、skill_versions | 数据库表 | +| 4.11.2 Skill 加载与执行 | 解析 prompt、schema、tools | `backend/app/skills/loader.py` | +| 4.11.3 内置默认 Skills | 摘要、分类、Tag、打分、日报、聊天 | `backend/app/skills/defaults/` | +| 4.11.4 Skill CRUD API | 创建、修改、导入、导出、恢复默认 | `/api/skills/*` | +| 4.11.5 Skill 管理前端 | 编辑器、导入导出 | `SkillsView.vue` | + +#### Week 12:日报任务系统 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 4.12.1 OutputTask 模型 | task + skill + filter + schedule | 数据库表 | +| 4.12.2 日报生成任务 | 复用 dataClean `brief.py`,改为 Skill 驱动 | `backend/app/tasks/output_tasks.py` | +| 4.12.3 数据筛选 DSL | 时间、分类、Tag、分数过滤 | Filter builder | +| 4.12.4 多种日报类型 | 科技观察、综合日报等 | 多个 OutputTask | +| 4.12.5 日报前端 | 列表、详情、重新生成 | `BriefsView.vue` | + +#### Week 13:聊天引擎 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 4.13.1 聊天数据模型 | sessions、messages | 数据库表 | +| 4.13.2 聊天 API | 创建会话、发送消息、历史记录 | `/api/chat/*` | +| 4.13.3 Tool 调用框架 | 搜索文章、获取文章详情、调用外部 API | `backend/app/chat/tools.py` | +| 4.13.4 引用链接解析 | 从 AI 输出提取标记并映射链接 | `backend/app/chat/references.py` | +| 4.13.5 聊天前端 | 对话界面、引用渲染 | `ChatView.vue` | + +#### Week 14:工具与产出集成 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 4.14.1 外部 API Tool | 调用用户配置的 HTTP API | `fetch_external_api` tool | +| 4.14.2 产出导出 | Markdown、JSON、邮件 | `backend/app/services/exporters.py` | +| 4.14.3 聊天中运行 Skill | 用户选择 Skill 进行专项对话 | Skill selector | +| 4.14.4 会话上下文管理 | 窗口大小、历史摘要 | Context manager | +| 4.14.5 集成测试 | 端到端聊天流程 | `tests/test_chat.py` | + +### 阶段三交付物 +- [ ] Skill 系统(CRUD + 默认 Skills) +- [ ] 可配置的日报任务系统 +- [ ] 聊天窗口(支持 Tool 调用和引用链接) +- [ ] 外部 API Tool + +### 阶段三里程碑 +**M3:平台具备 AI 产出能力,日报和聊天均可调用 Skills 生成带引用链接的内容。** + +--- + +## 五、阶段四:自优化与工业化(Week 15-18) + +### 目标 +实现每日凌晨 2 点自优化、去重算法自优化、监控、备份、日志。 + +### 任务清单 + +#### Week 15:自优化调度器 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 5.15.1 自优化数据收集 | 收集最近执行样例、历史优化记录 | `backend/app/optimization/collector.py` | +| 5.15.2 Prompt/Skill 优化器 | 用 AI 评审并生成候选 Prompt | `backend/app/optimization/prompt_optimizer.py` | +| 5.15.3 优化器自评逻辑 | 候选 vs 当前版本评分 | 评估服务 | +| 5.15.4 自动发布机制 | 保存新版本、自动生效 | 发布服务 | +| 5.15.5 优化日志 | `optimization_logs` 表 + API | 留痕 | + +#### Week 16:去重算法自优化 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 5.16.1 去重算法优化器 | 分析样例生成候选算法代码 | `backend/app/optimization/dedup_optimizer.py` | +| 5.16.2 算法版本备份 | 自动生成 `.bak` | 版本管理 | +| 5.16.3 算法热加载与回滚 | 失败自动回滚 | 插件加载器增强 | +| 5.16.4 算法性能评估 | 运行时间、内存、误判漏判 | 评估服务 | +| 5.16.5 手动回滚 API | `/api/admin/deduplication/rollback` | 管理接口 | + +#### Week 17:监控与可观测性 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 5.17.1 Prometheus 指标 | 任务数、AI 调用数、延迟、失败率 | `/metrics` | +| 5.17.2 Grafana 仪表盘 | 系统状态、任务状态、AI 成本 | `docker/grafana/` | +| 5.17.3 Sentry 集成 | 错误追踪 | 配置 | +| 5.17.4 日志聚合 | Loki 或文件日志 | 日志配置 | +| 5.17.5 告警规则 | 任务失败、AI 调用异常 | Alert rules | + +#### Week 18:备份、安全与部署 + +| 任务 | 说明 | 产出 | +|------|------|------| +| 5.18.1 PostgreSQL 自动备份 | 每日备份、保留策略 | `scripts/backup_db.sh` | +| 5.18.2 对象存储备份 | 插件、导出产物备份 | 备份脚本 | +| 5.18.3 安全加固 | API 限流、密码策略、敏感配置加密 | 安全文档 | +| 5.18.4 生产 Docker Compose | 多 Worker 配置 | `docker-compose.prod.yml` | +| 5.18.5 部署文档 | 完整部署步骤 | `docs/deployment.md` | + +### 阶段四交付物 +- [ ] 每日 2:00 自优化调度器 +- [ ] 去重算法自优化 + 版本回滚 +- [ ] Prometheus + Grafana 监控 +- [ ] 自动备份方案 +- [ ] 生产部署文档 + +### 阶段四里程碑 +**M4:平台具备工业化能力,可无人值守运行并自动优化。** + +--- + +## 六、阶段五:规模化与持续优化(Week 19+) + +### 目标 +根据实际数据量和负载进行性能优化和架构扩展。 + +### 任务清单 + +| 任务 | 说明 | 时机 | +|------|------|------| +| 6.1 迁移到 OpenSearch | 当 PG tsvector 性能不足时 | 10万+ 文章 | +| 6.2 向量检索优化 | 使用专用向量库(Qdrant/Milvus) | 需要语义检索时 | +| 6.3 去重算法升级 | LSH/MinHash/Embedding | 日增 5000+ 时 | +| 6.4 数据库读写分离 | 主从复制 | 读压力高时 | +| 6.5 按时间分库 | 历史数据归档 | 50万+ 文章 | +| 6.6 服务拆分 | 拆分为 feed-service、ai-service 等 | 团队扩大时 | +| 6.7 缓存层优化 | Redis 缓存热门查询、文章 | 读压力大时 | +| 6.8 AI 调用成本优化 | 缓存 Embedding、批量调用、模型降级 | AI 成本高时 | + +--- + +## 七、关键依赖清单 + +### 7.1 Python 后端 + +```txt +fastapi==0.115.0 +uvicorn[standard]==0.30.0 +sqlalchemy[asyncio]==2.0.31 +asyncpg==0.29.0 +alembic==1.13.2 +psycopg2-binary==2.9.9 +celery==5.4.0 +redis==5.0.7 +pydantic==2.8.2 +pydantic-settings==2.3.4 +python-jose[cryptography]==3.3.0 +passlib[bcrypt]==1.7.4 +python-multipart==0.0.9 +httpx==0.27.0 +feedparser==6.0.11 +beautifulsoup4==4.12.3 +lxml==5.2.2 +scikit-learn==1.5.1 +numpy==1.26.4 +openai==1.35.0 +litellm==1.41.0 +langdetect==1.0.9 +watchdog==4.0.1 +prometheus-client==0.20.0 +sentry-sdk==2.7.0 +pytest==8.2.2 +pytest-asyncio==0.23.7 +``` + +### 7.2 前端 + +```json +{ + "vue": "^3.4.0", + "vue-router": "^4.3.0", + "pinia": "^2.1.0", + "element-plus": "^2.7.0", + "axios": "^1.7.0", + "typescript": "^5.4.0", + "marked": "^13.0.0", + "dompurify": "^3.1.0" +} +``` + +### 7.3 基础设施 + +| 组件 | 版本 | 用途 | +|------|------|------| +| PostgreSQL | 16+ | 主数据库 | +| Redis | 7+ | 缓存、锁、消息队列 | +| MinIO | latest | 对象存储 | +| Prometheus | latest | 指标采集 | +| Grafana | latest | 监控仪表盘 | + +--- + +## 八、测试策略 + +### 8.1 单元测试 + +| 模块 | 测试重点 | +|------|----------| +| `plugins/deduplication` | URL/标题/内容去重、代表文章选择 | +| `app/services/normalizer` | URL 规范化、HTML 清洗 | +| `app/ai/client` | 多 Provider 路由、Fallback | +| `app/skills/loader` | Skill 解析、Schema 校验 | +| `app/chat/references` | 引用标记解析与链接映射 | +| `app/services/lock_service` | 锁获取/释放/超时 | + +### 8.2 集成测试 + +| 模块 | 测试重点 | +|------|----------| +| `/api/feeds` | CRUD、抓取触发 | +| `/api/tasks/deduplicate` | 去重任务全流程 | +| `/api/chat/sessions` | 创建会话、发送消息 | +| `/api/output-tasks` | 创建任务、手动运行 | +| `/api/admin/deduplication/rollback` | 算法回滚 | + +### 8.3 端到端测试 + +- 发布 RSS → 抓取 → 清洗 → 去重 → AI 处理 → 日报生成 +- 聊天中提问 → 检索文章 → AI 回答 → 验证引用链接 + +--- + +## 九、里程碑与验收标准 + +| 里程碑 | 时间 | 验收标准 | +|--------|------|----------| +| M1 基础骨架 | Week 5 | `docker-compose up` 可运行;可管理 Feed;可查看文章列表;JWT 鉴权可用 | +| M2 核心流程 | Week 10 | 一条 RSS 文章能自动完成:抓取→清洗→去重→AI 摘要/分类/Tag/打分 | +| M3 产出与聊天 | Week 14 | 可创建日报任务并生成日报;可在聊天中提问并获得带引用链接的回答 | +| M4 自优化与工业化 | Week 18 | 每日 2:00 自优化成功执行并留痕;去重算法可热加载和回滚;监控可用 | +| M5 规模化 | 持续 | 根据数据量完成 OpenSearch/分库等扩展 | + +--- + +## 十、风险与应对 + +| 风险 | 阶段 | 影响 | 应对 | +|------|------|------|------| +| SQLite → PostgreSQL 迁移复杂 | 阶段一 | 高 | 编写迁移脚本 + 数据校验 + 保留旧系统 | +| Celery 任务状态丢失 | 阶段二 | 中 | 使用 Redis 持久化 + 任务结果 backend | +| 去重插件热加载失败 | 阶段二/四 | 高 | 失败自动回滚上一个 `.bak` | +| AI 自优化产生劣质结果 | 阶段四 | 中 | 优化器自评 + 版本化 + 不自动删除旧版本 | +| AI 调用成本高 | 全程 | 中 | 多供应商路由 + Fallback + 结果缓存 | +| 前端 TypeScript 迁移成本 | 阶段一 | 低 | 新平台前端直接用 TS,旧前端代码逐步重写 | +| 数据量增长超预期 | 阶段五 | 中 | 分区表 + 预留 OpenSearch/向量库扩展路径 | + +--- + +## 十一、交付文档清单 + +| 文档 | 负责阶段 | 状态 | +|------|----------|------| +| 本开发步骤文档 | 全程 | ✅ | +| 架构设计文档 | 全程 | ✅ `rss-platform-design.md` | +| API 接口详细文档 | 阶段一~三 | 待编写 | +| Skill 开发指南 | 阶段三 | 待编写 | +| 自优化机制说明 | 阶段四 | 待编写 | +| Docker 部署手册 | 阶段一/四 | 待编写 | +| 数据迁移手册 | 阶段一 | 待编写 | +| 运维监控手册 | 阶段四 | 待编写 | + +--- + +## 十二、后续建议 + +1. **先验证核心假设**:在阶段二结束前,用真实 RSS 源跑一周,观察去重效果和 AI 输出质量。 +2. **Prompt 工程优先于自优化**:初期先人工调优 Prompt,阶段四再引入自优化。 +3. **保留旧系统并行运行**:直到阶段三结束,旧 rssKeeper + dataClean 可继续服务,降低切换风险。 +4. **建立反馈闭环**:即使不自优化,也要先收集用户对 AI 产出的反馈数据。 +5. **控制 AI 成本**:为每个任务设置预算上限和模型降级策略。 + +--- + +## 十三、快速启动命令(预期) + +```bash +# 1. 克隆项目 +cd /home/congsh/workspace/dev +git clone rss-platform +cd rss-platform + +# 2. 启动基础设施 +docker-compose -f docker-compose.dev.yml up -d postgres redis minio + +# 3. 初始化数据库 +cd backend +alembic upgrade head +python scripts/migrate_from_sqlite.py \ + --rsskeeper-db /path/to/rsskeeper.db \ + --dataclean-db /path/to/dataclean.db + +# 4. 启动后端 +uvicorn main:app --reload --port 8000 + +# 5. 启动前端 +cd ../frontend +npm install +npm run dev + +# 6. 启动 Celery Worker +cd ../backend +celery -A app.tasks worker -Q default,fetch,ai -l info +celery -A app.tasks beat -l info +``` + +--- + +*本文档与 `rss-platform-design.md` 配套使用,开发过程中应根据实际情况迭代更新。* diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..64c28c4 --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,13 @@ + + + + + + + RSS Platform + + +
+ + + diff --git a/frontend/package.json b/frontend/package.json new file mode 100644 index 0000000..3fbe639 --- /dev/null +++ b/frontend/package.json @@ -0,0 +1,26 @@ +{ + "name": "rss-platform-frontend", + "private": true, + "version": "0.1.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vue-tsc --noEmit && vite build", + "preview": "vite preview", + "lint": "eslint . --ext .vue,.ts,.tsx --fix" + }, + "dependencies": { + "vue": "^3.4.0", + "vue-router": "^4.3.0", + "pinia": "^2.1.0", + "element-plus": "^2.7.0", + "axios": "^1.7.0", + "@element-plus/icons-vue": "^2.3.0" + }, + "devDependencies": { + "@vitejs/plugin-vue": "^5.0.4", + "typescript": "^5.4.0", + "vite": "^5.2.0", + "vue-tsc": "^2.0.0" + } +} diff --git a/frontend/src/api/articles.ts b/frontend/src/api/articles.ts new file mode 100644 index 0000000..80a390a --- /dev/null +++ b/frontend/src/api/articles.ts @@ -0,0 +1,22 @@ +import api from './index' +import type { Article, PaginatedResponse } from '@/types' + +export interface ArticleListParams { + skip?: number + limit?: number + feed_id?: string + category?: string + tag?: string + search?: string +} + +export const articlesApi = { + list: (params: ArticleListParams = {}): Promise> => + api.get('/articles', { params }), + + get: (id: string): Promise
=> + api.get(`/articles/${id}`), + + markRead: (id: string): Promise<{ message: string }> => + api.put(`/articles/${id}/read`), +} diff --git a/frontend/src/api/auth.ts b/frontend/src/api/auth.ts new file mode 100644 index 0000000..9794c17 --- /dev/null +++ b/frontend/src/api/auth.ts @@ -0,0 +1,26 @@ +import api from './index' +import axios from 'axios' +import type { LoginCredentials, TokenResponse, User } from '@/types' + +export const authApi = { + login: (credentials: LoginCredentials): Promise => + api.post('/auth/login', credentials), + + register: (data: { username: string; password: string; role?: string }): Promise => + api.post('/auth/register', data), + + getMe: (): Promise => + api.get('/auth/me'), + + refresh: (refreshToken: string): Promise => + axios.post( + `${import.meta.env.VITE_API_BASE_URL || '/api/v1'}/auth/refresh`, + { refresh_token: refreshToken } + ).then((res) => res.data), + + logout: (refreshToken: string): Promise => + axios.post( + `${import.meta.env.VITE_API_BASE_URL || '/api/v1'}/auth/logout`, + { refresh_token: refreshToken } + ).then(() => undefined), +} diff --git a/frontend/src/api/feeds.ts b/frontend/src/api/feeds.ts new file mode 100644 index 0000000..ef99a64 --- /dev/null +++ b/frontend/src/api/feeds.ts @@ -0,0 +1,32 @@ +import api from './index' +import type { + Feed, + FeedCreateRequest, + FeedUpdateRequest, + PaginatedResponse, +} from '@/types' + +export interface FeedListParams { + skip?: number + limit?: number + category?: string + search?: string + is_active?: boolean +} + +export const feedsApi = { + list: (params: FeedListParams = {}): Promise> => + api.get('/feeds', { params }), + + get: (id: string): Promise => + api.get(`/feeds/${id}`), + + create: (data: FeedCreateRequest): Promise => + api.post('/feeds', data), + + update: (id: string, data: FeedUpdateRequest): Promise => + api.put(`/feeds/${id}`, data), + + delete: (id: string): Promise<{ message: string }> => + api.delete(`/feeds/${id}`), +} diff --git a/frontend/src/api/index.ts b/frontend/src/api/index.ts new file mode 100644 index 0000000..35e4f70 --- /dev/null +++ b/frontend/src/api/index.ts @@ -0,0 +1,94 @@ +import axios from 'axios' +import type { AxiosError, AxiosInstance, AxiosResponse, InternalAxiosRequestConfig } from 'axios' +import { useAuthStore } from '@/stores/auth' + +const api: AxiosInstance = axios.create({ + baseURL: import.meta.env.VITE_API_BASE_URL || '/api/v1', + timeout: 30000, +}) + +api.interceptors.request.use( + (config: InternalAxiosRequestConfig) => { + const token = localStorage.getItem('token') + if (token && config.headers) { + config.headers.Authorization = `Bearer ${token}` + } + return config + }, + (error: AxiosError) => { + return Promise.reject(error) + } +) + +let isRefreshing = false +let refreshSubscribers: Array<(token: string) => void> = [] + +function onRefreshed(token: string) { + refreshSubscribers.forEach((callback) => callback(token)) + refreshSubscribers = [] +} + +function addRefreshSubscriber(callback: (token: string) => void) { + refreshSubscribers.push(callback) +} + +function rejectRefreshSubscribers() { + refreshSubscribers = [] +} + +api.interceptors.response.use( + (response: AxiosResponse) => response.data, + async (error: AxiosError) => { + const originalRequest = error.config as InternalAxiosRequestConfig & { _retry?: boolean } + const status = error.response?.status + const detail = (error.response?.data as any)?.detail || error.message + + if (status === 401 && originalRequest && !originalRequest._retry) { + const authStore = useAuthStore() + const refreshToken = authStore.refreshToken + + if (!refreshToken) { + authStore.logout() + window.location.href = '/login' + return Promise.reject(new Error(detail)) + } + + if (isRefreshing) { + return new Promise((resolve) => { + addRefreshSubscriber((newToken: string) => { + if (originalRequest.headers) { + originalRequest.headers.Authorization = `Bearer ${newToken}` + } + resolve(api(originalRequest)) + }) + }) + } + + originalRequest._retry = true + isRefreshing = true + + try { + const refreshed = await authStore.refreshAccessToken() + if (!refreshed || !authStore.token) { + throw new Error('Refresh failed') + } + onRefreshed(authStore.token) + if (originalRequest.headers) { + originalRequest.headers.Authorization = `Bearer ${authStore.token}` + } + return api(originalRequest) + } catch (refreshError) { + rejectRefreshSubscribers() + authStore.logout() + window.location.href = '/login' + return Promise.reject(refreshError) + } finally { + isRefreshing = false + } + } + + return Promise.reject(new Error(detail)) + } +) + +export default api diff --git a/frontend/src/components/Layout.vue b/frontend/src/components/Layout.vue new file mode 100644 index 0000000..b54e76f --- /dev/null +++ b/frontend/src/components/Layout.vue @@ -0,0 +1,112 @@ + + + + + diff --git a/frontend/src/main.ts b/frontend/src/main.ts new file mode 100644 index 0000000..fe98beb --- /dev/null +++ b/frontend/src/main.ts @@ -0,0 +1,24 @@ +import { createApp } from 'vue' +import { createPinia } from 'pinia' +import ElementPlus from 'element-plus' +import * as ElementPlusIconsVue from '@element-plus/icons-vue' +import 'element-plus/dist/index.css' + +import App from './App.vue' +import router from './router' + +const app = createApp(App) + +app.config.errorHandler = (err, _instance, info) => { + console.error('Unhandled Vue error:', err, info) +} + +app.use(createPinia()) +app.use(router) +app.use(ElementPlus) + +for (const [key, component] of Object.entries(ElementPlusIconsVue)) { + app.component(key, component) +} + +app.mount('#app') diff --git a/frontend/src/router/index.ts b/frontend/src/router/index.ts new file mode 100644 index 0000000..2f1044c --- /dev/null +++ b/frontend/src/router/index.ts @@ -0,0 +1,59 @@ +import { createRouter, createWebHistory } from 'vue-router' +import { useAuthStore } from '@/stores/auth' + +const router = createRouter({ + history: createWebHistory(), + routes: [ + { + path: '/login', + name: 'Login', + component: () => import('@/views/LoginView.vue'), + meta: { public: true }, + }, + { + path: '/', + component: () => import('@/components/Layout.vue'), + redirect: '/dashboard', + children: [ + { + path: 'dashboard', + name: 'Dashboard', + component: () => import('@/views/DashboardView.vue'), + }, + { + path: 'feeds', + name: 'Feeds', + component: () => import('@/views/FeedsView.vue'), + }, + { + path: 'articles', + name: 'Articles', + component: () => import('@/views/ArticlesView.vue'), + }, + ], + }, + ], +}) + +router.beforeEach(async (to) => { + const authStore = useAuthStore() + + if (to.meta.public) { + return true + } + + if (!authStore.isAuthenticated) { + const hasToken = !!localStorage.getItem('token') + if (hasToken) { + const ok = await authStore.fetchUser() + if (ok) { + return true + } + } + return '/login' + } + + return true +}) + +export default router diff --git a/frontend/src/stores/auth.ts b/frontend/src/stores/auth.ts new file mode 100644 index 0000000..d0b20af --- /dev/null +++ b/frontend/src/stores/auth.ts @@ -0,0 +1,86 @@ +import { defineStore } from 'pinia' +import { computed, ref } from 'vue' +import { authApi } from '@/api/auth' +import type { User } from '@/types' + +const TOKEN_KEY = 'token' +const REFRESH_TOKEN_KEY = 'refresh_token' + +export const useAuthStore = defineStore('auth', () => { + const token = ref(localStorage.getItem(TOKEN_KEY)) + const refreshToken = ref(localStorage.getItem(REFRESH_TOKEN_KEY)) + const user = ref(null) + const loading = ref(false) + + const isAuthenticated = computed(() => !!token.value && !!user.value) + const isAdmin = computed(() => user.value?.role === 'admin') + + async function login(username: string, password: string) { + loading.value = true + try { + const response = await authApi.login({ username, password }) + setTokens(response.access_token, response.refresh_token) + await fetchUser() + return true + } catch (error) { + throw error + } finally { + loading.value = false + } + } + + async function fetchUser() { + try { + const response = await authApi.getMe() + user.value = response + return true + } catch (error) { + logout() + return false + } + } + + async function refreshAccessToken() { + const currentRefresh = refreshToken.value + if (!currentRefresh) { + logout() + return false + } + try { + const response = await authApi.refresh(currentRefresh) + setTokens(response.access_token, response.refresh_token) + return true + } catch (error) { + logout() + return false + } + } + + function setTokens(access: string, refresh: string) { + token.value = access + refreshToken.value = refresh + localStorage.setItem(TOKEN_KEY, access) + localStorage.setItem(REFRESH_TOKEN_KEY, refresh) + } + + function logout() { + token.value = null + refreshToken.value = null + user.value = null + localStorage.removeItem(TOKEN_KEY) + localStorage.removeItem(REFRESH_TOKEN_KEY) + } + + return { + token, + refreshToken, + user, + loading, + isAuthenticated, + isAdmin, + login, + fetchUser, + refreshAccessToken, + logout, + } +}) diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts new file mode 100644 index 0000000..8822064 --- /dev/null +++ b/frontend/src/types/index.ts @@ -0,0 +1,95 @@ +export interface User { + id: string + username: string + role: 'admin' | 'member' + is_active: boolean +} + +export interface Feed { + id: string + url: string + title: string + description: string + category: string + is_active: boolean + fetch_interval_minutes: number + priority: number + parser_config: Record + proxy_policy: string + last_fetch_at: string | null + last_fetch_status: string | null + last_error: string | null + error_type: string | null + success_count: number + fail_count: number + article_count: number + health_status: string + created_at: string + updated_at: string +} + +export interface Article { + id: string + raw_article_id: string | null + feed_id: string + title: string | null + link: string + author: string | null + feed_title: string | null + feed_category: string | null + published_at: string | null + fetched_at: string + content: string | null + original_summary: string | null + ai_summary: string | null + category: string | null + tags: string[] + heat_score: number + importance_score: number + duplication_score: number + composite_score: number + is_representative: boolean + reference_links: any[] + processing_status: string + created_at: string + updated_at: string +} + +export interface PaginatedResponse { + total: number + items: T[] +} + +export interface LoginCredentials { + username: string + password: string +} + +export interface TokenResponse { + access_token: string + refresh_token: string + token_type: string +} + +export interface FeedCreateRequest { + url: string + title?: string + description?: string + category?: string + is_active?: boolean + fetch_interval_minutes?: number + priority?: number + parser_config?: Record + proxy_policy?: string +} + +export interface FeedUpdateRequest { + title?: string + description?: string + category?: string + is_active?: boolean + fetch_interval_minutes?: number + priority?: number + parser_config?: Record + proxy_policy?: string +} diff --git a/frontend/src/views/ArticlesView.vue b/frontend/src/views/ArticlesView.vue new file mode 100644 index 0000000..922af94 --- /dev/null +++ b/frontend/src/views/ArticlesView.vue @@ -0,0 +1,162 @@ + + + + + diff --git a/frontend/src/views/DashboardView.vue b/frontend/src/views/DashboardView.vue new file mode 100644 index 0000000..dc83930 --- /dev/null +++ b/frontend/src/views/DashboardView.vue @@ -0,0 +1,67 @@ + + + + + diff --git a/frontend/src/views/FeedsView.vue b/frontend/src/views/FeedsView.vue new file mode 100644 index 0000000..93bd2c6 --- /dev/null +++ b/frontend/src/views/FeedsView.vue @@ -0,0 +1,273 @@ + + + + + diff --git a/frontend/src/views/LoginView.vue b/frontend/src/views/LoginView.vue new file mode 100644 index 0000000..4084519 --- /dev/null +++ b/frontend/src/views/LoginView.vue @@ -0,0 +1,116 @@ + + + + + diff --git a/frontend/tsconfig.json b/frontend/tsconfig.json new file mode 100644 index 0000000..e367497 --- /dev/null +++ b/frontend/tsconfig.json @@ -0,0 +1,25 @@ +{ + "compilerOptions": { + "target": "ES2020", + "useDefineForClassFields": true, + "module": "ESNext", + "lib": ["ES2020", "DOM", "DOM.Iterable"], + "skipLibCheck": true, + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "resolveJsonModule": true, + "isolatedModules": true, + "noEmit": true, + "jsx": "preserve", + "strict": true, + "noUnusedLocals": false, + "noUnusedParameters": false, + "noFallthroughCasesInSwitch": true, + "baseUrl": ".", + "paths": { + "@/*": ["src/*"] + } + }, + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.vue"], + "references": [{ "path": "./tsconfig.node.json" }] +} diff --git a/frontend/tsconfig.node.json b/frontend/tsconfig.node.json new file mode 100644 index 0000000..42872c5 --- /dev/null +++ b/frontend/tsconfig.node.json @@ -0,0 +1,10 @@ +{ + "compilerOptions": { + "composite": true, + "skipLibCheck": true, + "module": "ESNext", + "moduleResolution": "bundler", + "allowSyntheticDefaultImports": true + }, + "include": ["vite.config.ts"] +} diff --git a/frontend/vite.config.ts b/frontend/vite.config.ts new file mode 100644 index 0000000..43f31be --- /dev/null +++ b/frontend/vite.config.ts @@ -0,0 +1,31 @@ +import { defineConfig, loadEnv } from 'vite' +import vue from '@vitejs/plugin-vue' +import { resolve } from 'path' + +export default defineConfig(({ mode }) => { + const env = loadEnv(mode, process.cwd(), '') + const apiTarget = env.VITE_API_BASE_URL || 'http://localhost:8000' + + return { + plugins: [vue()], + resolve: { + alias: { + '@': resolve(__dirname, 'src'), + }, + }, + server: { + port: 5173, + host: true, + proxy: { + '/api': { + target: apiTarget, + changeOrigin: true, + }, + '/health': { + target: apiTarget, + changeOrigin: true, + }, + }, + }, + } +}) diff --git a/plugins/deduplication/__init__.py b/plugins/deduplication/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plugins/deduplication/current.py b/plugins/deduplication/current.py new file mode 100644 index 0000000..c3a8c2a --- /dev/null +++ b/plugins/deduplication/current.py @@ -0,0 +1,34 @@ +"""Default deduplication plugin placeholder.""" +from dataclasses import dataclass +from typing import Dict, List + + +@dataclass +class DedupInput: + article_id: str + title: str + link: str + content: str + content_length: int + published_at: str + feed_id: str + + +@dataclass +class DuplicateGroup: + representative_id: str + member_ids: List[str] + reason: str + similarity_scores: Dict[str, float] + + +class DeduplicationPlugin: + """Default deduplication plugin.""" + + name = "default_placeholder" + version = "0.1.0" + + def find_duplicates(self, articles: List[DedupInput]) -> List[DuplicateGroup]: + """Find duplicate articles.""" + # Placeholder implementation + return [] diff --git a/scripts/migrate_from_sqlite.py b/scripts/migrate_from_sqlite.py new file mode 100644 index 0000000..059f4e3 --- /dev/null +++ b/scripts/migrate_from_sqlite.py @@ -0,0 +1,19 @@ +"""Placeholder for SQLite to PostgreSQL migration script.""" +import argparse + + +def main(): + parser = argparse.ArgumentParser(description="Migrate SQLite data to PostgreSQL") + parser.add_argument("--rsskeeper-db", required=True, help="Path to rssKeeper SQLite database") + parser.add_argument("--dataclean-db", required=True, help="Path to dataClean SQLite database") + parser.add_argument("--pg-url", required=True, help="PostgreSQL connection URL") + parser.add_argument("--dry-run", action="store_true", help="Dry run without writing") + args = parser.parse_args() + + print(f"Migrating from {args.rsskeeper_db} and {args.dataclean_db} to {args.pg_url}") + print("This is a placeholder. Full implementation will be added in a later phase.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())