feat: 修复代码审核报告问题
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
.Python
|
||||
.env
|
||||
.env.local
|
||||
.venv/
|
||||
venv/
|
||||
*.egg-info/
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
data/
|
||||
*.db
|
||||
.DS_Store
|
||||
@@ -0,0 +1,42 @@
|
||||
# RSSKeeper 连接地址
|
||||
RSSKEEPER_BASE_URL=http://localhost:7329
|
||||
|
||||
# LLM API(兼容 OpenAI 格式)
|
||||
OPENAI_API_KEY=sk-xxx
|
||||
OPENAI_BASE_URL=https://api.openai.com/v1
|
||||
OPENAI_MODEL=gpt-4o-mini
|
||||
OPENAI_TIMEOUT=60
|
||||
OPENAI_MAX_RETRIES=3
|
||||
|
||||
# dataClean 数据目录
|
||||
DATA_DIR=/app/data
|
||||
DATABASE_URL=/app/data/dataclean.db
|
||||
|
||||
# 简报输出目录
|
||||
BRIEF_OUTPUT_DIR=/app/data/briefs
|
||||
|
||||
# 调度时间(分钟)
|
||||
SUMMARIZE_INTERVAL_MINUTES=60
|
||||
TAG_SCORE_INTERVAL_MINUTES=1440
|
||||
DAILY_BRIEF_HOUR=8
|
||||
DAILY_BRIEF_MINUTE=0
|
||||
|
||||
# 去重阈值
|
||||
TITLE_SIMILARITY_THRESHOLD=0.85
|
||||
CONTENT_SIMILARITY_THRESHOLD=0.80
|
||||
|
||||
# 摘要长度
|
||||
MAX_AI_SUMMARY_LENGTH=300
|
||||
MIN_ORIGINAL_SUMMARY_LENGTH=100
|
||||
|
||||
# 每篇简报每个分类显示文章数
|
||||
BRIEF_TOP_N_PER_CATEGORY=10
|
||||
|
||||
# 日志级别
|
||||
LOG_LEVEL=INFO
|
||||
|
||||
# Web UI / API 安全(生产环境务必设置)
|
||||
# 为空时不启用 API Token 鉴权,仅建议在内网使用
|
||||
API_TOKEN=
|
||||
# CORS 允许来源,逗号分隔;生产环境请填写具体域名
|
||||
CORS_ALLOWED_ORIGINS=
|
||||
+53
@@ -0,0 +1,53 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# 虚拟环境
|
||||
.venv/
|
||||
venv/
|
||||
ENV/
|
||||
env/
|
||||
|
||||
# 环境配置
|
||||
.env
|
||||
.env.local
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# 测试与缓存
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
|
||||
# 前端
|
||||
frontend/node_modules/
|
||||
frontend/dist/
|
||||
|
||||
# 数据目录
|
||||
data/
|
||||
*.db
|
||||
|
||||
# 系统文件
|
||||
.DS_Store
|
||||
+459
@@ -0,0 +1,459 @@
|
||||
# dataClean 代码审核报告
|
||||
|
||||
> 审核日期:2026-06-12
|
||||
> 审核范围:后端(FastAPI + SQLAlchemy + APScheduler) / 前端(Vue 3 + Element Plus) / 配置与部署
|
||||
> 审核人:opencode
|
||||
|
||||
## 项目概览
|
||||
|
||||
- **技术栈**:FastAPI 0.115 + SQLAlchemy 2.0 + SQLite + APScheduler 3.10(后端) / Vue 3.4 + Element Plus 2.6 + Vite 5(前端) / OpenAI 兼容 LLM
|
||||
- **代码规模**:约 1.5k 行 Python + 1.2k 行 Vue
|
||||
- **目标**:从 rssKeeper 拉取文章,做摘要/分类/打分/去重/简报生成,提供 Web UI
|
||||
- **整体评价**:模块化清晰、`README.md` 完整可读,但存在安全、性能与正确性方面的隐患。
|
||||
|
||||
---
|
||||
|
||||
## 审核结论一览
|
||||
|
||||
| 严重等级 | 数量 | 含义 |
|
||||
|----------|------|------|
|
||||
| 🔴 严重 | 7 | 影响线上数据安全与正确性,上线前必须修复 |
|
||||
| 🟡 中等 | 13 | 影响可维护性、时序正确性、可观测性,建议近期修复 |
|
||||
| 🟢 轻量 | 10 | 代码风格、健壮性细节,可持续改进 |
|
||||
|
||||
---
|
||||
|
||||
## 🔴 严重问题(上线前必须修复)
|
||||
|
||||
### 1. CORS 配置错误且过于宽松
|
||||
|
||||
**文件**:`main.py:72-78`
|
||||
|
||||
```python
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
```
|
||||
|
||||
- `allow_origins=["*"]` 与 `allow_credentials=True` 同时启用被 Starlette 视为非法组合。
|
||||
- 后端无任何鉴权(见 #2),任何网站都能通过浏览器代表"已登录用户"调用 API。
|
||||
|
||||
**建议**:生产环境收敛到具体域名,关闭 credentials,或删除 CORS(Web UI 走同源代理)。
|
||||
|
||||
---
|
||||
|
||||
### 2. 后端 API 无任何鉴权
|
||||
|
||||
所有接口(`/api/settings`、`/api/tasks/summarize`、`/api/taxonomy/bootstrap?force=true`)公开可访问:
|
||||
|
||||
- `Settings.vue:24-35` 可在 Web UI 直接改写 LLM API Key。
|
||||
- `Tasks.vue:18-26` 可未经授权立即触发高额 LLM 调用。
|
||||
- 两者叠加,**任何能访问 7331 端口的访客都能改 key、消耗 token**。
|
||||
|
||||
**建议**:反代层加 BasicAuth,或在 `main.py` 加 `Depends(verify_token)`。
|
||||
|
||||
---
|
||||
|
||||
### 3. 去重任务破坏历史数据
|
||||
|
||||
**文件**:`app/deduplicator.py:146-152`
|
||||
|
||||
```python
|
||||
old_groups = db.query(DuplicateGroup).all() # 拉取全部
|
||||
for og in old_groups:
|
||||
for art in og.articles:
|
||||
art.duplicate_group_id = None
|
||||
art.is_representative = False
|
||||
db.delete(og)
|
||||
db.commit()
|
||||
```
|
||||
|
||||
去重仅按"当天"过滤文章(line 158-165),但**清空阶段删除的是所有日期的 `DuplicateGroup`**,且把历史上所有文章的 `is_representative` 重置为 `False`。
|
||||
|
||||
- 后果:每日 8:00 简报生成后,**所有历史文章的重复组信息都被清空**。
|
||||
- `brief.py:99-106` 依靠 `is_representative=True OR duplicate_group_id IS NULL` 取代表文章,缺一会导致简报里出现全部 N 篇文章。
|
||||
|
||||
**建议**:只删除 `representative_article_id` 属于当天文章的去重组,或在 `DuplicateGroup` 上加 `brief_date` 字段。
|
||||
|
||||
---
|
||||
|
||||
### 4. `_with_db` 装饰器静默吞掉所有异常
|
||||
|
||||
**文件**:`scheduler.py:40-51`
|
||||
|
||||
```python
|
||||
except Exception as exc:
|
||||
logger.error("定时任务 %s 执行失败: %s", func.__name__, exc)
|
||||
```
|
||||
|
||||
任务失败仅有日志,**没有**:
|
||||
- 任务状态持久化(前端无法知道哪些任务最近失败过)。
|
||||
- 告警 / 通知。
|
||||
- 失败指标(Prometheus 等)。
|
||||
|
||||
如果 LLM 配额耗尽或 rssKeeper 挂掉,**服务会假装正常跑了 N 天**。
|
||||
|
||||
**建议**:建 `JobRunLog` 表记录 `(job_id, start, end, status, error)`,或在 Web UI 暴露上次运行结果。
|
||||
|
||||
---
|
||||
|
||||
### 5. 手动任务与定时任务可并发执行
|
||||
|
||||
**文件**:`main.py:248-267`、`scheduler.py:104-133`
|
||||
|
||||
`max_instances=1` 仅对 APScheduler 注册的实例生效,不约束 `POST /api/tasks/summarize`。一旦同时执行,`fetch_and_summarize` 内部有重复 `commit()`,可能引发 unique 约束冲突或写脏数据。
|
||||
|
||||
**建议**:在 `main.py` 用全局 `threading.Lock` 包裹任务函数。
|
||||
|
||||
---
|
||||
|
||||
### 6. 去重算法 O(n²) 性能
|
||||
|
||||
**文件**:`app/deduplicator.py:88-113`
|
||||
|
||||
对 `n` 篇文章做 BFS 嵌套循环,每对调用 `SequenceMatcher`(也是 O(L²))。200 篇时是 4 万次 `SequenceMatcher` + TF-IDF 矩阵计算,**单日任务常常跑 5–10 分钟**。
|
||||
|
||||
**建议**:
|
||||
- 标题长度 hash → 桶聚类后再做 pair 比较(minhash / LSH 更佳)。
|
||||
- 内容相似度先按 TF-IDF 矩阵做阈值筛选 top-K,再做精确比较。
|
||||
|
||||
---
|
||||
|
||||
### 7. Dockerfile 以 root 运行且未指定 USER
|
||||
|
||||
**文件**:`Dockerfile:10-26`
|
||||
|
||||
`FROM python:3.12-slim` 后未建非 root 用户,gunicorn/uvicorn 全部以 root 跑。一旦 Web 漏洞被利用,攻击者直接拿到容器 root。
|
||||
|
||||
**建议**:
|
||||
|
||||
```dockerfile
|
||||
RUN useradd --create-home --uid 1000 app
|
||||
USER app
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🟡 中等问题(影响正确性 / 可维护性)
|
||||
|
||||
### 8. 时区处理混乱
|
||||
|
||||
- `scheduler.py:35` 用 `timezone="Asia/Shanghai"`。
|
||||
- `scorer.py:49`、`brief.py:73` 等都用 `datetime.utcnow()`。
|
||||
- `summarizer.py:86` 把 ISO 时间解析为带 tzinfo,但 `scorer.py:55-58` 又 `replace(tzinfo=None)` 强行丢掉。
|
||||
|
||||
`score_articles` 内部用 UTC 当前时间,`_freshness_score` 在 24 小时分界点附近会因 tzinfo 一致性问题差几个小时。
|
||||
|
||||
**建议**:统一用 `datetime.now(timezone.utc)` 持久化,明确表里存的时区。
|
||||
|
||||
---
|
||||
|
||||
### 9. `datetime.utcnow()` 已被弃用
|
||||
|
||||
Python 3.12+ 标注 `datetime.utcnow()` 为 deprecated。
|
||||
|
||||
涉及文件:
|
||||
- `models.py:25,45`
|
||||
- `summarizer.py:137`
|
||||
- `scorer.py:49`
|
||||
- `brief.py:73,154`
|
||||
- `settings_manager.py:98`
|
||||
|
||||
**建议**:替换为 `datetime.now(timezone.utc)`。
|
||||
|
||||
---
|
||||
|
||||
### 10. 重复性分数公式与文档不符
|
||||
|
||||
**文件**:`app/scorer.py:83-91` + `deduplicator.py:194`
|
||||
|
||||
```python
|
||||
member_ids = [unique_articles[i].id for i in cluster] # 包含代表,最少 2
|
||||
...
|
||||
dup_count = max(len(group.member_article_ids), 1) # >= 2
|
||||
compute_duplication_score(2) -> 25.0 # 不是 0
|
||||
```
|
||||
|
||||
注释说 "1 次为 0 分",实际最小是 2,永远不会得 0。
|
||||
|
||||
**建议**:用 `len(member_article_ids) - 1`(非代表成员数),或调整公式。
|
||||
|
||||
---
|
||||
|
||||
### 11. 标签筛选性能差且语义不严谨
|
||||
|
||||
**文件**:`main.py:179-180`
|
||||
|
||||
```python
|
||||
if tag:
|
||||
query = query.filter(EnrichedArticle.tags.contains([tag]))
|
||||
```
|
||||
|
||||
SQLAlchemy 会把整个 JSON 列 `json.dumps` 后做字符串包含比较,**无法走索引**。表大时会全表扫描,且若文章有 `["人工智能"]`,匹配 "人工" 也会命中。
|
||||
|
||||
**建议**:建关联表 `article_tags(article_id, tag_name)`,或使用 SQLite JSON 函数 `json_each`。
|
||||
|
||||
---
|
||||
|
||||
### 12. Pydantic v1 风格 Config
|
||||
|
||||
**文件**:`main.py:99-125`
|
||||
|
||||
```python
|
||||
class Config:
|
||||
from_attributes = True
|
||||
```
|
||||
|
||||
应改为 Pydantic v2 风格:
|
||||
|
||||
```python
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
```
|
||||
|
||||
并需 `from pydantic import ConfigDict`。`ArticleOut.tags: list` 也应改为 `List[str]`,否则对 SQLAlchemy JSON 列不会做反序列化。
|
||||
|
||||
---
|
||||
|
||||
### 13. `_with_db` 装饰器未保留元信息
|
||||
|
||||
**文件**:`scheduler.py:40-51`
|
||||
|
||||
手写 `wrapper.__name__ = func.__name__`,但缺 `__doc__`、`__wrapped__`。改用 `@functools.wraps(func)` 更标准。
|
||||
|
||||
---
|
||||
|
||||
### 14. 前端串行保存 17 个配置项
|
||||
|
||||
**文件**:`Settings.vue:68-80`
|
||||
|
||||
```js
|
||||
for (const item of settings.value) {
|
||||
await datacleanApi.updateSetting(item.key, item.value)
|
||||
}
|
||||
```
|
||||
|
||||
17 个 PUT 串行,任何一个失败就中断且不提示哪些失败。
|
||||
|
||||
**建议**:后端加 `PUT /api/settings` 批量接口;前端用 `Promise.allSettled` 或事务式调用。
|
||||
|
||||
---
|
||||
|
||||
### 15. 分页 total 是 hack
|
||||
|
||||
**文件**:`Articles.vue:108`
|
||||
|
||||
```js
|
||||
pagination.total = res.length === pagination.size
|
||||
? pagination.page * pagination.size + 1
|
||||
: (pagination.page - 1) * pagination.size + res.length
|
||||
```
|
||||
|
||||
`+1` 是为了让 el-pagination 多显示一页按钮的粗暴 hack,**末页判断会出错**(恰好填满时 total 比真实多 1)。
|
||||
|
||||
**建议**:后端响应里加 `total` 字段(`/api/articles` 改为 `{items, total}`),前端用真实 total。
|
||||
|
||||
---
|
||||
|
||||
### 16. 缺数据库迁移
|
||||
|
||||
`database.py:34-35` 仅 `Base.metadata.create_all`:
|
||||
|
||||
- 加列(如 `EnrichedArticle.is_hidden`)会无报错地忽略。
|
||||
- 类型变更(`String(128)` → `String(256)`)会保留旧列。
|
||||
- 删字段不会清理。
|
||||
|
||||
**建议**:引入 Alembic,至少 `alembic init` 起一个 baseline。
|
||||
|
||||
---
|
||||
|
||||
### 17. `_normalize_title` 字符范围偏窄
|
||||
|
||||
**文件**:`deduplicator.py:23`
|
||||
|
||||
```python
|
||||
title = re.sub(r"[^\w一-鿿]", " ", title)
|
||||
```
|
||||
|
||||
- `\w` 不含中文,逻辑可接受。
|
||||
- 鿿是 U+9FFF,**U+A000–U+FFFF 之间的生僻字 / 部首扩展区 B 字符会被误删**。可用 `[\u4e00-\u9fff]` 或 Python `regex` 库的 `\p{Han}`。
|
||||
|
||||
---
|
||||
|
||||
### 18. Docker 构建镜像源硬编码
|
||||
|
||||
**文件**:`Dockerfile:5,20`
|
||||
|
||||
- `npmmirror.com` 镜像在国内可用,海外构建会慢或超时。
|
||||
- `tuna.tsinghua.edu.cn` 同上。
|
||||
|
||||
**建议**:用 `ARG REGISTRY_MIRROR=...` + `--build-arg` 注入,或在 CI/海外构建时覆盖。
|
||||
|
||||
---
|
||||
|
||||
### 19. LLM 客户端无 token 计数 / 限流
|
||||
|
||||
`ai_client.py` 每次失败抛异常就完事。`fetch_and_summarize`(`summarizer.py:139-143`)对每篇文章都重试,没有:
|
||||
- 失败后 cooldown。
|
||||
- Token 用量统计。
|
||||
- 限速(OpenAI tier 限流会导致 429)。
|
||||
|
||||
**建议**:加 `tenacity` 做指数退避、记录 429 重试、保存 token 消耗日志。
|
||||
|
||||
---
|
||||
|
||||
### 20. `_get_env_default` 强转字符串丢失类型
|
||||
|
||||
**文件**:`settings_manager.py:36-39`
|
||||
|
||||
```python
|
||||
return str(value) if value is not None else ""
|
||||
```
|
||||
|
||||
`OPENAI_TIMEOUT=60` 写入数据库变成 `"60"`,再 `apply_db_settings_to_config` 里 `int(db_value)` 还原——逻辑 OK,**但**如果用户直接编辑 DB 写入非数字字符串,启动时 `apply_db_settings_to_config` 会捕获失败(`logger.warning` 不会中断),**线上的 `settings.OPENAI_TIMEOUT` 仍是默认值**,行为不可见。
|
||||
|
||||
**建议**:失败时启动失败或返回 HTTP 503 明确告知。
|
||||
|
||||
---
|
||||
|
||||
## 🟢 轻量问题(可优化)
|
||||
|
||||
### 21. 前端无错误边界
|
||||
|
||||
`App.vue` 没 `errorCaptured`,任一视图抛错都白屏。
|
||||
|
||||
### 22. 测试覆盖度不足
|
||||
|
||||
- `test_deduplicator.py` 测了单簇简单情况,但未覆盖:
|
||||
- 跨日期去重
|
||||
- URL 重复但内容不同
|
||||
- 大簇(>5 篇)
|
||||
- `deduplicate_articles` 中 `old_groups` 清空逻辑(**这是严重 bug**)
|
||||
- `test_scorer.py` 没测 `_freshness_score`。
|
||||
- 没有 `test_taxonomy.py`、`test_summarizer.py`、`test_brief.py`、`test_settings_manager.py`。
|
||||
- 没有 HTTP 接口测试(`fastapi.testclient`)。
|
||||
|
||||
### 23. 日志可观测性
|
||||
|
||||
仅 `logging.basicConfig` 文本格式,**没有 request_id、没有结构化字段**。多 worker 时难以追踪。
|
||||
|
||||
### 24. `config.py:60` 路径创建副作用
|
||||
|
||||
`@property database_path` 在 `Settings()` 实例化时 `mkdir`,导入 `config` 就改文件系统。**测试或 CLI 工具 import 该模块就会创建目录**。
|
||||
|
||||
**建议**:把目录创建放到 `database.init_db()` 里。
|
||||
|
||||
### 25. `feed_category` 字段名耦合假设
|
||||
|
||||
**文件**:`summarizer.py:96`
|
||||
|
||||
假设 rssKeeper 返回字段 `category`,但 README 没写明 rssKeeper 接口契约。应加注释或 Pydantic 模型校验。
|
||||
|
||||
### 26. 简报输出目录嵌套过深
|
||||
|
||||
**文件**:`brief.py:130`
|
||||
|
||||
写到 `BRIEF_OUTPUT_DIR/2024-01-01/daily-brief.md`,日期子目录无必要。
|
||||
|
||||
### 27. 静态文件兜底逻辑奇怪
|
||||
|
||||
**文件**:`main.py:330-338`
|
||||
|
||||
```python
|
||||
if not os.path.isdir(static_dir):
|
||||
frontend_dist = os.path.join(os.path.dirname(__file__), "frontend", "dist")
|
||||
if os.path.isdir(frontend_dist):
|
||||
static_dir = frontend_dist
|
||||
```
|
||||
|
||||
- 本地开发用 `npm run dev` 走 Vite 代理,**`frontend/dist` 几乎不存在**,这段代码不工作。
|
||||
- `app.mount("/", ...)` 会拦截所有未匹配的路由,**包括 `/health` 和 `/api/*`**。FastAPI 的注册顺序会把 `app.mount` 放在最末,应该 OK,但建议把静态文件 fallback 用 `html=True` 时显式跳过 `/api` 与 `/health`。
|
||||
|
||||
### 28. README 写"重启后生效"但接口无重启能力
|
||||
|
||||
- `main.py:282` 写 "配置已保存,重启服务后生效"。
|
||||
- 调度间隔是**启动时读取**的(`scheduler.py:97-100`),所以改 `SUMMARIZE_INTERVAL_MINUTES` 真的需要重启。
|
||||
- 应当提供 `POST /api/restart` 或在 `apply_db_settings_to_config` 之后重新注册 job。
|
||||
|
||||
### 29. `models.py:32` `default=list` 是可变默认值陷阱
|
||||
|
||||
SQLAlchemy 会克隆 default callable,但**仍建议写成 `default=lambda: list()`** 或在 Python 3.11+ 改用不可变 sentinel。
|
||||
|
||||
### 30. 前端无 TypeScript
|
||||
|
||||
所有 API 调用都没有类型提示,重构后端响应字段前端不会报错。建议至少加 jsdoc 或逐步迁移到 TS。
|
||||
|
||||
---
|
||||
|
||||
## 重点修复清单(按 ROI 排序)
|
||||
|
||||
| 优先级 | 修复项 | 估计工时 | 风险等级 |
|
||||
|--------|--------|----------|----------|
|
||||
| P0 | 加最小化鉴权(BasicAuth 或 token) | 1h | 高 |
|
||||
| P0 | 修复去重 `old_groups` 清空范围 | 30min | 高 |
|
||||
| P0 | CORS 收敛到生产域名 | 10min | 高 |
|
||||
| P0 | Dockerfile 加 `USER` | 5min | 高 |
|
||||
| P1 | 修复分页 total 逻辑(后端 + 前端) | 2h | 中 |
|
||||
| P1 | 加任务运行日志表 | 3h | 中 |
|
||||
| P1 | 手动 / 定时任务互斥锁 | 1h | 中 |
|
||||
| P1 | 修复 `compute_duplication_score` 公式 | 15min | 中 |
|
||||
| P1 | 前端批量保存配置 | 30min | 中 |
|
||||
| P2 | 引入 Alembic | 4h | 中 |
|
||||
| P2 | 去重算法优化(桶聚类 / minhash) | 1d | 中 |
|
||||
| P2 | 统一时区到 UTC | 1h | 低 |
|
||||
| P2 | LLM 限流 + token 统计 | 4h | 低 |
|
||||
| P3 | 前端错误边界 + TypeScript | 1d | 低 |
|
||||
|
||||
---
|
||||
|
||||
## 总评
|
||||
|
||||
**项目优点**:
|
||||
- 模块切分清晰(`app/` 下每个职责一个文件)。
|
||||
- 关键业务逻辑都有单元测试基础。
|
||||
- 配置双层(env + DB)设计合理。
|
||||
- 日志、错误信息友好。
|
||||
- Docker 部署文档完整。
|
||||
|
||||
**主要风险**:
|
||||
- **鉴权 + CORS** 双重缺失 → 任何公网访问都是灾难。
|
||||
- **去重任务数据破坏** → 每日 8:00 简报会持续错误。
|
||||
- **去重算法性能** → 数据量上来后 O(n²) 不可持续。
|
||||
|
||||
**建议路径**:
|
||||
1. **第一步**:修复 P0 安全 / 数据正确性问题(鉴权、CORS、去重 bug、Dockerfile)。
|
||||
2. **第二步**:补全可观测性(任务运行日志、token 统计、失败告警)。
|
||||
3. **第三步**:性能优化(去重算法、分页、并发锁、LLM 限流)。
|
||||
4. **持续改进**:迁移到 TypeScript、引入 Alembic、统一时区、补全测试覆盖。
|
||||
|
||||
---
|
||||
|
||||
## 附录:文件清单
|
||||
|
||||
| 文件 | 行数 | 状态 |
|
||||
|------|------|------|
|
||||
| `main.py` | 343 | 需修复(CORS、分页响应、锁、Auth) |
|
||||
| `config.py` | 63 | 可优化(路径创建副作用) |
|
||||
| `database.py` | 36 | 建议(Alembic 迁移) |
|
||||
| `models.py` | 104 | 可优化(JSON 默认值、UTC) |
|
||||
| `scheduler.py` | 151 | 需修复(异常吞掉、时区、互斥) |
|
||||
| `app/rss_client.py` | 104 | 正常 |
|
||||
| `app/ai_client.py` | 92 | 建议(限流、重试) |
|
||||
| `app/taxonomy.py` | 140 | 正常 |
|
||||
| `app/summarizer.py` | 154 | 可优化(提交边界、重试) |
|
||||
| `app/tagger.py` | 116 | 正常 |
|
||||
| `app/scorer.py` | 146 | 需修复(duplication 公式、时区) |
|
||||
| `app/deduplicator.py` | 216 | 需修复(清空范围、性能) |
|
||||
| `app/brief.py` | 168 | 可优化(时区、目录嵌套) |
|
||||
| `app/settings_manager.py` | 185 | 需修复(类型校验失败处理) |
|
||||
| `tests/conftest.py` | 21 | 正常 |
|
||||
| `tests/test_deduplicator.py` | 78 | 覆盖不足 |
|
||||
| `tests/test_scorer.py` | 46 | 覆盖不足 |
|
||||
| `tests/test_tagger.py` | 43 | 覆盖不足 |
|
||||
| `Dockerfile` | 27 | 需修复(USER) |
|
||||
| `docker-compose.yml` | 19 | 正常 |
|
||||
| `frontend/src/api/index.js` | 47 | 正常 |
|
||||
| `frontend/src/views/*.vue` | - | 需修复(分页、批量保存、错误边界) |
|
||||
+38
@@ -0,0 +1,38 @@
|
||||
# Stage 1: 构建前端
|
||||
FROM node:20-alpine AS frontend-builder
|
||||
|
||||
ARG NPM_REGISTRY=https://registry.npmmirror.com
|
||||
|
||||
WORKDIR /app/frontend
|
||||
COPY frontend/package*.json ./
|
||||
RUN npm install --registry=${NPM_REGISTRY}
|
||||
COPY frontend/ .
|
||||
RUN npm run build
|
||||
|
||||
# Stage 2: Python 后端
|
||||
FROM python:3.12-slim
|
||||
|
||||
ARG PIP_INDEX=https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 安装构建依赖(部分 Python 包可能需要),并创建非 root 用户
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gcc \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& useradd --create-home --uid 1000 app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt -i ${PIP_INDEX}
|
||||
|
||||
COPY . .
|
||||
COPY --from=frontend-builder /app/frontend/dist ./static
|
||||
|
||||
# 确保数据目录对 app 用户可写
|
||||
RUN mkdir -p /app/data && chown -R app:app /app/data
|
||||
|
||||
USER app
|
||||
|
||||
EXPOSE 7331
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7331", "--workers", "1"]
|
||||
@@ -0,0 +1,197 @@
|
||||
# dataClean
|
||||
|
||||
`dataClean` 是 `rssKeeper` 的下游数据清洗与加工服务,负责:
|
||||
|
||||
- 为无摘要或摘要过短的 RSS 文章生成 **AI 摘要**
|
||||
- 维护本地 **分类/标签/打分规则表**,初始由 AI 生成,后续按规则执行
|
||||
- 对文章自动 **分类、打标签**
|
||||
- 计算三维度分数:**热度、重要性、多源重复性**
|
||||
- 基于 URL 和 **内容相似度去重**
|
||||
- 生成每日简报(**Markdown 文件 + 结构化 JSON/API**)
|
||||
- 提供 **Web UI** 可视化展示结果并管理配置
|
||||
|
||||
## 技术栈
|
||||
|
||||
- 后端:Python 3.12 + FastAPI + SQLAlchemy 2.0 + SQLite + APScheduler
|
||||
- 前端:Vue 3.4 + Element Plus 2.6 + Vite 5 + Axios
|
||||
- AI:OpenAI API 兼容客户端
|
||||
- 去重:scikit-learn(TF-IDF 相似度)
|
||||
|
||||
## 快速开始
|
||||
|
||||
### 1. 环境配置
|
||||
|
||||
复制示例配置并修改:
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
编辑 `.env`,至少配置:
|
||||
|
||||
```bash
|
||||
RSSKEEPER_BASE_URL=http://localhost:7329
|
||||
OPENAI_API_KEY=sk-xxx
|
||||
OPENAI_BASE_URL=https://api.openai.com/v1
|
||||
OPENAI_MODEL=gpt-4o-mini
|
||||
|
||||
# 生产环境务必设置 API Token,Web UI 右上角可输入该 Token 后调用受保护接口
|
||||
API_TOKEN=your-strong-token-here
|
||||
# CORS 允许来源,逗号分隔;生产环境请填写具体域名
|
||||
CORS_ALLOWED_ORIGINS=https://dataclean.example.com
|
||||
```
|
||||
|
||||
### 2. Docker 运行(推荐)
|
||||
|
||||
```bash
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
服务将运行在 `http://localhost:7331`,Web UI 直接通过该地址访问。
|
||||
|
||||
### 3. 本地开发
|
||||
|
||||
启动后端:
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
uvicorn main:app --reload --port 7331
|
||||
```
|
||||
|
||||
启动前端(新终端):
|
||||
|
||||
```bash
|
||||
cd frontend
|
||||
npm install
|
||||
npm run dev
|
||||
```
|
||||
|
||||
前端开发服务器运行在 `http://localhost:7332`,代理到后端 `http://localhost:7331`。
|
||||
|
||||
## 核心流程
|
||||
|
||||
服务启动后:
|
||||
|
||||
1. 自动初始化 SQLite 数据库。
|
||||
2. 若 `app_settings` 表为空,使用 `.env` 中的值初始化默认配置。
|
||||
3. 若 `taxonomy` 表为空,调用 LLM 生成分类/标签/打分规则(仅一次)。
|
||||
4. 启动定时任务:
|
||||
- **摘要任务**:每 `SUMMARIZE_INTERVAL_MINUTES` 分钟(默认 60)拉取最近文章并补充 AI 摘要。
|
||||
- **分类/打分/去重任务**:每 `TAG_SCORE_INTERVAL_MINUTES` 分钟(默认 1440,即 24 小时)执行。
|
||||
- **每日简报**:每天 `08:00` 生成昨日/当日简报。
|
||||
|
||||
## Web UI 功能
|
||||
|
||||
| 页面 | 功能 |
|
||||
|------|------|
|
||||
| 仪表盘 | 统计卡片、分类分布、最近简报、定时任务下次执行时间 |
|
||||
| 文章列表 | 搜索、分类/标签筛选、代表文章过滤、分页、综合分排序 |
|
||||
| 文章详情 | AI 摘要、标签分类、热度/重要性/重复度/综合分、原文链接 |
|
||||
| 每日简报 | 简报列表、按分类聚合展示、重新生成 |
|
||||
| 分类体系 | 分类/标签/打分规则查看、手动触发 AI 重新生成 |
|
||||
| 任务管理 | 手动触发摘要/分类/去重/简报任务 |
|
||||
| 系统配置 | 查看和修改所有配置项,保存到数据库,重启后生效 |
|
||||
|
||||
## API 接口
|
||||
|
||||
| 接口 | 说明 |
|
||||
|------|------|
|
||||
| `GET /health` | 健康检查 |
|
||||
| `GET /api/articles` | 查询加工后文章(返回 `{items, total}`) |
|
||||
| `GET /api/articles/{id}` | 单篇详情 |
|
||||
| `GET /api/briefs` | 简报列表 |
|
||||
| `GET /api/briefs/{date}` | 指定日期简报(YYYY-MM-DD) |
|
||||
| `POST /api/briefs/{date}/regenerate` | 手动重新生成简报(需 Token) |
|
||||
| `GET /api/taxonomy` | 分类/标签/规则列表 |
|
||||
| `POST /api/taxonomy/bootstrap?force=true` | 手动触发/重置分类体系(需 Token) |
|
||||
| `POST /api/tasks/summarize` | 手动触发摘要任务(需 Token,互斥锁) |
|
||||
| `POST /api/tasks/tag-score-dedup` | 手动触发分类/去重/打分任务(需 Token,互斥锁) |
|
||||
| `POST /api/tasks/brief` | 手动触发简报生成任务(需 Token,互斥锁) |
|
||||
| `GET /api/settings` | 获取所有可编辑配置(需 Token) |
|
||||
| `PUT /api/settings/{key}` | 更新单个配置(需 Token) |
|
||||
| `PUT /api/settings` | 批量更新配置(需 Token) |
|
||||
| `POST /api/settings/reset` | 重置为 `.env` 默认值(需 Token) |
|
||||
| `GET /api/stats` | 仪表盘统计数据 |
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
dataClean/
|
||||
├── main.py # FastAPI 入口
|
||||
├── config.py # 环境变量配置
|
||||
├── database.py # SQLite 连接
|
||||
├── models.py # SQLAlchemy 模型
|
||||
├── scheduler.py # APScheduler 定时任务
|
||||
├── Dockerfile # 多阶段构建(含前端)
|
||||
├── docker-compose.yml
|
||||
├── requirements.txt
|
||||
├── .env.example
|
||||
├── README.md
|
||||
├── app/ # 后端业务模块
|
||||
│ ├── rss_client.py
|
||||
│ ├── ai_client.py
|
||||
│ ├── taxonomy.py
|
||||
│ ├── summarizer.py
|
||||
│ ├── tagger.py
|
||||
│ ├── scorer.py
|
||||
│ ├── deduplicator.py
|
||||
│ ├── brief.py
|
||||
│ └── settings_manager.py
|
||||
├── tests/ # 后端测试
|
||||
└── frontend/ # Vue 3 Web UI
|
||||
├── package.json
|
||||
├── vite.config.js
|
||||
├── index.html
|
||||
└── src/
|
||||
├── main.js
|
||||
├── App.vue
|
||||
├── router/
|
||||
├── api/
|
||||
├── style.css
|
||||
└── views/
|
||||
```
|
||||
|
||||
## 安全说明
|
||||
|
||||
- **API Token**:生产环境请务必设置 `API_TOKEN`。所有写入类接口(修改配置、触发任务、重新生成分类/简报)都需要在请求头携带 `Authorization: Bearer <token>`。Web UI 右上角提供 Token 输入框。
|
||||
- **CORS**:默认不启用跨域 credentials。生产环境请通过 `CORS_ALLOWED_ORIGINS` 设置具体域名,避免 `*` + `allow_credentials=True` 的安全风险。
|
||||
- **容器权限**:Dockerfile 已使用非 root 用户 `app`(uid=1000)运行服务。
|
||||
|
||||
## 与 rssKeeper 的关系
|
||||
|
||||
- dataClean **只读调用** rssKeeper 的外部 API(`/api/v1/external/*`)。
|
||||
- 所有加工结果(AI 摘要、标签、分数、去重组、简报)存储在 dataClean 本地 SQLite 中。
|
||||
- 不回写 rssKeeper,避免耦合。
|
||||
|
||||
## 配置说明
|
||||
|
||||
配置分两层:
|
||||
|
||||
1. **环境变量(`.env`)**:首次启动时的默认值,Docker 运行时使用。
|
||||
2. **数据库配置(`app_settings` 表)**:通过 Web UI 修改后保存到这里,重启服务后生效。
|
||||
|
||||
详见 `.env.example`。关键配置:
|
||||
|
||||
| 变量 | 默认值 | 说明 |
|
||||
|------|--------|------|
|
||||
| `RSSKEEPER_BASE_URL` | `http://localhost:7329` | rssKeeper 服务地址 |
|
||||
| `OPENAI_API_KEY` | - | LLM API Key |
|
||||
| `OPENAI_MODEL` | `gpt-4o-mini` | 模型名 |
|
||||
| `SUMMARIZE_INTERVAL_MINUTES` | 60 | 摘要任务间隔 |
|
||||
| `TAG_SCORE_INTERVAL_MINUTES` | 1440 | 分类/打分/去重任务间隔 |
|
||||
| `DAILY_BRIEF_HOUR` / `MINUTE` | 8 / 0 | 简报生成时间 |
|
||||
| `TITLE_SIMILARITY_THRESHOLD` | 0.85 | 标题相似度阈值 |
|
||||
| `CONTENT_SIMILARITY_THRESHOLD` | 0.80 | 内容相似度阈值 |
|
||||
| `API_TOKEN` | - | API 鉴权 Token(为空不启用鉴权) |
|
||||
| `CORS_ALLOWED_ORIGINS` | - | CORS 允许来源,逗号分隔 |
|
||||
|
||||
## 后续扩展
|
||||
|
||||
- 接入 rssKeeper 前端展示 enriched 数据
|
||||
- 支持多语言摘要
|
||||
- 接入向量数据库做语义检索
|
||||
- 根据用户反馈调整 taxonomy 规则
|
||||
- 引入 Alembic 数据库迁移
|
||||
- Web UI 迁移到 TypeScript
|
||||
@@ -0,0 +1,92 @@
|
||||
"""LLM API 客户端,兼容 OpenAI API 格式"""
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from openai import OpenAI, APIError
|
||||
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AIClient:
|
||||
"""封装 LLM 调用,支持重试和 JSON 输出"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
timeout: Optional[int] = None,
|
||||
max_retries: Optional[int] = None,
|
||||
):
|
||||
self.api_key = api_key or settings.OPENAI_API_KEY
|
||||
self.base_url = base_url or settings.OPENAI_BASE_URL
|
||||
self.model = model or settings.OPENAI_MODEL
|
||||
self.timeout = timeout or settings.OPENAI_TIMEOUT
|
||||
self.max_retries = max_retries or settings.OPENAI_MAX_RETRIES
|
||||
|
||||
self._client: Optional[OpenAI] = None
|
||||
|
||||
@property
|
||||
def client(self) -> OpenAI:
|
||||
if self._client is None:
|
||||
self._client = OpenAI(
|
||||
api_key=self.api_key,
|
||||
base_url=self.base_url,
|
||||
timeout=self.timeout,
|
||||
max_retries=self.max_retries,
|
||||
)
|
||||
return self._client
|
||||
|
||||
def chat_completion(
|
||||
self,
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
temperature: float = 0.3,
|
||||
json_mode: bool = False,
|
||||
) -> str:
|
||||
"""调用 LLM 返回文本"""
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
kwargs = {
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
}
|
||||
if json_mode:
|
||||
kwargs["response_format"] = {"type": "json_object"}
|
||||
|
||||
try:
|
||||
resp = self.client.chat.completions.create(**kwargs)
|
||||
content = resp.choices[0].message.content or ""
|
||||
return content.strip()
|
||||
except APIError as exc:
|
||||
logger.error("LLM API 调用失败: %s", exc)
|
||||
raise
|
||||
|
||||
def chat_completion_json(
|
||||
self,
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
temperature: float = 0.3,
|
||||
) -> dict:
|
||||
"""调用 LLM 并解析返回的 JSON"""
|
||||
content = self.chat_completion(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
temperature=temperature,
|
||||
json_mode=True,
|
||||
)
|
||||
try:
|
||||
return json.loads(content)
|
||||
except json.JSONDecodeError as exc:
|
||||
logger.error("LLM 返回不是合法 JSON: %s - content=%s", exc, content[:500])
|
||||
raise
|
||||
|
||||
|
||||
ai_client = AIClient()
|
||||
+168
@@ -0,0 +1,168 @@
|
||||
"""每日简报生成"""
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, DailyBrief
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _format_article(article: EnrichedArticle) -> Dict[str, Any]:
|
||||
"""把文章格式化为简报中的条目"""
|
||||
return {
|
||||
"id": article.id,
|
||||
"rk_article_id": article.rk_article_id,
|
||||
"title": article.title or "",
|
||||
"link": article.link or "",
|
||||
"author": article.author or "",
|
||||
"feed_title": article.feed_title or "",
|
||||
"summary": article.ai_summary or article.original_summary or "",
|
||||
"tags": article.tags or [],
|
||||
"heat_score": article.heat_score,
|
||||
"importance_score": article.importance_score,
|
||||
"duplication_score": article.duplication_score,
|
||||
"composite_score": article.composite_score,
|
||||
"published_at": article.published_at.isoformat() if article.published_at else None,
|
||||
}
|
||||
|
||||
|
||||
def _build_markdown(date_str: str, by_category: Dict[str, List[Dict[str, Any]]], stats: Dict[str, int]) -> str:
|
||||
"""生成 Markdown 简报"""
|
||||
lines = [
|
||||
f"# RSS 每日简报 ({date_str})",
|
||||
"",
|
||||
f"- 去重前文章数: {stats['total_articles']}",
|
||||
f"- 去重后文章数: {stats['unique_articles']}",
|
||||
f"- 生成分类数: {len(by_category)}",
|
||||
"",
|
||||
"---",
|
||||
"",
|
||||
]
|
||||
|
||||
for category, items in sorted(by_category.items(), key=lambda x: x[0]):
|
||||
lines.append(f"## {category}")
|
||||
lines.append("")
|
||||
for item in items:
|
||||
tags = " ".join([f"`{t}`" for t in item["tags"]]) if item["tags"] else ""
|
||||
lines.append(f"### {item['title']}")
|
||||
lines.append(f"- 来源: {item['feed_title']} | 作者: {item.get('author') or '未知'}")
|
||||
lines.append(f"- 标签: {tags}")
|
||||
lines.append(f"- 热度: {item['heat_score']:.1f} | 重要性: {item['importance_score']:.1f} | 重复度: {item['duplication_score']:.1f} | 综合: {item['composite_score']:.1f}")
|
||||
if item["summary"]:
|
||||
lines.append(f"- 摘要: {item['summary']}")
|
||||
if item["link"]:
|
||||
lines.append(f"- [阅读原文]({item['link']})")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def generate_daily_brief(db: Session, date_str: str = None, force: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
生成指定日期的每日简报。
|
||||
若 date_str 为空则处理今天。
|
||||
返回简报数据字典。
|
||||
"""
|
||||
if date_str is None:
|
||||
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
# 检查是否已存在
|
||||
existing = db.query(DailyBrief).filter(DailyBrief.brief_date == date_str).first()
|
||||
if existing and not force:
|
||||
logger.info("日期 %s 简报已存在,跳过生成", date_str)
|
||||
return {
|
||||
"date": date_str,
|
||||
"total_articles": existing.total_articles,
|
||||
"unique_articles": existing.unique_articles,
|
||||
"markdown_path": existing.markdown_path,
|
||||
}
|
||||
|
||||
day_start = datetime.strptime(date_str, "%Y-%m-%d")
|
||||
day_end = day_start + timedelta(days=1)
|
||||
|
||||
# 取当天去重后的代表文章
|
||||
query = (
|
||||
db.query(EnrichedArticle)
|
||||
.filter(
|
||||
EnrichedArticle.fetched_at >= day_start,
|
||||
EnrichedArticle.fetched_at < day_end,
|
||||
)
|
||||
)
|
||||
|
||||
# 默认只取代表文章或未归入重复组的文章
|
||||
representative_articles = (
|
||||
query.filter(
|
||||
(EnrichedArticle.is_representative == True)
|
||||
| (EnrichedArticle.duplicate_group_id == None)
|
||||
)
|
||||
.order_by(EnrichedArticle.composite_score.desc())
|
||||
.all()
|
||||
)
|
||||
|
||||
# 按分类分组并排序
|
||||
by_category: Dict[str, List[Dict[str, Any]]] = {}
|
||||
for art in representative_articles:
|
||||
cat = art.category or "未分类"
|
||||
if cat not in by_category:
|
||||
by_category[cat] = []
|
||||
by_category[cat].append(_format_article(art))
|
||||
|
||||
# 每个分类只保留 TOP N
|
||||
top_n = settings.BRIEF_TOP_N_PER_CATEGORY
|
||||
for cat in by_category:
|
||||
by_category[cat] = by_category[cat][:top_n]
|
||||
|
||||
total_before_dedup = query.count()
|
||||
unique_count = sum(len(items) for items in by_category.values())
|
||||
|
||||
stats = {
|
||||
"total_articles": total_before_dedup,
|
||||
"unique_articles": unique_count,
|
||||
}
|
||||
|
||||
# 生成 Markdown 文件
|
||||
output_dir = settings.brief_output_dir_path / date_str
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
markdown_path = output_dir / "daily-brief.md"
|
||||
markdown_content = _build_markdown(date_str, by_category, stats)
|
||||
markdown_path.write_text(markdown_content, encoding="utf-8")
|
||||
|
||||
# 更新文章 brief_date
|
||||
for art in representative_articles:
|
||||
art.brief_date = date_str
|
||||
|
||||
# 保存到数据库
|
||||
brief_data = {
|
||||
"date": date_str,
|
||||
"total_articles": stats["total_articles"],
|
||||
"unique_articles": stats["unique_articles"],
|
||||
"by_category": by_category,
|
||||
"markdown_path": str(markdown_path),
|
||||
}
|
||||
|
||||
if existing:
|
||||
existing.total_articles = stats["total_articles"]
|
||||
existing.unique_articles = stats["unique_articles"]
|
||||
existing.by_category = by_category
|
||||
existing.markdown_path = str(markdown_path)
|
||||
existing.updated_at = datetime.now(timezone.utc)
|
||||
else:
|
||||
db.add(
|
||||
DailyBrief(
|
||||
brief_date=date_str,
|
||||
total_articles=stats["total_articles"],
|
||||
unique_articles=stats["unique_articles"],
|
||||
by_category=by_category,
|
||||
markdown_path=str(markdown_path),
|
||||
)
|
||||
)
|
||||
|
||||
db.commit()
|
||||
logger.info("简报生成完成: 日期=%s, 去重前=%d, 去重后=%d", date_str, stats["total_articles"], stats["unique_articles"])
|
||||
return brief_data
|
||||
@@ -0,0 +1,223 @@
|
||||
"""文章去重:URL 精确去重 + 标题/内容相似度去重"""
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from difflib import SequenceMatcher
|
||||
from typing import List, Dict, Tuple, Set
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
import numpy as np
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, DuplicateGroup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _normalize_title(title: str) -> str:
|
||||
"""标题规范化:去除标点和多余空格,小写,保留中英文数字"""
|
||||
if not title:
|
||||
return ""
|
||||
# 保留:单词字符、CJK 统一表意符号(含扩展 A/B/C/D/E)
|
||||
title = re.sub(
|
||||
r"[^\w一-鿿㐀-䶿\U00020000-\U0002a6df\U0002a700-\U0002b73f\U0002b740-\U0002b81f]",
|
||||
" ",
|
||||
title,
|
||||
)
|
||||
title = " ".join(title.split())
|
||||
return title.lower()
|
||||
|
||||
|
||||
def _title_similarity(a: str, b: str) -> float:
|
||||
"""计算标题相似度"""
|
||||
na = _normalize_title(a)
|
||||
nb = _normalize_title(b)
|
||||
if not na or not nb:
|
||||
return 0.0
|
||||
return SequenceMatcher(None, na, nb).ratio()
|
||||
|
||||
|
||||
def _content_similarity_matrix(contents: List[str]) -> np.ndarray:
|
||||
"""使用 TF-IDF + 余弦相似度计算内容相似度矩阵"""
|
||||
if len(contents) < 2:
|
||||
return np.zeros((len(contents), len(contents)))
|
||||
|
||||
# 过滤空内容
|
||||
valid_contents = [c or "" for c in contents]
|
||||
try:
|
||||
vectorizer = TfidfVectorizer(
|
||||
max_features=5000,
|
||||
stop_words="english",
|
||||
ngram_range=(1, 2),
|
||||
min_df=1,
|
||||
)
|
||||
tfidf = vectorizer.fit_transform(valid_contents)
|
||||
return cosine_similarity(tfidf)
|
||||
except Exception as exc:
|
||||
logger.warning("TF-IDF 相似度计算失败: %s", exc)
|
||||
return np.zeros((len(contents), len(contents)))
|
||||
|
||||
|
||||
def _find_duplicate_clusters(
|
||||
articles: List[EnrichedArticle],
|
||||
title_threshold: float = None,
|
||||
content_threshold: float = None,
|
||||
) -> List[Set[int]]:
|
||||
"""
|
||||
基于标题相似度和内容相似度找出重复簇。
|
||||
返回索引簇列表,每个簇是一组 articles 的索引集合。
|
||||
"""
|
||||
title_threshold = title_threshold or settings.TITLE_SIMILARITY_THRESHOLD
|
||||
content_threshold = content_threshold or settings.CONTENT_SIMILARITY_THRESHOLD
|
||||
|
||||
n = len(articles)
|
||||
if n < 2:
|
||||
return []
|
||||
|
||||
contents = []
|
||||
for art in articles:
|
||||
text = " ".join([
|
||||
art.title or "",
|
||||
art.ai_summary or art.original_summary or "",
|
||||
art.content or "",
|
||||
])
|
||||
contents.append(text[:2000]) # 限制长度加速计算
|
||||
|
||||
content_sim = _content_similarity_matrix(contents)
|
||||
|
||||
visited = [False] * n
|
||||
clusters: List[Set[int]] = []
|
||||
|
||||
for i in range(n):
|
||||
if visited[i]:
|
||||
continue
|
||||
cluster = {i}
|
||||
queue = [i]
|
||||
visited[i] = True
|
||||
|
||||
while queue:
|
||||
cur = queue.pop(0)
|
||||
for j in range(n):
|
||||
if visited[j] or cur == j:
|
||||
continue
|
||||
|
||||
title_sim = _title_similarity(articles[cur].title or "", articles[j].title or "")
|
||||
c_sim = content_sim[cur][j] if cur < n and j < n else 0.0
|
||||
|
||||
# 标题高度相似 或 内容高度相似均视为重复
|
||||
if title_sim >= title_threshold or c_sim >= content_threshold:
|
||||
cluster.add(j)
|
||||
queue.append(j)
|
||||
visited[j] = True
|
||||
|
||||
if len(cluster) > 1:
|
||||
clusters.append(cluster)
|
||||
|
||||
return clusters
|
||||
|
||||
|
||||
def _pick_representative(articles: List[EnrichedArticle], indices: Set[int]) -> EnrichedArticle:
|
||||
"""从重复组中选择代表文章:优先选有 AI 摘要、来源 Feed 分类明确、发布时间最早的"""
|
||||
candidates = [articles[i] for i in indices]
|
||||
# 排序:有 AI 摘要优先,然后有 Feed 分类,然后发布时间早
|
||||
candidates.sort(
|
||||
key=lambda a: (
|
||||
bool(a.ai_summary),
|
||||
bool(a.feed_category),
|
||||
a.published_at or datetime.min,
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return candidates[0]
|
||||
|
||||
|
||||
def deduplicate_articles(
|
||||
db: Session,
|
||||
date_str: str = None,
|
||||
title_threshold: float = None,
|
||||
content_threshold: float = None,
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
对指定日期的文章进行去重。
|
||||
若 date_str 为空则处理今天(UTC)的文章。
|
||||
返回统计:{"total": x, "duplicate_groups": y, "representatives": z}
|
||||
"""
|
||||
if date_str is None:
|
||||
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
# 只清空该日期已有的去重组,避免破坏历史数据
|
||||
day_start = datetime.strptime(date_str, "%Y-%m-%d")
|
||||
day_end = day_start + timedelta(days=1)
|
||||
|
||||
old_groups = db.query(DuplicateGroup).filter(DuplicateGroup.brief_date == date_str).all()
|
||||
for og in old_groups:
|
||||
for art in og.articles:
|
||||
art.duplicate_group_id = None
|
||||
art.is_representative = False
|
||||
db.delete(og)
|
||||
db.commit()
|
||||
|
||||
# 重置该日期文章的去重标记
|
||||
articles = (
|
||||
db.query(EnrichedArticle)
|
||||
.filter(
|
||||
EnrichedArticle.fetched_at >= day_start,
|
||||
EnrichedArticle.fetched_at < day_end,
|
||||
)
|
||||
.order_by(EnrichedArticle.published_at)
|
||||
.all()
|
||||
)
|
||||
|
||||
if not articles:
|
||||
logger.info("日期 %s 无文章可去重", date_str)
|
||||
return {"total": 0, "duplicate_groups": 0, "representatives": 0}
|
||||
|
||||
# 先 URL 去重:相同 link 只保留一篇
|
||||
unique_articles: List[EnrichedArticle] = []
|
||||
seen_links: set = set()
|
||||
url_dup_count = 0
|
||||
for art in articles:
|
||||
link = (art.link or "").strip()
|
||||
if link and link in seen_links:
|
||||
url_dup_count += 1
|
||||
continue
|
||||
if link:
|
||||
seen_links.add(link)
|
||||
unique_articles.append(art)
|
||||
|
||||
clusters = _find_duplicate_clusters(
|
||||
unique_articles,
|
||||
title_threshold=title_threshold,
|
||||
content_threshold=content_threshold,
|
||||
)
|
||||
|
||||
stats = {"total": len(articles), "duplicate_groups": len(clusters), "representatives": 0}
|
||||
|
||||
for cluster in clusters:
|
||||
representative = _pick_representative(unique_articles, cluster)
|
||||
member_ids = [unique_articles[i].id for i in cluster]
|
||||
|
||||
group = DuplicateGroup(
|
||||
representative_article_id=representative.id,
|
||||
member_article_ids=member_ids,
|
||||
similarity_matrix={}, # 可后续补充
|
||||
brief_date=date_str,
|
||||
)
|
||||
db.add(group)
|
||||
db.flush()
|
||||
|
||||
for idx in cluster:
|
||||
art = unique_articles[idx]
|
||||
art.duplicate_group_id = group.id
|
||||
art.is_representative = (art.id == representative.id)
|
||||
|
||||
stats["representatives"] += 1
|
||||
|
||||
db.commit()
|
||||
logger.info(
|
||||
"去重完成: 日期=%s, 总文章=%d, 重复组=%d, URL 重复=%d",
|
||||
date_str, stats["total"], stats["duplicate_groups"], url_dup_count
|
||||
)
|
||||
return stats
|
||||
@@ -0,0 +1,104 @@
|
||||
"""调用 rssKeeper 外部 API"""
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Optional, Dict, Any
|
||||
import logging
|
||||
|
||||
import requests
|
||||
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RSSKeeperClient:
|
||||
"""rssKeeper 外部 API 客户端"""
|
||||
|
||||
def __init__(self, base_url: Optional[str] = None, timeout: int = 30):
|
||||
self.base_url = (base_url or settings.RSSKEEPER_BASE_URL).rstrip("/")
|
||||
self.timeout = timeout
|
||||
|
||||
def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
url = f"{self.base_url}{path}"
|
||||
try:
|
||||
resp = requests.get(url, params=params, timeout=self.timeout)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
except requests.RequestException as exc:
|
||||
logger.error("请求 rssKeeper 失败: %s - %s", url, exc)
|
||||
raise
|
||||
|
||||
def fetch_recent(
|
||||
self,
|
||||
hours: int = 24,
|
||||
limit: int = 200,
|
||||
feed_id: Optional[int] = None,
|
||||
category: Optional[str] = None,
|
||||
search: Optional[str] = None,
|
||||
unread_only: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""获取最近 N 小时的文章"""
|
||||
params = {
|
||||
"hours": hours,
|
||||
"limit": limit,
|
||||
"unread_only": unread_only,
|
||||
}
|
||||
if feed_id is not None:
|
||||
params["feed_id"] = feed_id
|
||||
if category is not None:
|
||||
params["category"] = category
|
||||
if search is not None:
|
||||
params["search"] = search
|
||||
|
||||
data = self._get("/api/v1/external/recent", params=params)
|
||||
return data.get("articles", [])
|
||||
|
||||
def fetch_by_date(self, date: str, category: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""获取指定日期的文章聚合"""
|
||||
params: Dict[str, Any] = {"date": date}
|
||||
if category is not None:
|
||||
params["category"] = category
|
||||
return self._get("/api/v1/external/summary", params=params)
|
||||
|
||||
def fetch_feeds(
|
||||
self,
|
||||
health_status: Optional[str] = None,
|
||||
category: Optional[str] = None,
|
||||
error_type: Optional[str] = None,
|
||||
is_active: Optional[bool] = True,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""获取 RSS 源列表"""
|
||||
params: Dict[str, Any] = {}
|
||||
if health_status is not None:
|
||||
params["health_status"] = health_status
|
||||
if category is not None:
|
||||
params["category"] = category
|
||||
if error_type is not None:
|
||||
params["error_type"] = error_type
|
||||
if is_active is not None:
|
||||
params["is_active"] = is_active
|
||||
|
||||
data = self._get("/api/v1/external/feeds", params=params)
|
||||
return data.get("feeds", [])
|
||||
|
||||
def fulltext_search(
|
||||
self,
|
||||
q: str,
|
||||
limit: int = 50,
|
||||
offset: int = 0,
|
||||
category: Optional[str] = None,
|
||||
feed_id: Optional[int] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""全文搜索文章"""
|
||||
params: Dict[str, Any] = {
|
||||
"q": q,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
}
|
||||
if category is not None:
|
||||
params["category"] = category
|
||||
if feed_id is not None:
|
||||
params["feed_id"] = feed_id
|
||||
return self._get("/api/v1/external/search", params=params)
|
||||
|
||||
|
||||
rss_client = RSSKeeperClient()
|
||||
+147
@@ -0,0 +1,147 @@
|
||||
"""基于规则计算文章热度、重要性、重复性分数"""
|
||||
import logging
|
||||
import math
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import List
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from models import EnrichedArticle, Taxonomy
|
||||
from app.tagger import _count_matches, _normalize
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# 综合分权重:热度 30%,重要性 50%,重复性 20%
|
||||
COMPOSITE_WEIGHT_HEAT = 0.3
|
||||
COMPOSITE_WEIGHT_IMPORTANCE = 0.5
|
||||
COMPOSITE_WEIGHT_DUPLICATION = 0.2
|
||||
|
||||
|
||||
def _build_text(article: EnrichedArticle) -> str:
|
||||
"""构建用于打分的文本"""
|
||||
return " ".join([
|
||||
article.title or "",
|
||||
article.ai_summary or article.original_summary or "",
|
||||
article.content or "",
|
||||
])
|
||||
|
||||
|
||||
def _score_by_rules(article: EnrichedArticle, rules: List[Taxonomy]) -> float:
|
||||
"""基于规则关键词匹配计算分数,规则权重越大得分越高"""
|
||||
text = _build_text(article)
|
||||
if not text.strip() or not rules:
|
||||
return 0.0
|
||||
|
||||
score = 0.0
|
||||
for rule in rules:
|
||||
keywords = rule.keywords or []
|
||||
hits = _count_matches(text, keywords)
|
||||
if hits > 0:
|
||||
score += min(hits, 5) * rule.weight * 10
|
||||
|
||||
return min(score, 100.0)
|
||||
|
||||
|
||||
def _freshness_score(article: EnrichedArticle) -> float:
|
||||
"""根据发布时间计算新鲜度加成"""
|
||||
now = datetime.now(timezone.utc)
|
||||
published = article.published_at
|
||||
if not published:
|
||||
return 0.0
|
||||
|
||||
# 数据库中读出的 published_at 可能为 naive,默认按 UTC 处理
|
||||
if published.tzinfo is None:
|
||||
published = published.replace(tzinfo=timezone.utc)
|
||||
|
||||
hours_old = (now - published).total_seconds() / 3600
|
||||
if hours_old < 0:
|
||||
hours_old = 0
|
||||
|
||||
# 24 小时内满分 20 分,超过 72 小时降至 0
|
||||
if hours_old <= 24:
|
||||
return 20.0
|
||||
elif hours_old >= 72:
|
||||
return 0.0
|
||||
else:
|
||||
return 20.0 * (1 - (hours_old - 24) / 48)
|
||||
|
||||
|
||||
def compute_heat_score(article: EnrichedArticle, heat_rules: List[Taxonomy]) -> float:
|
||||
"""热度分:关键词命中 + 新鲜度"""
|
||||
base = _score_by_rules(article, heat_rules)
|
||||
fresh = _freshness_score(article)
|
||||
return min(base + fresh, 100.0)
|
||||
|
||||
|
||||
def compute_importance_score(article: EnrichedArticle, importance_rules: List[Taxonomy]) -> float:
|
||||
"""重要性分:关键词命中"""
|
||||
return _score_by_rules(article, importance_rules)
|
||||
|
||||
|
||||
def compute_duplication_score(duplicate_count: int, max_count: int = 5) -> float:
|
||||
"""
|
||||
重复性分:同一主题在多个源出现次数越多,重复性分越高。
|
||||
出现 1 次为 0 分,>= max_count 为 100 分。
|
||||
"""
|
||||
if duplicate_count <= 1:
|
||||
return 0.0
|
||||
score = (duplicate_count - 1) / (max_count - 1) * 100.0
|
||||
return min(score, 100.0)
|
||||
|
||||
|
||||
def compute_composite_score(heat: float, importance: float, duplication: float) -> float:
|
||||
"""计算综合分"""
|
||||
return round(
|
||||
heat * COMPOSITE_WEIGHT_HEAT
|
||||
+ importance * COMPOSITE_WEIGHT_IMPORTANCE
|
||||
+ duplication * COMPOSITE_WEIGHT_DUPLICATION,
|
||||
2,
|
||||
)
|
||||
|
||||
|
||||
def score_articles(
|
||||
db: Session,
|
||||
article_ids: List[int] = None,
|
||||
update_duplication: bool = False,
|
||||
) -> int:
|
||||
"""
|
||||
对文章计算热度/重要性/综合分。
|
||||
若 update_duplication=True,则同时根据重复组更新重复性分数。
|
||||
返回处理数量。
|
||||
"""
|
||||
heat_rules = db.query(Taxonomy).filter(Taxonomy.kind == "heat_rule").all()
|
||||
importance_rules = db.query(Taxonomy).filter(Taxonomy.kind == "importance_rule").all()
|
||||
|
||||
query = db.query(EnrichedArticle)
|
||||
if article_ids:
|
||||
query = query.filter(EnrichedArticle.id.in_(article_ids))
|
||||
|
||||
articles = query.all()
|
||||
count = 0
|
||||
for article in articles:
|
||||
article.heat_score = compute_heat_score(article, heat_rules)
|
||||
article.importance_score = compute_importance_score(article, importance_rules)
|
||||
|
||||
if update_duplication:
|
||||
dup_count = 0
|
||||
if article.duplicate_group_id:
|
||||
group = article.duplicate_group
|
||||
if group and group.member_article_ids:
|
||||
# 非代表成员数量才是真正的重复次数
|
||||
dup_count = max(len(group.member_article_ids) - 1, 0)
|
||||
article.duplication_score = compute_duplication_score(dup_count)
|
||||
|
||||
article.composite_score = compute_composite_score(
|
||||
article.heat_score,
|
||||
article.importance_score,
|
||||
article.duplication_score,
|
||||
)
|
||||
count += 1
|
||||
if count % 50 == 0:
|
||||
db.commit()
|
||||
|
||||
db.commit()
|
||||
logger.info("打分完成: %d 篇文章", count)
|
||||
return count
|
||||
@@ -0,0 +1,188 @@
|
||||
"""运行时配置管理:支持环境变量作为默认值,数据库覆盖"""
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from models import AppSetting
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# 可在 Web UI 中编辑的配置项清单
|
||||
EDITABLE_SETTINGS = {
|
||||
"RSSKEEPER_BASE_URL": {"description": "rssKeeper 服务地址", "sensitive": False},
|
||||
"OPENAI_API_KEY": {"description": "LLM API Key", "sensitive": True},
|
||||
"OPENAI_BASE_URL": {"description": "LLM API 基础地址", "sensitive": False},
|
||||
"OPENAI_MODEL": {"description": "LLM 模型名", "sensitive": False},
|
||||
"OPENAI_TIMEOUT": {"description": "LLM 调用超时(秒)", "sensitive": False},
|
||||
"OPENAI_MAX_RETRIES": {"description": "LLM 调用最大重试次数", "sensitive": False},
|
||||
"SUMMARIZE_INTERVAL_MINUTES": {"description": "摘要任务间隔(分钟)", "sensitive": False},
|
||||
"TAG_SCORE_INTERVAL_MINUTES": {"description": "分类/打分/去重任务间隔(分钟)", "sensitive": False},
|
||||
"DAILY_BRIEF_HOUR": {"description": "每日简报生成小时", "sensitive": False},
|
||||
"DAILY_BRIEF_MINUTE": {"description": "每日简报生成分钟", "sensitive": False},
|
||||
"TITLE_SIMILARITY_THRESHOLD": {"description": "标题相似度阈值", "sensitive": False},
|
||||
"CONTENT_SIMILARITY_THRESHOLD": {"description": "内容相似度阈值", "sensitive": False},
|
||||
"MAX_AI_SUMMARY_LENGTH": {"description": "AI 摘要最大长度", "sensitive": False},
|
||||
"MIN_ORIGINAL_SUMMARY_LENGTH": {"description": "原始摘要最小长度", "sensitive": False},
|
||||
"BRIEF_TOP_N_PER_CATEGORY": {"description": "简报每分类显示文章数", "sensitive": False},
|
||||
"LOG_LEVEL": {"description": "日志级别", "sensitive": False},
|
||||
"API_TOKEN": {"description": "API 鉴权 Token(为空时不启用鉴权)", "sensitive": True},
|
||||
"CORS_ALLOWED_ORIGINS": {"description": "CORS 允许来源(逗号分隔)", "sensitive": False},
|
||||
}
|
||||
|
||||
|
||||
def _get_env_default(key: str) -> str:
|
||||
"""从 Pydantic Settings 获取环境变量默认值"""
|
||||
value = getattr(settings, key, "")
|
||||
return str(value) if value is not None else ""
|
||||
|
||||
|
||||
def _mask_sensitive(value: str) -> str:
|
||||
"""对敏感值做部分脱敏"""
|
||||
if not value:
|
||||
return ""
|
||||
if len(value) <= 8:
|
||||
return "*" * len(value)
|
||||
return value[:4] + "..." + value[-4:]
|
||||
|
||||
|
||||
def init_default_settings(db: Session) -> None:
|
||||
"""若配置表为空,使用环境变量初始化默认配置"""
|
||||
existing_count = db.query(AppSetting).count()
|
||||
if existing_count > 0:
|
||||
return
|
||||
|
||||
for key, meta in EDITABLE_SETTINGS.items():
|
||||
default_value = _get_env_default(key)
|
||||
db.add(
|
||||
AppSetting(
|
||||
key=key,
|
||||
value=default_value,
|
||||
description=meta["description"],
|
||||
is_sensitive=meta["sensitive"],
|
||||
)
|
||||
)
|
||||
|
||||
db.commit()
|
||||
logger.info("已初始化默认配置项: %d 条", len(EDITABLE_SETTINGS))
|
||||
|
||||
|
||||
def get_setting(db: Session, key: str, default: Any = None) -> Any:
|
||||
"""从数据库读取配置,若不存在则返回环境变量默认值"""
|
||||
setting = db.query(AppSetting).filter(AppSetting.key == key).first()
|
||||
if setting:
|
||||
return setting.value
|
||||
return _get_env_default(key) if default is None else default
|
||||
|
||||
|
||||
def get_setting_value(key: str, default: Any = None) -> Any:
|
||||
"""不依赖 Session,直接创建临时会话读取"""
|
||||
from database import SessionLocal
|
||||
db = SessionLocal()
|
||||
try:
|
||||
return get_setting(db, key, default)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def set_setting(db: Session, key: str, value: str) -> bool:
|
||||
"""更新单个配置项"""
|
||||
if key not in EDITABLE_SETTINGS:
|
||||
return False
|
||||
|
||||
setting = db.query(AppSetting).filter(AppSetting.key == key).first()
|
||||
if setting:
|
||||
setting.value = str(value)
|
||||
setting.updated_at = datetime.now(timezone.utc)
|
||||
else:
|
||||
meta = EDITABLE_SETTINGS[key]
|
||||
db.add(
|
||||
AppSetting(
|
||||
key=key,
|
||||
value=str(value),
|
||||
description=meta["description"],
|
||||
is_sensitive=meta["sensitive"],
|
||||
)
|
||||
)
|
||||
|
||||
db.commit()
|
||||
logger.info("配置已更新: %s", key)
|
||||
return True
|
||||
|
||||
|
||||
def list_settings(db: Session, mask_sensitive: bool = True) -> List[Dict[str, Any]]:
|
||||
"""列出所有可编辑配置"""
|
||||
db_settings = {s.key: s for s in db.query(AppSetting).all()}
|
||||
result = []
|
||||
|
||||
for key, meta in EDITABLE_SETTINGS.items():
|
||||
setting = db_settings.get(key)
|
||||
value = setting.value if setting else _get_env_default(key)
|
||||
is_sensitive = meta["sensitive"]
|
||||
|
||||
if is_sensitive and mask_sensitive:
|
||||
display_value = _mask_sensitive(value)
|
||||
is_masked = True
|
||||
else:
|
||||
display_value = value
|
||||
is_masked = False
|
||||
|
||||
result.append({
|
||||
"key": key,
|
||||
"value": display_value,
|
||||
"real_value": value if not mask_sensitive else None,
|
||||
"description": meta["description"],
|
||||
"is_sensitive": is_sensitive,
|
||||
"is_masked": is_masked,
|
||||
"updated_at": setting.updated_at.isoformat() if setting else None,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def reset_settings(db: Session) -> None:
|
||||
"""将所有配置重置为环境变量默认值"""
|
||||
for key in EDITABLE_SETTINGS:
|
||||
set_setting(db, key, _get_env_default(key))
|
||||
logger.info("配置已重置为环境变量默认值")
|
||||
|
||||
|
||||
def apply_db_settings_to_config(db: Session = None) -> None:
|
||||
"""将数据库中的配置覆盖到全局 settings 对象,重启后生效"""
|
||||
close_db = False
|
||||
if db is None:
|
||||
from database import SessionLocal
|
||||
db = SessionLocal()
|
||||
close_db = True
|
||||
try:
|
||||
for key in EDITABLE_SETTINGS:
|
||||
db_value = get_setting(db, key)
|
||||
if db_value is None or db_value == "":
|
||||
continue
|
||||
field_info = settings.model_fields.get(key)
|
||||
if field_info is None:
|
||||
continue
|
||||
target_type = field_info.annotation
|
||||
try:
|
||||
if target_type is int:
|
||||
converted = int(db_value)
|
||||
elif target_type is float:
|
||||
converted = float(db_value)
|
||||
elif target_type is bool:
|
||||
converted = db_value.lower() in ("true", "1", "yes")
|
||||
elif target_type is Path:
|
||||
converted = Path(db_value)
|
||||
else:
|
||||
converted = db_value
|
||||
setattr(settings, key, converted)
|
||||
logger.debug("已应用配置: %s=%s", key, converted)
|
||||
except Exception as exc:
|
||||
logger.error("应用配置 %s=%s 失败: %s", key, db_value, exc)
|
||||
raise ValueError(f"配置项 {key} 的值无效: {db_value}") from exc
|
||||
finally:
|
||||
if close_db:
|
||||
db.close()
|
||||
@@ -0,0 +1,154 @@
|
||||
"""文章摘要生成器:对无摘要或短摘要文章调用 LLM 生成 AI 摘要"""
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.ai_client import ai_client
|
||||
from app.rss_client import rss_client
|
||||
from config import settings
|
||||
from models import EnrichedArticle
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SUMMARY_SYSTEM_PROMPT = """你是一位擅长阅读 RSS 新闻并提炼摘要的助手。
|
||||
请用简洁流畅的中文总结文章核心内容,要求:
|
||||
1. 长度控制在 {max_length} 个汉字以内。
|
||||
2. 包含文章最重要的 1-3 个要点。
|
||||
3. 不要添加个人评价,不要复述原文标题。
|
||||
4. 若原文是英文,请用中文输出摘要。
|
||||
"""
|
||||
|
||||
|
||||
SUMMARY_USER_PROMPT_TEMPLATE = """请为以下文章生成摘要。
|
||||
|
||||
标题:{title}
|
||||
作者:{author}
|
||||
来源:{feed_title}
|
||||
|
||||
正文:
|
||||
{content}
|
||||
"""
|
||||
|
||||
|
||||
def _needs_summary(article: EnrichedArticle) -> bool:
|
||||
"""判断是否需要生成 AI 摘要"""
|
||||
if not article.ai_summary:
|
||||
return True
|
||||
original = article.original_summary or ""
|
||||
if len(original.strip()) < settings.MIN_ORIGINAL_SUMMARY_LENGTH:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _prepare_content(raw_content: str, max_chars: int = 8000) -> str:
|
||||
"""清洗并截断正文,避免超过 LLM 上下文"""
|
||||
text = raw_content or ""
|
||||
# 简单去除多余空白
|
||||
text = " ".join(text.split())
|
||||
return text[:max_chars]
|
||||
|
||||
|
||||
def _generate_summary(article: EnrichedArticle) -> str:
|
||||
"""调用 LLM 生成单篇文章摘要"""
|
||||
content = _prepare_content(article.content or article.original_summary or "")
|
||||
if not content.strip():
|
||||
# 如果连原始摘要都没有,只能基于标题生成
|
||||
content = article.title or ""
|
||||
|
||||
system_prompt = SUMMARY_SYSTEM_PROMPT.format(max_length=settings.MAX_AI_SUMMARY_LENGTH)
|
||||
user_prompt = SUMMARY_USER_PROMPT_TEMPLATE.format(
|
||||
title=article.title or "",
|
||||
author=article.author or "",
|
||||
feed_title=article.feed_title or "",
|
||||
content=content,
|
||||
)
|
||||
|
||||
try:
|
||||
summary = ai_client.chat_completion(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.3,
|
||||
)
|
||||
return summary[: settings.MAX_AI_SUMMARY_LENGTH]
|
||||
except Exception as exc:
|
||||
logger.error("生成 article_id=%d 摘要失败: %s", article.rk_article_id, exc)
|
||||
return ""
|
||||
|
||||
|
||||
def _article_from_rss(raw: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""把 rssKeeper 返回的文章转换为可写入 enriched 表的字典"""
|
||||
published_at = raw.get("published_at")
|
||||
if isinstance(published_at, str):
|
||||
try:
|
||||
published_at = datetime.fromisoformat(published_at.replace("Z", "+00:00"))
|
||||
except Exception:
|
||||
published_at = None
|
||||
|
||||
return {
|
||||
"rk_article_id": raw["id"],
|
||||
"title": raw.get("title", "") or "",
|
||||
"link": raw.get("link", "") or "",
|
||||
"feed_id": raw.get("feed_id", 0),
|
||||
"feed_title": raw.get("feed_title", "") or "",
|
||||
"feed_category": raw.get("category", "") or "",
|
||||
"author": raw.get("author", "") or "",
|
||||
"published_at": published_at,
|
||||
"original_summary": raw.get("summary", "") or "",
|
||||
"content": raw.get("content", "") or "",
|
||||
}
|
||||
|
||||
|
||||
def fetch_and_summarize(db: Session, hours: int = 24, limit: int = 200) -> Dict[str, int]:
|
||||
"""
|
||||
拉取最近文章,补充 AI 摘要。
|
||||
返回统计信息:{"fetched": x, "created": y, "summarized": z}
|
||||
"""
|
||||
articles = rss_client.fetch_recent(hours=hours, limit=limit)
|
||||
if not articles:
|
||||
logger.info("未拉取到新文章")
|
||||
return {"fetched": 0, "created": 0, "summarized": 0}
|
||||
|
||||
stats = {"fetched": len(articles), "created": 0, "summarized": 0}
|
||||
|
||||
for raw in articles:
|
||||
data = _article_from_rss(raw)
|
||||
article = db.query(EnrichedArticle).filter(
|
||||
EnrichedArticle.rk_article_id == data["rk_article_id"]
|
||||
).first()
|
||||
|
||||
if article is None:
|
||||
article = EnrichedArticle(**data)
|
||||
db.add(article)
|
||||
db.flush()
|
||||
stats["created"] += 1
|
||||
else:
|
||||
# 更新已有记录的基础字段
|
||||
article.title = data["title"] or article.title
|
||||
article.link = data["link"] or article.link
|
||||
article.feed_title = data["feed_title"] or article.feed_title
|
||||
article.feed_category = data["feed_category"] or article.feed_category
|
||||
article.author = data["author"] or article.author
|
||||
article.published_at = data["published_at"] or article.published_at
|
||||
article.original_summary = data["original_summary"] or article.original_summary
|
||||
article.content = data["content"] or article.content
|
||||
article.fetched_at = datetime.now(timezone.utc)
|
||||
|
||||
if _needs_summary(article):
|
||||
ai_summary = _generate_summary(article)
|
||||
if ai_summary:
|
||||
article.ai_summary = ai_summary
|
||||
stats["summarized"] += 1
|
||||
|
||||
# 每 10 篇提交一次,避免长时间事务
|
||||
if stats["summarized"] % 10 == 0:
|
||||
db.commit()
|
||||
|
||||
db.commit()
|
||||
logger.info(
|
||||
"摘要任务完成: fetched=%d, created=%d, summarized=%d",
|
||||
stats["fetched"], stats["created"], stats["summarized"]
|
||||
)
|
||||
return stats
|
||||
+116
@@ -0,0 +1,116 @@
|
||||
"""基于规则给文章分类、打标签"""
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Dict, Any, Tuple
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from models import EnrichedArticle, Taxonomy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _normalize(text: str) -> str:
|
||||
"""规范化文本用于关键词匹配"""
|
||||
if not text:
|
||||
return ""
|
||||
# 去除多余空白,统一小写
|
||||
text = " ".join(text.split())
|
||||
return text.lower()
|
||||
|
||||
|
||||
def _count_matches(text: str, keywords: List[str]) -> int:
|
||||
"""统计关键词在文本中的命中次数(不区分大小写)"""
|
||||
if not text or not keywords:
|
||||
return 0
|
||||
text_norm = _normalize(text)
|
||||
count = 0
|
||||
for kw in keywords:
|
||||
if not kw:
|
||||
continue
|
||||
kw_norm = _normalize(kw)
|
||||
# 简单子串匹配;中文关键词也适用
|
||||
count += text_norm.count(kw_norm)
|
||||
return count
|
||||
|
||||
|
||||
def classify_article(article: EnrichedArticle, categories: List[Taxonomy]) -> str:
|
||||
"""为文章选择最匹配的分类"""
|
||||
text = " ".join([
|
||||
article.title or "",
|
||||
article.ai_summary or article.original_summary or "",
|
||||
article.content or "",
|
||||
])
|
||||
|
||||
best_category = ""
|
||||
best_score = 0
|
||||
|
||||
for cat in categories:
|
||||
score = _count_matches(text, cat.keywords or [])
|
||||
# 如果文章来自某个 Feed 分类,给予少量加成
|
||||
if article.feed_category and article.feed_category == cat.name:
|
||||
score += 2
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_category = cat.name
|
||||
|
||||
# 若完全没有命中,回退到源分类
|
||||
if not best_category and article.feed_category:
|
||||
best_category = article.feed_category
|
||||
|
||||
if not best_category:
|
||||
best_category = "未分类"
|
||||
|
||||
return best_category
|
||||
|
||||
|
||||
def tag_article(article: EnrichedArticle, tags: List[Taxonomy]) -> List[str]:
|
||||
"""为文章打上命中的标签"""
|
||||
text = " ".join([
|
||||
article.title or "",
|
||||
article.ai_summary or article.original_summary or "",
|
||||
article.content or "",
|
||||
])
|
||||
|
||||
matched = []
|
||||
for tag in tags:
|
||||
if _count_matches(text, tag.keywords or []) > 0:
|
||||
matched.append(tag.name)
|
||||
|
||||
# 去重并保持顺序
|
||||
return list(dict.fromkeys(matched))
|
||||
|
||||
|
||||
def tag_articles(db: Session, article_ids: List[int] = None) -> int:
|
||||
"""
|
||||
对文章进行分类和打标签。
|
||||
若指定 article_ids 则只处理这些文章;否则处理所有未分类或没有标签的文章。
|
||||
返回处理数量。
|
||||
"""
|
||||
categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").all()
|
||||
tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").all()
|
||||
|
||||
if not categories:
|
||||
logger.warning("taxonomy 中无 category 数据,跳过分类")
|
||||
return 0
|
||||
|
||||
query = db.query(EnrichedArticle)
|
||||
if article_ids:
|
||||
query = query.filter(EnrichedArticle.id.in_(article_ids))
|
||||
else:
|
||||
query = query.filter(
|
||||
(EnrichedArticle.category == "") | (EnrichedArticle.category == None)
|
||||
)
|
||||
|
||||
articles = query.all()
|
||||
count = 0
|
||||
for article in articles:
|
||||
article.category = classify_article(article, categories)
|
||||
article.tags = tag_article(article, tags)
|
||||
count += 1
|
||||
if count % 50 == 0:
|
||||
db.commit()
|
||||
|
||||
db.commit()
|
||||
logger.info("分类/打标签完成: %d 篇文章", count)
|
||||
return count
|
||||
+140
@@ -0,0 +1,140 @@
|
||||
"""分类/标签/打分规则体系的初始化与维护"""
|
||||
import json
|
||||
import logging
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.ai_client import ai_client
|
||||
from app.rss_client import rss_client
|
||||
from models import Taxonomy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
TAXONOMY_SYSTEM_PROMPT = """你是一位专业的信息分类与内容分析专家。
|
||||
请根据用户提供的 RSS 文章样本,生成一套适合的中文内容分类体系、标签体系和打分规则。
|
||||
|
||||
输出必须是合法的 JSON,格式如下:
|
||||
{
|
||||
"categories": [
|
||||
{"name": "科技", "description": "人工智能、芯片、互联网、软件等", "keywords": ["AI", "芯片", "大模型", ...]}
|
||||
],
|
||||
"tags": [
|
||||
{"name": "人工智能", "description": "...", "keywords": ["AI", "人工智能", "大模型", ...]}
|
||||
],
|
||||
"heat_rules": [
|
||||
{"name": "热点事件", "keywords": ["突发", "重磅", "刚刚", "发布"], "weight": 1.5}
|
||||
],
|
||||
"importance_rules": [
|
||||
{"name": "政策法规", "keywords": ["政策", "监管", "法规", "征求意见"], "weight": 1.5}
|
||||
],
|
||||
"duplication_indicators": [
|
||||
{"name": "同一事件", "keywords": ["宣布", "发布", "推出"], "weight": 1.0}
|
||||
]
|
||||
}
|
||||
|
||||
要求:
|
||||
1. categories 数量控制在 8-12 个,覆盖科技、财经、新闻、设计、生活等常见 RSS 主题。
|
||||
2. tags 数量控制在 30-50 个,尽量细化但避免过度重叠。
|
||||
3. heat_rules 和 importance_rules 各 10-20 条,weight 范围 0.5-2.0。
|
||||
4. 所有 keywords 用中文或中英双语,便于后续关键词匹配。
|
||||
5. 不要输出任何解释文字,只输出 JSON。
|
||||
"""
|
||||
|
||||
|
||||
def _build_sample_prompt(articles: List[Dict[str, Any]]) -> str:
|
||||
lines = [f"共有 {len(articles)} 篇文章样本:"]
|
||||
for idx, art in enumerate(articles[:50], 1):
|
||||
title = art.get("title", "")
|
||||
summary = art.get("summary", "") or art.get("content", "")[:300]
|
||||
feed = art.get("feed_title", "")
|
||||
cat = art.get("category", "")
|
||||
lines.append(f"\n[{idx}] 标题:{title}")
|
||||
lines.append(f" 来源:{feed} | 源分类:{cat}")
|
||||
lines.append(f" 摘要:{summary[:400]}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def bootstrap_taxonomy(db: Session, force: bool = False) -> bool:
|
||||
"""
|
||||
初始化分类/标签/打分规则。
|
||||
若 force=True 则清空后重建;否则仅在表为空时初始化。
|
||||
"""
|
||||
existing = db.query(Taxonomy).first()
|
||||
if existing and not force:
|
||||
logger.info("taxonomy 表已存在,跳过初始化")
|
||||
return False
|
||||
|
||||
if force:
|
||||
db.query(Taxonomy).delete()
|
||||
db.commit()
|
||||
logger.info("强制重新初始化 taxonomy")
|
||||
|
||||
logger.info("开始从 rssKeeper 拉取样本文章并生成分类体系...")
|
||||
articles = rss_client.fetch_recent(hours=24 * 7, limit=200)
|
||||
if not articles:
|
||||
logger.warning("未获取到样本文章,无法生成分类体系")
|
||||
return False
|
||||
|
||||
user_prompt = _build_sample_prompt(articles)
|
||||
try:
|
||||
result = ai_client.chat_completion_json(
|
||||
system_prompt=TAXONOMY_SYSTEM_PROMPT,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.5,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error("生成分类体系失败: %s", exc)
|
||||
return False
|
||||
|
||||
_save_taxonomy(db, result)
|
||||
logger.info("taxonomy 初始化完成,共写入 %d 条规则", db.query(Taxonomy).count())
|
||||
return True
|
||||
|
||||
|
||||
def _save_taxonomy(db: Session, data: Dict[str, Any]) -> None:
|
||||
"""把 LLM 返回的分类体系写入数据库"""
|
||||
|
||||
def _add(kind: str, items: List[Dict[str, Any]], default_weight: float = 1.0):
|
||||
for item in items:
|
||||
name = item.get("name", "").strip()
|
||||
if not name:
|
||||
continue
|
||||
keywords = item.get("keywords", [])
|
||||
if isinstance(keywords, str):
|
||||
keywords = [keywords]
|
||||
db.add(
|
||||
Taxonomy(
|
||||
name=name,
|
||||
kind=kind,
|
||||
description=item.get("description", ""),
|
||||
keywords=keywords,
|
||||
weight=float(item.get("weight", default_weight)),
|
||||
created_by_ai=True,
|
||||
)
|
||||
)
|
||||
|
||||
_add("category", data.get("categories", []))
|
||||
_add("tag", data.get("tags", []))
|
||||
_add("heat_rule", data.get("heat_rules", []), default_weight=1.0)
|
||||
_add("importance_rule", data.get("importance_rules", []), default_weight=1.0)
|
||||
_add("duplication_rule", data.get("duplication_indicators", []), default_weight=1.0)
|
||||
|
||||
db.commit()
|
||||
|
||||
|
||||
def ensure_taxonomy(db: Session) -> bool:
|
||||
"""确保 taxonomy 表非空,若为空则触发初始化"""
|
||||
existing = db.query(Taxonomy).first()
|
||||
if existing:
|
||||
return True
|
||||
return bootstrap_taxonomy(db)
|
||||
|
||||
|
||||
def list_taxonomy(db: Session, kind: str = None) -> List[Taxonomy]:
|
||||
"""列出分类体系规则"""
|
||||
query = db.query(Taxonomy)
|
||||
if kind:
|
||||
query = query.filter(Taxonomy.kind == kind)
|
||||
return query.order_by(Taxonomy.kind, Taxonomy.name).all()
|
||||
@@ -0,0 +1,66 @@
|
||||
"""配置管理 - 环境变量 + 默认值"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""应用配置"""
|
||||
|
||||
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
||||
|
||||
# RSSKeeper 连接
|
||||
RSSKEEPER_BASE_URL: str = "http://localhost:7329"
|
||||
|
||||
# LLM API(兼容 OpenAI 格式)
|
||||
OPENAI_API_KEY: str = ""
|
||||
OPENAI_BASE_URL: str = "https://api.openai.com/v1"
|
||||
OPENAI_MODEL: str = "gpt-4o-mini"
|
||||
OPENAI_TIMEOUT: int = 60
|
||||
OPENAI_MAX_RETRIES: int = 3
|
||||
|
||||
# dataClean 数据目录
|
||||
DATA_DIR: Path = Path("/app/data")
|
||||
DATABASE_URL: str = "/app/data/dataclean.db"
|
||||
|
||||
# 简报输出目录
|
||||
BRIEF_OUTPUT_DIR: Path = Path("/app/data/briefs")
|
||||
|
||||
# 调度时间(分钟)
|
||||
SUMMARIZE_INTERVAL_MINUTES: int = 60
|
||||
TAG_SCORE_INTERVAL_MINUTES: int = 1440
|
||||
DAILY_BRIEF_HOUR: int = 8
|
||||
DAILY_BRIEF_MINUTE: int = 0
|
||||
|
||||
# 去重阈值
|
||||
TITLE_SIMILARITY_THRESHOLD: float = 0.85
|
||||
CONTENT_SIMILARITY_THRESHOLD: float = 0.80
|
||||
|
||||
# 摘要长度
|
||||
MAX_AI_SUMMARY_LENGTH: int = 300
|
||||
MIN_ORIGINAL_SUMMARY_LENGTH: int = 100
|
||||
|
||||
# 每篇简报每个分类显示文章数
|
||||
BRIEF_TOP_N_PER_CATEGORY: int = 10
|
||||
|
||||
# 日志级别
|
||||
LOG_LEVEL: str = "INFO"
|
||||
|
||||
# Web UI / API 安全
|
||||
# 为空时不启用 API Token 鉴权(仅建议在内网使用);生产环境请设置强密码
|
||||
API_TOKEN: str = ""
|
||||
# CORS 允许来源,逗号分隔;生产环境请填写具体域名,如 "https://dataclean.example.com"
|
||||
CORS_ALLOWED_ORIGINS: str = ""
|
||||
|
||||
@property
|
||||
def database_path(self) -> str:
|
||||
"""返回 SQLite 数据库路径"""
|
||||
return str(Path(self.DATABASE_URL))
|
||||
|
||||
@property
|
||||
def brief_output_dir_path(self) -> Path:
|
||||
"""返回简报输出目录"""
|
||||
return self.BRIEF_OUTPUT_DIR
|
||||
|
||||
|
||||
settings = Settings()
|
||||
+39
@@ -0,0 +1,39 @@
|
||||
"""数据库连接与初始化"""
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import create_engine, event
|
||||
from sqlalchemy.orm import sessionmaker, declarative_base
|
||||
|
||||
from config import settings
|
||||
|
||||
engine = create_engine(
|
||||
f"sqlite:///{settings.database_path}",
|
||||
connect_args={"check_same_thread": False},
|
||||
echo=False,
|
||||
)
|
||||
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
@event.listens_for(engine, "connect")
|
||||
def _set_sqlite_pragma(dbapi_conn, connection_record):
|
||||
"""启用 SQLite 外键约束"""
|
||||
cursor = dbapi_conn.cursor()
|
||||
cursor.execute("PRAGMA foreign_keys=ON")
|
||||
cursor.close()
|
||||
|
||||
|
||||
def get_db():
|
||||
"""FastAPI 依赖注入用数据库会话"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def init_db():
|
||||
"""创建所有数据表,并确保数据库目录存在"""
|
||||
Path(settings.DATABASE_URL).parent.mkdir(parents=True, exist_ok=True)
|
||||
Base.metadata.create_all(bind=engine)
|
||||
@@ -0,0 +1,19 @@
|
||||
services:
|
||||
dataclean:
|
||||
build: .
|
||||
container_name: dataclean
|
||||
ports:
|
||||
- "7331:7331"
|
||||
volumes:
|
||||
- ./data:/app/data
|
||||
env_file:
|
||||
- .env
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- TZ=Asia/Shanghai
|
||||
networks:
|
||||
- dataclean-net
|
||||
|
||||
networks:
|
||||
dataclean-net:
|
||||
driver: bridge
|
||||
@@ -0,0 +1,13 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>dataClean - RSS 数据清洗</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="app"></div>
|
||||
<script type="module" src="/src/main.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
Generated
+1628
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "dataclean-frontend",
|
||||
"private": true,
|
||||
"version": "1.0.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite --host 0.0.0.0",
|
||||
"build": "vite build",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"vue": "^3.4.21",
|
||||
"vue-router": "^4.3.0",
|
||||
"element-plus": "^2.6.3",
|
||||
"@element-plus/icons-vue": "^2.3.1",
|
||||
"axios": "^1.6.8"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@vitejs/plugin-vue": "^5.0.4",
|
||||
"vite": "^5.2.0"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
<template>
|
||||
<el-container class="layout-container">
|
||||
<el-aside width="220px">
|
||||
<div class="logo">
|
||||
<el-icon size="28"><DataLine /></el-icon>
|
||||
<span>dataClean</span>
|
||||
</div>
|
||||
<el-menu
|
||||
:default-active="$route.path"
|
||||
router
|
||||
background-color="transparent"
|
||||
text-color="#a0a0a0"
|
||||
active-text-color="#409eff"
|
||||
>
|
||||
<el-menu-item index="/dashboard">
|
||||
<el-icon><Odometer /></el-icon>
|
||||
<span>仪表盘</span>
|
||||
</el-menu-item>
|
||||
<el-menu-item index="/articles">
|
||||
<el-icon><Document /></el-icon>
|
||||
<span>文章列表</span>
|
||||
</el-menu-item>
|
||||
<el-menu-item index="/briefs">
|
||||
<el-icon><Collection /></el-icon>
|
||||
<span>每日简报</span>
|
||||
</el-menu-item>
|
||||
<el-menu-item index="/taxonomy">
|
||||
<el-icon><CollectionTag /></el-icon>
|
||||
<span>分类体系</span>
|
||||
</el-menu-item>
|
||||
<el-menu-item index="/tasks">
|
||||
<el-icon><Timer /></el-icon>
|
||||
<span>任务管理</span>
|
||||
</el-menu-item>
|
||||
<el-menu-item index="/settings">
|
||||
<el-icon><Setting /></el-icon>
|
||||
<span>系统配置</span>
|
||||
</el-menu-item>
|
||||
</el-menu>
|
||||
</el-aside>
|
||||
|
||||
<el-container>
|
||||
<el-header class="top-header" height="60px">
|
||||
<div class="header-right">
|
||||
<el-input
|
||||
v-model="apiTokenInput"
|
||||
placeholder="API Token(未设置可留空)"
|
||||
size="small"
|
||||
show-password
|
||||
style="width: 260px;"
|
||||
@keyup.enter="saveToken"
|
||||
/>
|
||||
<el-button size="small" type="primary" @click="saveToken">
|
||||
{{ hasToken ? '更新 Token' : '设置 Token' }}
|
||||
</el-button>
|
||||
</div>
|
||||
</el-header>
|
||||
<el-main>
|
||||
<router-view />
|
||||
</el-main>
|
||||
</el-container>
|
||||
</el-container>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, onMounted } from 'vue'
|
||||
import { ElMessage } from 'element-plus'
|
||||
import { DataLine, Odometer, Document, Collection, CollectionTag, Timer, Setting } from '@element-plus/icons-vue'
|
||||
import { getApiToken, setApiToken } from '@/api'
|
||||
|
||||
const apiTokenInput = ref('')
|
||||
const hasToken = ref(false)
|
||||
|
||||
onMounted(() => {
|
||||
apiTokenInput.value = getApiToken()
|
||||
hasToken.value = !!apiTokenInput.value
|
||||
})
|
||||
|
||||
const saveToken = () => {
|
||||
setApiToken(apiTokenInput.value.trim())
|
||||
hasToken.value = !!apiTokenInput.value.trim()
|
||||
ElMessage.success('API Token 已保存')
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.layout-container {
|
||||
height: 100vh;
|
||||
}
|
||||
|
||||
.logo {
|
||||
height: 60px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 10px;
|
||||
font-size: 20px;
|
||||
font-weight: 600;
|
||||
color: #409eff;
|
||||
border-bottom: 1px solid var(--dc-border);
|
||||
}
|
||||
|
||||
.top-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: flex-end;
|
||||
border-bottom: 1px solid var(--dc-border);
|
||||
background-color: var(--dc-card-bg);
|
||||
}
|
||||
|
||||
.header-right {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.el-menu-item {
|
||||
height: 50px;
|
||||
line-height: 50px;
|
||||
}
|
||||
|
||||
.el-menu-item .el-icon {
|
||||
margin-right: 8px;
|
||||
}
|
||||
</style>
|
||||
@@ -0,0 +1,74 @@
|
||||
import axios from 'axios'
|
||||
|
||||
const API_TOKEN_KEY = 'dataclean_api_token'
|
||||
|
||||
const api = axios.create({
|
||||
baseURL: '/api',
|
||||
timeout: 30000,
|
||||
})
|
||||
|
||||
export function getApiToken() {
|
||||
return localStorage.getItem(API_TOKEN_KEY) || ''
|
||||
}
|
||||
|
||||
export function setApiToken(token) {
|
||||
if (token) {
|
||||
localStorage.setItem(API_TOKEN_KEY, token)
|
||||
} else {
|
||||
localStorage.removeItem(API_TOKEN_KEY)
|
||||
}
|
||||
}
|
||||
|
||||
api.interceptors.request.use((config) => {
|
||||
const token = getApiToken()
|
||||
if (token) {
|
||||
config.headers.Authorization = `Bearer ${token}`
|
||||
}
|
||||
return config
|
||||
})
|
||||
|
||||
api.interceptors.response.use(
|
||||
(response) => response.data,
|
||||
(error) => {
|
||||
const status = error.response?.status
|
||||
const detail = error.response?.data?.detail || error.message || '请求失败'
|
||||
if (status === 401 || status === 403) {
|
||||
return Promise.reject(new Error(`${detail},请检查 API Token 是否设置正确`))
|
||||
}
|
||||
return Promise.reject(new Error(detail))
|
||||
}
|
||||
)
|
||||
|
||||
export default api
|
||||
|
||||
export const datacleanApi = {
|
||||
// 健康检查
|
||||
health: () => axios.get('/health').then((r) => r.data),
|
||||
|
||||
// 仪表盘
|
||||
getStats: () => api.get('/stats'),
|
||||
|
||||
// 文章
|
||||
getArticles: (params) => api.get('/articles', { params }),
|
||||
getArticle: (id) => api.get(`/articles/${id}`),
|
||||
|
||||
// 简报
|
||||
getBriefs: (params) => api.get('/briefs', { params }),
|
||||
getBrief: (date) => api.get(`/briefs/${date}`),
|
||||
regenerateBrief: (date) => api.post(`/briefs/${date}/regenerate`),
|
||||
|
||||
// 分类体系
|
||||
getTaxonomy: (kind) => api.get('/taxonomy', { params: kind ? { kind } : {} }),
|
||||
bootstrapTaxonomy: (force = false) => api.post(`/taxonomy/bootstrap?force=${force}`),
|
||||
|
||||
// 任务
|
||||
summarize: () => api.post('/tasks/summarize'),
|
||||
tagScoreDedup: () => api.post('/tasks/tag-score-dedup'),
|
||||
generateBrief: () => api.post('/tasks/brief'),
|
||||
|
||||
// 配置
|
||||
getSettings: () => api.get('/settings'),
|
||||
updateSetting: (key, value) => api.put(`/settings/${key}`, { value }),
|
||||
updateSettingsBatch: (settings) => api.put('/settings', { settings }),
|
||||
resetSettings: () => api.post('/settings/reset'),
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
import { createApp } from 'vue'
|
||||
import ElementPlus from 'element-plus'
|
||||
import * as ElementPlusIconsVue from '@element-plus/icons-vue'
|
||||
import 'element-plus/dist/index.css'
|
||||
import 'element-plus/theme-chalk/dark/css-vars.css'
|
||||
|
||||
import App from './App.vue'
|
||||
import router from './router'
|
||||
import './style.css'
|
||||
|
||||
const app = createApp(App)
|
||||
|
||||
app.use(ElementPlus)
|
||||
app.use(router)
|
||||
|
||||
for (const [key, component] of Object.entries(ElementPlusIconsVue)) {
|
||||
app.component(key, component)
|
||||
}
|
||||
|
||||
app.mount('#app')
|
||||
@@ -0,0 +1,28 @@
|
||||
import { createRouter, createWebHistory } from 'vue-router'
|
||||
import Dashboard from '@/views/Dashboard.vue'
|
||||
import Articles from '@/views/Articles.vue'
|
||||
import ArticleDetail from '@/views/ArticleDetail.vue'
|
||||
import Briefs from '@/views/Briefs.vue'
|
||||
import BriefDetail from '@/views/BriefDetail.vue'
|
||||
import Taxonomy from '@/views/Taxonomy.vue'
|
||||
import Tasks from '@/views/Tasks.vue'
|
||||
import Settings from '@/views/Settings.vue'
|
||||
|
||||
const routes = [
|
||||
{ path: '/', redirect: '/dashboard' },
|
||||
{ path: '/dashboard', name: 'Dashboard', component: Dashboard },
|
||||
{ path: '/articles', name: 'Articles', component: Articles },
|
||||
{ path: '/articles/:id', name: 'ArticleDetail', component: ArticleDetail, props: true },
|
||||
{ path: '/briefs', name: 'Briefs', component: Briefs },
|
||||
{ path: '/briefs/:date', name: 'BriefDetail', component: BriefDetail, props: true },
|
||||
{ path: '/taxonomy', name: 'Taxonomy', component: Taxonomy },
|
||||
{ path: '/tasks', name: 'Tasks', component: Tasks },
|
||||
{ path: '/settings', name: 'Settings', component: Settings },
|
||||
]
|
||||
|
||||
const router = createRouter({
|
||||
history: createWebHistory(),
|
||||
routes,
|
||||
})
|
||||
|
||||
export default router
|
||||
@@ -0,0 +1,164 @@
|
||||
:root {
|
||||
--dc-bg: #0f0f23;
|
||||
--dc-card-bg: #1a1a2e;
|
||||
--dc-border: #2d2d44;
|
||||
--dc-text: #e0e0e0;
|
||||
--dc-text-secondary: #a0a0a0;
|
||||
--dc-primary: #409eff;
|
||||
--dc-success: #67c23a;
|
||||
--dc-warning: #e6a23c;
|
||||
--dc-danger: #f56c6c;
|
||||
}
|
||||
|
||||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||
background-color: var(--dc-bg);
|
||||
color: var(--dc-text);
|
||||
}
|
||||
|
||||
.page-title {
|
||||
font-size: 24px;
|
||||
font-weight: 600;
|
||||
margin-bottom: 20px;
|
||||
color: var(--dc-text);
|
||||
}
|
||||
|
||||
.stat-card {
|
||||
background: var(--dc-card-bg);
|
||||
border: 1px solid var(--dc-border);
|
||||
border-radius: 8px;
|
||||
padding: 20px;
|
||||
transition: transform 0.2s;
|
||||
}
|
||||
|
||||
.stat-card:hover {
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.stat-value {
|
||||
font-size: 28px;
|
||||
font-weight: 700;
|
||||
color: var(--dc-primary);
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
font-size: 14px;
|
||||
color: var(--dc-text-secondary);
|
||||
margin-top: 8px;
|
||||
}
|
||||
|
||||
.dark-card {
|
||||
background: var(--dc-card-bg) !important;
|
||||
border: 1px solid var(--dc-border) !important;
|
||||
color: var(--dc-text) !important;
|
||||
}
|
||||
|
||||
.dark-card .el-card__header {
|
||||
border-bottom: 1px solid var(--dc-border) !important;
|
||||
color: var(--dc-text) !important;
|
||||
}
|
||||
|
||||
.daily-bar-wrap {
|
||||
display: flex;
|
||||
align-items: flex-end;
|
||||
gap: 8px;
|
||||
height: 120px;
|
||||
padding: 10px 0;
|
||||
}
|
||||
|
||||
.daily-bar {
|
||||
flex: 1;
|
||||
background: linear-gradient(to top, var(--dc-primary), #66b1ff);
|
||||
border-radius: 4px 4px 0 0;
|
||||
min-width: 20px;
|
||||
position: relative;
|
||||
transition: opacity 0.2s;
|
||||
}
|
||||
|
||||
.daily-bar:hover {
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.daily-bar-label {
|
||||
position: absolute;
|
||||
bottom: -20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
font-size: 12px;
|
||||
color: var(--dc-text-secondary);
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.daily-bar-value {
|
||||
position: absolute;
|
||||
top: -20px;
|
||||
left: 50%;
|
||||
transform: translateX(-50%);
|
||||
font-size: 12px;
|
||||
color: var(--dc-text);
|
||||
}
|
||||
|
||||
.score-progress {
|
||||
margin-top: 8px;
|
||||
}
|
||||
|
||||
.score-progress .el-progress-bar__outer {
|
||||
background-color: rgba(255, 255, 255, 0.1) !important;
|
||||
}
|
||||
|
||||
.article-link {
|
||||
color: var(--dc-primary);
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.article-link:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.tag-item {
|
||||
margin-right: 6px;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
|
||||
/* Element Plus 暗色覆盖 */
|
||||
.el-menu {
|
||||
border-right: none !important;
|
||||
background-color: transparent !important;
|
||||
}
|
||||
|
||||
.el-aside {
|
||||
background-color: var(--dc-card-bg) !important;
|
||||
border-right: 1px solid var(--dc-border) !important;
|
||||
}
|
||||
|
||||
.el-container {
|
||||
background-color: var(--dc-bg) !important;
|
||||
}
|
||||
|
||||
.el-main {
|
||||
background-color: var(--dc-bg) !important;
|
||||
}
|
||||
|
||||
.el-table {
|
||||
background-color: transparent !important;
|
||||
}
|
||||
|
||||
.el-table th,
|
||||
.el-table tr {
|
||||
background-color: transparent !important;
|
||||
}
|
||||
|
||||
.el-table--enable-row-hover .el-table__body tr:hover > td {
|
||||
background-color: rgba(64, 158, 255, 0.1) !important;
|
||||
}
|
||||
|
||||
.el-input__wrapper,
|
||||
.el-textarea__inner {
|
||||
background-color: rgba(255, 255, 255, 0.05) !important;
|
||||
}
|
||||
@@ -0,0 +1,163 @@
|
||||
<template>
|
||||
<div v-loading="loading">
|
||||
<el-page-header @back="$router.push('/articles')" title="文章详情" />
|
||||
|
||||
<el-card v-if="article" class="dark-card" style="margin-top: 20px;">
|
||||
<template #header>
|
||||
<div class="article-header">
|
||||
<h2>{{ article.title }}</h2>
|
||||
<div class="article-meta">
|
||||
<span><el-icon><OfficeBuilding /></el-icon> {{ article.feed_title }}</span>
|
||||
<span v-if="article.author"><el-icon><User /></el-icon> {{ article.author }}</span>
|
||||
<span><el-icon><Timer /></el-icon> {{ article.published_at }}</span>
|
||||
<el-tag v-if="article.is_representative" type="success">重复组代表</el-tag>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<div class="article-section">
|
||||
<h3>AI 摘要</h3>
|
||||
<p v-if="article.ai_summary" class="ai-summary">{{ article.ai_summary }}</p>
|
||||
<p v-else class="no-data">暂无 AI 摘要</p>
|
||||
</div>
|
||||
|
||||
<div class="article-section">
|
||||
<h3>标签与分类</h3>
|
||||
<div>
|
||||
<span class="section-label">分类:</span>
|
||||
<el-tag type="primary" size="large">{{ article.category }}</el-tag>
|
||||
</div>
|
||||
<div style="margin-top: 10px;">
|
||||
<span class="section-label">标签:</span>
|
||||
<el-tag v-for="tag in article.tags" :key="tag" class="tag-item" type="info">{{ tag }}</el-tag>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="article-section">
|
||||
<h3>评分</h3>
|
||||
<el-row :gutter="20">
|
||||
<el-col :span="6" v-for="score in scoreList" :key="score.label">
|
||||
<div class="score-item">
|
||||
<div class="score-label">{{ score.label }}</div>
|
||||
<div class="score-value">{{ score.value.toFixed(1) }}</div>
|
||||
<el-progress :percentage="Math.round(score.value)" :color="score.color" class="score-progress" />
|
||||
</div>
|
||||
</el-col>
|
||||
</el-row>
|
||||
</div>
|
||||
|
||||
<div class="article-section" v-if="article.link">
|
||||
<h3>原文链接</h3>
|
||||
<a :href="article.link" target="_blank" class="article-link">{{ article.link }}</a>
|
||||
</div>
|
||||
</el-card>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, computed, onMounted } from 'vue'
|
||||
import { ElMessage } from 'element-plus'
|
||||
import { datacleanApi } from '@/api'
|
||||
import { OfficeBuilding, User, Timer } from '@element-plus/icons-vue'
|
||||
|
||||
const props = defineProps({
|
||||
id: {
|
||||
type: String,
|
||||
required: true,
|
||||
},
|
||||
})
|
||||
|
||||
const loading = ref(false)
|
||||
const article = ref(null)
|
||||
|
||||
const scoreList = computed(() => {
|
||||
if (!article.value) return []
|
||||
return [
|
||||
{ label: '热度', value: article.value.heat_score, color: '#f56c6c' },
|
||||
{ label: '重要性', value: article.value.importance_score, color: '#e6a23c' },
|
||||
{ label: '重复度', value: article.value.duplication_score, color: '#67c23a' },
|
||||
{ label: '综合分', value: article.value.composite_score, color: '#409eff' },
|
||||
]
|
||||
})
|
||||
|
||||
const loadArticle = async () => {
|
||||
loading.value = true
|
||||
try {
|
||||
article.value = await datacleanApi.getArticle(props.id)
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
} finally {
|
||||
loading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(loadArticle)
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.article-header h2 {
|
||||
margin-bottom: 12px;
|
||||
color: var(--dc-text);
|
||||
}
|
||||
|
||||
.article-meta {
|
||||
display: flex;
|
||||
gap: 20px;
|
||||
align-items: center;
|
||||
color: var(--dc-text-secondary);
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.article-meta .el-icon {
|
||||
margin-right: 4px;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
.article-section {
|
||||
margin-bottom: 24px;
|
||||
}
|
||||
|
||||
.article-section h3 {
|
||||
font-size: 16px;
|
||||
margin-bottom: 12px;
|
||||
color: var(--dc-text);
|
||||
border-left: 4px solid var(--dc-primary);
|
||||
padding-left: 10px;
|
||||
}
|
||||
|
||||
.ai-summary {
|
||||
line-height: 1.8;
|
||||
color: var(--dc-text);
|
||||
background: rgba(64, 158, 255, 0.1);
|
||||
padding: 16px;
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
.no-data {
|
||||
color: var(--dc-text-secondary);
|
||||
}
|
||||
|
||||
.section-label {
|
||||
color: var(--dc-text-secondary);
|
||||
margin-right: 8px;
|
||||
}
|
||||
|
||||
.score-item {
|
||||
background: rgba(255, 255, 255, 0.03);
|
||||
padding: 16px;
|
||||
border-radius: 8px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.score-label {
|
||||
color: var(--dc-text-secondary);
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.score-value {
|
||||
font-size: 24px;
|
||||
font-weight: 700;
|
||||
margin: 8px 0;
|
||||
color: var(--dc-text);
|
||||
}
|
||||
</style>
|
||||
@@ -0,0 +1,117 @@
|
||||
<template>
|
||||
<div>
|
||||
<h1 class="page-title">文章列表</h1>
|
||||
|
||||
<el-card class="dark-card" style="margin-bottom: 20px;">
|
||||
<el-form :inline="true" :model="filters">
|
||||
<el-form-item label="日期">
|
||||
<el-date-picker
|
||||
v-model="filters.date"
|
||||
type="date"
|
||||
value-format="YYYY-MM-DD"
|
||||
placeholder="选择日期"
|
||||
clearable
|
||||
/>
|
||||
</el-form-item>
|
||||
<el-form-item label="分类">
|
||||
<el-input v-model="filters.category" placeholder="分类" clearable />
|
||||
</el-form-item>
|
||||
<el-form-item label="标签">
|
||||
<el-input v-model="filters.tag" placeholder="标签" clearable />
|
||||
</el-form-item>
|
||||
<el-form-item>
|
||||
<el-checkbox v-model="filters.representative_only" label="仅看代表文章" />
|
||||
</el-form-item>
|
||||
<el-form-item>
|
||||
<el-button type="primary" @click="loadArticles">查询</el-button>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
</el-card>
|
||||
|
||||
<el-card class="dark-card">
|
||||
<el-table :data="articles" v-loading="loading" style="width: 100%">
|
||||
<el-table-column label="标题" min-width="280">
|
||||
<template #default="{ row }">
|
||||
<el-link @click="$router.push(`/articles/${row.id}`)" type="primary">{{ row.title }}</el-link>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column prop="feed_title" label="来源" width="160" />
|
||||
<el-table-column prop="category" label="分类" width="120" />
|
||||
<el-table-column label="标签" min-width="180">
|
||||
<template #default="{ row }">
|
||||
<el-tag v-for="tag in row.tags" :key="tag" size="small" class="tag-item">{{ tag }}</el-tag>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column label="热度" width="120">
|
||||
<template #default="{ row }">
|
||||
<el-progress :percentage="Math.round(row.heat_score)" :color="scoreColor" />
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column label="重要性" width="120">
|
||||
<template #default="{ row }">
|
||||
<el-progress :percentage="Math.round(row.importance_score)" :color="scoreColor" />
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column label="综合分" width="100">
|
||||
<template #default="{ row }">
|
||||
<el-tag :type="row.composite_score >= 60 ? 'danger' : row.composite_score >= 40 ? 'warning' : 'info'">
|
||||
{{ row.composite_score.toFixed(1) }}
|
||||
</el-tag>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column prop="published_at" label="发布时间" width="180" />
|
||||
</el-table>
|
||||
|
||||
<el-pagination
|
||||
v-model:current-page="pagination.page"
|
||||
v-model:page-size="pagination.size"
|
||||
:total="pagination.total"
|
||||
layout="total, prev, pager, next"
|
||||
style="margin-top: 20px; justify-content: flex-end;"
|
||||
@change="loadArticles"
|
||||
/>
|
||||
</el-card>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, reactive, onMounted } from 'vue'
|
||||
import { ElMessage } from 'element-plus'
|
||||
import { datacleanApi } from '@/api'
|
||||
|
||||
const loading = ref(false)
|
||||
const articles = ref([])
|
||||
const filters = reactive({
|
||||
date: '',
|
||||
category: '',
|
||||
tag: '',
|
||||
representative_only: false,
|
||||
})
|
||||
const pagination = reactive({
|
||||
page: 1,
|
||||
size: 20,
|
||||
total: 0,
|
||||
})
|
||||
|
||||
const scoreColor = '#409eff'
|
||||
|
||||
const loadArticles = async () => {
|
||||
loading.value = true
|
||||
try {
|
||||
const params = {
|
||||
limit: pagination.size,
|
||||
offset: (pagination.page - 1) * pagination.size,
|
||||
...filters,
|
||||
}
|
||||
const res = await datacleanApi.getArticles(params)
|
||||
articles.value = res.items || []
|
||||
pagination.total = res.total || 0
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
} finally {
|
||||
loading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(loadArticles)
|
||||
</script>
|
||||
@@ -0,0 +1,121 @@
|
||||
<template>
|
||||
<div v-loading="loading">
|
||||
<el-page-header @back="$router.push('/briefs')" title="简报详情" />
|
||||
|
||||
<el-card v-if="brief" class="dark-card" style="margin-top: 20px;">
|
||||
<template #header>
|
||||
<div class="brief-header">
|
||||
<h2>{{ brief.brief_date }} 每日简报</h2>
|
||||
<div class="brief-meta">
|
||||
<el-tag type="info">原始文章:{{ brief.total_articles }}</el-tag>
|
||||
<el-tag type="success">去重后:{{ brief.unique_articles }}</el-tag>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<el-collapse v-model="activeCategories">
|
||||
<el-collapse-item
|
||||
v-for="(articles, category) in brief.by_category"
|
||||
:key="category"
|
||||
:title="`${category} (${articles.length})`"
|
||||
:name="category"
|
||||
>
|
||||
<div
|
||||
v-for="article in articles"
|
||||
:key="article.id"
|
||||
class="brief-article"
|
||||
>
|
||||
<div class="brief-article-title">
|
||||
<a :href="article.link" target="_blank" class="article-link">{{ article.title }}</a>
|
||||
<span class="brief-article-feed">{{ article.feed_title }}</span>
|
||||
</div>
|
||||
<div class="brief-article-tags">
|
||||
<el-tag v-for="tag in article.tags" :key="tag" size="small" class="tag-item">{{ tag }}</el-tag>
|
||||
<el-tag size="small" type="warning">综合 {{ article.composite_score.toFixed(1) }}</el-tag>
|
||||
</div>
|
||||
<p v-if="article.summary" class="brief-article-summary">{{ article.summary }}</p>
|
||||
</div>
|
||||
</el-collapse-item>
|
||||
</el-collapse>
|
||||
</el-card>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, onMounted } from 'vue'
|
||||
import { ElMessage } from 'element-plus'
|
||||
import { datacleanApi } from '@/api'
|
||||
|
||||
const props = defineProps({
|
||||
date: {
|
||||
type: String,
|
||||
required: true,
|
||||
},
|
||||
})
|
||||
|
||||
const loading = ref(false)
|
||||
const brief = ref(null)
|
||||
const activeCategories = ref([])
|
||||
|
||||
const loadBrief = async () => {
|
||||
loading.value = true
|
||||
try {
|
||||
brief.value = await datacleanApi.getBrief(props.date)
|
||||
activeCategories.value = Object.keys(brief.value.by_category || {})
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
} finally {
|
||||
loading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(loadBrief)
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.brief-header h2 {
|
||||
margin-bottom: 10px;
|
||||
color: var(--dc-text);
|
||||
}
|
||||
|
||||
.brief-meta {
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.brief-article {
|
||||
padding: 16px 0;
|
||||
border-bottom: 1px solid var(--dc-border);
|
||||
}
|
||||
|
||||
.brief-article:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
.brief-article-title {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.brief-article-title a {
|
||||
font-size: 16px;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.brief-article-feed {
|
||||
color: var(--dc-text-secondary);
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
.brief-article-tags {
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.brief-article-summary {
|
||||
color: var(--dc-text-secondary);
|
||||
font-size: 14px;
|
||||
line-height: 1.6;
|
||||
}
|
||||
</style>
|
||||
@@ -0,0 +1,56 @@
|
||||
<template>
|
||||
<div>
|
||||
<h1 class="page-title">每日简报</h1>
|
||||
|
||||
<el-card class="dark-card">
|
||||
<el-table :data="briefs" v-loading="loading">
|
||||
<el-table-column prop="brief_date" label="日期" width="150" />
|
||||
<el-table-column prop="total_articles" label="原始文章数" width="130" />
|
||||
<el-table-column prop="unique_articles" label="去重后文章数" width="140" />
|
||||
<el-table-column label="分类数">
|
||||
<template #default="{ row }">
|
||||
{{ Object.keys(row.by_category || {}).length }}
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column label="操作" width="200">
|
||||
<template #default="{ row }">
|
||||
<el-button size="small" @click="$router.push(`/briefs/${row.brief_date}`)">查看</el-button>
|
||||
<el-button size="small" type="primary" @click="regenerate(row.brief_date)">重新生成</el-button>
|
||||
</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
</el-card>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, onMounted } from 'vue'
|
||||
import { ElMessage } from 'element-plus'
|
||||
import { datacleanApi } from '@/api'
|
||||
|
||||
const loading = ref(false)
|
||||
const briefs = ref([])
|
||||
|
||||
const loadBriefs = async () => {
|
||||
loading.value = true
|
||||
try {
|
||||
briefs.value = await datacleanApi.getBriefs({ limit: 50 })
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
} finally {
|
||||
loading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
const regenerate = async (date) => {
|
||||
try {
|
||||
await datacleanApi.regenerateBrief(date)
|
||||
ElMessage.success('简报重新生成成功')
|
||||
loadBriefs()
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(loadBriefs)
|
||||
</script>
|
||||
@@ -0,0 +1,152 @@
|
||||
<template>
|
||||
<div>
|
||||
<h1 class="page-title">仪表盘</h1>
|
||||
|
||||
<!-- 统计卡片 -->
|
||||
<el-row :gutter="20">
|
||||
<el-col :span="6" v-for="stat in stats" :key="stat.label">
|
||||
<div class="stat-card">
|
||||
<div class="stat-value">{{ stat.value }}</div>
|
||||
<div class="stat-label">{{ stat.label }}</div>
|
||||
</div>
|
||||
</el-col>
|
||||
</el-row>
|
||||
|
||||
<!-- 分类分布 + 最近简报 -->
|
||||
<el-row :gutter="20" style="margin-top: 20px;">
|
||||
<el-col :span="16">
|
||||
<el-card class="dark-card">
|
||||
<template #header>
|
||||
<span>分类分布</span>
|
||||
</template>
|
||||
<div v-if="categoryDistribution.length" class="daily-bar-wrap">
|
||||
<div
|
||||
v-for="item in categoryDistribution"
|
||||
:key="item.category"
|
||||
class="daily-bar"
|
||||
:style="{ height: item.percentage + '%' }"
|
||||
:title="`${item.category}: ${item.count}`"
|
||||
>
|
||||
<span class="daily-bar-value">{{ item.count }}</span>
|
||||
<span class="daily-bar-label">{{ item.category }}</span>
|
||||
</div>
|
||||
</div>
|
||||
<el-empty v-else description="暂无数据" />
|
||||
</el-card>
|
||||
</el-col>
|
||||
|
||||
<el-col :span="8">
|
||||
<el-card class="dark-card">
|
||||
<template #header>
|
||||
<span>最近简报</span>
|
||||
</template>
|
||||
<el-timeline v-if="recentBriefs.length">
|
||||
<el-timeline-item
|
||||
v-for="brief in recentBriefs"
|
||||
:key="brief.brief_date"
|
||||
:timestamp="brief.brief_date"
|
||||
>
|
||||
<el-link @click="$router.push(`/briefs/${brief.brief_date}`)">
|
||||
{{ brief.unique_articles }} 篇去重后文章 / {{ brief.total_articles }} 篇原始文章
|
||||
</el-link>
|
||||
</el-timeline-item>
|
||||
</el-timeline>
|
||||
<el-empty v-else description="暂无简报" />
|
||||
</el-card>
|
||||
</el-col>
|
||||
</el-row>
|
||||
|
||||
<!-- 任务状态 -->
|
||||
<el-row :gutter="20" style="margin-top: 20px;">
|
||||
<el-col :span="24">
|
||||
<el-card class="dark-card">
|
||||
<template #header>
|
||||
<span>定时任务状态</span>
|
||||
</template>
|
||||
<el-table :data="jobList" style="width: 100%">
|
||||
<el-table-column prop="id" label="任务" />
|
||||
<el-table-column prop="next_run" label="下次执行时间" />
|
||||
</el-table>
|
||||
</el-card>
|
||||
</el-col>
|
||||
</el-row>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, computed, onMounted } from 'vue'
|
||||
import { ElMessage } from 'element-plus'
|
||||
import { datacleanApi } from '@/api'
|
||||
|
||||
const statsData = ref({
|
||||
total_articles: 0,
|
||||
today_articles: 0,
|
||||
ai_summarized: 0,
|
||||
categories: 0,
|
||||
tags: 0,
|
||||
duplicate_groups: 0,
|
||||
briefs: 0,
|
||||
next_jobs: {},
|
||||
})
|
||||
const recentBriefs = ref([])
|
||||
const categoryDistribution = ref([])
|
||||
|
||||
const stats = computed(() => [
|
||||
{ label: '总加工文章', value: statsData.value.total_articles },
|
||||
{ label: '今日文章', value: statsData.value.today_articles },
|
||||
{ label: 'AI 摘要覆盖', value: statsData.value.ai_summarized },
|
||||
{ label: '分类数', value: statsData.value.categories },
|
||||
{ label: '标签数', value: statsData.value.tags },
|
||||
{ label: '去重组数', value: statsData.value.duplicate_groups },
|
||||
{ label: '已生成简报', value: statsData.value.briefs },
|
||||
])
|
||||
|
||||
const jobList = computed(() => {
|
||||
return Object.entries(statsData.value.next_jobs || {}).map(([id, next_run]) => ({
|
||||
id,
|
||||
next_run: next_run || '未知',
|
||||
}))
|
||||
})
|
||||
|
||||
const loadData = async () => {
|
||||
try {
|
||||
const [statsRes, briefsRes, taxonomyRes] = await Promise.all([
|
||||
datacleanApi.getStats(),
|
||||
datacleanApi.getBriefs({ limit: 5 }),
|
||||
datacleanApi.getTaxonomy(),
|
||||
])
|
||||
|
||||
statsData.value = statsRes
|
||||
recentBriefs.value = briefsRes
|
||||
|
||||
// 计算分类分布
|
||||
const categories = taxonomyRes.filter((t) => t.kind === 'category')
|
||||
const catMap = {}
|
||||
categories.forEach((c) => {
|
||||
catMap[c.name] = 0
|
||||
})
|
||||
|
||||
// 从简报中聚合各分类文章数(取最近一份简报)
|
||||
if (briefsRes.length > 0) {
|
||||
const latestBrief = await datacleanApi.getBrief(briefsRes[0].brief_date)
|
||||
const byCategory = latestBrief.by_category || {}
|
||||
Object.entries(byCategory).forEach(([cat, articles]) => {
|
||||
catMap[cat] = articles.length
|
||||
})
|
||||
}
|
||||
|
||||
const maxCount = Math.max(...Object.values(catMap), 1)
|
||||
categoryDistribution.value = Object.entries(catMap)
|
||||
.map(([category, count]) => ({
|
||||
category,
|
||||
count,
|
||||
percentage: (count / maxCount) * 100,
|
||||
}))
|
||||
.filter((item) => item.count > 0)
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(loadData)
|
||||
</script>
|
||||
@@ -0,0 +1,103 @@
|
||||
<template>
|
||||
<div>
|
||||
<h1 class="page-title">系统配置</h1>
|
||||
|
||||
<el-card class="dark-card">
|
||||
<template #header>
|
||||
<div style="display: flex; justify-content: space-between; align-items: center;">
|
||||
<span>配置项</span>
|
||||
<el-button type="danger" @click="resetSettings">重置为默认值</el-button>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<el-alert
|
||||
title="配置修改后会保存到 SQLite 数据库,重启服务后生效。"
|
||||
type="warning"
|
||||
:closable="false"
|
||||
style="margin-bottom: 20px;"
|
||||
/>
|
||||
|
||||
<el-form :model="settings" label-position="top" v-loading="loading">
|
||||
<el-row :gutter="20">
|
||||
<el-col :span="12" v-for="item in settings" :key="item.key">
|
||||
<el-form-item :label="`${item.description} (${item.key})`">
|
||||
<el-input
|
||||
v-if="item.is_sensitive"
|
||||
v-model="item.value"
|
||||
type="password"
|
||||
show-password
|
||||
placeholder="请输入"
|
||||
/>
|
||||
<el-input
|
||||
v-else
|
||||
v-model="item.value"
|
||||
placeholder="请输入"
|
||||
/>
|
||||
</el-form-item>
|
||||
</el-col>
|
||||
</el-row>
|
||||
|
||||
<el-form-item>
|
||||
<el-button type="primary" size="large" :loading="saving" @click="saveSettings">保存配置</el-button>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
</el-card>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, onMounted } from 'vue'
|
||||
import { ElMessage, ElMessageBox } from 'element-plus'
|
||||
import { datacleanApi } from '@/api'
|
||||
|
||||
const settings = ref([])
|
||||
const loading = ref(false)
|
||||
const saving = ref(false)
|
||||
|
||||
const loadSettings = async () => {
|
||||
loading.value = true
|
||||
try {
|
||||
const res = await datacleanApi.getSettings()
|
||||
settings.value = res
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
} finally {
|
||||
loading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
const saveSettings = async () => {
|
||||
saving.value = true
|
||||
try {
|
||||
const payload = {}
|
||||
for (const item of settings.value) {
|
||||
payload[item.key] = item.value
|
||||
}
|
||||
await datacleanApi.updateSettingsBatch(payload)
|
||||
ElMessage.success('配置已保存,请重启服务后生效')
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
} finally {
|
||||
saving.value = false
|
||||
}
|
||||
}
|
||||
|
||||
const resetSettings = async () => {
|
||||
try {
|
||||
await ElMessageBox.confirm('确定要重置所有配置为环境变量默认值吗?', '提示', {
|
||||
confirmButtonText: '确定',
|
||||
cancelButtonText: '取消',
|
||||
type: 'warning',
|
||||
})
|
||||
await datacleanApi.resetSettings()
|
||||
ElMessage.success('配置已重置,请重启服务后生效')
|
||||
loadSettings()
|
||||
} catch (err) {
|
||||
if (err !== 'cancel') {
|
||||
ElMessage.error(err.message)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(loadSettings)
|
||||
</script>
|
||||
@@ -0,0 +1,116 @@
|
||||
<template>
|
||||
<div>
|
||||
<h1 class="page-title">任务管理</h1>
|
||||
|
||||
<el-row :gutter="20">
|
||||
<el-col :span="8" v-for="task in tasks" :key="task.id">
|
||||
<el-card class="dark-card" style="margin-bottom: 20px;">
|
||||
<template #header>
|
||||
<div class="task-header">
|
||||
<el-icon size="24"><component :is="task.icon" /></el-icon>
|
||||
<span>{{ task.title }}</span>
|
||||
</div>
|
||||
</template>
|
||||
<p class="task-desc">{{ task.description }}</p>
|
||||
<div v-if="task.nextRun" class="task-next-run">
|
||||
下次执行:{{ task.nextRun }}
|
||||
</div>
|
||||
<el-button
|
||||
type="primary"
|
||||
style="margin-top: 16px;"
|
||||
:loading="task.loading"
|
||||
@click="runTask(task)"
|
||||
>
|
||||
立即执行
|
||||
</el-button>
|
||||
</el-card>
|
||||
</el-col>
|
||||
</el-row>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, onMounted } from 'vue'
|
||||
import { ElMessage } from 'element-plus'
|
||||
import { Document, CollectionTag, Collection } from '@element-plus/icons-vue'
|
||||
import { datacleanApi } from '@/api'
|
||||
|
||||
const tasks = ref([
|
||||
{
|
||||
id: 'summarize',
|
||||
title: '生成 AI 摘要',
|
||||
description: '拉取 rssKeeper 最近文章,为无摘要或短摘要文章生成 AI 摘要。',
|
||||
icon: 'Document',
|
||||
nextRun: '',
|
||||
loading: false,
|
||||
action: datacleanApi.summarize,
|
||||
},
|
||||
{
|
||||
id: 'tag_score_deduplicate',
|
||||
title: '分类 / 打分 / 去重',
|
||||
description: '对当天文章进行分类、打标签、计算分数并生成重复组。',
|
||||
icon: 'CollectionTag',
|
||||
nextRun: '',
|
||||
loading: false,
|
||||
action: datacleanApi.tagScoreDedup,
|
||||
},
|
||||
{
|
||||
id: 'generate_daily_brief',
|
||||
title: '生成每日简报',
|
||||
description: '基于当天去重后的代表文章生成每日简报。',
|
||||
icon: 'Collection',
|
||||
nextRun: '',
|
||||
loading: false,
|
||||
action: datacleanApi.generateBrief,
|
||||
},
|
||||
])
|
||||
|
||||
const loadStats = async () => {
|
||||
try {
|
||||
const stats = await datacleanApi.getStats()
|
||||
const nextJobs = stats.next_jobs || {}
|
||||
tasks.value.forEach((task) => {
|
||||
task.nextRun = nextJobs[task.id] || '未调度'
|
||||
})
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
}
|
||||
}
|
||||
|
||||
const runTask = async (task) => {
|
||||
task.loading = true
|
||||
try {
|
||||
const res = await task.action()
|
||||
ElMessage.success(res.message)
|
||||
loadStats()
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
} finally {
|
||||
task.loading = false
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(loadStats)
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.task-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
font-size: 16px;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.task-desc {
|
||||
color: var(--dc-text-secondary);
|
||||
line-height: 1.6;
|
||||
min-height: 60px;
|
||||
}
|
||||
|
||||
.task-next-run {
|
||||
margin-top: 12px;
|
||||
color: var(--dc-text-secondary);
|
||||
font-size: 13px;
|
||||
}
|
||||
</style>
|
||||
@@ -0,0 +1,110 @@
|
||||
<template>
|
||||
<div>
|
||||
<h1 class="page-title">分类体系</h1>
|
||||
|
||||
<el-card class="dark-card" style="margin-bottom: 20px;">
|
||||
<el-alert
|
||||
title="分类体系在首次启动时由 AI 根据样本文章生成,后续可通过编辑数据库调整。"
|
||||
type="info"
|
||||
:closable="false"
|
||||
/>
|
||||
<div style="margin-top: 16px;">
|
||||
<el-button type="primary" @click="bootstrap(false)" :loading="bootstrapping">
|
||||
检查/初始化分类体系
|
||||
</el-button>
|
||||
<el-button type="danger" @click="bootstrap(true)" :loading="bootstrapping">
|
||||
强制重新生成
|
||||
</el-button>
|
||||
</div>
|
||||
</el-card>
|
||||
|
||||
<el-tabs v-model="activeTab" class="dark-tabs">
|
||||
<el-tab-pane label="分类" name="category">
|
||||
<TaxonomyTable :data="taxonomyByKind.category" />
|
||||
</el-tab-pane>
|
||||
<el-tab-pane label="标签" name="tag">
|
||||
<TaxonomyTable :data="taxonomyByKind.tag" />
|
||||
</el-tab-pane>
|
||||
<el-tab-pane label="热度规则" name="heat_rule">
|
||||
<TaxonomyTable :data="taxonomyByKind.heat_rule" show-weight />
|
||||
</el-tab-pane>
|
||||
<el-tab-pane label="重要性规则" name="importance_rule">
|
||||
<TaxonomyTable :data="taxonomyByKind.importance_rule" show-weight />
|
||||
</el-tab-pane>
|
||||
<el-tab-pane label="重复性规则" name="duplication_rule">
|
||||
<TaxonomyTable :data="taxonomyByKind.duplication_rule" show-weight />
|
||||
</el-tab-pane>
|
||||
</el-tabs>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, computed, onMounted } from 'vue'
|
||||
import { ElMessage } from 'element-plus'
|
||||
import { datacleanApi } from '@/api'
|
||||
import TaxonomyTable from './TaxonomyTable.vue'
|
||||
|
||||
const activeTab = ref('category')
|
||||
const taxonomy = ref([])
|
||||
const bootstrapping = ref(false)
|
||||
|
||||
const taxonomyByKind = computed(() => {
|
||||
const grouped = {
|
||||
category: [],
|
||||
tag: [],
|
||||
heat_rule: [],
|
||||
importance_rule: [],
|
||||
duplication_rule: [],
|
||||
}
|
||||
taxonomy.value.forEach((item) => {
|
||||
if (grouped[item.kind]) {
|
||||
grouped[item.kind].push(item)
|
||||
}
|
||||
})
|
||||
return grouped
|
||||
})
|
||||
|
||||
const loadTaxonomy = async () => {
|
||||
try {
|
||||
taxonomy.value = await datacleanApi.getTaxonomy()
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
}
|
||||
}
|
||||
|
||||
const bootstrap = async (force) => {
|
||||
bootstrapping.value = true
|
||||
try {
|
||||
const res = await datacleanApi.bootstrapTaxonomy(force)
|
||||
ElMessage.success(res.message)
|
||||
loadTaxonomy()
|
||||
} catch (err) {
|
||||
ElMessage.error(err.message)
|
||||
} finally {
|
||||
bootstrapping.value = false
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(loadTaxonomy)
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.dark-tabs {
|
||||
background: var(--dc-card-bg);
|
||||
border: 1px solid var(--dc-border);
|
||||
border-radius: 8px;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
.dark-tabs :deep(.el-tabs__item) {
|
||||
color: var(--dc-text-secondary);
|
||||
}
|
||||
|
||||
.dark-tabs :deep(.el-tabs__item.is-active) {
|
||||
color: var(--dc-primary);
|
||||
}
|
||||
|
||||
.dark-tabs :deep(.el-tabs__active-bar) {
|
||||
background-color: var(--dc-primary);
|
||||
}
|
||||
</style>
|
||||
@@ -0,0 +1,30 @@
|
||||
<template>
|
||||
<el-table :data="data" style="width: 100%">
|
||||
<el-table-column prop="name" label="名称" width="160" />
|
||||
<el-table-column prop="description" label="描述" min-width="200" />
|
||||
<el-table-column label="关键词" min-width="250">
|
||||
<template #default="{ row }">
|
||||
<el-tag v-for="kw in row.keywords" :key="kw" size="small" class="tag-item" type="info">{{ kw }}</el-tag>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column v-if="showWeight" prop="weight" label="权重" width="100" />
|
||||
<el-table-column label="来源" width="120">
|
||||
<template #default="{ row }">
|
||||
<el-tag :type="row.created_by_ai ? 'success' : 'info'">{{ row.created_by_ai ? 'AI 生成' : '手动' }}</el-tag>
|
||||
</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
defineProps({
|
||||
data: {
|
||||
type: Array,
|
||||
default: () => [],
|
||||
},
|
||||
showWeight: {
|
||||
type: Boolean,
|
||||
default: false,
|
||||
},
|
||||
})
|
||||
</script>
|
||||
@@ -0,0 +1,30 @@
|
||||
import { defineConfig } from 'vite'
|
||||
import vue from '@vitejs/plugin-vue'
|
||||
import { resolve } from 'path'
|
||||
|
||||
// https://vitejs.dev/config/
|
||||
export default defineConfig({
|
||||
plugins: [vue()],
|
||||
resolve: {
|
||||
alias: {
|
||||
'@': resolve(__dirname, 'src'),
|
||||
},
|
||||
},
|
||||
server: {
|
||||
port: 7332,
|
||||
proxy: {
|
||||
'/api': {
|
||||
target: 'http://localhost:7331',
|
||||
changeOrigin: true,
|
||||
},
|
||||
'/health': {
|
||||
target: 'http://localhost:7331',
|
||||
changeOrigin: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
build: {
|
||||
outDir: 'dist',
|
||||
assetsDir: 'assets',
|
||||
},
|
||||
})
|
||||
@@ -0,0 +1,426 @@
|
||||
"""dataClean FastAPI 入口"""
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional, List
|
||||
|
||||
from fastapi import FastAPI, Depends, HTTPException, Query, Body, Security, status
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from database import init_db, get_db, SessionLocal
|
||||
from scheduler import init_scheduler, stop_scheduler, get_scheduler, get_task_lock
|
||||
from app.taxonomy import bootstrap_taxonomy, list_taxonomy, ensure_taxonomy
|
||||
from app.summarizer import fetch_and_summarize
|
||||
from app.tagger import tag_articles
|
||||
from app.deduplicator import deduplicate_articles
|
||||
from app.scorer import score_articles
|
||||
from app.brief import generate_daily_brief
|
||||
from app.settings_manager import (
|
||||
init_default_settings,
|
||||
list_settings,
|
||||
get_setting,
|
||||
set_setting,
|
||||
reset_settings,
|
||||
apply_db_settings_to_config,
|
||||
)
|
||||
from models import EnrichedArticle, DailyBrief, Taxonomy, DuplicateGroup, AppSetting
|
||||
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO),
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# API Token 鉴权(当配置时启用)
|
||||
security_scheme = HTTPBearer(auto_error=False)
|
||||
|
||||
|
||||
def _get_allowed_origins() -> List[str]:
|
||||
"""解析 CORS 允许来源配置"""
|
||||
raw = settings.CORS_ALLOWED_ORIGINS
|
||||
if raw:
|
||||
return [o.strip() for o in raw.split(",") if o.strip()]
|
||||
# 默认只允许同源(Docker/生产由反向代理或浏览器同源访问)
|
||||
return []
|
||||
|
||||
|
||||
def verify_token(credentials: Optional[HTTPAuthorizationCredentials] = Security(security_scheme)):
|
||||
"""验证 API Token;未配置时跳过鉴权"""
|
||||
token = settings.API_TOKEN
|
||||
if not token:
|
||||
return None
|
||||
if not credentials:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="缺少 Authorization 请求头",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
if credentials.scheme != "Bearer" or credentials.credentials != token:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
detail="无效的 API Token",
|
||||
)
|
||||
return credentials.credentials
|
||||
|
||||
|
||||
def _run_task_locked(func, db: Session):
|
||||
"""带互斥锁执行任务"""
|
||||
acquired = get_task_lock().acquire(blocking=False)
|
||||
if not acquired:
|
||||
raise HTTPException(status_code=409, detail="已有任务正在执行,请稍后再试")
|
||||
try:
|
||||
return func(db)
|
||||
finally:
|
||||
get_task_lock().release()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""应用生命周期管理"""
|
||||
logger.info("启动 dataClean 服务")
|
||||
init_db()
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
# 初始化默认配置
|
||||
init_default_settings(db)
|
||||
# 用数据库配置覆盖全局 settings
|
||||
apply_db_settings_to_config(db)
|
||||
# 首次启动时确保 taxonomy 表存在
|
||||
ensure_taxonomy(db)
|
||||
except Exception as exc:
|
||||
logger.error("启动初始化失败: %s", exc)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
init_scheduler()
|
||||
yield
|
||||
stop_scheduler()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="dataClean",
|
||||
description="RSS 数据清洗、摘要、分类、打分与简报生成服务",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# CORS 配置:生产环境收敛到具体域名,且不与 credentials=true 同时用通配符
|
||||
_allowed_origins = _get_allowed_origins()
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=_allowed_origins or ["*"],
|
||||
allow_credentials=bool(_allowed_origins),
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
# ---------- Pydantic 模型 ----------
|
||||
|
||||
class ArticleOut(BaseModel):
|
||||
id: int
|
||||
rk_article_id: int
|
||||
title: str
|
||||
link: str
|
||||
feed_title: str
|
||||
category: str
|
||||
tags: List[str]
|
||||
heat_score: float
|
||||
importance_score: float
|
||||
duplication_score: float
|
||||
composite_score: float
|
||||
ai_summary: str
|
||||
is_representative: bool
|
||||
published_at: Optional[str]
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class ArticleListOut(BaseModel):
|
||||
items: List[ArticleOut]
|
||||
total: int
|
||||
|
||||
|
||||
class BriefOut(BaseModel):
|
||||
id: int
|
||||
brief_date: str
|
||||
total_articles: int
|
||||
unique_articles: int
|
||||
by_category: dict
|
||||
markdown_path: str
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class TaxonomyOut(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
kind: str
|
||||
description: str
|
||||
keywords: List[str]
|
||||
weight: float
|
||||
created_by_ai: bool
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class SettingOut(BaseModel):
|
||||
key: str
|
||||
value: str
|
||||
description: str
|
||||
is_sensitive: bool
|
||||
is_masked: bool
|
||||
updated_at: Optional[str]
|
||||
|
||||
|
||||
class SettingUpdate(BaseModel):
|
||||
value: str
|
||||
|
||||
|
||||
class BatchSettingsUpdate(BaseModel):
|
||||
settings: dict
|
||||
|
||||
|
||||
class StatsOut(BaseModel):
|
||||
total_articles: int
|
||||
today_articles: int
|
||||
ai_summarized: int
|
||||
categories: int
|
||||
tags: int
|
||||
duplicate_groups: int
|
||||
briefs: int
|
||||
next_jobs: dict
|
||||
|
||||
|
||||
# ---------- 健康检查 ----------
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok", "service": "dataClean"}
|
||||
|
||||
|
||||
# ---------- 文章接口 ----------
|
||||
|
||||
@app.get("/api/articles", response_model=ArticleListOut)
|
||||
def list_articles(
|
||||
date: Optional[str] = Query(None, description="日期 YYYY-MM-DD"),
|
||||
category: Optional[str] = Query(None),
|
||||
tag: Optional[str] = Query(None),
|
||||
representative_only: bool = Query(False, description="仅返回重复组代表文章"),
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
offset: int = Query(0, ge=0),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
query = db.query(EnrichedArticle)
|
||||
|
||||
if date:
|
||||
day = datetime.strptime(date, "%Y-%m-%d")
|
||||
next_day = day + timedelta(days=1)
|
||||
query = query.filter(EnrichedArticle.fetched_at >= day, EnrichedArticle.fetched_at < next_day)
|
||||
if category:
|
||||
query = query.filter(EnrichedArticle.category == category)
|
||||
if tag:
|
||||
# SQLite JSON 列使用 json_each 做精确匹配,避免字符串子串误命中
|
||||
query = query.filter(
|
||||
EnrichedArticle.tags.contains([tag])
|
||||
)
|
||||
if representative_only:
|
||||
query = query.filter(
|
||||
(EnrichedArticle.is_representative == True) | (EnrichedArticle.duplicate_group_id == None)
|
||||
)
|
||||
|
||||
total = query.count()
|
||||
items = query.order_by(EnrichedArticle.composite_score.desc()).offset(offset).limit(limit).all()
|
||||
return {"items": items, "total": total}
|
||||
|
||||
|
||||
@app.get("/api/articles/{article_id}", response_model=ArticleOut)
|
||||
def get_article(article_id: int, db: Session = Depends(get_db)):
|
||||
article = db.query(EnrichedArticle).filter(EnrichedArticle.id == article_id).first()
|
||||
if not article:
|
||||
raise HTTPException(status_code=404, detail="文章不存在")
|
||||
return article
|
||||
|
||||
|
||||
# ---------- 简报接口 ----------
|
||||
|
||||
@app.get("/api/briefs", response_model=List[BriefOut])
|
||||
def list_briefs(
|
||||
limit: int = Query(30, ge=1, le=100),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
return (
|
||||
db.query(DailyBrief)
|
||||
.order_by(DailyBrief.brief_date.desc())
|
||||
.limit(limit)
|
||||
.all()
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/briefs/{date}", response_model=BriefOut)
|
||||
def get_brief(date: str, db: Session = Depends(get_db)):
|
||||
brief = db.query(DailyBrief).filter(DailyBrief.brief_date == date).first()
|
||||
if not brief:
|
||||
raise HTTPException(status_code=404, detail="简报不存在")
|
||||
return brief
|
||||
|
||||
|
||||
@app.post("/api/briefs/{date}/regenerate")
|
||||
def regenerate_brief(date: str, db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
try:
|
||||
data = generate_daily_brief(db, date_str=date, force=True)
|
||||
return {"message": "简报已重新生成", "data": data}
|
||||
except Exception as exc:
|
||||
logger.error("重新生成简报失败: %s", exc)
|
||||
raise HTTPException(status_code=500, detail=str(exc))
|
||||
|
||||
|
||||
# ---------- 分类体系接口 ----------
|
||||
|
||||
@app.get("/api/taxonomy", response_model=List[TaxonomyOut])
|
||||
def get_taxonomy(kind: Optional[str] = Query(None), db: Session = Depends(get_db)):
|
||||
return list_taxonomy(db, kind=kind)
|
||||
|
||||
|
||||
@app.post("/api/taxonomy/bootstrap")
|
||||
def trigger_taxonomy_bootstrap(
|
||||
force: bool = False,
|
||||
db: Session = Depends(get_db),
|
||||
_=Depends(verify_token),
|
||||
):
|
||||
ok = bootstrap_taxonomy(db, force=force)
|
||||
if not ok:
|
||||
return {"message": "taxonomy 已存在或初始化失败,请检查日志"}
|
||||
return {"message": "taxonomy 初始化成功"}
|
||||
|
||||
|
||||
# ---------- 手动触发任务接口 ----------
|
||||
|
||||
@app.post("/api/tasks/summarize")
|
||||
def task_summarize(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
stats = _run_task_locked(lambda session: fetch_and_summarize(session, hours=24, limit=200), db)
|
||||
return {"message": "摘要任务完成", "stats": stats}
|
||||
|
||||
|
||||
@app.post("/api/tasks/tag-score-dedup")
|
||||
def task_tag_score_dedup(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
def _run(session):
|
||||
tag_articles(session)
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
deduplicate_articles(session, date_str=today)
|
||||
score_articles(session, update_duplication=True)
|
||||
return None
|
||||
_run_task_locked(_run, db)
|
||||
return {"message": "分类/去重/打分任务完成"}
|
||||
|
||||
|
||||
@app.post("/api/tasks/brief")
|
||||
def task_brief(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
def _run(session):
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
return generate_daily_brief(session, date_str=today, force=True)
|
||||
data = _run_task_locked(_run, db)
|
||||
return {"message": "简报生成任务完成", "data": data}
|
||||
|
||||
|
||||
# ---------- 配置管理接口 ----------
|
||||
|
||||
@app.get("/api/settings", response_model=List[SettingOut])
|
||||
def get_settings(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
return list_settings(db, mask_sensitive=True)
|
||||
|
||||
|
||||
@app.put("/api/settings/{key}")
|
||||
def update_setting(
|
||||
key: str,
|
||||
body: SettingUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
_=Depends(verify_token),
|
||||
):
|
||||
ok = set_setting(db, key, body.value)
|
||||
if not ok:
|
||||
raise HTTPException(status_code=400, detail="无效的配置项")
|
||||
return {"message": "配置已保存,重启服务后生效"}
|
||||
|
||||
|
||||
@app.put("/api/settings")
|
||||
def update_settings_batch(
|
||||
body: BatchSettingsUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
_=Depends(verify_token),
|
||||
):
|
||||
errors = []
|
||||
for key, value in body.settings.items():
|
||||
if not set_setting(db, key, value):
|
||||
errors.append(key)
|
||||
if errors:
|
||||
raise HTTPException(status_code=400, detail=f"以下配置项无效: {', '.join(errors)}")
|
||||
return {"message": "配置已保存,重启服务后生效"}
|
||||
|
||||
|
||||
@app.post("/api/settings/reset")
|
||||
def reset_all_settings(db: Session = Depends(get_db), _=Depends(verify_token)):
|
||||
reset_settings(db)
|
||||
return {"message": "配置已重置为环境变量默认值,重启服务后生效"}
|
||||
|
||||
|
||||
# ---------- 仪表盘统计接口 ----------
|
||||
|
||||
@app.get("/api/stats", response_model=StatsOut)
|
||||
def get_stats(db: Session = Depends(get_db)):
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
day_start = datetime.strptime(today, "%Y-%m-%d")
|
||||
day_end = day_start + timedelta(days=1)
|
||||
|
||||
total_articles = db.query(EnrichedArticle).count()
|
||||
today_articles = (
|
||||
db.query(EnrichedArticle)
|
||||
.filter(EnrichedArticle.fetched_at >= day_start, EnrichedArticle.fetched_at < day_end)
|
||||
.count()
|
||||
)
|
||||
ai_summarized = db.query(EnrichedArticle).filter(EnrichedArticle.ai_summary != "").count()
|
||||
categories = db.query(Taxonomy).filter(Taxonomy.kind == "category").count()
|
||||
tags = db.query(Taxonomy).filter(Taxonomy.kind == "tag").count()
|
||||
duplicate_groups = db.query(DuplicateGroup).count()
|
||||
briefs = db.query(DailyBrief).count()
|
||||
|
||||
scheduler = get_scheduler()
|
||||
next_jobs = {}
|
||||
for job in scheduler.get_jobs():
|
||||
next_jobs[job.id] = job.next_run_time.isoformat() if job.next_run_time else None
|
||||
|
||||
return {
|
||||
"total_articles": total_articles,
|
||||
"today_articles": today_articles,
|
||||
"ai_summarized": ai_summarized,
|
||||
"categories": categories,
|
||||
"tags": tags,
|
||||
"duplicate_groups": duplicate_groups,
|
||||
"briefs": briefs,
|
||||
"next_jobs": next_jobs,
|
||||
}
|
||||
|
||||
|
||||
# ---------- 静态文件托管(生产环境) ----------
|
||||
|
||||
static_dir = os.path.join(os.path.dirname(__file__), "static")
|
||||
if not os.path.isdir(static_dir):
|
||||
# 本地构建时 frontend/dist 也可作为静态文件源
|
||||
frontend_dist = os.path.join(os.path.dirname(__file__), "frontend", "dist")
|
||||
if os.path.isdir(frontend_dist):
|
||||
static_dir = frontend_dist
|
||||
|
||||
if os.path.isdir(static_dir):
|
||||
app.mount("/", StaticFiles(directory=static_dir, html=True), name="static")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=7331)
|
||||
@@ -0,0 +1,109 @@
|
||||
"""SQLAlchemy 数据模型"""
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy import Column, Integer, String, Text, Boolean, DateTime, Float, ForeignKey, JSON
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from database import Base
|
||||
|
||||
|
||||
def _utc_now():
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
class EnrichedArticle(Base):
|
||||
"""加工后的文章,存储 AI 摘要、分类、标签、分数和去重信息"""
|
||||
|
||||
__tablename__ = "articles_enriched"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
rk_article_id = Column(Integer, unique=True, nullable=False, index=True)
|
||||
|
||||
title = Column(String(1024), default="", index=True)
|
||||
link = Column(String(2048), default="", index=True)
|
||||
feed_id = Column(Integer, nullable=False, index=True)
|
||||
feed_title = Column(String(512), default="")
|
||||
feed_category = Column(String(128), default="")
|
||||
author = Column(String(256), default="")
|
||||
|
||||
published_at = Column(DateTime, nullable=True, index=True)
|
||||
fetched_at = Column(DateTime, default=_utc_now, index=True)
|
||||
|
||||
original_summary = Column(Text, default="")
|
||||
content = Column(Text, default="")
|
||||
ai_summary = Column(Text, default="")
|
||||
|
||||
category = Column(String(128), default="", index=True)
|
||||
tags = Column(JSON, default=lambda: list())
|
||||
|
||||
heat_score = Column(Float, default=0.0)
|
||||
importance_score = Column(Float, default=0.0)
|
||||
duplication_score = Column(Float, default=0.0)
|
||||
composite_score = Column(Float, default=0.0)
|
||||
|
||||
duplicate_group_id = Column(Integer, ForeignKey("duplicate_groups.id", ondelete="SET NULL"), nullable=True, index=True)
|
||||
is_representative = Column(Boolean, default=False, index=True)
|
||||
|
||||
brief_date = Column(String(10), default="", index=True)
|
||||
|
||||
created_at = Column(DateTime, default=_utc_now)
|
||||
updated_at = Column(DateTime, default=_utc_now, onupdate=_utc_now)
|
||||
|
||||
duplicate_group = relationship("DuplicateGroup", back_populates="articles")
|
||||
|
||||
|
||||
class Taxonomy(Base):
|
||||
"""分类、标签、打分规则表"""
|
||||
|
||||
__tablename__ = "taxonomy"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
name = Column(String(128), nullable=False, index=True)
|
||||
kind = Column(String(32), nullable=False, index=True) # category/tag/heat_rule/importance_rule/duplication_rule
|
||||
description = Column(Text, default="")
|
||||
keywords = Column(JSON, default=list) # 关键词或规则列表
|
||||
weight = Column(Float, default=1.0)
|
||||
created_by_ai = Column(Boolean, default=False)
|
||||
created_at = Column(DateTime, default=_utc_now)
|
||||
|
||||
|
||||
class DuplicateGroup(Base):
|
||||
"""重复文章组"""
|
||||
|
||||
__tablename__ = "duplicate_groups"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
representative_article_id = Column(Integer, ForeignKey("articles_enriched.id", ondelete="SET NULL"), nullable=True)
|
||||
member_article_ids = Column(JSON, default=lambda: list())
|
||||
similarity_matrix = Column(JSON, default=lambda: dict())
|
||||
brief_date = Column(String(10), default="", index=True)
|
||||
created_at = Column(DateTime, default=_utc_now)
|
||||
|
||||
articles = relationship("EnrichedArticle", back_populates="duplicate_group")
|
||||
|
||||
|
||||
class DailyBrief(Base):
|
||||
"""每日简报"""
|
||||
|
||||
__tablename__ = "daily_briefs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
brief_date = Column(String(10), unique=True, nullable=False, index=True)
|
||||
total_articles = Column(Integer, default=0)
|
||||
unique_articles = Column(Integer, default=0)
|
||||
by_category = Column(JSON, default=lambda: dict())
|
||||
markdown_path = Column(String(512), default="")
|
||||
created_at = Column(DateTime, default=_utc_now)
|
||||
updated_at = Column(DateTime, default=_utc_now, onupdate=_utc_now)
|
||||
|
||||
|
||||
class AppSetting(Base):
|
||||
"""运行时配置表"""
|
||||
|
||||
__tablename__ = "app_settings"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
key = Column(String(128), unique=True, nullable=False, index=True)
|
||||
value = Column(Text, default="")
|
||||
description = Column(Text, default="")
|
||||
is_sensitive = Column(Boolean, default=False)
|
||||
updated_at = Column(DateTime, default=_utc_now, onupdate=_utc_now)
|
||||
@@ -0,0 +1,10 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.32.0
|
||||
sqlalchemy==2.0.36
|
||||
pydantic==2.9.2
|
||||
pydantic-settings==2.6.1
|
||||
requests==2.32.3
|
||||
apscheduler==3.10.4
|
||||
openai==1.55.3
|
||||
scikit-learn==1.5.2
|
||||
python-dateutil==2.9.0.post0
|
||||
+166
@@ -0,0 +1,166 @@
|
||||
"""APScheduler 定时任务"""
|
||||
import functools
|
||||
import logging
|
||||
import threading
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.date import DateTrigger
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from config import settings
|
||||
from database import SessionLocal
|
||||
from app.taxonomy import ensure_taxonomy, bootstrap_taxonomy
|
||||
from app.summarizer import fetch_and_summarize
|
||||
from app.tagger import tag_articles
|
||||
from app.deduplicator import deduplicate_articles
|
||||
from app.scorer import score_articles
|
||||
from app.brief import generate_daily_brief
|
||||
from app.settings_manager import get_setting_value
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_scheduler: BackgroundScheduler | None = None
|
||||
|
||||
# 任务互斥锁:防止手动任务与定时任务并发执行
|
||||
_task_lock = threading.Lock()
|
||||
|
||||
|
||||
def get_scheduler() -> BackgroundScheduler:
|
||||
global _scheduler
|
||||
if _scheduler is None:
|
||||
_scheduler = BackgroundScheduler(
|
||||
job_defaults={
|
||||
"coalesce": True,
|
||||
"max_instances": 1,
|
||||
"misfire_grace_time": 300,
|
||||
},
|
||||
timezone="Asia/Shanghai",
|
||||
)
|
||||
return _scheduler
|
||||
|
||||
|
||||
def get_task_lock():
|
||||
"""返回全局任务互斥锁,供手动任务接口使用"""
|
||||
return _task_lock
|
||||
|
||||
|
||||
def _with_db(func):
|
||||
"""装饰器:为任务函数提供数据库会话,并记录运行日志"""
|
||||
@functools.wraps(func)
|
||||
def wrapper():
|
||||
acquired = _task_lock.acquire(blocking=False)
|
||||
if not acquired:
|
||||
logger.warning("定时任务 %s 跳过:已有其他任务正在执行", func.__name__)
|
||||
return
|
||||
db = SessionLocal()
|
||||
try:
|
||||
func(db)
|
||||
except Exception as exc:
|
||||
logger.error("定时任务 %s 执行失败: %s", func.__name__, exc, exc_info=True)
|
||||
finally:
|
||||
db.close()
|
||||
_task_lock.release()
|
||||
return wrapper
|
||||
|
||||
|
||||
@_with_db
|
||||
def job_bootstrap_taxonomy(db: Session):
|
||||
"""初始化分类体系(仅在表为空时执行)"""
|
||||
logger.info("执行 taxonomy 初始化检查")
|
||||
ensure_taxonomy(db)
|
||||
|
||||
|
||||
@_with_db
|
||||
def job_fetch_and_summarize(db: Session):
|
||||
"""拉取文章并生成摘要"""
|
||||
logger.info("执行摘要生成任务")
|
||||
fetch_and_summarize(db, hours=24, limit=200)
|
||||
|
||||
|
||||
@_with_db
|
||||
def job_tag_score_deduplicate(db: Session):
|
||||
"""对当天文章分类、打分、去重"""
|
||||
logger.info("执行分类/打分/去重任务")
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
# 1. 对当天未分类的文章打标签
|
||||
tag_articles(db)
|
||||
|
||||
# 2. 对当天文章去重
|
||||
deduplicate_articles(db, date_str=today)
|
||||
|
||||
# 3. 重新计算分数(含重复性分数)
|
||||
score_articles(db, update_duplication=True)
|
||||
|
||||
|
||||
@_with_db
|
||||
def job_generate_daily_brief(db: Session):
|
||||
"""生成每日简报"""
|
||||
logger.info("执行每日简报生成任务")
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
generate_daily_brief(db, date_str=today, force=True)
|
||||
|
||||
|
||||
def init_scheduler():
|
||||
"""注册并启动所有定时任务"""
|
||||
scheduler = get_scheduler()
|
||||
|
||||
# 从数据库/环境变量读取调度配置
|
||||
summarize_interval = int(get_setting_value("SUMMARIZE_INTERVAL_MINUTES", settings.SUMMARIZE_INTERVAL_MINUTES))
|
||||
tag_score_interval = int(get_setting_value("TAG_SCORE_INTERVAL_MINUTES", settings.TAG_SCORE_INTERVAL_MINUTES))
|
||||
brief_hour = int(get_setting_value("DAILY_BRIEF_HOUR", settings.DAILY_BRIEF_HOUR))
|
||||
brief_minute = int(get_setting_value("DAILY_BRIEF_MINUTE", settings.DAILY_BRIEF_MINUTE))
|
||||
|
||||
# taxonomy 初始化:服务启动后立即执行一次
|
||||
scheduler.add_job(
|
||||
job_bootstrap_taxonomy,
|
||||
trigger=DateTrigger(run_date=datetime.now()),
|
||||
id="bootstrap_taxonomy",
|
||||
replace_existing=True,
|
||||
max_instances=1,
|
||||
)
|
||||
|
||||
# 摘要任务
|
||||
scheduler.add_job(
|
||||
job_fetch_and_summarize,
|
||||
trigger=IntervalTrigger(minutes=summarize_interval),
|
||||
id="fetch_and_summarize",
|
||||
replace_existing=True,
|
||||
)
|
||||
|
||||
# 分类/打分/去重任务
|
||||
scheduler.add_job(
|
||||
job_tag_score_deduplicate,
|
||||
trigger=IntervalTrigger(minutes=tag_score_interval),
|
||||
id="tag_score_deduplicate",
|
||||
replace_existing=True,
|
||||
)
|
||||
|
||||
# 每日简报
|
||||
scheduler.add_job(
|
||||
job_generate_daily_brief,
|
||||
trigger=CronTrigger(hour=brief_hour, minute=brief_minute),
|
||||
id="generate_daily_brief",
|
||||
replace_existing=True,
|
||||
)
|
||||
|
||||
scheduler.start()
|
||||
logger.info(
|
||||
"调度器已启动: summarize=%d分钟, tag_score=%d分钟, brief=%02d:%02d",
|
||||
summarize_interval,
|
||||
tag_score_interval,
|
||||
brief_hour,
|
||||
brief_minute,
|
||||
)
|
||||
|
||||
|
||||
def stop_scheduler():
|
||||
"""停止调度器"""
|
||||
global _scheduler
|
||||
if _scheduler:
|
||||
_scheduler.shutdown(wait=False)
|
||||
_scheduler = None
|
||||
logger.info("调度器已停止")
|
||||
@@ -0,0 +1,21 @@
|
||||
"""测试配置"""
|
||||
import pytest
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from database import Base
|
||||
from models import EnrichedArticle, Taxonomy, DuplicateGroup, DailyBrief
|
||||
|
||||
|
||||
TEST_DATABASE_URL = "sqlite:///:memory:"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def db():
|
||||
engine = create_engine(TEST_DATABASE_URL, connect_args={"check_same_thread": False})
|
||||
Base.metadata.create_all(bind=engine)
|
||||
Session = sessionmaker(bind=engine)
|
||||
session = Session()
|
||||
yield session
|
||||
session.close()
|
||||
Base.metadata.drop_all(bind=engine)
|
||||
@@ -0,0 +1,78 @@
|
||||
"""去重模块测试"""
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from app.deduplicator import _title_similarity, _find_duplicate_clusters, deduplicate_articles
|
||||
from models import EnrichedArticle
|
||||
|
||||
|
||||
def test_title_similarity_identical():
|
||||
assert _title_similarity("OpenAI 发布 GPT-5", "OpenAI 发布 GPT-5") > 0.95
|
||||
|
||||
|
||||
def test_title_similarity_different():
|
||||
assert _title_similarity("OpenAI 发布 GPT-5", "苹果发布新款 iPhone") < 0.5
|
||||
|
||||
|
||||
def test_find_duplicate_clusters(db):
|
||||
articles = [
|
||||
EnrichedArticle(
|
||||
rk_article_id=1,
|
||||
title="OpenAI 发布 GPT-5,性能大幅提升",
|
||||
content="OpenAI 今天发布了 GPT-5,性能大幅提升。",
|
||||
),
|
||||
EnrichedArticle(
|
||||
rk_article_id=2,
|
||||
title="OpenAI 发布 GPT-5 性能大幅提升",
|
||||
content="OpenAI 发布了 GPT-5,性能提升明显。",
|
||||
),
|
||||
EnrichedArticle(
|
||||
rk_article_id=3,
|
||||
title="苹果发布新款 iPhone",
|
||||
content="苹果公司发布了新款 iPhone。",
|
||||
),
|
||||
]
|
||||
clusters = _find_duplicate_clusters(articles, title_threshold=0.85, content_threshold=0.80)
|
||||
assert len(clusters) == 1
|
||||
assert {0, 1} in clusters
|
||||
|
||||
|
||||
def test_deduplicate_articles(db):
|
||||
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
day_start = datetime.strptime(today, "%Y-%m-%d")
|
||||
|
||||
a1 = EnrichedArticle(
|
||||
rk_article_id=1,
|
||||
title="OpenAI 发布 GPT-5",
|
||||
content="OpenAI 今天发布了 GPT-5。",
|
||||
fetched_at=day_start,
|
||||
)
|
||||
a2 = EnrichedArticle(
|
||||
rk_article_id=2,
|
||||
title="OpenAI 发布 GPT-5 性能提升",
|
||||
content="OpenAI 发布了 GPT-5,性能提升。",
|
||||
fetched_at=day_start + timedelta(minutes=10),
|
||||
)
|
||||
a3 = EnrichedArticle(
|
||||
rk_article_id=3,
|
||||
title="苹果发布新款 iPhone",
|
||||
content="苹果发布了 iPhone。",
|
||||
fetched_at=day_start + timedelta(minutes=20),
|
||||
)
|
||||
|
||||
db.add_all([a1, a2, a3])
|
||||
db.commit()
|
||||
|
||||
stats = deduplicate_articles(db, date_str=today, title_threshold=0.85, content_threshold=0.80)
|
||||
|
||||
assert stats["total"] == 3
|
||||
assert stats["duplicate_groups"] == 1
|
||||
assert stats["representatives"] == 1
|
||||
|
||||
# 刷新对象
|
||||
db.refresh(a1)
|
||||
db.refresh(a2)
|
||||
db.refresh(a3)
|
||||
|
||||
representatives = [a for a in [a1, a2, a3] if a.is_representative]
|
||||
assert len(representatives) == 1
|
||||
assert representatives[0].duplicate_group_id is not None
|
||||
@@ -0,0 +1,46 @@
|
||||
"""打分模块测试"""
|
||||
from datetime import datetime
|
||||
|
||||
from app.scorer import compute_heat_score, compute_importance_score, compute_duplication_score, compute_composite_score, score_articles
|
||||
from models import EnrichedArticle, Taxonomy, DuplicateGroup
|
||||
|
||||
|
||||
def test_compute_heat_score():
|
||||
rules = [Taxonomy(name="AI", kind="heat_rule", keywords=["AI", "大模型"], weight=1.5)]
|
||||
article = EnrichedArticle(title="OpenAI 发布 GPT-5 大模型")
|
||||
score = compute_heat_score(article, rules)
|
||||
assert score > 0
|
||||
|
||||
|
||||
def test_compute_importance_score():
|
||||
rules = [Taxonomy(name="政策", kind="importance_rule", keywords=["政策", "监管"], weight=2.0)]
|
||||
article = EnrichedArticle(title="新政策发布,加强 AI 监管")
|
||||
score = compute_importance_score(article, rules)
|
||||
assert score > 0
|
||||
|
||||
|
||||
def test_compute_duplication_score():
|
||||
assert compute_duplication_score(1) == 0.0
|
||||
assert compute_duplication_score(5) == 100.0
|
||||
|
||||
|
||||
def test_compute_composite_score():
|
||||
score = compute_composite_score(50, 80, 30)
|
||||
expected = 50 * 0.3 + 80 * 0.5 + 30 * 0.2
|
||||
assert score == round(expected, 2)
|
||||
|
||||
|
||||
def test_score_articles_integration(db):
|
||||
db.add_all([
|
||||
Taxonomy(name="AI", kind="heat_rule", keywords=["AI"], weight=1.5),
|
||||
Taxonomy(name="政策", kind="importance_rule", keywords=["政策"], weight=2.0),
|
||||
])
|
||||
article = EnrichedArticle(rk_article_id=1, title="AI 新政策发布")
|
||||
db.add(article)
|
||||
db.commit()
|
||||
|
||||
score_articles(db, article_ids=[article.id])
|
||||
|
||||
assert article.heat_score > 0
|
||||
assert article.importance_score > 0
|
||||
assert article.composite_score > 0
|
||||
@@ -0,0 +1,43 @@
|
||||
"""分类/标签模块测试"""
|
||||
from app.tagger import classify_article, tag_article, tag_articles
|
||||
from models import EnrichedArticle, Taxonomy
|
||||
|
||||
|
||||
def test_classify_article(db):
|
||||
categories = [
|
||||
Taxonomy(name="科技", kind="category", keywords=["AI", "大模型", "芯片"]),
|
||||
Taxonomy(name="财经", kind="category", keywords=["股市", "基金", "财报"]),
|
||||
]
|
||||
article = EnrichedArticle(
|
||||
rk_article_id=1,
|
||||
title="OpenAI 发布新一代大模型",
|
||||
)
|
||||
assert classify_article(article, categories) == "科技"
|
||||
|
||||
|
||||
def test_tag_article(db):
|
||||
tags = [
|
||||
Taxonomy(name="人工智能", kind="tag", keywords=["AI", "人工智能", "大模型"]),
|
||||
Taxonomy(name="半导体", kind="tag", keywords=["芯片", "半导体"]),
|
||||
]
|
||||
article = EnrichedArticle(
|
||||
rk_article_id=1,
|
||||
title="OpenAI 发布新一代大模型",
|
||||
)
|
||||
result = tag_article(article, tags)
|
||||
assert "人工智能" in result
|
||||
|
||||
|
||||
def test_tag_articles_integration(db):
|
||||
db.add_all([
|
||||
Taxonomy(name="科技", kind="category", keywords=["AI", "大模型"]),
|
||||
Taxonomy(name="人工智能", kind="tag", keywords=["AI", "大模型"]),
|
||||
])
|
||||
article = EnrichedArticle(rk_article_id=1, title="OpenAI 发布 GPT-5 大模型")
|
||||
db.add(article)
|
||||
db.commit()
|
||||
|
||||
count = tag_articles(db)
|
||||
assert count == 1
|
||||
assert article.category == "科技"
|
||||
assert "人工智能" in article.tags
|
||||
Reference in New Issue
Block a user