Files
SnapAndAnaly/backend/app/services/ingest.py
T
congsh 5c028d7952 Initial commit: snapAna 截图智能整理工具
包含 FastAPI 后端、React 前端、队列/OCR/标签/待办等完整功能。

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-27 15:45:50 +08:00

148 lines
4.4 KiB
Python

"""将磁盘上的截图文件入库 + 排队分析。"""
from __future__ import annotations
from datetime import datetime
from pathlib import Path
from typing import Iterable, Optional
from PIL import Image
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.core.path_utils import (
is_accessible_dir,
is_accessible_file,
path_from_storage,
path_to_storage,
)
from app.core.logger import get_logger
from app.models.job import Job, JobKind, JobStatus
from app.models.screenshot import ProcessStatus, Screenshot
from app.models.tag import Tag
from app.services.exif_utils import extract_image_metadata
from app.services.thumbnail import file_hash, generate_thumbnail, is_supported
logger = get_logger(__name__)
def ingest_path(session: Session, path: Path) -> Optional[Screenshot]:
"""单文件入库。返回 Screenshot 或 None(不支持/重复时)。"""
if not is_accessible_file(path) or not path.is_file():
return None
if not is_supported(path):
return None
stored_path = path_to_storage(path)
try:
digest = file_hash(path)
except OSError as exc:
logger.warning("无法读取文件 %s: %s", path, exc)
return None
existing = session.scalar(select(Screenshot).where(Screenshot.file_hash == digest))
if existing:
# 同一内容重命名/移动:更新路径
if existing.path != stored_path:
existing.path = stored_path
session.flush()
return existing
try:
with Image.open(path) as img:
width, height = img.size
except Exception as exc: # noqa: BLE001
logger.warning("无法读取图片尺寸 %s: %s", path, exc)
width, height = 0, 0
stat = path.stat()
captured_at = datetime.fromtimestamp(stat.st_mtime)
exif_time, location_tags = extract_image_metadata(path)
if exif_time is not None:
captured_at = exif_time
try:
thumb = generate_thumbnail(path)
thumb_path = thumb.as_posix()
except Exception as exc: # noqa: BLE001
logger.warning("生成缩略图失败 %s: %s", path, exc)
thumb_path = None
shot = Screenshot(
path=stored_path,
file_hash=digest,
width=width,
height=height,
size=stat.st_size,
captured_at=captured_at,
thumb_path=thumb_path,
ocr_status=ProcessStatus.PENDING.value,
ai_status=ProcessStatus.PENDING.value,
)
session.add(shot)
session.flush()
if location_tags:
_attach_location_tags(session, shot, location_tags)
job = Job(screenshot_id=shot.id, kind=JobKind.FULL.value, status=JobStatus.PENDING.value)
session.add(job)
logger.info("入库 #%d %s", shot.id, path.name)
return shot
def _attach_location_tags(session: Session, shot: Screenshot, tag_names: list[str]) -> None:
"""入库时写入 EXIF 地点标签。"""
tag_objs: list[Tag] = []
for raw in tag_names:
name = (raw or "").strip()[:64]
if not name:
continue
tag = session.scalar(select(Tag).where(Tag.name == name))
if tag is None:
tag = Tag(name=name)
session.add(tag)
session.flush()
tag_objs.append(tag)
shot.tags = tag_objs
def ingest_directory(
session: Session,
root: Path | str,
recursive: bool = True,
) -> tuple[int, int]:
"""遍历目录入库。返回 (新增数, 跳过数)。支持 UNC 网络路径。"""
root_p = path_from_storage(str(root)) if isinstance(root, str) else root
if not is_accessible_dir(root_p):
return 0, 0
iterator: Iterable[Path]
if recursive:
iterator = (p for p in root_p.rglob("*") if p.is_file())
else:
iterator = (p for p in root_p.iterdir() if p.is_file())
added, skipped = 0, 0
for path in iterator:
if not is_supported(path):
continue
stored = path_to_storage(path)
before = session.scalar(
select(Screenshot.id).where(Screenshot.path == stored)
)
result = ingest_path(session, path)
if result is None:
skipped += 1
continue
if before is None:
added += 1
else:
skipped += 1
# 批量提交,避免巨型事务
if (added + skipped) % 50 == 0:
session.commit()
session.commit()
return added, skipped