SnapAndAnaly/backend/app/services/ingest.py

"""将磁盘上的截图文件入库 + 排队分析。"""
from __future__ import annotations

from datetime import datetime
from pathlib import Path
from typing import Iterable, Optional

from PIL import Image
from sqlalchemy import select
from sqlalchemy.orm import Session

from app.core.path_utils import (
    is_accessible_dir,
    is_accessible_file,
    path_from_storage,
    path_to_storage,
)
from app.core.logger import get_logger
from app.models.job import Job, JobKind, JobStatus
from app.models.screenshot import ProcessStatus, Screenshot
from app.models.tag import Tag
from app.services.exif_utils import extract_image_metadata
from app.services.thumbnail import file_hash, generate_thumbnail, is_supported


logger = get_logger(__name__)


def ingest_path(session: Session, path: Path) -> Optional[Screenshot]:
    """单文件入库。返回 Screenshot 或 None（不支持/重复时）。"""
    if not is_accessible_file(path) or not path.is_file():
        return None
    if not is_supported(path):
        return None

    stored_path = path_to_storage(path)

    try:
        digest = file_hash(path)
    except OSError as exc:
        logger.warning("无法读取文件 %s: %s", path, exc)
        return None

    existing = session.scalar(select(Screenshot).where(Screenshot.file_hash == digest))
    if existing:
        # 同一内容重命名/移动：更新路径
        if existing.path != stored_path:
            existing.path = stored_path
            session.flush()
        return existing

    try:
        with Image.open(path) as img:
            width, height = img.size
    except Exception as exc:  # noqa: BLE001
        logger.warning("无法读取图片尺寸 %s: %s", path, exc)
        width, height = 0, 0

    stat = path.stat()
    captured_at = datetime.fromtimestamp(stat.st_mtime)
    exif_time, location_tags = extract_image_metadata(path)
    if exif_time is not None:
        captured_at = exif_time

    try:
        thumb = generate_thumbnail(path)
        thumb_path = thumb.as_posix()
    except Exception as exc:  # noqa: BLE001
        logger.warning("生成缩略图失败 %s: %s", path, exc)
        thumb_path = None

    shot = Screenshot(
        path=stored_path,
        file_hash=digest,
        width=width,
        height=height,
        size=stat.st_size,
        captured_at=captured_at,
        thumb_path=thumb_path,
        ocr_status=ProcessStatus.PENDING.value,
        ai_status=ProcessStatus.PENDING.value,
    )
    session.add(shot)
    session.flush()

    if location_tags:
        _attach_location_tags(session, shot, location_tags)

    job = Job(screenshot_id=shot.id, kind=JobKind.FULL.value, status=JobStatus.PENDING.value)
    session.add(job)
    logger.info("入库 #%d %s", shot.id, path.name)
    return shot


def _attach_location_tags(session: Session, shot: Screenshot, tag_names: list[str]) -> None:
    """入库时写入 EXIF 地点标签。"""
    tag_objs: list[Tag] = []
    for raw in tag_names:
        name = (raw or "").strip()[:64]
        if not name:
            continue
        tag = session.scalar(select(Tag).where(Tag.name == name))
        if tag is None:
            tag = Tag(name=name)
            session.add(tag)
            session.flush()
        tag_objs.append(tag)
    shot.tags = tag_objs


def ingest_directory(
    session: Session,
    root: Path | str,
    recursive: bool = True,
) -> tuple[int, int]:
    """遍历目录入库。返回 (新增数, 跳过数)。支持 UNC 网络路径。"""
    root_p = path_from_storage(str(root)) if isinstance(root, str) else root
    if not is_accessible_dir(root_p):
        return 0, 0

    iterator: Iterable[Path]
    if recursive:
        iterator = (p for p in root_p.rglob("*") if p.is_file())
    else:
        iterator = (p for p in root_p.iterdir() if p.is_file())

    added, skipped = 0, 0
    for path in iterator:
        if not is_supported(path):
            continue
        stored = path_to_storage(path)
        before = session.scalar(
            select(Screenshot.id).where(Screenshot.path == stored)
        )
        result = ingest_path(session, path)
        if result is None:
            skipped += 1
            continue
        if before is None:
            added += 1
        else:
            skipped += 1
        # 批量提交，避免巨型事务
        if (added + skipped) % 50 == 0:
            session.commit()
    session.commit()
    return added, skipped