"""PaddleOCR 本地 OCR(可选依赖)。""" from __future__ import annotations import asyncio from pathlib import Path from .base import OCRProvider class PaddleOCRProvider(OCRProvider): """通过 PaddleOCR 本地识文。需 pip install paddleocr paddlepaddle。""" name = "paddleocr" def __init__(self, lang: str = "ch") -> None: self.lang = lang self._engine = None async def recognize(self, image_path: Path) -> str: return await asyncio.to_thread(self._sync_recognize, image_path) def _sync_recognize(self, image_path: Path) -> str: try: from paddleocr import PaddleOCR # type: ignore except ImportError as exc: raise RuntimeError( "未安装 PaddleOCR,请执行: pip install paddleocr paddlepaddle" ) from exc if self._engine is None: self._engine = PaddleOCR(use_angle_cls=True, lang=self.lang, show_log=False) result = self._engine.ocr(str(image_path), cls=True) lines: list[str] = [] if result and result[0]: for line in result[0]: if line and len(line) >= 2: text_part = line[1] if isinstance(text_part, (list, tuple)) and text_part: lines.append(str(text_part[0])) elif isinstance(text_part, str): lines.append(text_part) return "\n".join(lines).strip()