"""Tesseract 本地 OCR 实现。""" from __future__ import annotations import asyncio from pathlib import Path from typing import Optional from .base import OCRProvider class TesseractOCR(OCRProvider): """通过 pytesseract 调用本地 tesseract。 需提前安装 tesseract-ocr 及中文语言包。 """ name = "tesseract" def __init__(self, lang: str = "chi_sim+eng", cmd: Optional[str] = None) -> None: self.lang = lang self.cmd = cmd async def recognize(self, image_path: Path) -> str: """异步包装:避免阻塞事件循环。""" return await asyncio.to_thread(self._sync_recognize, image_path) def _sync_recognize(self, image_path: Path) -> str: try: import pytesseract from PIL import Image except ImportError as exc: # pragma: no cover raise RuntimeError("未安装 pytesseract / Pillow") from exc if self.cmd: pytesseract.pytesseract.tesseract_cmd = self.cmd with Image.open(image_path) as img: text = pytesseract.image_to_string(img, lang=self.lang) return text.strip()