Files
SnapAndAnaly/backend/app/providers/ocr_tesseract.py
T

40 lines
1.1 KiB
Python
Raw Normal View History

"""Tesseract 本地 OCR 实现。"""
from __future__ import annotations
import asyncio
from pathlib import Path
from typing import Optional
from .base import OCRProvider
class TesseractOCR(OCRProvider):
"""通过 pytesseract 调用本地 tesseract。
需提前安装 tesseract-ocr 及中文语言包。
"""
name = "tesseract"
def __init__(self, lang: str = "chi_sim+eng", cmd: Optional[str] = None) -> None:
self.lang = lang
self.cmd = cmd
async def recognize(self, image_path: Path) -> str:
"""异步包装:避免阻塞事件循环。"""
return await asyncio.to_thread(self._sync_recognize, image_path)
def _sync_recognize(self, image_path: Path) -> str:
try:
import pytesseract
from PIL import Image
except ImportError as exc: # pragma: no cover
raise RuntimeError("未安装 pytesseract / Pillow") from exc
if self.cmd:
pytesseract.pytesseract.tesseract_cmd = self.cmd
with Image.open(image_path) as img:
text = pytesseract.image_to_string(img, lang=self.lang)
return text.strip()