Files
SnapAndAnaly/backend/app/providers/ocr_tesseract.py
T
congsh 5c028d7952 Initial commit: snapAna 截图智能整理工具
包含 FastAPI 后端、React 前端、队列/OCR/标签/待办等完整功能。

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-27 15:45:50 +08:00

40 lines
1.1 KiB
Python

"""Tesseract 本地 OCR 实现。"""
from __future__ import annotations
import asyncio
from pathlib import Path
from typing import Optional
from .base import OCRProvider
class TesseractOCR(OCRProvider):
"""通过 pytesseract 调用本地 tesseract。
需提前安装 tesseract-ocr 及中文语言包。
"""
name = "tesseract"
def __init__(self, lang: str = "chi_sim+eng", cmd: Optional[str] = None) -> None:
self.lang = lang
self.cmd = cmd
async def recognize(self, image_path: Path) -> str:
"""异步包装:避免阻塞事件循环。"""
return await asyncio.to_thread(self._sync_recognize, image_path)
def _sync_recognize(self, image_path: Path) -> str:
try:
import pytesseract
from PIL import Image
except ImportError as exc: # pragma: no cover
raise RuntimeError("未安装 pytesseract / Pillow") from exc
if self.cmd:
pytesseract.pytesseract.tesseract_cmd = self.cmd
with Image.open(image_path) as img:
text = pytesseract.image_to_string(img, lang=self.lang)
return text.strip()