5c028d7952
包含 FastAPI 后端、React 前端、队列/OCR/标签/待办等完整功能。 Co-authored-by: Cursor <cursoragent@cursor.com>
40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
"""Tesseract 本地 OCR 实现。"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
from .base import OCRProvider
|
|
|
|
|
|
class TesseractOCR(OCRProvider):
|
|
"""通过 pytesseract 调用本地 tesseract。
|
|
|
|
需提前安装 tesseract-ocr 及中文语言包。
|
|
"""
|
|
|
|
name = "tesseract"
|
|
|
|
def __init__(self, lang: str = "chi_sim+eng", cmd: Optional[str] = None) -> None:
|
|
self.lang = lang
|
|
self.cmd = cmd
|
|
|
|
async def recognize(self, image_path: Path) -> str:
|
|
"""异步包装:避免阻塞事件循环。"""
|
|
return await asyncio.to_thread(self._sync_recognize, image_path)
|
|
|
|
def _sync_recognize(self, image_path: Path) -> str:
|
|
try:
|
|
import pytesseract
|
|
from PIL import Image
|
|
except ImportError as exc: # pragma: no cover
|
|
raise RuntimeError("未安装 pytesseract / Pillow") from exc
|
|
|
|
if self.cmd:
|
|
pytesseract.pytesseract.tesseract_cmd = self.cmd
|
|
|
|
with Image.open(image_path) as img:
|
|
text = pytesseract.image_to_string(img, lang=self.lang)
|
|
return text.strip()
|