一、下载tessdoc
https://github.com/UB-Mannheim/tesseract/wiki
二、找到自己的安装路径,并且找到执行文件
例如:D:\Tesseract-OCR
三、查看有哪些接口可以调用
from .pytesseract import ALTONotSupported
from .pytesseract import get_languages
from .pytesseract import get_tesseract_version
from .pytesseract import image_to_alto_xml
from .pytesseract import image_to_boxes
from .pytesseract import image_to_data
from .pytesseract import image_to_osd
from .pytesseract import image_to_pdf_or_hocr
from .pytesseract import image_to_string
from .pytesseract import Output
from .pytesseract import run_and_get_output
from .pytesseract import TesseractError
from .pytesseract import TesseractNotFoundError
from .pytesseract import TSVNotSupported
四、如果要识别中文的话
遇到的问题:识别中文会报错。
原因:没有包
下载地址:https://gitcode.net/mirrors/tesseract-ocr/tessdata/-/blob/master/chi_sim.traineddata
将下载的包放入文件夹目录下
然后在识别的函数里面设置lang如下:
print(pytesseract.image_to_string(image=img, lang='chi_sim'))
就可以识别中文了。
五、举例
(1)测试图片
(2)代码
# package
import cv2
import pytesseract
import numpy as np
from PIL import ImageGrab
import time
# 安装路径 D:\Tesseract-OCR
pytesseract.pytesseract.tesseract_cmd = 'D:\\Tesseract-OCR\\tesseract.exe'
img = cv2.imread('Source\\P1-image2.png')
# opencv中图像是BGR格式
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 检测字符
height, weight, _ = img.shape
# # 可以设置lang='chi_sim'来识别中文
# print(pytesseract.image_to_boxes(image=img))
boxes = pytesseract.image_to_boxes(img)
for b in boxes.splitlines():
print(b)
# 分割
b = b.split(' ')
print(b)
# x,y,weight,high 对应于int(b[1]), int(b[2]), int(b[3]), int(b[4])
# 字符串转化为整数
x, y, w, h = int(b[1]), int(b[2]), int(b[3]), int(b[4])
cv2.rectangle(img, (x, height - y), (w, height - h), (50, 50, 255), 2)
cv2.putText(img, b[0], (x, height - y + 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (50, 50, 255), 2)
cv2.imshow('img', img)
cv2.waitKey(0)
(3)结果
文章链接
发表评论