2019年10月11日 星期五

[Python] OCR文字辨識

Install:
pip install pytesseract
pip install pillow

Tesseract-OCR
https://github.com/UB-Mannheim/tesseract/wiki



Edit:
C:\Python27\Lib\site-packages\pytesseract\pytesseract.py

modify path
tesseract_cmd = 'F:/Program Files/Tesseract-OCR/tesseract.exe'

python script:
#-*-coding:utf-8-*-

from PIL import Image,ImageEnhance
import pytesseract
import time


def pic_to_word(filepath,filename,resize_num,b):
    """
    :param filepath: 檔案路徑
    :param filename:圖片名
    :param resize_num:縮放倍數
    :param b:對比度
    :return:返回圖片識別文字
    """
    try:
        time1 = time.time()
        im = Image.open(str(filepath)+str(filename))
        # 影象放大
        im = im.resize((im.width * int(resize_num), im.height * int(resize_num)))
        # 影象二值化
        imgry = im.convert('L')
        # 對比度增強
        sharpness = ImageEnhance.Contrast(imgry)
        sharp_img = sharpness.enhance(b)
        content = pytesseract.image_to_string(sharp_img, lang='chi_sim')
        time2 = time.time()
        print('total time%s s' % (time2 - time1))
    except Exception as e:
        print("{0}".format(str(e)))

    return content


if __name__ == '__main__':
    filepath="C:/Users/lc_lee/Desktop/"
    filename="a2.png"
    resize_num = 2
    b = 2.0
    content=pic_to_word(filepath,filename,resize_num,b)
    print(content.encode(encoding="utf-8", errors="strict"))
    # print(content.encode("utf8").decode("cp950", "ignore"))


# chcp 65001

• https://www.itread01.com/content/1547721909.html
• https://blog.csdn.net/wwj_748/article/details/78109680

沒有留言:

張貼留言