1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
| from aip import AipOcr import os from pdf2image import convert_from_path
def apiMessage(app_id,app_key,secret_key): APP_ID = app_id API_KEY = app_key SECRET_KEY = secret_key client = AipOcr(APP_ID, API_KEY, SECRET_KEY) return client
def imgOcr(client,filename): with open(filename, 'rb') as fp: image = fp.read() dic_result = client.basicGeneral(image) res = dic_result['words_result'] return res
def imgContent(res,resultpath): with open('{}\\result.txt'.format(resultpath),'a',encoding='utf-8') as f: for i in res: f.write(i['words'])
def pdfToimg(pdfFile,outputPath): images = convert_from_path(pdfFile,poppler_path=r'C:\Users\admin\Documents\workspace\otherapi\poppler-0.68.0\bin') for i ,img in enumerate(images): img.save(outputPath+f'\\page_{i+1}.png','PNG')
if __name__ == '__main__': APP_ID = 'xx' API_KEY = 'xx' SECRET_KEY = 'x'
pdfFile = r'C:\Users\A.pdf' outputPath = r'C:\Users' listDir = os.listdir(outputPath) for i in listDir: filenamei = '{}\\{}'.format(outputPath,i) client = apiMessage(APP_ID,API_KEY,SECRET_KEY) content = imgOcr(client,filenamei) imgContent(content,outputPath) print('完成{}的处理'.format(i))
|