
| import os from PyPDF2 import PdfReader, PdfWriter import re import sqlite3 import shutil from loguru import logger import datetime
def productQuery(bank_acct): logger.info('----查询项目编号及简称:{}'.format(bank_acct)) productCode = '' productName = '' conn = sqlite3.connect(r'\\**********.db') c = conn.cursor() querySql = c.execute("select bank_acct,product_code,product_jc from trust_product_bank where bank_acct=?",(bank_acct,)) for row in querySql: productCode = row[1] productName = row[2] conn.close() if len(productCode) == 0: return '保障基金' else : return productCode+'-'+productName
def pdf_split(pdf_in,pdf_out,list): output = PdfWriter() with open(pdf_in, 'rb') as in_pdf: pdf_file = PdfReader(in_pdf) for i in list: output.add_page(pdf_file.pages[i]) with open(pdf_out, 'ab') as out_pdf: output.write(out_pdf)
def getPdfList(filepath,filemonth): dirlist = [] try : listDir = os.listdir(r'{}\{}'.format(filepath,filemonth)) except FileNotFoundError: print('未找到对应的目录') else: for dir in listDir: if os.path.splitext(dir)[1] == r'.pdf': dirlist.append(dir) return dirlist
def pdf_reader(pdf_in): pdf_file = PdfReader(pdf_in) pages = len(pdf_file.pages) return pages
def pdf_content(pdf_in,page): pdf_page = PdfReader(pdf_in) page = pdf_page.pages[page] return page.extract_text()
if __name__ == '__main__': filepath = input('请输入文件夹路径(所在月份文件夹父目录):') filemonth = input('请输入处理月份(格式:202301):') FileLog = '{}\Pdflog{}.log'.format(filepath,datetime.date.today()) logger.add(FileLog,rotation="500MB", encoding="utf-8", enqueue=True) pat10 = r'存款利息收入回单' pat11 = r'入账回单' pat12 = r'出账回单' pat13 = r'定期收费回单' pat14 = r'协议利率利息收入回单' pat15 = r'对公业务收费回单' pat2 = r'(?<=收款账号:)[0-9]{1,30}' pat3 = r'(?<=付款账号:)[0-9]{1,30}' pat4 = r'(?<=扣费账号:)[0-9]{1,30}' pat5 = r'(?<=客户账号:)[0-9]{1,30}' logger.info('----获取{}下的pdf清单'.format(filemonth)) pdflist = getPdfList(filepath,filemonth) logger.info('----获取pdf清单:{}'.format(str(pdflist))) for i in pdflist: logger.info('----开始处理文件:{}'.format(i)) ywlx = [] skzh = [] fkzh = [] bankAll = [] logger.info('----读取文件内容') filepage = pdf_reader(r'{}\{}\{}'.format(filepath,filemonth,i)) for j in range(0,filepage): contents = pdf_content(r'{}\{}\{}'.format(filepath,filemonth,i),j).replace(' ','').replace('\n','') if len(contents) == 0: continue try : re.findall(pat10,contents)[0] except IndexError: try : re.findall(pat11,contents)[0] except IndexError: try : re.findall(pat12,contents)[0] except IndexError: try : re.findall(pat13,contents)[0] except IndexError: try : re.findall(pat14,contents)[0] except IndexError: ywlx.append(re.findall(pat15,contents)[0]) else: ywlx.append(re.findall(pat14,contents)[0]) else : ywlx.append(re.findall(pat13,contents)[0]) else: ywlx.append(re.findall(pat12,contents)[0]) else: ywlx.append(re.findall(pat11,contents)[0]) else : ywlx.append(re.findall(pat10,contents)[0]) if ywlx[j] == pat11: bankAll.append(re.findall(pat2,contents)[0]) elif ywlx[j] == pat10: bankAll.append(re.findall(pat2,contents)[0]) elif ywlx[j] == pat12: bankAll.append(re.findall(pat3,contents)[0]) elif ywlx[j] == pat13: bankAll.append(re.findall(pat4,contents)[0]) elif ywlx[j] == pat14: bankAll.append(re.findall(pat2,contents)[0]) elif ywlx[j] == pat15: bankAll.append(re.findall(pat5,contents)[0]) logger.info('----完成收付款类型、银行账号提取') logger.info('----去重银行账号') bankDistinct = list(set(bankAll)) for k in range(0,len(bankDistinct)) : logger.info('----根据银行账号查询项目,并检查子文件夹') productDirname = productQuery(bankDistinct[k]) if not os.path.exists(r'{}\{}\{}'.format(filepath,filemonth,productDirname)): os.mkdir(r'{}\{}\{}'.format(filepath,filemonth,productDirname)) if not os.path.exists(r'{}\{}\000原始文件'.format(filepath,filemonth)): os.mkdir(r'{}\{}\000原始文件'.format(filepath,filemonth)) logger.info('----按银行账号提取pdf页面') pages = [m for m ,n in enumerate(bankAll) if n is bankDistinct[k]] pdf_in = r'{}\{}\{}'.format(filepath,filemonth,i) pdf_out = r'{}\{}\{}\{}'.format(filepath,filemonth,productDirname,'{}-{}.pdf'.format(i[-13:-5],productDirname)) pdf_split(pdf_in,pdf_out,pages) logger.info('----拆分pdf完成') shutil.move(r'{}\{}\{}'.format(filepath,filemonth,i),r'{}\{}\000原始文件'.format(filepath,filemonth)) logger.info('----将已处理pdf移至已处理文件夹') logger.info('----{}文件处理完成'.format(i)) logger.info('----{}处理结束'.format(filemonth))
|