1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
| import os from PyPDF2 import PdfReader, PdfWriter import re import sqlite3 import shutil from loguru import logger import datetime
def productQuery(bank_acct): logger.info('----查询项目编号及简称:{}'.format(bank_acct)) productCode = '' productName = '' conn = sqlite3.connect(r'\\**********.db') c = conn.cursor() querySql = c.execute("select bank_acct,product_code,product_jc from trust_product_bank where bank_acct=?",(bank_acct,)) for row in querySql: productCode = row[1] productName = row[2] conn.close() if len(productCode) == 0: return '保障基金' else : return productCode+'-'+productName
def pdf_split(pdf_in,pdf_out,list): output = PdfWriter() with open(pdf_in, 'rb') as in_pdf: pdf_file = PdfReader(in_pdf) for i in list: output.add_page(pdf_file.pages[i]) with open(pdf_out, 'ab') as out_pdf: output.write(out_pdf)
def getPdfList(filepath,filemonth): dirlist = [] try : listDir = os.listdir(r'{}\{}'.format(filepath,filemonth)) except FileNotFoundError: print('未找到对应的目录') else: for dir in listDir: if os.path.splitext(dir)[1] == r'.pdf': dirlist.append(dir) return dirlist
def pdf_reader(pdf_in): pdf_file = PdfReader(pdf_in) pages = len(pdf_file.pages) return pages
def pdf_content(pdf_in,page): pdf_page = PdfReader(pdf_in) page = pdf_page.pages[page] return page.extract_text()
if __name__ == '__main__': filepath = input('请输入文件夹路径(所在月份文件夹父目录):') filemonth = input('请输入处理月份(格式:202301):') FileLog = '{}\Pdflog{}.log'.format(filepath,datetime.date.today()) logger.add(FileLog,rotation="500MB", encoding="utf-8", enqueue=True) pat10 = r'存款利息收入回单' pat11 = r'入账回单' pat12 = r'出账回单' pat13 = r'定期收费回单' pat14 = r'协议利率利息收入回单' pat15 = r'对公业务收费回单' pat2 = r'(?<=收款账号:)[0-9]{1,30}' pat3 = r'(?<=付款账号:)[0-9]{1,30}' pat4 = r'(?<=扣费账号:)[0-9]{1,30}' pat5 = r'(?<=客户账号:)[0-9]{1,30}' logger.info('----获取{}下的pdf清单'.format(filemonth)) pdflist = getPdfList(filepath,filemonth) logger.info('----获取pdf清单:{}'.format(str(pdflist))) for i in pdflist: logger.info('----开始处理文件:{}'.format(i)) ywlx = [] skzh = [] fkzh = [] bankAll = [] logger.info('----读取文件内容') filepage = pdf_reader(r'{}\{}\{}'.format(filepath,filemonth,i)) for j in range(0,filepage): contents = pdf_content(r'{}\{}\{}'.format(filepath,filemonth,i),j).replace(' ','').replace('\n','') if len(contents) == 0: continue try : re.findall(pat10,contents)[0] except IndexError: try : re.findall(pat11,contents)[0] except IndexError: try : re.findall(pat12,contents)[0] except IndexError: try : re.findall(pat13,contents)[0] except IndexError: try : re.findall(pat14,contents)[0] except IndexError: ywlx.append(re.findall(pat15,contents)[0]) else: ywlx.append(re.findall(pat14,contents)[0]) else : ywlx.append(re.findall(pat13,contents)[0]) else: ywlx.append(re.findall(pat12,contents)[0]) else: ywlx.append(re.findall(pat11,contents)[0]) else : ywlx.append(re.findall(pat10,contents)[0]) if ywlx[j] == pat11: bankAll.append(re.findall(pat2,contents)[0]) elif ywlx[j] == pat10: bankAll.append(re.findall(pat2,contents)[0]) elif ywlx[j] == pat12: bankAll.append(re.findall(pat3,contents)[0]) elif ywlx[j] == pat13: bankAll.append(re.findall(pat4,contents)[0]) elif ywlx[j] == pat14: bankAll.append(re.findall(pat2,contents)[0]) elif ywlx[j] == pat15: bankAll.append(re.findall(pat5,contents)[0]) logger.info('----完成收付款类型、银行账号提取') logger.info('----去重银行账号') bankDistinct = list(set(bankAll)) for k in range(0,len(bankDistinct)) : logger.info('----根据银行账号查询项目,并检查子文件夹') productDirname = productQuery(bankDistinct[k]) if not os.path.exists(r'{}\{}\{}'.format(filepath,filemonth,productDirname)): os.mkdir(r'{}\{}\{}'.format(filepath,filemonth,productDirname)) if not os.path.exists(r'{}\{}\000原始文件'.format(filepath,filemonth)): os.mkdir(r'{}\{}\000原始文件'.format(filepath,filemonth)) logger.info('----按银行账号提取pdf页面') pages = [m for m ,n in enumerate(bankAll) if n is bankDistinct[k]] pdf_in = r'{}\{}\{}'.format(filepath,filemonth,i) pdf_out = r'{}\{}\{}\{}'.format(filepath,filemonth,productDirname,'{}-{}.pdf'.format(i[-13:-5],productDirname)) pdf_split(pdf_in,pdf_out,pages) logger.info('----拆分pdf完成') shutil.move(r'{}\{}\{}'.format(filepath,filemonth,i),r'{}\{}\000原始文件'.format(filepath,filemonth)) logger.info('----将已处理pdf移至已处理文件夹') logger.info('----{}文件处理完成'.format(i)) logger.info('----{}处理结束'.format(filemonth))
|