From 616bd5eb4871f55da847b23dd58a95ae1e33feb6 Mon Sep 17 00:00:00 2001 From: fengyunzaidushi Date: Sat, 29 Jul 2023 11:36:01 +0800 Subject: [PATCH] modify all(cell == '' for cell in col)] --- pdf2txt.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pdf2txt.py b/pdf2txt.py index 227194f..2b0d55c 100644 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -3,7 +3,7 @@ import re from collections import defaultdict import json - +import os class PDFProcessor: def __init__(self, filepath): self.filepath = filepath @@ -54,7 +54,7 @@ def check_lines(self, page, top, buttom): def drop_empty_cols(self, data): # 删除所有列为空数据的列 transposed_data = list(map(list, zip(*data))) - filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)] + filtered_data = [col for col in transposed_data if not all(cell == '' for cell in col)] result = list(map(list, zip(*filtered_data))) return result @@ -177,6 +177,8 @@ def process_all_pdfs_in_folder(folder_path): try: processor = PDFProcessor(file_path) processor.process_pdf() + if not os.path.exists(file_path): + os.makedirs('alltxt') save_path = 'alltxt/' + file_path.split('/')[-1].replace('.pdf', '.txt') processor.save_all_text(save_path) except: