# -*- coding: utf-8 -*- # @file: preprocess.py # PDF文档预处理 import os import json import time
import fitz from PIL import Image
from src.config.config import PROJECT_DIR
# 使用fitz模块提取文本, 未使用OCR defget_pdf_file_text( pdf_file_path: str ) -> dict[int, str]: doc = fitz.open(pdf_file_path) page_result = {} for i inrange(doc.page_count): page = doc[i] text = "" page_content = page.get_text("blocks") for record in page_content: ifnot record[-1]: text += record[4] page_result[i] = text doc.close() # 将识别结果保存到json文件中 pdf_file_name = pdf_file_path.split('/')[-1].split(".")[0] json_output_path = os.path.join(PROJECT_DIR, f"output/{pdf_file_name}/original_text.json") withopen(json_output_path, "w", encoding="utf-8") as f: f.write(json.dumps(page_result, ensure_ascii=False, indent=4)) return page_result
# 将PDF文件转换为图片 defconvert_pdf_2_img( pdf_file: str ) -> list[str]: """ convert pdf to image :param pdf_file: pdf file path :param pages: convert pages number(at most) :return: output of image file path list """ pdf_document = fitz.open(pdf_file) output_image_file_path_list = [] # Iterate through each page and convert to an image for page_number inrange(pdf_document.page_count): # Get the page page = pdf_document[page_number] # Convert the page to an image pix = page.get_pixmap() # Create a Pillow Image object from the pixmap image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # Save the image pdf_file_name = pdf_file.split('/')[-1].split(".")[0] output_dir = os.path.join(PROJECT_DIR, f"output/{pdf_file_name}") ifnot os.path.exists(output_dir): os.makedirs(output_dir) save_image_path = os.path.join(output_dir, f"{page_number}.png") image.save(save_image_path) output_image_file_path_list.append(save_image_path) # Close the PDF file pdf_document.close() return output_image_file_path_list
# -*- coding: utf-8 -*- # @file: image_ocr.py # 使用PaddleOCR提取图片中的文字 import json import os from paddleocr import PaddleOCR
from src.config.config import PROJECT_DIR
defget_pdf_file_ocr_result(pdf_file_dir_path: str) -> dict[int, str]: # Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 # 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` ocr = PaddleOCR(use_angle_cls=False, lang="ch") page_ocr_result = {} files = [file for file in os.listdir(pdf_file_dir_path) if file.endswith(".png")] # 按数字大小排序 files.sort(key=lambda x: int(x.split(".")[0])) for file in files: text = "" page_no = int(file.split(".")[0]) img_path = os.path.join(pdf_file_dir_path, file) result = ocr.ocr(img_path, cls=False) for idx inrange(len(result)): res = result[idx] if res: for line in res: text += line[1][0] print(f"page: {page_no}, text: {line[1][0]}") page_ocr_result[page_no] = text
deftext_preprocess(text: str) -> str: """ text preprocess :param text: original text :return: preprocessed text """ text = text.replace("\n", "") return text
defget_sentences(text: str) -> list[str]: """ get sentences from text :param text: original text :return: sentences list """ sentences = segment("zh", text) return sentences
deffind_similar_sentence(sent: str, candidate_sentences: list[str]) -> str: """ find similar sentence :param sent: sentence :param candidate_sentences: candidate sentences :return: similar sentence """ for candidate_sent in candidate_sentences: if sent == candidate_sent: return candidate_sent eliflen(sent) == len(candidate_sent): # 计算两个句子的jaccard相似度 set_sent = set(sent) set_candidate_sent = set(candidate_sent) jaccard_sim = len(set_sent & set_candidate_sent) / len(set_sent | set_candidate_sent) if jaccard_sim > 0.8: return candidate_sent return""
# 生成文本纠错语料 final_corpus_list = [] for key, value in ocr_result_dict.items(): my_ocr_text = value if key in original_text_dict: my_original_text = original_text_dict[key] my_corpus_list = get_corpus(my_original_text, my_ocr_text) final_corpus_list.extend(my_corpus_list)