# -*- coding: utf-8 -*- # 借助OCR技术,对PDF文件在OCR前后的字符串进行统计,如果 OCR前字符数/OCR后字符数 不在一定范围内(比如[0.5, 2]),则可判断为扫描版PDF。 import os import time import traceback
import fitz import requests from PIL import Image
# 使用fitz模块提取文本, 未使用OCR defget_pdf_file_text( pdf_file_path: str, pdf_page_count: int ) -> str: doc = fitz.open(pdf_file_path) whole_text_list = [] for i inrange(pdf_page_count): if i < doc.page_count: page = doc[i] page_content = page.get_text("blocks") for record in page_content: ifnot record[-1]: whole_text_list.append(record[4]) doc.close() return''.join(whole_text_list)
# 将PDF文件转换为图片 defconvert_pdf_2_img( pdf_file: str, pages: int ) -> list[str]: """ convert pdf to image :param pdf_file: pdf file path :param pages: convert pages number(at most) :return: output of image file path list """ pdf_document = fitz.open(pdf_file) output_image_file_path_list = [] # Iterate through each page and convert to an image for page_number inrange(pages): if page_number < pdf_document.page_count: # Get the page page = pdf_document[page_number] # Convert the page to an image pix = page.get_pixmap() # Create a Pillow Image object from the pixmap image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # Save the image pdf_file_name = pdf_file.split("/")[-1].split(".")[0] ifnot os.path.exists(f"../output/{pdf_file_name}"): os.makedirs(f"../output/{pdf_file_name}") save_image_path = f"../output/{pdf_file_name}/{page_number + 1}.png" image.save(save_image_path) output_image_file_path_list.append(save_image_path) # Close the PDF file pdf_document.close() return output_image_file_path_list
# 使用OCR技术从图片中提取文本 defget_file_content(file_path: str): withopen(file_path, 'rb') as fp: return fp.read()