返回

多线程PDF转Word,60行Python代码轻松搞定!

开发工具

工作中经常会遇到需要提取PDF文件中文字的情况,一个PDF还好,复制粘贴一下也花不了太多时间,如果需要把大量PDF转为Word,怎么办呢?

今天教大家用60行代码实现,多线程批量PDF转Word。没兴趣看具体过程可以直接拉到最后,有代码。

把PDF转为Word,分几步?

两步,第一步PDF转为图片,第二步图片转为Word。

第一步:PDF转为图片

import os
import PyPDF2

def pdf_to_image(pdf_file, output_dir):
    """
    将PDF文件转换为图片
    :param pdf_file: PDF文件路径
    :param output_dir: 输出图片的目录
    """
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    for page_num in range(pdf_reader.numPages):
        page_object = pdf_reader.getPage(page_num)
        image = page_object.getImage()
        if image:
            image_file = os.path.join(output_dir, f"page_{page_num + 1}.png")
            with open(image_file, "wb") as f:
                f.write(image)

第二步:图片转为Word

import os
import img2pdf

def image_to_word(image_dir, output_file):
    """
    将图片转换为Word
    :param image_dir: 图片目录
    :param output_file: 输出Word文件的路径
    """
    images = os.listdir(image_dir)
    images.sort()

    with open(output_file, "wb") as f:
        for image in images:
            image_path = os.path.join(image_dir, image)
            img2pdf.convert(image_path, f)

多线程实现

import threading

def multi_thread_pdf_to_word(pdf_files, output_dir):
    """
    多线程实现PDF转Word
    :param pdf_files: PDF文件路径列表
    :param output_dir: 输出目录
    """
    threads = []
    for pdf_file in pdf_files:
        thread = threading.Thread(target=pdf_to_image, args=(pdf_file, output_dir))
        threads.append(thread)

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()

    threads = []
    for subdir in os.listdir(output_dir):
        subdir_path = os.path.join(output_dir, subdir)
        if os.path.isdir(subdir_path):
            output_file = os.path.join(output_dir, f"{subdir}.docx")
            thread = threading.Thread(target=image_to_word, args=(subdir_path, output_file))
            threads.append(thread)

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()

使用说明

  1. 将需要转换的PDF文件放在同一个目录下。
  2. 运行python脚本,指定PDF文件目录和输出目录。
  3. 等待转换完成,输出目录中会生成Word文件。

完整代码

import os
import PyPDF2
import img2pdf
import threading

def pdf_to_image(pdf_file, output_dir):
    """
    将PDF文件转换为图片
    :param pdf_file: PDF文件路径
    :param output_dir: 输出图片的目录
    """
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    for page_num in range(pdf_reader.numPages):
        page_object = pdf_reader.getPage(page_num)
        image = page_object.getImage()
        if image:
            image_file = os.path.join(output_dir, f"page_{page_num + 1}.png")
            with open(image_file, "wb") as f:
                f.write(image)

def image_to_word(image_dir, output_file):
    """
    将图片转换为Word
    :param image_dir: 图片目录
    :param output_file: 输出Word文件的路径
    """
    images = os.listdir(image_dir)
    images.sort()

    with open(output_file, "wb") as f:
        for image in images:
            image_path = os.path.join(image_dir, image)
            img2pdf.convert(image_path, f)

def multi_thread_pdf_to_word(pdf_files, output_dir):
    """
    多线程实现PDF转Word
    :param pdf_files: PDF文件路径列表
    :param output_dir: 输出目录
    """
    threads = []
    for pdf_file in pdf_files:
        thread = threading.Thread(target=pdf_to_image, args=(pdf_file, output_dir))
        threads.append(thread)

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()

    threads = []
    for subdir in os.listdir(output_dir):
        subdir_path = os.path.join(output_dir, subdir)
        if os.path.isdir(subdir_path):
            output_file = os.path.join(output_dir, f"{subdir}.docx")
            thread = threading.Thread(target=image_to_word, args=(subdir_path, output_file))
            threads.append(thread)

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()

if __name__ == "__main__":
    pdf_files = ["file1.pdf", "file2.pdf", "file3.pdf"]
    output_dir = "output"
    multi_thread_pdf_to_word(pdf_files, output_dir)