返回
多线程PDF转Word,60行Python代码轻松搞定!
开发工具
2023-10-20 23:34:16
工作中经常会遇到需要提取PDF文件中文字的情况,一个PDF还好,复制粘贴一下也花不了太多时间,如果需要把大量PDF转为Word,怎么办呢?
今天教大家用60行代码实现,多线程批量PDF转Word。没兴趣看具体过程可以直接拉到最后,有代码。
把PDF转为Word,分几步?
两步,第一步PDF转为图片,第二步图片转为Word。
第一步:PDF转为图片
import os
import PyPDF2
def pdf_to_image(pdf_file, output_dir):
"""
将PDF文件转换为图片
:param pdf_file: PDF文件路径
:param output_dir: 输出图片的目录
"""
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
for page_num in range(pdf_reader.numPages):
page_object = pdf_reader.getPage(page_num)
image = page_object.getImage()
if image:
image_file = os.path.join(output_dir, f"page_{page_num + 1}.png")
with open(image_file, "wb") as f:
f.write(image)
第二步:图片转为Word
import os
import img2pdf
def image_to_word(image_dir, output_file):
"""
将图片转换为Word
:param image_dir: 图片目录
:param output_file: 输出Word文件的路径
"""
images = os.listdir(image_dir)
images.sort()
with open(output_file, "wb") as f:
for image in images:
image_path = os.path.join(image_dir, image)
img2pdf.convert(image_path, f)
多线程实现
import threading
def multi_thread_pdf_to_word(pdf_files, output_dir):
"""
多线程实现PDF转Word
:param pdf_files: PDF文件路径列表
:param output_dir: 输出目录
"""
threads = []
for pdf_file in pdf_files:
thread = threading.Thread(target=pdf_to_image, args=(pdf_file, output_dir))
threads.append(thread)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
threads = []
for subdir in os.listdir(output_dir):
subdir_path = os.path.join(output_dir, subdir)
if os.path.isdir(subdir_path):
output_file = os.path.join(output_dir, f"{subdir}.docx")
thread = threading.Thread(target=image_to_word, args=(subdir_path, output_file))
threads.append(thread)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
使用说明
- 将需要转换的PDF文件放在同一个目录下。
- 运行python脚本,指定PDF文件目录和输出目录。
- 等待转换完成,输出目录中会生成Word文件。
完整代码
import os
import PyPDF2
import img2pdf
import threading
def pdf_to_image(pdf_file, output_dir):
"""
将PDF文件转换为图片
:param pdf_file: PDF文件路径
:param output_dir: 输出图片的目录
"""
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
for page_num in range(pdf_reader.numPages):
page_object = pdf_reader.getPage(page_num)
image = page_object.getImage()
if image:
image_file = os.path.join(output_dir, f"page_{page_num + 1}.png")
with open(image_file, "wb") as f:
f.write(image)
def image_to_word(image_dir, output_file):
"""
将图片转换为Word
:param image_dir: 图片目录
:param output_file: 输出Word文件的路径
"""
images = os.listdir(image_dir)
images.sort()
with open(output_file, "wb") as f:
for image in images:
image_path = os.path.join(image_dir, image)
img2pdf.convert(image_path, f)
def multi_thread_pdf_to_word(pdf_files, output_dir):
"""
多线程实现PDF转Word
:param pdf_files: PDF文件路径列表
:param output_dir: 输出目录
"""
threads = []
for pdf_file in pdf_files:
thread = threading.Thread(target=pdf_to_image, args=(pdf_file, output_dir))
threads.append(thread)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
threads = []
for subdir in os.listdir(output_dir):
subdir_path = os.path.join(output_dir, subdir)
if os.path.isdir(subdir_path):
output_file = os.path.join(output_dir, f"{subdir}.docx")
thread = threading.Thread(target=image_to_word, args=(subdir_path, output_file))
threads.append(thread)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if __name__ == "__main__":
pdf_files = ["file1.pdf", "file2.pdf", "file3.pdf"]
output_dir = "output"
multi_thread_pdf_to_word(pdf_files, output_dir)