Artificial Intelligence 5 min read

Splitting PDF Files and Recognizing MP3 Audio with Python

This guide explains how to split a PDF into separate files using PyPDF2 and provides two Python approaches for converting MP3 audio to text—one leveraging Google Speech‑Recognition for higher accuracy and another using PocketSphinx for complete transcription—complete with ready‑to‑run code examples.

Test Development Learning Exchange
Test Development Learning Exchange
Test Development Learning Exchange
Splitting PDF Files and Recognizing MP3 Audio with Python

In this tutorial we demonstrate how to programmatically split a PDF document into separate files—extracting the first page as a cover and grouping subsequent pages in pairs—using the PyPDF2 library.

We then show two approaches for processing an accompanying MP3 audio file: first converting the MP3 to WAV and using the Google Speech‑Recognition API for accurate but potentially incomplete transcription, and second using PocketSphinx for a full transcription that may be less accurate.

Both methods are implemented in Python, with complete code examples provided below.

from PyPDF2 import PdfReader, PdfWriter

def split_pdf(input_path, output_folder):
    with open(input_path, 'rb') as file:
        reader = PdfReader(file)
        total_pages = len(reader.pages)

        # Split the first page as a separate PDF
        first_page_output_path = f"{output_folder}/1.pdf"
        first_page_writer = PdfWriter()
        first_page = reader.pages[0]
        first_page_writer.add_page(first_page)
        with open(first_page_output_path, 'wb') as first_page_output_file:
            first_page_writer.write(first_page_output_file)

        # Split the remaining pages
        for start_page in range(1, total_pages, 2):
            end_page = min(start_page + 1, total_pages - 1)
            output_path = f"{output_folder}/{start_page + 1}-{end_page + 1}.pdf"
            writer = PdfWriter()
            for page_number in range(start_page, end_page + 1):
                page = reader.pages[page_number]
                writer.add_page(page)
            with open(output_path, 'wb') as output_file:
                writer.write(output_file)

input_pdf_path = 'XXX.pdf'
output_folder = '/output_folder/pdf'

if __name__ == '__main__':
    split_pdf(input_pdf_path, output_folder)
import os
import speech_recognition as sr
from pydub import AudioSegment

def mp3_to_wav(input_path, output_path):
    # 读取MP3文件
    audio = AudioSegment.from_mp3(input_path)
    # 将MP3转换为WAV格式
    audio.export(output_path, format="wav")

def recognize_speech(input_wav_path, output_txt_path):
    # 初始化语音识别器
    recognizer = sr.Recognizer()
    # 读取WAV文件并进行语音识别
    with sr.AudioFile(input_wav_path) as source:
        audio = recognizer.record(source)
    # 进行语音识别
    try:
        text = recognizer.recognize_google(audio, language="en-US")
    except sr.UnknownValueError:
        text = "Speech Recognition could not understand the audio."
    except sr.RequestError as e:
        text = f"Could not request results from Google Speech Recognition service; {e}"
    # 将识别结果保存到文本文件中
    with open(output_txt_path, "a", encoding="utf-8") as output_file:
        output_file.write(text)

if __name__ == "__main__":
    input_mp3 = "XXX.mp3"
    output_wav = "/output_folder/output1.wav"
    mp3_to_wav(input_mp3, output_wav)
    output_txt = "/output_folder/output1.txt"
    recognize_speech(output_wav, output_txt)
import speech_recognition as sr
from pydub import AudioSegment

# 定义输入和输出文件路径
input_file = "XXX.mp3"
output_file = "/output_folder/output.txt"

# 将 MP3 文件转换为 WAV 格式
audio = AudioSegment.from_mp3(input_file)
audio.export("/output_folder/temp.wav", format="wav")

# 创建语音识别器对象
recognizer = sr.Recognizer()

# 使用 PocketSphinx 进行识别
with sr.AudioFile("/output_folder/temp.wav") as source:
    audio_data = recognizer.record(source)
    text = recognizer.recognize_sphinx(audio_data)

# 将识别的文本按行写入文件
lines = text.split("\n")
with open(output_file, "w") as file:
    for line in lines:
        file.write(line + "\n")

print("转换完成!")
AutomationPDFSpeech RecognitionPyPDF2pydub
Test Development Learning Exchange
Written by

Test Development Learning Exchange

Test Development Learning Exchange

0 followers
Reader feedback

How this landed with the community

login Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.