Backend Development 8 min read
Python PDF Manipulation Guide: Merging, Splitting, Encrypting, Decrypting, Text Extraction, Adding Text, Watermarking, Page Removal, Rotation, and HTML‑to‑PDF Conversion
This tutorial demonstrates how to use Python libraries such as PyPDF2, ReportLab, and WeasyPrint to merge, split, encrypt, decrypt, extract text from, add text to, watermark, delete, rotate PDF pages, and convert HTML files into PDFs, providing complete code examples for each operation.
Test Development Learning Exchange
Test Development Learning Exchange
Merge multiple PDF files
from PyPDF2 import PdfMerger
def merge_pdfs(paths, output):
merger = PdfMerger()
for pdf in paths:
merger.append(pdf)
merger.write(output)
merger.close()
paths = ['/path/to/file1.pdf', '/path/to/file2.pdf']
output = '/path/to/merged.pdf'
merge_pdfs(paths, output)Split a PDF file
from PyPDF2 import PdfReader, PdfWriter
def split_pdf(input_path, start_page, end_page, output_path):
with open(input_path, 'rb') as file:
pdf = PdfReader(file)
writer = PdfWriter()
for i in range(start_page - 1, end_page):
writer.add_page(pdf.pages[i])
with open(output_path, 'wb') as output:
writer.write(output)
input_path = '/path/to/input.pdf'
start_page = 1
end_page = 5
output_path = '/path/to/output.pdf'
split_pdf(input_path, start_page, end_page, output_path)Encrypt a PDF file
from PyPDF2 import PdfWriter, PdfReader
def encrypt_pdf(input_path, output_path, password):
with open(input_path, 'rb') as file:
pdf = PdfReader(file)
writer = PdfWriter()
for page in pdf.pages:
writer.add_page(page)
writer.encrypt(password)
with open(output_path, 'wb') as output:
writer.write(output)
input_path = '/path/to/input.pdf'
output_path = '/path/to/encrypted.pdf'
password = 'mysecretpassword'
encrypt_pdf(input_path, output_path, password)Decrypt a PDF file
from PyPDF2 import PdfReader, PdfWriter
def decrypt_pdf(input_path, output_path, password):
with open(input_path, 'rb') as file:
pdf = PdfReader(file)
if not pdf.is_encrypted or pdf.decrypt(password) == 0:
raise ValueError("Password incorrect or file not encrypted.")
writer = PdfWriter()
for page in pdf.pages:
writer.add_page(page)
with open(output_path, 'wb') as output:
writer.write(output)
input_path = '/path/to/encrypted.pdf'
output_path = '/path/to/decrypted.pdf'
password = 'mysecretpassword'
decrypt_pdf(input_path, output_path, password)Extract text from a PDF
from PyPDF2 import PdfReader
def extract_text_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
pdf = PdfReader(file)
text = ''
for page in pdf.pages:
text += page.extract_text()
return text
pdf_path = '/path/to/input.pdf'
text = extract_text_from_pdf(pdf_path)
print(text)Add text to a PDF page (using ReportLab)
import io
from PyPDF2 import PdfReader, PdfWriter
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
def add_text_to_pdf(pdf_path, output_path, text, x, y):
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(x, y, text)
can.save()
packet.seek(0)
new_pdf = PdfReader(packet)
existing_pdf = PdfReader(open(pdf_path, "rb"))
output = PdfWriter()
page = existing_pdf.pages[0]
page.merge_page(new_pdf.pages[0])
output.add_page(page)
with open(output_path, "wb") as out_stream:
output.write(out_stream)
pdf_path = '/path/to/input.pdf'
output_path = '/path/to/output_with_text.pdf'
text = 'This is some text.'
x = 50
y = 750
add_text_to_pdf(pdf_path, output_path, text, x, y)Add a watermark to a PDF (using ReportLab)
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
def add_watermark(input_pdf, output_pdf, watermark_text):
watermark = canvas.Canvas("/tmp/watermark.pdf", pagesize=letter)
watermark.saveState()
watermark.setFont("Helvetica", 80)
watermark.rotate(45)
watermark.setFillAlpha(0.1)
watermark.drawString(150, 300, watermark_text)
watermark.restoreState()
watermark.save()
watermark_pdf = PdfReader(open("/tmp/watermark.pdf", "rb"))
input_reader = PdfReader(open(input_pdf, "rb"))
output_writer = PdfWriter()
watermark_page = watermark_pdf.pages[0]
for page in input_reader.pages:
page.merge_page(watermark_page)
output_writer.add_page(page)
with open(output_pdf, "wb") as out_stream:
output_writer.write(out_stream)
input_pdf = '/path/to/input.pdf'
output_pdf = '/path/to/output_with_watermark.pdf'
watermark_text = 'Confidential'
add_watermark(input_pdf, output_pdf, watermark_text)Remove specific pages from a PDF
from PyPDF2 import PdfReader, PdfWriter
def remove_pages(input_pdf, output_pdf, page_numbers):
reader = PdfReader(open(input_pdf, "rb"))
writer = PdfWriter()
for i, page in enumerate(reader.pages):
if i + 1 not in page_numbers:
writer.add_page(page)
with open(output_pdf, "wb") as out_stream:
writer.write(out_stream)
input_pdf = '/path/to/input.pdf'
output_pdf = '/path/to/output_without_pages.pdf'
page_numbers = [1, 3] # pages to remove
remove_pages(input_pdf, output_pdf, page_numbers)Rotate selected PDF pages
from PyPDF2 import PdfReader, PdfWriter
def rotate_pages(input_pdf, output_pdf, page_numbers, angle):
reader = PdfReader(open(input_pdf, "rb"))
writer = PdfWriter()
for i, page in enumerate(reader.pages):
if i + 1 in page_numbers:
page.rotate(angle)
writer.add_page(page)
with open(output_pdf, "wb") as out_stream:
writer.write(out_stream)
input_pdf = '/path/to/input.pdf'
output_pdf = '/path/to/output_rotated.pdf'
page_numbers = [2, 4] # pages to rotate
angle = 90 # rotation angle (90, 180, 270)
rotate_pages(input_pdf, output_pdf, page_numbers, angle)Convert HTML to PDF using WeasyPrint
from weasyprint import HTML
def html_to_pdf(html_path, output_path):
HTML(html_path).write_pdf(output_path)
html_path = '/path/to/input.html'
output_path = '/path/to/output.pdf'
html_to_pdf(html_path, output_path)Written by
Test Development Learning Exchange
Test Development Learning Exchange
0 followers
Reader feedback
How this landed with the community
Rate this article
Was this worth your time?
Discussion
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.