Backend Development 7 min read
Python PDF Operations: Merging, Splitting, Encryption, Decryption, Text Extraction, Watermarking, Page Removal, Rotation, and HTML-to-PDF Conversion
This guide demonstrates how to use Python libraries such as PyPDF2, ReportLab, and WeasyPrint to merge, split, encrypt, decrypt, extract text, add watermarks, remove or rotate pages, and convert HTML files into PDFs, providing complete code examples for each operation.
Test Development Learning Exchange
Test Development Learning Exchange
Merge Multiple PDF Files
from PyPDF2 import PdfMerger
def merge_pdfs(paths, output):
merger = PdfMerger()
for pdf in paths:
merger.append(pdf)
merger.write(output)
merger.close()
paths = ['/path/to/file1.pdf', '/path/to/file2.pdf']
output = '/path/to/merged.pdf'
merge_pdfs(paths, output)Split PDF Files
from PyPDF2 import PdfReader, PdfWriter
def split_pdf(input_path, start_page, end_page, output_path):
with open(input_path, 'rb') as file:
pdf = PdfReader(file)
writer = PdfWriter()
for i in range(start_page - 1, end_page):
writer.add_page(pdf.pages[i])
with open(output_path, 'wb') as output:
writer.write(output)
input_path = '/path/to/input.pdf'
start_page = 1
end_page = 5
output_path = '/path/to/output.pdf'
split_pdf(input_path, start_page, end_page, output_path)Encrypt PDF Files
from PyPDF2 import PdfWriter, PdfReader
def encrypt_pdf(input_path, output_path, password):
with open(input_path, 'rb') as file:
pdf = PdfReader(file)
writer = PdfWriter()
for page in pdf.pages:
writer.add_page(page)
writer.encrypt(password)
with open(output_path, 'wb') as output:
writer.write(output)
input_path = '/path/to/input.pdf'
output_path = '/path/to/encrypted.pdf'
password = 'mysecretpassword'
encrypt_pdf(input_path, output_path, password)Decrypt PDF Files
from PyPDF2 import PdfReader, PdfWriter
def decrypt_pdf(input_path, output_path, password):
with open(input_path, 'rb') as file:
pdf = PdfReader(file)
if not pdf.is_encrypted or pdf.decrypt(password) == 0:
raise ValueError("Password incorrect or file not encrypted.")
writer = PdfWriter()
for page in pdf.pages:
writer.add_page(page)
with open(output_path, 'wb') as output:
writer.write(output)
input_path = '/path/to/encrypted.pdf'
output_path = '/path/to/decrypted.pdf'
password = 'mysecretpassword'
decrypt_pdf(input_path, output_path, password)Extract Text from PDF
from PyPDF2 import PdfReader
def extract_text_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
pdf = PdfReader(file)
text = ''
for page in pdf.pages:
text += page.extract_text()
return text
pdf_path = '/path/to/input.pdf'
text = extract_text_from_pdf(pdf_path)
print(text)Add Text to a PDF Page
from PyPDF2 import PdfWriter, PdfReader
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import io
def add_text_to_pdf(pdf_path, output_path, text, x, y):
packet = io.BytesIO()
can = canvas.Canvas(packet, pagesize=letter)
can.drawString(x, y, text)
can.save()
packet.seek(0)
new_pdf = PdfReader(packet)
existing_pdf = PdfReader(open(pdf_path, "rb"))
output = PdfWriter()
page = existing_pdf.pages[0]
page.merge_page(new_pdf.pages[0])
output.add_page(page)
outputStream = open(output_path, "wb")
output.write(outputStream)
outputStream.close()
pdf_path = '/path/to/input.pdf'
output_path = '/path/to/output_with_text.pdf'
text = 'This is some text.'
x = 50
y = 750
add_text_to_pdf(pdf_path, output_path, text, x, y)Add Watermark to PDF
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
def add_watermark(input_pdf, output_pdf, watermark_text):
watermark = canvas.Canvas("/tmp/watermark.pdf", pagesize=letter)
watermark.saveState()
watermark.setFont("Helvetica", 80)
watermark.rotate(45)
watermark.setFillAlpha(0.1)
watermark.drawString(150, 300, watermark_text)
watermark.restoreState()
watermark.save()
watermark_pdf = PdfReader(open("/tmp/watermark.pdf", "rb"))
input_pdf = PdfReader(open(input_pdf, "rb"))
output_pdf = PdfWriter()
watermark_page = watermark_pdf.pages[0]
for page in input_pdf.pages:
page.merge_page(watermark_page)
output_pdf.add_page(page)
outputStream = open(output_pdf, "wb")
output_pdf.write(outputStream)
outputStream.close()
input_pdf = '/path/to/input.pdf'
output_pdf = '/path/to/output_with_watermark.pdf'
watermark_text = 'Confidential'
add_watermark(input_pdf, output_pdf, watermark_text)Delete PDF Pages
from PyPDF2 import PdfReader, PdfWriter
def remove_pages(input_pdf, output_pdf, page_numbers):
input_pdf = PdfReader(open(input_pdf, "rb"))
output_pdf = PdfWriter()
for i, page in enumerate(input_pdf.pages):
if i + 1 not in page_numbers:
output_pdf.add_page(page)
outputStream = open(output_pdf, "wb")
output_pdf.write(outputStream)
outputStream.close()
input_pdf = '/path/to/input.pdf'
output_pdf = '/path/to/output_without_pages.pdf'
page_numbers = [1, 3] # Pages to remove
remove_pages(input_pdf, output_pdf, page_numbers)Rotate PDF Pages
from PyPDF2 import PdfReader, PdfWriter
def rotate_pages(input_pdf, output_pdf, page_numbers, angle):
input_pdf = PdfReader(open(input_pdf, "rb"))
output_pdf = PdfWriter()
for i, page in enumerate(input_pdf.pages):
if i + 1 in page_numbers:
page.rotate(angle)
output_pdf.add_page(page)
outputStream = open(output_pdf, "wb")
output_pdf.write(outputStream)
outputStream.close()
input_pdf = '/path/to/input.pdf'
output_pdf = '/path/to/output_rotated.pdf'
page_numbers = [2, 4] # Pages to rotate
angle = 90 # Rotation angle (90, 180, 270)
rotate_pages(input_pdf, output_pdf, page_numbers, angle)Convert HTML to PDF
from weasyprint import HTML
def html_to_pdf(html_path, output_path):
HTML(html_path).write_pdf(output_path)
html_path = '/path/to/input.html'
output_path = '/path/to/output.pdf'
html_to_pdf(html_path, output_path)Written by
Test Development Learning Exchange
Test Development Learning Exchange
0 followers
Reader feedback
How this landed with the community
Rate this article
Was this worth your time?
Discussion
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.