Fundamentals 20 min read

Master Bulk Excel Automation with Python: Merge, Filter, Format & More

Learn a collection of Python scripts that automate common Excel operations—including merging multiple files, find‑and‑replace, filtering rows, adding formulas, adjusting formats, extracting images, and encrypting workbooks—providing step‑by‑step code examples for efficient data processing.

Test Development Learning Exchange

Feb 6, 2025

Master Bulk Excel Automation with Python: Merge, Filter, Format & More

Bulk Merging Multiple Excel Files

This script scans a directory for all .xlsx files, reads each into a pandas DataFrame, concatenates them, and writes the combined data to a new workbook.

import pandas as pd
import os

def merge_excel_files(directory, output_file):
    excel_files = [f for f in os.listdir(directory) if f.endswith('.xlsx')]
    all_data = pd.DataFrame()
    for file_name in excel_files:
        print(f"Processing: {file_name}")
        file_path = os.path.join(directory, file_name)
        df = pd.read_excel(file_path)
        all_data = pd.concat([all_data, df], ignore_index=True)
    all_data.to_excel(output_file, index=False)
    print(f"Merging completed, result saved to: {output_file}")

# Usage example
merge_excel_files('input_directory', 'merged_output.xlsx')

Find and Replace Content in Excel

Using openpyxl, this function iterates every cell in every worksheet, replaces a target string with a new one, and saves the modified workbook.

from openpyxl import load_workbook

def find_and_replace_in_excel(file_path, search_text, replace_text):
    wb = load_workbook(file_path)
    for sheet in wb.worksheets:
        print(f"Processing sheet: {sheet.title}")
        for row in sheet.iter_rows():
            for cell in row:
                if cell.value and search_text in str(cell.value):
                    old_value = cell.value
                    new_value = old_value.replace(search_text, replace_text)
                    cell.value = new_value
                    print(f"In cell {cell.coordinate} found '{search_text}', replaced with '{replace_text}'")
    wb.save('replaced_' + os.path.basename(file_path))
    print("Find and replace completed")

# Usage example
find_and_replace_in_excel('example.xlsx', 'old text', 'new text')

Filter Data by Condition and Save

Read an Excel file with pandas, keep rows where a specified column exceeds a threshold, and write the result to a new file.

import pandas as pd

def filter_excel_data(input_file, output_file, column_name, threshold):
    df = pd.read_excel(input_file)
    filtered_df = df[df[column_name] > threshold]
    filtered_df.to_excel(output_file, index=False)
    print(f"Filtering completed, results saved to: {output_file}")

# Usage example
filter_excel_data('data.xlsx', 'filtered_data.xlsx', 'Age', 30)

Add Formula to a Cell

Loads a workbook, writes a formula string into a target cell, and saves the modified file.

from openpyxl import load_workbook

def add_formula_to_excel(file_path, formula, target_cell):
    wb = load_workbook(file_path)
    ws = wb.active
    ws[target_cell] = formula
    print(f"Formula '{formula}' added to cell {target_cell}")
    wb.save('formulated_' + os.path.basename(file_path))

# Usage example
add_formula_to_excel('example.xlsx', '=SUM(A1:A10)', 'B1')

Batch Adjust Excel Formatting

Applies a red, bold Arial font and center alignment to every cell in the active worksheet.

from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment

def format_excel_file(file_path):
    wb = load_workbook(file_path)
    ws = wb.active
    for row in ws.iter_rows():
        for cell in row:
            cell.font = Font(name='Arial', size=12, bold=True, color='FF0000')
            cell.alignment = Alignment(horizontal='center', vertical='center')
    wb.save('formatted_' + os.path.basename(file_path))
    print("Formatting completed")

# Usage example
format_excel_file('example.xlsx')

Batch Rename Worksheets

Prepends a user‑defined prefix to each worksheet title in a workbook.

from openpyxl import load_workbook

def rename_sheets(file_path, prefix):
    wb = load_workbook(file_path)
    for sheet in wb.worksheets:
        old_name = sheet.title
        new_name = f"{prefix}_{old_name}"
        sheet.title = new_name
        print(f"Worksheet '{old_name}' renamed to: {new_name}")
    wb.save('renamed_' + os.path.basename(file_path))
    print("All worksheets renamed")

# Usage example
rename_sheets('example.xlsx', 'New')

Extract Specific Columns from Multiple Files

Iterates over all Excel files in a folder, selects given columns, concatenates them, and writes the combined result.

import pandas as pd
import os

def extract_columns_from_multiple_files(directory, columns_to_extract, output_file):
    all_data = pd.DataFrame()
    for filename in os.listdir(directory):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(directory, filename)
            df = pd.read_excel(file_path)
            extracted_df = df[columns_to_extract]
            all_data = pd.concat([all_data, extracted_df], ignore_index=True)
    all_data.to_excel(output_file, index=False)
    print(f"Data extraction completed, result saved to: {output_file}")

# Usage example
extract_columns_from_multiple_files('input_directory', ['Name', 'Age'], 'extracted_output.xlsx')

Insert New Row or Column

Depending on the dimension argument, inserts a row or column at the specified index.

from openpyxl import load_workbook

def insert_row_or_column(file_path, location, dimension='row'):
    wb = load_workbook(file_path)
    ws = wb.active
    if dimension == 'row':
        ws.insert_rows(location)
        print(f"Inserted a row before row {location}")
    elif dimension == 'column':
        ws.insert_cols(location)
        print(f"Inserted a column before column {location}")
    wb.save('inserted_' + os.path.basename(file_path))

# Usage examples
insert_row_or_column('example.xlsx', 5, 'row')
insert_row_or_column('example.xlsx', 3, 'column')

Remove Duplicate Rows

Loads a workbook with pandas, drops duplicate rows (optionally based on a subset of columns), and saves the cleaned data.

import pandas as pd

def remove_duplicates(input_file, output_file, subset=None):
    df = pd.read_excel(input_file)
    df.drop_duplicates(subset=subset, inplace=True)
    df.to_excel(output_file, index=False)
    print(f"Duplicate removal completed, result saved to: {output_file}")

# Usage example
remove_duplicates('data_with_duplicates.xlsx', 'cleaned_data.xlsx', subset=['Name', 'Email'])

Copy a Worksheet Across Workbooks

Copies the contents of a specified sheet from a source workbook into a new sheet of a target workbook.

from openpyxl import load_workbook

def copy_sheet(source_file, target_file, sheet_name):
    source_wb = load_workbook(source_file)
    target_wb = load_workbook(target_file)
    if sheet_name in source_wb.sheetnames:
        source_ws = source_wb[sheet_name]
        target_ws = target_wb.create_sheet(title=sheet_name)
        for row in source_ws.iter_rows():
            for cell in row:
                target_ws[cell.coordinate].value = cell.value
        target_wb.save(target_file)
        print(f"Sheet '{sheet_name}' copied from {source_file} to {target_file}")
    else:
        print(f"Sheet '{sheet_name}' not found in source file")

# Usage example
copy_sheet('source.xlsx', 'target.xlsx', 'Sheet1')

Adjust Column Width Automatically

Calculates the maximum string length in each column and sets the column width accordingly.

from openpyxl import load_workbook

def adjust_column_width(file_path):
    wb = load_workbook(file_path)
    ws = wb.active
    for column in ws.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        ws.column_dimensions[column_letter].width = max_length + 2
    wb.save('adjusted_' + os.path.basename(file_path))
    print("All column widths adjusted")

# Usage example
adjust_column_width('example.xlsx')

Create a Pivot Table

Generates a pivot table that aggregates values by specified index columns using pandas.

import pandas as pd

def create_pivot_table(input_file, output_file, index_columns, values_columns):
    df = pd.read_excel(input_file)
    pivot_table = pd.pivot_table(df, values=values_columns, index=index_columns, aggfunc='sum')
    pivot_table.to_excel(output_file)
    print(f"Pivot table generated and saved to: {output_file}")

# Usage example
create_pivot_table('sales_data.xlsx', 'pivot_output.xlsx', ['Region'], ['Sales'])

Add a Bar Chart to a Worksheet

Creates a bar chart with titles and data range, then inserts it at cell E5.

from openpyxl import load_workbook
from openpyxl.chart import BarChart, Reference

def add_chart_to_excel(file_path, sheet_name, chart_title, x_axis_title, y_axis_title, data_range):
    wb = load_workbook(file_path)
    ws = wb[sheet_name]
    chart = BarChart()
    chart.title = chart_title
    chart.x_axis.title = x_axis_title
    chart.y_axis.title = y_axis_title
    data = Reference(ws, min_col=data_range['min_col'], min_row=data_range['min_row'],
                    max_col=data_range['max_col'], max_row=data_range['max_row'])
    categories = Reference(ws, min_col=data_range['category_col'], min_row=data_range['category_min_row'],
                          max_row=data_range['category_max_row'])
    chart.add_data(data, titles_from_data=True)
    chart.set_categories(categories)
    ws.add_chart(chart, "E5")
    wb.save('chart_added_' + os.path.basename(file_path))
    print("Chart addition completed")

# Usage example
add_chart_to_excel('example.xlsx', 'Sheet1', 'Sales Statistics', 'Product', 'Sales',
                   {'min_col': 1, 'min_row': 2, 'max_col': 2, 'max_row': 7,
                    'category_col': 1, 'category_min_row': 2, 'category_max_row': 7})

Merge Adjacent Cells with Identical Content

Scans each column, merges consecutive cells that share the same value, and saves the workbook.

from openpyxl import load_workbook
from openpyxl.utils import get_column_letter

def merge_cells_with_same_content(file_path):
    wb = load_workbook(file_path)
    ws = wb.active
    for col in range(1, ws.max_column + 1):
        start = None
        for row in range(1, ws.max_row + 1):
            current_cell = ws[f"{get_column_letter(col)}{row}"]
            next_cell = ws[f"{get_column_letter(col)}{row + 1}"] if row < ws.max_row else None
            if start is None and next_cell and current_cell.value == next_cell.value:
                start = current_cell.coordinate
            elif start is not None and (not next_cell or current_cell.value != next_cell.value):
                end = current_cell.coordinate
                ws.merge_cells(f"{start}:{end}")
                start = None
    wb.save('merged_' + os.path.basename(file_path))
    print("Merging same-content cells completed")

# Usage example
merge_cells_with_same_content('example.xlsx')

Extract Images Embedded in an Excel File

Iterates over the worksheet's image objects, saves each as a PNG file in the specified output directory.

from openpyxl import load_workbook
from openpyxl.drawing.image import Image
import os

def extract_images_from_excel(file_path, output_dir):
    wb = load_workbook(file_path, keep_vba=True)
    ws = wb.active
    for img in ws._images:
        image = Image(img.ref)
        image_path = os.path.join(output_dir, f"image_{img.anchor._from.row}.png")
        image.save(image_path)
        print(f"Image saved to: {image_path}")

# Usage example
extract_images_from_excel('example_with_images.xlsx', 'output_images')

Batch Convert Date Formats

Converts a date column from one string format to another using pandas' datetime utilities.

import pandas as pd

def convert_date_format(input_file, output_file, date_column, current_format, target_format):
    df = pd.read_excel(input_file)
    df[date_column] = pd.to_datetime(df[date_column], format=current_format).dt.strftime(target_format)
    df.to_excel(output_file, index=False)
    print(f"Date format conversion completed, result saved to: {output_file}")

# Usage example
convert_date_format('dates.xlsx', 'formatted_dates.xlsx', 'Date', '%Y-%m-%d', '%d/%m/%Y')

Split an Excel File by a Condition

Separates rows that match a given value in a specific column from those that do not, saving each subset to its own workbook.

import pandas as pd

def split_excel_by_condition(input_file, column_name, condition_value):
    df = pd.read_excel(input_file)
    matching_df = df[df[column_name] == condition_value]
    non_matching_df = df[df[column_name] != condition_value]
    matching_df.to_excel(f'matching_{condition_value}.xlsx', index=False)
    non_matching_df.to_excel('non_matching.xlsx', index=False)
    print(f"Data matching condition saved to: matching_{condition_value}.xlsx")
    print("Data not matching condition saved to: non_matching.xlsx")

# Usage example
split_excel_by_condition('data.xlsx', 'Category', 'A')

Add a Text Watermark (Conceptual)

This example shows how to create a text drawing with openpyxl and place it on the worksheet. In practice, additional adjustments may be required, or a library like Pillow might be used for more complex watermarks.

from openpyxl import load_workbook
from openpyxl.drawing.text import TextBlock, Paragraph, ParagraphProperties, CharacterProperties
from openpyxl.drawing.spreadsheet_drawing import OneCellAnchor, AnchorMarker
from openpyxl.utils.units import cm_to_EMU
import os

def add_watermark(file_path, watermark_text):
    wb = load_workbook(file_path)
    ws = wb.active
    drawing = TextBlock(
        Paragraph(paragraphProperties=ParagraphProperties(defRPr=CharacterProperties(sz=400))),
        [Paragraph(watermark_text, paragraphProperties=ParagraphProperties(defRPr=CharacterProperties(sz=400)))]
    )
    anchor = OneCellAnchor(anchorRow=5, anchorCol=5,
                          _from=AnchorMarker(x=cm_to_EMU(2), y=cm_to_EMU(2)),
                          ext=AnchorMarker(x=cm_to_EMU(10), y=cm_to_EMU(1)))
    ws.add_drawing(drawing, anchor)
    wb.save('watermarked_' + os.path.basename(file_path))
    print("Watermark addition completed")

# Usage example
add_watermark('example.xlsx', 'CONFIDENTIAL')

Batch Encrypt Excel Workbooks

Applies a password to each workbook in a directory by setting the WorkbookProtection object.

from openpyxl import load_workbook
from openpyxl.workbook.protection import WorkbookProtection
import os

def encrypt_excel_files(directory, password):
    for filename in os.listdir(directory):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(directory, filename)
            wb = load_workbook(file_path)
            wb.security = WorkbookProtection(workbookPassword=password, lockStructure=True)
            encrypted_filename = f'encrypted_{filename}'
            wb.save(os.path.join(directory, encrypted_filename))
            print(f"{filename} encrypted and saved as {encrypted_filename}")

# Usage example
encrypt_excel_files('input_directory', 'your_password')

Update Hyperlinks Across a Workbook

Searches for hyperlinks that start with a given old URL and replaces the domain with a new one.

from openpyxl import load_workbook
from openpyxl.worksheet.hyperlink import Hyperlink

def update_hyperlinks(file_path, old_url, new_url):
    wb = load_workbook(file_path)
    for sheet in wb.worksheets:
        for row in sheet.iter_rows():
            for cell in row:
                if cell.hyperlink and cell.hyperlink.target.startswith(old_url):
                    new_link = cell.hyperlink.target.replace(old_url, new_url)
                    cell.hyperlink = Hyperlink(display=None, ref=cell.coordinate,
                                               location=None, tooltip=None, target=new_link)
                    print(f"Hyperlink in cell {cell.coordinate} updated from {old_url} to {new_url}")
    wb.save('updated_links_' + os.path.basename(file_path))
    print("Hyperlink update completed")

# Usage example
update_hyperlinks('example.xlsx', 'http://oldwebsite.com', 'http://newwebsite.com')

These snippets collectively provide a practical toolbox for automating repetitive Excel tasks, improving productivity, and ensuring consistency across large datasets.

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.

Python Automation Excel Pandas openpyxl DataProcessing

Written by

Test Development Learning Exchange

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.