Comprehensive Guide to Excel Data Processing with Pandas in Python
This article provides a step‑by‑step tutorial for reading, filtering, summarizing, grouping, sorting, comparing, merging, cleaning, reshaping, aggregating, and writing Excel data using Python's pandas library, covering a wide range of common data‑processing tasks with complete code examples.
1. Read Excel file
Read an Excel file using pandas.
import pandas as pd
def read_excel(file_path):
df = pd.read_excel(file_path, engine='openpyxl')
return df
if __name__ == "__main__":
file_path = "data.xlsx"
df = read_excel(file_path)
print(df)2. Filter data
Filter rows where a specific column matches a given value.
import pandas as pd
def filter_data(file_path, column, value):
df = pd.read_excel(file_path, engine='openpyxl')
filtered_df = df[df[column] == value]
return filtered_df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column1"
value = "D"
filtered_df = filter_data(file_path, column, value)
print(filtered_df)3. Summarize data
Calculate the sum of a column.
import pandas as pd
def summarize_data(file_path, column):
df = pd.read_excel(file_path, engine='openpyxl')
total = df[column].sum()
return total
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column2"
total = summarize_data(file_path, column)
print(total)4. Group data
Group by a column and compute the mean of another column.
import pandas as pd
def group_data(file_path, by, agg):
df = pd.read_excel(file_path, engine='openpyxl')
grouped = df.groupby(by)[agg].mean()
return grouped
if __name__ == "__main__":
file_path = "data.xlsx"
by = "Column1"
agg = "Column2"
grouped = group_data(file_path, by, agg)
print(grouped)5. Sort data
Sort the DataFrame by a column in ascending order.
import pandas as pd
def sort_data(file_path, column, ascending=True):
df = pd.read_excel(file_path, engine='openpyxl')
sorted_df = df.sort_values(by=column, ascending=ascending)
return sorted_df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column2"
sorted_df = sort_data(file_path, column)
print(sorted_df)6. Compare data
Compare two Excel files and show differences in a specific column.
import pandas as pd
def compare_data(file_path1, file_path2, on):
df1 = pd.read_excel(file_path1, engine='openpyxl')
df2 = pd.read_excel(file_path2, engine='openpyxl')
merged = pd.merge(df1, df2, on=on, how='outer', indicator=True)
diff = merged[merged['_merge'] != 'both']
return diff
if __name__ == "__main__":
file_path1 = "data1.xlsx"
file_path2 = "data2.xlsx"
on = "Column1"
diff = compare_data(file_path1, file_path2, on)
print(diff)7. Merge data
Merge rows from two Excel files that share the same key column.
import pandas as pd
def merge_data(file_path1, file_path2, on):
df1 = pd.read_excel(file_path1, engine='openpyxl')
df2 = pd.read_excel(file_path2, engine='openpyxl')
merged = pd.merge(df1, df2, on=on, how='outer')
return merged
if __name__ == "__main__":
file_path1 = "data1.xlsx"
file_path2 = "data2.xlsx"
on = "Column1"
merged = merge_data(file_path1, file_path2, on)
print(merged)8. Merge sheets
Combine all sheets of a single Excel workbook into one DataFrame.
import pandas as pd
def merge_sheets(file_path):
sheets = pd.read_excel(file_path, sheet_name=None, engine='openpyxl')
combined = pd.concat(sheets.values(), ignore_index=True)
return combined
if __name__ == "__main__":
file_path = "data.xlsx"
combined = merge_sheets(file_path)
print(combined)9. Merge multiple Excel files
Merge all Excel files matching a pattern into a single DataFrame.
import pandas as pd
import glob
def merge_multiple_files(file_pattern):
files = glob.glob(file_pattern)
dfs = [pd.read_excel(file, engine='openpyxl') for file in files]
combined = pd.concat(dfs, ignore_index=True)
return combined
if __name__ == "__main__":
file_pattern = "*.xlsx"
combined = merge_multiple_files(file_pattern)
print(combined)10. Fill missing values
Replace missing values in a column with a specified value.
import pandas as pd
def fill_missing_values(file_path, column, value):
df = pd.read_excel(file_path, engine='openpyxl')
df[column].fillna(value, inplace=True)
return df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column2"
value = 0
df = fill_missing_values(file_path, column, value)
print(df)11. Clean data
Remove whitespace from a column using a regular expression.
import pandas as pd
def clean_data(file_path, column, regex):
df = pd.read_excel(file_path, engine='openpyxl')
df[column] = df[column].str.replace(regex, '')
return df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column1"
regex = r'\s+'
df = clean_data(file_path, column, regex)
print(df)12. Pivot table
Create a pivot table that aggregates values by a categorical column.
import pandas as pd
def pivot_table(file_path, index, columns, values):
df = pd.read_excel(file_path, engine='openpyxl')
pivoted = pd.pivot_table(df, index=index, columns=columns, values=values, aggfunc='sum')
return pivoted
if __name__ == "__main__":
file_path = "data.xlsx"
index = "Column1"
columns = "Column2"
values = "Column3"
pivoted = pivot_table(file_path, index, columns, values)
print(pivoted)13. Reshape data (melt)
Transform selected columns into a long format.
import pandas as pd
def melt_data(file_path, id_vars, value_vars):
df = pd.read_excel(file_path, engine='openpyxl')
melted = pd.melt(df, id_vars=id_vars, value_vars=value_vars)
return melted
if __name__ == "__main__":
file_path = "data.xlsx"
id_vars = "Column1"
value_vars = ["Column2", "Column3"]
melted = melt_data(file_path, id_vars, value_vars)
print(melted)14. Aggregate data
Group by a column and compute sum and mean for other columns.
import pandas as pd
def aggregate_data(file_path, group_by, agg):
df = pd.read_excel(file_path, engine='openpyxl')
aggregated = df.groupby(group_by).agg(agg)
return aggregated
if __name__ == "__main__":
file_path = "data.xlsx"
group_by = "Column1"
agg = {"Column2": "sum", "Column3": "mean"}
aggregated = aggregate_data(file_path, group_by, agg)
print(aggregated)15. Describe data
Generate descriptive statistics for selected columns.
import pandas as pd
def describe_data(file_path, columns):
df = pd.read_excel(file_path, engine='openpyxl')
stats = df[columns].describe()
return stats
if __name__ == "__main__":
file_path = "data.xlsx"
columns = ["Column2", "Column3"]
stats = describe_data(file_path, columns)
print(stats)16. Convert data types
Change the data type of a column to a specified type.
import pandas as pd
def convert_types(file_path, column, dtype):
df = pd.read_excel(file_path, engine='openpyxl')
df[column] = df[column].astype(dtype)
return df
if __name__ == "__main__":
file_path = "data.xlsx"
column = "Column2"
dtype = int
df = convert_types(file_path, column, dtype)
print(df)17. Slice data
Select a range of rows by index.
import pandas as pd
def slice_data(file_path, start, stop):
df = pd.read_excel(file_path, engine='openpyxl')
sliced_df = df[start:stop]
return sliced_df
if __name__ == "__main__":
file_path = "data.xlsx"
start = 0
stop = 5
sliced_df = slice_data(file_path, start, stop)
print(sliced_df)18. Rename columns
Rename a column to a new name.
import pandas as pd
def rename_columns(file_path, old_name, new_name):
df = pd.read_excel(file_path, engine='openpyxl')
df.rename(columns={old_name: new_name}, inplace=True)
return df
if __name__ == "__main__":
file_path = "data.xlsx"
old_name = "Column1"
new_name = "NewColumn1"
df = rename_columns(file_path, old_name, new_name)
print(df)19. Remove duplicates
Drop duplicate rows based on a subset of columns.
import pandas as pd
def remove_duplicates(file_path, subset=None):
df = pd.read_excel(file_path, engine='openpyxl')
unique_df = df.drop_duplicates(subset=subset)
return unique_df
if __name__ == "__main__":
file_path = "data.xlsx"
subset = ["Column1"]
unique_df = remove_duplicates(file_path, subset)
print(unique_df)20. Write Excel file
Save a DataFrame back to an Excel file.
import pandas as pd
def write_excel(df, file_path):
df.to_excel(file_path, index=False, engine='openpyxl')
if __name__ == "__main__":
df = pd.DataFrame({'Column1': ['A', 'B', 'C'], 'Column2': [10, 20, 30]})
file_path = "output.xlsx"
write_excel(df, file_path)Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
