Master Python’s os.walk: Build a Powerful File Management Tool in Minutes
This guide introduces Python’s os.walk module, explains its top‑down and bottom‑up traversal modes, and demonstrates how to list files, filter by extension, enumerate subdirectories, and create a complete log‑organizing script that moves, deduplicates, and records files based on date.
Have you ever wondered how to write a file management program in Python? Thanks to the powerful os.walk module, it’s straightforward. os.walk is a simple and easy‑to‑use iterator that traverses files and directories, enabling efficient handling of file‑system tasks.
1. Basic Introduction
os.walk()scans a specified directory and returns an iterator over its subdirectories and files.
2. Basic Usage
Assume the folder data has the following structure:
2.1 Scan all files
Scanning content:
Subfolders and files
Files within subfolders
Output format:
folder_name/file_name
Scanning direction:
Top‑down (default, topdown=True)
Bottom‑up ( topdown=False)
from os import walk
path="data"
for curDir, dirs, files in walk(path):
# for curDir, dirs, files in walk(path, topdown=False):
print("Current directory:", curDir)
print("Subdirectories:", str(dirs))
print("Files:", str(files))
print("*"*20)Top‑down scan result:
Current directory: data
Subdirectories: ['testA', 'testB', 'testC']
Files: ['2020-07-12 - 第一层.xlsx', '2020-07-13 - 第一层.xlsx', '2020-07-14 - 第一层.xlsx']
********************
Current directory: data\testA
Subdirectories: []
Files: ['2020-07-12-A.xlsx', '2020-07-13-A.xlsx', '2020-07-14-A.xlsx']
********************
...Bottom‑up scan result:
Current directory: data\testA
Subdirectories: []
Files: ['2020-07-12-A.xlsx', '2020-07-13-A.xlsx', '2020-07-14-A.xlsx']
********************
...
Current directory: data
Subdirectories: ['testA', 'testB', 'testC']
Files: ['2020-07-12 - 第一层.xlsx', '2020-07-13 - 第一层.xlsx', '2020-07-14 - 第一层.xlsx']
********************2.2 Scan and output all file paths
import os
path="data"
for curDir, dirs, files in os.walk(path):
for file in files:
print(os.path.join(curDir, file))Example output:
data\2020-07-12 - 第一层.xlsx
data\2020-07-13 - 第一层.xlsx
...
data\testC\2020-07-14-C.xlsx2.3 Scan and output all subdirectories
# List all subfolders under a directory
import os
path="data"
for curDir, dirs, files in os.walk(path):
for _dir in dirs:
print(os.path.join(curDir, _dir)) data\testA
data\testB
data\testCCase code – File organization script based on dates
import pandas as pd
import numpy as np
import os, openpyxl
def move_file(file_path, _new_path, date_xl_str):
# Move files of the current month to the new folder, delete others
for curDir, dirs, files in os.walk(file_path):
for file in files:
old_path = os.path.join(curDir, file)
new_path = os.path.join(_new_path, file)
file_date = file.split("_")[-1][:10]
try:
os.rename(old_path, new_path) if file_date in date_xl_str else os.remove(old_path)
except:
os.remove(old_path)
# Remove subfolders
for curDir, dirs, files in os.walk(file_path):
for _dir in dirs:
os.removedirs(os.path.join(curDir, _dir))
os.mkdir("data")
def qch_date(file_path):
wj_names = os.listdir(file_path)
wj_list = []
num = 0
for wj in wj_names:
new_wj = wj[:-11]
if new_wj not in wj_list:
wj_list.append(new_wj)
else:
os.remove(file_path + "\\" + wj)
num += 1
return num
def refresh_data(file_path, sheet_name, data):
book = openpyxl.load_workbook(file_path)
writer = pd.ExcelWriter(file_path, engine="openpyxl")
writer.book = book
ws = book[sheet_name]
for row in ws.iter_rows():
for cell in row:
cell.value = None
idx_num, col_num = data.shape
for i in range(1, idx_num + 1):
for j in range(1, col_num + 1):
ws.cell(row=i, column=j).value = data.iloc[i-1, j-1]
writer.save()
writer.close()
return None
def check_file(file_path, check_file="文件检查.xlsx"):
wj_names = os.listdir(file_path)
data = pd.DataFrame([wj.split("_")[2:] for wj in wj_names],
columns=["店铺名称", "日期"])
data['日期'] = data['日期'].str[:10]
nind = data.index.insert(0, '0')
data1 = data.reindex(index=nind)
data1.loc['0'] = data.columns
data1.reset_index(drop=True, inplace=True)
refresh_data(check_file, "数据源", data1)
return None
file_path = "data"
start_date = input("请输入开始日期:")
end_date = input("请输入结束日期:")
date_xl_str = [str(i)[:10] for i in pd.date_range(start_date, end_date, freq='D')]
new_path = start_date + "~" + end_date
try:
os.mkdir(new_path)
except:
print("文件夹 【%s】 已存在" % new_path)
move_file(file_path, new_path, date_xl_str)
num = qch_date(new_path)
print("去除重复文件 %s 个" % num)
check_file(new_path)Article source: Python Programming
MaGe Linux Operations
Founded in 2009, MaGe Education is a top Chinese high‑end IT training brand. Its graduates earn 12K+ RMB salaries, and the school has trained tens of thousands of students. It offers high‑pay courses in Linux cloud operations, Python full‑stack, automation, data analysis, AI, and Go high‑concurrency architecture. Thanks to quality courses and a solid reputation, it has talent partnerships with numerous internet firms.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
