Complete Machine Learning Project: Data Collection, Cleaning, Feature Engineering, Model Training, Evaluation, and Deployment
This tutorial walks through a complete machine learning project in Python, covering data collection, cleaning, feature engineering, training linear regression, decision tree, and random forest models, evaluating them with cross‑validation, and finally deploying the best model using joblib.
Goal
Complete a full machine learning project.
Learning Content
Data collection, data cleaning, feature engineering, model training, model evaluation, and model deployment.
Code Example
1. Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler2. Data collection
# Load example dataset (Boston housing dataset)
from sklearn.datasets import load_boston
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target
print(f"示例数据集: \n{df.head()}")3. Data cleaning
Check missing values
# Check missing values per column
missing_values = df.isnull().sum()
print(f"每列的缺失值数量: \n{missing_values}")Handle missing values (optional)
# If there are missing values, fill with mean
# df.fillna(df.mean(), inplace=True)Check outliers
# Use boxplot to check outliers
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.show()4. Feature engineering
Standardize features
# Standardize features
scaler = StandardScaler()
X = df.drop('PRICE', axis=1)
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_scaled['PRICE'] = df['PRICE']
print(f"标准化后的数据集: \n{df_scaled.head()}")Create new feature
# Create new feature: product of RM and LSTAT
df_scaled['RM_LSTAT'] = df_scaled['RM'] * df_scaled['LSTAT']
print(f"创建新特征后的数据集: \n{df_scaled.head()}")5. Model training
Split dataset
# Split dataset into training and testing sets
X = df_scaled.drop('PRICE', axis=1)
y = df_scaled['PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"训练集特征: \n{X_train.head()}")
print(f"测试集特征: \n{X_test.head()}")
print(f"训练集标签: \n{y_train.head()}")
print(f"测试集标签: \n{y_test.head()}")Train Linear Regression
# Train linear regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
# Predict
y_pred_linear = linear_reg.predict(X_test)
# Evaluate
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)
print(f"线性回归模型的均方误差 (MSE): {mse_linear:.2f}")
print(f"线性回归模型的决定系数 (R^2): {r2_linear:.2f}")Train Decision Tree
# Train decision tree model
decision_tree = DecisionTreeRegressor(random_state=42)
decision_tree.fit(X_train, y_train)
# Predict
y_pred_tree = decision_tree.predict(X_test)
# Evaluate
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)
print(f"决策树模型的均方误差 (MSE): {mse_tree:.2f}")
print(f"决策树模型的决定系数 (R^2): {r2_tree:.2f}")Train Random Forest
# Train random forest model
random_forest = RandomForestRegressor(random_state=42)
random_forest.fit(X_train, y_train)
# Predict
y_pred_forest = random_forest.predict(X_test)
# Evaluate
mse_forest = mean_squared_error(y_test, y_pred_forest)
r2_forest = r2_score(y_test, y_pred_forest)
print(f"随机森林模型的均方误差 (MSE): {mse_forest:.2f}")
print(f"随机森林模型的决定系数 (R^2): {r2_forest:.2f}")6. Model evaluation
Use K‑fold cross‑validation
# Use K-fold cross-validation to evaluate models
models = {
'Linear Regression': LinearRegression(),
'Decision Tree': DecisionTreeRegressor(random_state=42),
'Random Forest': RandomForestRegressor(random_state=42)
}
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
mse_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
print(f"{name} 交叉验证的 MSE 评分: {mse_scores}")
print(f"{name} 交叉验证的平均 MSE: {mse_scores.mean():.2f}")
print(f"{name} 交叉验证的 R2 评分: {r2_scores}")
print(f"{name} 交叉验证的平均 R2: {r2_scores.mean():.2f}\n")7. Model deployment
Save the best model
import joblib
# Save the best model (assume random forest is best)
best_model = random_forest
joblib.dump(best_model, 'best_model.pkl')
# Load the model
loaded_model = joblib.load('best_model.pkl')
# Predict with loaded model
y_pred_loaded = loaded_model.predict(X_test)
print(f"加载的模型预测结果: \n{y_pred_loaded[:10]}")Summary
By completing this exercise you should have gone through the entire machine learning workflow—from data collection and cleaning to feature engineering, model training, evaluation, and deployment—while understanding each step through detailed comments and explanations.
Test Development Learning Exchange
Test Development Learning Exchange
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.