Artificial Intelligence 8 min read

Complete Machine Learning Project: Data Collection, Cleaning, Feature Engineering, Model Training, Evaluation, and Deployment

This tutorial walks through a complete machine learning project in Python, covering data collection, cleaning, feature engineering, training linear regression, decision tree, and random forest models, evaluating them with cross‑validation, and finally deploying the best model using joblib.

Test Development Learning Exchange

Nov 25, 2024

Complete Machine Learning Project: Data Collection, Cleaning, Feature Engineering, Model Training, Evaluation, and Deployment

Goal

Complete a full machine learning project.

Learning Content

Data collection, data cleaning, feature engineering, model training, model evaluation, and model deployment.

Code Example

1. Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

2. Data collection

# Load example dataset (Boston housing dataset)
from sklearn.datasets import load_boston
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target
print(f"示例数据集: 
{df.head()}")

3. Data cleaning

Check missing values

# Check missing values per column
missing_values = df.isnull().sum()
print(f"每列的缺失值数量: 
{missing_values}")

Handle missing values (optional)

# If there are missing values, fill with mean
# df.fillna(df.mean(), inplace=True)

Check outliers

# Use boxplot to check outliers
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.show()

4. Feature engineering

Standardize features

# Standardize features
scaler = StandardScaler()
X = df.drop('PRICE', axis=1)
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_scaled['PRICE'] = df['PRICE']
print(f"标准化后的数据集: 
{df_scaled.head()}")

Create new feature

# Create new feature: product of RM and LSTAT
df_scaled['RM_LSTAT'] = df_scaled['RM'] * df_scaled['LSTAT']
print(f"创建新特征后的数据集: 
{df_scaled.head()}")

5. Model training

Split dataset

# Split dataset into training and testing sets
X = df_scaled.drop('PRICE', axis=1)
y = df_scaled['PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"训练集特征: 
{X_train.head()}")
print(f"测试集特征: 
{X_test.head()}")
print(f"训练集标签: 
{y_train.head()}")
print(f"测试集标签: 
{y_test.head()}")

Train Linear Regression

# Train linear regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
# Predict
y_pred_linear = linear_reg.predict(X_test)
# Evaluate
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)
print(f"线性回归模型的均方误差 (MSE): {mse_linear:.2f}")
print(f"线性回归模型的决定系数 (R^2): {r2_linear:.2f}")

Train Decision Tree

# Train decision tree model
decision_tree = DecisionTreeRegressor(random_state=42)
decision_tree.fit(X_train, y_train)
# Predict
y_pred_tree = decision_tree.predict(X_test)
# Evaluate
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)
print(f"决策树模型的均方误差 (MSE): {mse_tree:.2f}")
print(f"决策树模型的决定系数 (R^2): {r2_tree:.2f}")

Train Random Forest

# Train random forest model
random_forest = RandomForestRegressor(random_state=42)
random_forest.fit(X_train, y_train)
# Predict
y_pred_forest = random_forest.predict(X_test)
# Evaluate
mse_forest = mean_squared_error(y_test, y_pred_forest)
r2_forest = r2_score(y_test, y_pred_forest)
print(f"随机森林模型的均方误差 (MSE): {mse_forest:.2f}")
print(f"随机森林模型的决定系数 (R^2): {r2_forest:.2f}")

6. Model evaluation

Use K‑fold cross‑validation

# Use K-fold cross-validation to evaluate models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42)
}
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
    mse_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    print(f"{name} 交叉验证的 MSE 评分: {mse_scores}")
    print(f"{name} 交叉验证的平均 MSE: {mse_scores.mean():.2f}")
    print(f"{name} 交叉验证的 R2 评分: {r2_scores}")
    print(f"{name} 交叉验证的平均 R2: {r2_scores.mean():.2f}
")

7. Model deployment

Save the best model

import joblib
# Save the best model (assume random forest is best)
best_model = random_forest
joblib.dump(best_model, 'best_model.pkl')
# Load the model
loaded_model = joblib.load('best_model.pkl')
# Predict with loaded model
y_pred_loaded = loaded_model.predict(X_test)
print(f"加载的模型预测结果: 
{y_pred_loaded[:10]}")

Summary

By completing this exercise you should have gone through the entire machine learning workflow—from data collection and cleaning to feature engineering, model training, evaluation, and deployment—while understanding each step through detailed comments and explanations.

Original Source

Signed-in readers can open the original source through BestHub's protected redirect.

Republication Notice

This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.

Python

Written by

Test Development Learning Exchange

0 followers

Reader feedback

How this landed with the community

Rate this article

Was this worth your time?

Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.