End-to-End House Prices Prediction Project: Data Collection, Preprocessing, Modeling, Evaluation, and Deployment with Python
This tutorial walks through a complete house price prediction project, covering data collection from Kaggle, preprocessing with pandas and scikit‑learn, model training using RandomForestRegressor, evaluation, and deployment of a Flask API for real‑time predictions, providing full code examples.
This guide demonstrates how to complete a full data‑science workflow for predicting house prices, starting from acquiring the Kaggle dataset, cleaning and engineering features, training a RandomForest model, evaluating its performance, and finally exposing the model via a Flask API.
Data Collection
The Kaggle "House Prices" dataset is downloaded and the train.csv and test.csv files are placed in a data directory.
Data Preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import joblib
# 读取数据
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
# 查看数据
print("训练数据前几行:")
print(train_data.head())
print("测试数据前几行:")
print(test_data.head())
# 分离特征和标签
X_train = train_data.drop(columns=['SalePrice'])
y_train = train_data['SalePrice']
X_test = test_data
# 检查缺失值
print("训练数据缺失值数量:")
print(X_train.isnull().sum().sort_values(ascending=False).head(10))
# 定义数值特征和类别特征
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=[np.object]).columns.tolist()
# 定义预处理步骤
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# 应用预处理
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
# 将预处理后的数据保存到文件
joblib.dump(preprocessor, 'models/preprocessor.pkl')
np.save('data/X_train_preprocessed.npy', X_train_preprocessed)
np.save('data/y_train.npy', y_train)
np.save('data/X_test_preprocessed.npy', X_test_preprocessed)
print("数据预处理完成,已保存到文件")Modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# 定义模型
model = RandomForestRegressor(random_state=42)
# 定义超参数网格
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10]
}
# 使用 GridSearchCV 进行超参数调优
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_preprocessed, y_train)
# 获取最佳模型
best_model = grid_search.best_estimator_
print(f"最佳模型参数: {grid_search.best_params_}")
# 保存模型
joblib.dump(best_model, 'models/best_model.pkl')
print("模型训练完成,已保存到文件")Evaluation
# 评估模型性能
y_pred = best_model.predict(X_train_preprocessed)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print(f"训练集上的 RMSE: {rmse}")
# 生成测试集预测
y_test_pred = best_model.predict(X_test_preprocessed)
# 保存预测结果
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': y_test_pred})
submission.to_csv('data/submission.csv', index=False)
print("测试集预测结果已保存到 submission.csv")Deployment
from flask import Flask, request, jsonify
import joblib
import numpy as np
app = Flask(__name__)
# 加载预处理器和模型
preprocessor = joblib.load('models/preprocessor.pkl')
model = joblib.load('models/best_model.pkl')
@app.route('/predict', methods=['POST'])
def predict():
try:
# 获取请求数据
data = request.json
input_data = pd.DataFrame([data])
# 预处理输入数据
input_data_preprocessed = preprocessor.transform(input_data)
# 进行预测
prediction = model.predict(input_data_preprocessed)
# 返回预测结果
return jsonify({'prediction': prediction[0]})
except Exception as e:
return jsonify({'error': str(e)})
if __name__ == '__main__':
app.run(debug=True)Run the Flask app by saving the above code to app.py and executing python app.py . Test the API with a curl command such as:
curl -X POST http://127.0.0.1:5000/predict -H "Content-Type: application/json" -d '{"OverallQual":7,"GrLivArea":1710,"GarageCars":2,"TotalBsmtSF":856,"FullBath":2,"YearBuilt":1976,"LotArea":9375,"YearRemodAdd":1976,"1stFlrSF":856,"TotRmsAbvGrd":6,"Fireplaces":1,"BsmtFinSF1":706,"GarageArea":548,"WoodDeckSF":0,"OpenPorchSF":61,"MoSold":2,"YrSold":2010,"OverallCond":5,"MasVnrArea":196,"BsmtUnfSF":150,"HalfBath":1,"BsmtFullBath":1,"BsmtHalfBath":0,"2ndFlrSF":854,"LowQualFinSF":0,"BsmtFinSF2":0,"EnclosedPorch":0,"3SsnPorch":0,"ScreenPorch":0,"PoolArea":0,"MiscVal":0,"LotFrontage":65,"Alley":"NA","MasVnrType":"BrkFace","ExterQual":"Gd","ExterCond":"TA","Foundation":"PConc","BsmtQual":"Gd","BsmtCond":"TA","BsmtExposure":"No","BsmtFinType1":"GLQ","BsmtFinType2":"Unf","HeatingQC":"Ex","CentralAir":"Y","Electrical":"SBrkr","KitchenQual":"TA","Functional":"Typ","FireplaceQu":"TA","GarageType":"Attchd","GarageFinish":"RFn","GarageQual":"TA","GarageCond":"TA","PavedDrive":"Y","Fence":"NA","MiscFeature":"NA","MSZoning":"RL","Street":"Pave","LotShape":"Reg","LandContour":"Lvl","Utilities":"AllPub","LotConfig":"Inside","LandSlope":"Gtl","Neighborhood":"CollgCr","Condition1":"Norm","Condition2":"Norm","BldgType":"1Fam","HouseStyle":"1Story","RoofStyle":"Gable","RoofMatl":"CompShg","Exterior1st":"VinylSd","Exterior2nd":"VinylSd","MasVnrType":"BrkFace","MasVnrArea":196,"ExterQual":"Gd","ExterCond":"TA","Foundation":"PConc","BsmtQual":"Gd","BsmtCond":"TA","BsmtExposure":"No","BsmtFinType1":"GLQ","BsmtFinType2":"Unf","Heating":"GasA","HeatingQC":"Ex","CentralAir":"Y","Electrical":"SBrkr","KitchenQual":"TA","Functional":"Typ","FireplaceQu":"TA","GarageType":"Attchd","GarageFinish":"RFn","GarageQual":"TA","GarageCond":"TA","PavedDrive":"Y","Fence":"NA","MiscFeature":"NA","SaleType":"WD","SaleCondition":"Normal"}'By following these steps you have completed an end‑to‑end data‑science pipeline from raw data to a deployable prediction service.
Test Development Learning Exchange
Test Development Learning Exchange
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.